summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-11-14 15:32:19 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-14 15:32:19 -0800
commite2c5923c349c1738fe8fda980874d93f6fb2e5b6 (patch)
treeb97a90170c45211bcc437761653aa8016c34afcd /drivers
parentabc36be236358162202e86ad88616ff95a755101 (diff)
parenta04b5de5050ab8b891128eb2c47a0916fe8622e1 (diff)
downloadop-kernel-dev-e2c5923c349c1738fe8fda980874d93f6fb2e5b6.zip
op-kernel-dev-e2c5923c349c1738fe8fda980874d93f6fb2e5b6.tar.gz
Merge branch 'for-4.15/block' of git://git.kernel.dk/linux-block
Pull core block layer updates from Jens Axboe: "This is the main pull request for block storage for 4.15-rc1. Nothing out of the ordinary in here, and no API changes or anything like that. Just various new features for drivers, core changes, etc. In particular, this pull request contains: - A patch series from Bart, closing the whole on blk/scsi-mq queue quescing. - A series from Christoph, building towards hidden gendisks (for multipath) and ability to move bio chains around. - NVMe - Support for native multipath for NVMe (Christoph). - Userspace notifications for AENs (Keith). - Command side-effects support (Keith). - SGL support (Chaitanya Kulkarni) - FC fixes and improvements (James Smart) - Lots of fixes and tweaks (Various) - bcache - New maintainer (Michael Lyle) - Writeback control improvements (Michael) - Various fixes (Coly, Elena, Eric, Liang, et al) - lightnvm updates, mostly centered around the pblk interface (Javier, Hans, and Rakesh). - Removal of unused bio/bvec kmap atomic interfaces (me, Christoph) - Writeback series that fix the much discussed hundreds of millions of sync-all units. This goes all the way, as discussed previously (me). - Fix for missing wakeup on writeback timer adjustments (Yafang Shao). - Fix laptop mode on blk-mq (me). - {mq,name} tupple lookup for IO schedulers, allowing us to have alias names. This means you can use 'deadline' on both !mq and on mq (where it's called mq-deadline). (me). - blktrace race fix, oopsing on sg load (me). - blk-mq optimizations (me). - Obscure waitqueue race fix for kyber (Omar). - NBD fixes (Josef). - Disable writeback throttling by default on bfq, like we do on cfq (Luca Miccio). - Series from Ming that enable us to treat flush requests on blk-mq like any other request. This is a really nice cleanup. - Series from Ming that improves merging on blk-mq with schedulers, getting us closer to flipping the switch on scsi-mq again. - BFQ updates (Paolo). - blk-mq atomic flags memory ordering fixes (Peter Z). - Loop cgroup support (Shaohua). - Lots of minor fixes from lots of different folks, both for core and driver code" * 'for-4.15/block' of git://git.kernel.dk/linux-block: (294 commits) nvme: fix visibility of "uuid" ns attribute blk-mq: fixup some comment typos and lengths ide: ide-atapi: fix compile error with defining macro DEBUG blk-mq: improve tag waiting setup for non-shared tags brd: remove unused brd_mutex blk-mq: only run the hardware queue if IO is pending block: avoid null pointer dereference on null disk fs: guard_bio_eod() needs to consider partitions xtensa/simdisk: fix compile error nvme: expose subsys attribute to sysfs nvme: create 'slaves' and 'holders' entries for hidden controllers block: create 'slaves' and 'holders' entries for hidden gendisks nvme: also expose the namespace identification sysfs files for mpath nodes nvme: implement multipath access to nvme subsystems nvme: track shared namespaces nvme: introduce a nvme_ns_ids structure nvme: track subsystems block, nvme: Introduce blk_mq_req_flags_t block, scsi: Make SCSI quiesce and resume work reliably block: Add the QUEUE_FLAG_PREEMPT_ONLY request queue flag ...
Diffstat (limited to 'drivers')
-rw-r--r--drivers/block/Kconfig5
-rw-r--r--drivers/block/brd.c1
-rw-r--r--drivers/block/cryptoloop.c2
-rw-r--r--drivers/block/loop.c13
-rw-r--r--drivers/block/loop.h1
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c7
-rw-r--r--drivers/block/nbd.c26
-rw-r--r--drivers/block/null_blk.c10
-rw-r--r--drivers/block/paride/Kconfig1
-rw-r--r--drivers/block/skd_main.c3
-rw-r--r--drivers/cdrom/Makefile15
-rw-r--r--drivers/ide/Kconfig2
-rw-r--r--drivers/ide/ide-atapi.c6
-rw-r--r--drivers/ide/ide-pm.c4
-rw-r--r--drivers/lightnvm/Kconfig3
-rw-r--r--drivers/lightnvm/core.c176
-rw-r--r--drivers/lightnvm/pblk-cache.c24
-rw-r--r--drivers/lightnvm/pblk-core.c512
-rw-r--r--drivers/lightnvm/pblk-gc.c289
-rw-r--r--drivers/lightnvm/pblk-init.c197
-rw-r--r--drivers/lightnvm/pblk-map.c28
-rw-r--r--drivers/lightnvm/pblk-rb.c30
-rw-r--r--drivers/lightnvm/pblk-read.c274
-rw-r--r--drivers/lightnvm/pblk-recovery.c129
-rw-r--r--drivers/lightnvm/pblk-rl.c43
-rw-r--r--drivers/lightnvm/pblk-sysfs.c2
-rw-r--r--drivers/lightnvm/pblk-write.c229
-rw-r--r--drivers/lightnvm/pblk.h132
-rw-r--r--drivers/md/bcache/alloc.c15
-rw-r--r--drivers/md/bcache/bcache.h19
-rw-r--r--drivers/md/bcache/btree.c17
-rw-r--r--drivers/md/bcache/btree.h2
-rw-r--r--drivers/md/bcache/closure.h6
-rw-r--r--drivers/md/bcache/request.c36
-rw-r--r--drivers/md/bcache/super.c52
-rw-r--r--drivers/md/bcache/sysfs.c28
-rw-r--r--drivers/md/bcache/util.c10
-rw-r--r--drivers/md/bcache/util.h4
-rw-r--r--drivers/md/bcache/writeback.c117
-rw-r--r--drivers/md/bcache/writeback.h6
-rw-r--r--drivers/md/bitmap.c2
-rw-r--r--drivers/md/dm-rq.c2
-rw-r--r--drivers/md/dm-table.c15
-rw-r--r--drivers/md/dm.c11
-rw-r--r--drivers/nvme/Kconfig4
-rw-r--r--drivers/nvme/host/Kconfig9
-rw-r--r--drivers/nvme/host/Makefile1
-rw-r--r--drivers/nvme/host/core.c1301
-rw-r--r--drivers/nvme/host/fabrics.c16
-rw-r--r--drivers/nvme/host/fabrics.h14
-rw-r--r--drivers/nvme/host/fc.c793
-rw-r--r--drivers/nvme/host/lightnvm.c86
-rw-r--r--drivers/nvme/host/multipath.c291
-rw-r--r--drivers/nvme/host/nvme.h169
-rw-r--r--drivers/nvme/host/pci.c243
-rw-r--r--drivers/nvme/host/rdma.c246
-rw-r--r--drivers/nvme/target/admin-cmd.c21
-rw-r--r--drivers/nvme/target/core.c23
-rw-r--r--drivers/nvme/target/fc.c48
-rw-r--r--drivers/nvme/target/io-cmd.c20
-rw-r--r--drivers/nvme/target/loop.c66
-rw-r--r--drivers/nvme/target/nvmet.h6
-rw-r--r--drivers/nvme/target/rdma.c16
-rw-r--r--drivers/scsi/Kconfig3
-rw-r--r--drivers/scsi/lpfc/lpfc_attr.c5
-rw-r--r--drivers/scsi/scsi_lib.c99
-rw-r--r--drivers/scsi/sg.c2
67 files changed, 3884 insertions, 2104 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 7b2df7a..923b417 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -68,9 +68,13 @@ config AMIGA_Z2RAM
To compile this driver as a module, choose M here: the
module will be called z2ram.
+config CDROM
+ tristate
+
config GDROM
tristate "SEGA Dreamcast GD-ROM drive"
depends on SH_DREAMCAST
+ select CDROM
select BLK_SCSI_REQUEST # only for the generic cdrom code
help
A standard SEGA Dreamcast comes with a modified CD ROM drive called a
@@ -348,6 +352,7 @@ config BLK_DEV_RAM_DAX
config CDROM_PKTCDVD
tristate "Packet writing on CD/DVD media (DEPRECATED)"
depends on !UML
+ select CDROM
select BLK_SCSI_REQUEST
help
Note: This driver is deprecated and will be removed from the
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 2d7178f..c1cf877 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -60,7 +60,6 @@ struct brd_device {
/*
* Look up and return a brd's page for a given sector.
*/
-static DEFINE_MUTEX(brd_mutex);
static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
{
pgoff_t idx;
diff --git a/drivers/block/cryptoloop.c b/drivers/block/cryptoloop.c
index 74e03aa..7033a4b 100644
--- a/drivers/block/cryptoloop.c
+++ b/drivers/block/cryptoloop.c
@@ -43,7 +43,6 @@ cryptoloop_init(struct loop_device *lo, const struct loop_info64 *info)
int cipher_len;
int mode_len;
char cms[LO_NAME_SIZE]; /* cipher-mode string */
- char *cipher;
char *mode;
char *cmsp = cms; /* c-m string pointer */
struct crypto_skcipher *tfm;
@@ -56,7 +55,6 @@ cryptoloop_init(struct loop_device *lo, const struct loop_info64 *info)
strncpy(cms, info->lo_crypt_name, LO_NAME_SIZE);
cms[LO_NAME_SIZE - 1] = 0;
- cipher = cmsp;
cipher_len = strcspn(cmsp, "-");
mode = cmsp + cipher_len;
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 85de673..bc8e615 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -476,6 +476,8 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
{
struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
+ if (cmd->css)
+ css_put(cmd->css);
cmd->ret = ret;
lo_rw_aio_do_completion(cmd);
}
@@ -535,6 +537,8 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
cmd->iocb.ki_filp = file;
cmd->iocb.ki_complete = lo_rw_aio_complete;
cmd->iocb.ki_flags = IOCB_DIRECT;
+ if (cmd->css)
+ kthread_associate_blkcg(cmd->css);
if (rw == WRITE)
ret = call_write_iter(file, &cmd->iocb, &iter);
@@ -542,6 +546,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
ret = call_read_iter(file, &cmd->iocb, &iter);
lo_rw_aio_do_completion(cmd);
+ kthread_associate_blkcg(NULL);
if (ret != -EIOCBQUEUED)
cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
@@ -1686,6 +1691,14 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
break;
}
+ /* always use the first bio's css */
+#ifdef CONFIG_BLK_CGROUP
+ if (cmd->use_aio && cmd->rq->bio && cmd->rq->bio->bi_css) {
+ cmd->css = cmd->rq->bio->bi_css;
+ css_get(cmd->css);
+ } else
+#endif
+ cmd->css = NULL;
kthread_queue_work(&lo->worker, &cmd->work);
return BLK_STS_OK;
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 1f39567..0f45416 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -72,6 +72,7 @@ struct loop_cmd {
long ret;
struct kiocb iocb;
struct bio_vec *bvec;
+ struct cgroup_subsys_state *css;
};
/* Support for loadable transfer modules */
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 4a3cfc7..b8af735 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -887,12 +887,9 @@ static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag)
static bool mtip_pause_ncq(struct mtip_port *port,
struct host_to_dev_fis *fis)
{
- struct host_to_dev_fis *reply;
unsigned long task_file_data;
- reply = port->rxfis + RX_FIS_D2H_REG;
task_file_data = readl(port->mmio+PORT_TFDATA);
-
if ((task_file_data & 1))
return false;
@@ -1020,7 +1017,6 @@ static int mtip_exec_internal_command(struct mtip_port *port,
.opts = opts
};
int rv = 0;
- unsigned long start;
/* Make sure the buffer is 8 byte aligned. This is asic specific. */
if (buffer & 0x00000007) {
@@ -1057,7 +1053,6 @@ static int mtip_exec_internal_command(struct mtip_port *port,
/* Copy the command to the command table */
memcpy(int_cmd->command, fis, fis_len*4);
- start = jiffies;
rq->timeout = timeout;
/* insert request and run queue */
@@ -3015,7 +3010,6 @@ static int mtip_hw_init(struct driver_data *dd)
{
int i;
int rv;
- unsigned int num_command_slots;
unsigned long timeout, timetaken;
dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR];
@@ -3025,7 +3019,6 @@ static int mtip_hw_init(struct driver_data *dd)
rv = -EIO;
goto out1;
}
- num_command_slots = dd->slot_groups * 32;
hba_setup(dd);
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 9adfb54..5f2a424 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -288,15 +288,6 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
cmd->status = BLK_STS_TIMEOUT;
return BLK_EH_HANDLED;
}
-
- /* If we are waiting on our dead timer then we could get timeout
- * callbacks for our request. For this we just want to reset the timer
- * and let the queue side take care of everything.
- */
- if (!completion_done(&cmd->send_complete)) {
- nbd_config_put(nbd);
- return BLK_EH_RESET_TIMER;
- }
config = nbd->config;
if (config->num_connections > 1) {
@@ -723,9 +714,9 @@ static int wait_for_reconnect(struct nbd_device *nbd)
return 0;
if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
return 0;
- wait_event_interruptible_timeout(config->conn_wait,
- atomic_read(&config->live_connections),
- config->dead_conn_timeout);
+ wait_event_timeout(config->conn_wait,
+ atomic_read(&config->live_connections),
+ config->dead_conn_timeout);
return atomic_read(&config->live_connections);
}
@@ -740,6 +731,7 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
if (!refcount_inc_not_zero(&nbd->config_refs)) {
dev_err_ratelimited(disk_to_dev(nbd->disk),
"Socks array is empty\n");
+ blk_mq_start_request(req);
return -EINVAL;
}
config = nbd->config;
@@ -748,6 +740,7 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
dev_err_ratelimited(disk_to_dev(nbd->disk),
"Attempted send on invalid socket\n");
nbd_config_put(nbd);
+ blk_mq_start_request(req);
return -EINVAL;
}
cmd->status = BLK_STS_OK;
@@ -771,6 +764,7 @@ again:
*/
sock_shutdown(nbd);
nbd_config_put(nbd);
+ blk_mq_start_request(req);
return -EIO;
}
goto again;
@@ -781,6 +775,7 @@ again:
* here so that it gets put _after_ the request that is already on the
* dispatch list.
*/
+ blk_mq_start_request(req);
if (unlikely(nsock->pending && nsock->pending != req)) {
blk_mq_requeue_request(req, true);
ret = 0;
@@ -793,10 +788,10 @@ again:
ret = nbd_send_cmd(nbd, cmd, index);
if (ret == -EAGAIN) {
dev_err_ratelimited(disk_to_dev(nbd->disk),
- "Request send failed trying another connection\n");
+ "Request send failed, requeueing\n");
nbd_mark_nsock_dead(nbd, nsock, 1);
- mutex_unlock(&nsock->tx_lock);
- goto again;
+ blk_mq_requeue_request(req, true);
+ ret = 0;
}
out:
mutex_unlock(&nsock->tx_lock);
@@ -820,7 +815,6 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
* done sending everything over the wire.
*/
init_completion(&cmd->send_complete);
- blk_mq_start_request(bd->rq);
/* We can be called directly from the user space process, which means we
* could possibly have signals pending so our sendmsg will fail. In
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index cda69db..c61960d 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -154,6 +154,10 @@ enum {
NULL_Q_MQ = 2,
};
+static int g_no_sched;
+module_param_named(no_sched, g_no_sched, int, S_IRUGO);
+MODULE_PARM_DESC(no_sched, "No io scheduler");
+
static int g_submit_queues = 1;
module_param_named(submit_queues, g_submit_queues, int, S_IRUGO);
MODULE_PARM_DESC(submit_queues, "Number of submission queues");
@@ -1754,6 +1758,8 @@ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
set->numa_node = nullb ? nullb->dev->home_node : g_home_node;
set->cmd_size = sizeof(struct nullb_cmd);
set->flags = BLK_MQ_F_SHOULD_MERGE;
+ if (g_no_sched)
+ set->flags |= BLK_MQ_F_NO_SCHED;
set->driver_data = NULL;
if ((nullb && nullb->dev->blocking) || g_blocking)
@@ -1985,8 +1991,10 @@ static int __init null_init(void)
for (i = 0; i < nr_devices; i++) {
dev = null_alloc_dev();
- if (!dev)
+ if (!dev) {
+ ret = -ENOMEM;
goto err_dev;
+ }
ret = null_add_dev(dev);
if (ret) {
null_free_dev(dev);
diff --git a/drivers/block/paride/Kconfig b/drivers/block/paride/Kconfig
index b226835a9..f8bd6ef 100644
--- a/drivers/block/paride/Kconfig
+++ b/drivers/block/paride/Kconfig
@@ -26,6 +26,7 @@ config PARIDE_PD
config PARIDE_PCD
tristate "Parallel port ATAPI CD-ROMs"
depends on PARIDE
+ select CDROM
select BLK_SCSI_REQUEST # only for the generic cdrom code
---help---
This option enables the high-level driver for ATAPI CD-ROM devices
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 64d0fc1..2819f23 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -1967,7 +1967,8 @@ static void skd_isr_msg_from_dev(struct skd_device *skdev)
break;
case FIT_MTD_CMD_LOG_HOST_ID:
- skdev->connect_time_stamp = get_seconds();
+ /* hardware interface overflows in y2106 */
+ skdev->connect_time_stamp = (u32)ktime_get_real_seconds();
data = skdev->connect_time_stamp & 0xFFFF;
mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_LO, 0, data);
SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
diff --git a/drivers/cdrom/Makefile b/drivers/cdrom/Makefile
index a95566f..0f3664b 100644
--- a/drivers/cdrom/Makefile
+++ b/drivers/cdrom/Makefile
@@ -1,14 +1,3 @@
# SPDX-License-Identifier: GPL-2.0
-# Makefile for the kernel cdrom device drivers.
-#
-# 30 Jan 1998, Michael Elizabeth Chastain, <mailto:mec@shout.net>
-# Rewritten to use lists instead of if-statements.
-
-# Each configuration option enables a list of files.
-
-obj-$(CONFIG_BLK_DEV_IDECD) += cdrom.o
-obj-$(CONFIG_BLK_DEV_SR) += cdrom.o
-obj-$(CONFIG_PARIDE_PCD) += cdrom.o
-obj-$(CONFIG_CDROM_PKTCDVD) += cdrom.o
-
-obj-$(CONFIG_GDROM) += gdrom.o cdrom.o
+obj-$(CONFIG_CDROM) += cdrom.o
+obj-$(CONFIG_GDROM) += gdrom.o
diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig
index c99a25c..cf1fb3f 100644
--- a/drivers/ide/Kconfig
+++ b/drivers/ide/Kconfig
@@ -117,7 +117,9 @@ config BLK_DEV_DELKIN
config BLK_DEV_IDECD
tristate "Include IDE/ATAPI CDROM support"
+ depends on BLK_DEV
select IDE_ATAPI
+ select CDROM
---help---
If you have a CD-ROM drive using the ATAPI protocol, say Y. ATAPI is
a newer protocol used by IDE CD-ROM and TAPE drives, similar to the
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 14d1e7d..0e6bc63 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -282,7 +282,7 @@ int ide_cd_expiry(ide_drive_t *drive)
struct request *rq = drive->hwif->rq;
unsigned long wait = 0;
- debug_log("%s: rq->cmd[0]: 0x%x\n", __func__, rq->cmd[0]);
+ debug_log("%s: scsi_req(rq)->cmd[0]: 0x%x\n", __func__, scsi_req(rq)->cmd[0]);
/*
* Some commands are *slow* and normally take a long time to complete.
@@ -463,7 +463,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
return ide_do_reset(drive);
}
- debug_log("[cmd %x]: check condition\n", rq->cmd[0]);
+ debug_log("[cmd %x]: check condition\n", scsi_req(rq)->cmd[0]);
/* Retry operation */
ide_retry_pc(drive);
@@ -531,7 +531,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
ide_pad_transfer(drive, write, bcount);
debug_log("[cmd %x] transferred %d bytes, padded %d bytes, resid: %u\n",
- rq->cmd[0], done, bcount, scsi_req(rq)->resid_len);
+ scsi_req(rq)->cmd[0], done, bcount, scsi_req(rq)->resid_len);
/* And set the interrupt handler again */
ide_set_handler(drive, ide_pc_intr, timeout);
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index dccdca9..ad8a125 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -90,9 +90,9 @@ int generic_ide_resume(struct device *dev)
}
memset(&rqpm, 0, sizeof(rqpm));
- rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+ rq = blk_get_request_flags(drive->queue, REQ_OP_DRV_IN,
+ BLK_MQ_REQ_PREEMPT);
ide_req(rq)->type = ATA_PRIV_PM_RESUME;
- rq->rq_flags |= RQF_PREEMPT;
rq->special = &rqpm;
rqpm.pm_step = IDE_PM_START_RESUME;
rqpm.pm_state = PM_EVENT_ON;
diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig
index ead61a9..2a953ef 100644
--- a/drivers/lightnvm/Kconfig
+++ b/drivers/lightnvm/Kconfig
@@ -4,7 +4,8 @@
menuconfig NVM
bool "Open-Channel SSD target support"
- depends on BLOCK && HAS_DMA
+ depends on BLOCK && HAS_DMA && PCI
+ select BLK_DEV_NVME
help
Say Y here to get to enable Open-channel SSDs.
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index ddae430..83249b4 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -22,6 +22,7 @@
#include <linux/types.h>
#include <linux/sem.h>
#include <linux/bitmap.h>
+#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/miscdevice.h>
#include <linux/lightnvm.h>
@@ -138,7 +139,6 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
int prev_nr_luns;
int i, j;
- nr_chnls = nr_luns / dev->geo.luns_per_chnl;
nr_chnls = (nr_chnls_mod == 0) ? nr_chnls : nr_chnls + 1;
dev_map = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL);
@@ -226,6 +226,24 @@ static const struct block_device_operations nvm_fops = {
.owner = THIS_MODULE,
};
+static struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock)
+{
+ struct nvm_tgt_type *tmp, *tt = NULL;
+
+ if (lock)
+ down_write(&nvm_tgtt_lock);
+
+ list_for_each_entry(tmp, &nvm_tgt_types, list)
+ if (!strcmp(name, tmp->name)) {
+ tt = tmp;
+ break;
+ }
+
+ if (lock)
+ up_write(&nvm_tgtt_lock);
+ return tt;
+}
+
static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
{
struct nvm_ioctl_create_simple *s = &create->conf.s;
@@ -316,6 +334,8 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
list_add_tail(&t->list, &dev->targets);
mutex_unlock(&dev->mlock);
+ __module_get(tt->owner);
+
return 0;
err_sysfs:
if (tt->exit)
@@ -351,6 +371,7 @@ static void __nvm_remove_target(struct nvm_target *t)
nvm_remove_tgt_dev(t->dev, 1);
put_disk(tdisk);
+ module_put(t->type->owner);
list_del(&t->list);
kfree(t);
@@ -532,25 +553,6 @@ void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
}
EXPORT_SYMBOL(nvm_part_to_tgt);
-struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock)
-{
- struct nvm_tgt_type *tmp, *tt = NULL;
-
- if (lock)
- down_write(&nvm_tgtt_lock);
-
- list_for_each_entry(tmp, &nvm_tgt_types, list)
- if (!strcmp(name, tmp->name)) {
- tt = tmp;
- break;
- }
-
- if (lock)
- up_write(&nvm_tgtt_lock);
- return tt;
-}
-EXPORT_SYMBOL(nvm_find_target_type);
-
int nvm_register_tgt_type(struct nvm_tgt_type *tt)
{
int ret = 0;
@@ -571,9 +573,9 @@ void nvm_unregister_tgt_type(struct nvm_tgt_type *tt)
if (!tt)
return;
- down_write(&nvm_lock);
+ down_write(&nvm_tgtt_lock);
list_del(&tt->list);
- up_write(&nvm_lock);
+ up_write(&nvm_tgtt_lock);
}
EXPORT_SYMBOL(nvm_unregister_tgt_type);
@@ -602,6 +604,52 @@ static struct nvm_dev *nvm_find_nvm_dev(const char *name)
return NULL;
}
+static int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
+ const struct ppa_addr *ppas, int nr_ppas)
+{
+ struct nvm_dev *dev = tgt_dev->parent;
+ struct nvm_geo *geo = &tgt_dev->geo;
+ int i, plane_cnt, pl_idx;
+ struct ppa_addr ppa;
+
+ if (geo->plane_mode == NVM_PLANE_SINGLE && nr_ppas == 1) {
+ rqd->nr_ppas = nr_ppas;
+ rqd->ppa_addr = ppas[0];
+
+ return 0;
+ }
+
+ rqd->nr_ppas = nr_ppas;
+ rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list);
+ if (!rqd->ppa_list) {
+ pr_err("nvm: failed to allocate dma memory\n");
+ return -ENOMEM;
+ }
+
+ plane_cnt = geo->plane_mode;
+ rqd->nr_ppas *= plane_cnt;
+
+ for (i = 0; i < nr_ppas; i++) {
+ for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
+ ppa = ppas[i];
+ ppa.g.pl = pl_idx;
+ rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa;
+ }
+ }
+
+ return 0;
+}
+
+static void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev,
+ struct nvm_rq *rqd)
+{
+ if (!rqd->ppa_list)
+ return;
+
+ nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
+}
+
+
int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
int nr_ppas, int type)
{
@@ -616,7 +664,7 @@ int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
memset(&rqd, 0, sizeof(struct nvm_rq));
- nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1);
+ nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas);
nvm_rq_tgt_to_dev(tgt_dev, &rqd);
ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type);
@@ -658,12 +706,25 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
}
EXPORT_SYMBOL(nvm_submit_io);
-static void nvm_end_io_sync(struct nvm_rq *rqd)
+int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
{
- struct completion *waiting = rqd->private;
+ struct nvm_dev *dev = tgt_dev->parent;
+ int ret;
- complete(waiting);
+ if (!dev->ops->submit_io_sync)
+ return -ENODEV;
+
+ nvm_rq_tgt_to_dev(tgt_dev, rqd);
+
+ rqd->dev = tgt_dev;
+
+ /* In case of error, fail with right address format */
+ ret = dev->ops->submit_io_sync(dev, rqd);
+ nvm_rq_dev_to_tgt(tgt_dev, rqd);
+
+ return ret;
}
+EXPORT_SYMBOL(nvm_submit_io_sync);
int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
int nr_ppas)
@@ -671,25 +732,21 @@ int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
struct nvm_geo *geo = &tgt_dev->geo;
struct nvm_rq rqd;
int ret;
- DECLARE_COMPLETION_ONSTACK(wait);
memset(&rqd, 0, sizeof(struct nvm_rq));
rqd.opcode = NVM_OP_ERASE;
- rqd.end_io = nvm_end_io_sync;
- rqd.private = &wait;
rqd.flags = geo->plane_mode >> 1;
- ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1);
+ ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas);
if (ret)
return ret;
- ret = nvm_submit_io(tgt_dev, &rqd);
+ ret = nvm_submit_io_sync(tgt_dev, &rqd);
if (ret) {
pr_err("rrpr: erase I/O submission failed: %d\n", ret);
goto free_ppa_list;
}
- wait_for_completion_io(&wait);
free_ppa_list:
nvm_free_rqd_ppalist(tgt_dev, &rqd);
@@ -775,57 +832,6 @@ void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin)
}
EXPORT_SYMBOL(nvm_put_area);
-int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
- const struct ppa_addr *ppas, int nr_ppas, int vblk)
-{
- struct nvm_dev *dev = tgt_dev->parent;
- struct nvm_geo *geo = &tgt_dev->geo;
- int i, plane_cnt, pl_idx;
- struct ppa_addr ppa;
-
- if ((!vblk || geo->plane_mode == NVM_PLANE_SINGLE) && nr_ppas == 1) {
- rqd->nr_ppas = nr_ppas;
- rqd->ppa_addr = ppas[0];
-
- return 0;
- }
-
- rqd->nr_ppas = nr_ppas;
- rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list);
- if (!rqd->ppa_list) {
- pr_err("nvm: failed to allocate dma memory\n");
- return -ENOMEM;
- }
-
- if (!vblk) {
- for (i = 0; i < nr_ppas; i++)
- rqd->ppa_list[i] = ppas[i];
- } else {
- plane_cnt = geo->plane_mode;
- rqd->nr_ppas *= plane_cnt;
-
- for (i = 0; i < nr_ppas; i++) {
- for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
- ppa = ppas[i];
- ppa.g.pl = pl_idx;
- rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa;
- }
- }
- }
-
- return 0;
-}
-EXPORT_SYMBOL(nvm_set_rqd_ppalist);
-
-void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
-{
- if (!rqd->ppa_list)
- return;
-
- nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
-}
-EXPORT_SYMBOL(nvm_free_rqd_ppalist);
-
void nvm_end_io(struct nvm_rq *rqd)
{
struct nvm_tgt_dev *tgt_dev = rqd->dev;
@@ -1177,7 +1183,7 @@ static long nvm_ioctl_info(struct file *file, void __user *arg)
info->version[1] = NVM_VERSION_MINOR;
info->version[2] = NVM_VERSION_PATCH;
- down_write(&nvm_lock);
+ down_write(&nvm_tgtt_lock);
list_for_each_entry(tt, &nvm_tgt_types, list) {
struct nvm_ioctl_info_tgt *tgt = &info->tgts[tgt_iter];
@@ -1190,7 +1196,7 @@ static long nvm_ioctl_info(struct file *file, void __user *arg)
}
info->tgtsize = tgt_iter;
- up_write(&nvm_lock);
+ up_write(&nvm_tgtt_lock);
if (copy_to_user(arg, info, sizeof(struct nvm_ioctl_info))) {
kfree(info);
diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c
index 024a8fc..0d227ef 100644
--- a/drivers/lightnvm/pblk-cache.c
+++ b/drivers/lightnvm/pblk-cache.c
@@ -43,8 +43,10 @@ retry:
if (unlikely(!bio_has_data(bio)))
goto out;
- w_ctx.flags = flags;
pblk_ppa_set_empty(&w_ctx.ppa);
+ w_ctx.flags = flags;
+ if (bio->bi_opf & REQ_PREFLUSH)
+ w_ctx.flags |= PBLK_FLUSH_ENTRY;
for (i = 0; i < nr_entries; i++) {
void *data = bio_data(bio);
@@ -73,12 +75,11 @@ out:
* On GC the incoming lbas are not necessarily sequential. Also, some of the
* lbas might not be valid entries, which are marked as empty by the GC thread
*/
-int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list,
- unsigned int nr_entries, unsigned int nr_rec_entries,
- struct pblk_line *gc_line, unsigned long flags)
+int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
{
struct pblk_w_ctx w_ctx;
unsigned int bpos, pos;
+ void *data = gc_rq->data;
int i, valid_entries;
/* Update the write buffer head (mem) with the entries that we can
@@ -86,28 +87,29 @@ int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list,
* rollback from here on.
*/
retry:
- if (!pblk_rb_may_write_gc(&pblk->rwb, nr_rec_entries, &bpos)) {
+ if (!pblk_rb_may_write_gc(&pblk->rwb, gc_rq->secs_to_gc, &bpos)) {
io_schedule();
goto retry;
}
- w_ctx.flags = flags;
+ w_ctx.flags = PBLK_IOTYPE_GC;
pblk_ppa_set_empty(&w_ctx.ppa);
- for (i = 0, valid_entries = 0; i < nr_entries; i++) {
- if (lba_list[i] == ADDR_EMPTY)
+ for (i = 0, valid_entries = 0; i < gc_rq->nr_secs; i++) {
+ if (gc_rq->lba_list[i] == ADDR_EMPTY)
continue;
- w_ctx.lba = lba_list[i];
+ w_ctx.lba = gc_rq->lba_list[i];
pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + valid_entries);
- pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_line, pos);
+ pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_rq->line,
+ gc_rq->paddr_list[i], pos);
data += PBLK_EXPOSED_PAGE_SIZE;
valid_entries++;
}
- WARN_ONCE(nr_rec_entries != valid_entries,
+ WARN_ONCE(gc_rq->secs_to_gc != valid_entries,
"pblk: inconsistent GC write\n");
#ifdef CONFIG_NVM_DEBUG
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 8150164..ce90213 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -18,6 +18,31 @@
#include "pblk.h"
+static void pblk_line_mark_bb(struct work_struct *work)
+{
+ struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
+ ws);
+ struct pblk *pblk = line_ws->pblk;
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct ppa_addr *ppa = line_ws->priv;
+ int ret;
+
+ ret = nvm_set_tgt_bb_tbl(dev, ppa, 1, NVM_BLK_T_GRWN_BAD);
+ if (ret) {
+ struct pblk_line *line;
+ int pos;
+
+ line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)];
+ pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa);
+
+ pr_err("pblk: failed to mark bb, line:%d, pos:%d\n",
+ line->id, pos);
+ }
+
+ kfree(ppa);
+ mempool_free(line_ws, pblk->gen_ws_pool);
+}
+
static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
struct ppa_addr *ppa)
{
@@ -33,7 +58,8 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n",
line->id, pos);
- pblk_line_run_ws(pblk, NULL, ppa, pblk_line_mark_bb, pblk->bb_wq);
+ pblk_gen_run_ws(pblk, NULL, ppa, pblk_line_mark_bb,
+ GFP_ATOMIC, pblk->bb_wq);
}
static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd)
@@ -63,7 +89,7 @@ static void pblk_end_io_erase(struct nvm_rq *rqd)
struct pblk *pblk = rqd->private;
__pblk_end_io_erase(pblk, rqd);
- mempool_free(rqd, pblk->g_rq_pool);
+ mempool_free(rqd, pblk->e_rq_pool);
}
void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line,
@@ -77,11 +103,7 @@ void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line,
* that newer updates are not overwritten.
*/
spin_lock(&line->lock);
- if (line->state == PBLK_LINESTATE_GC ||
- line->state == PBLK_LINESTATE_FREE) {
- spin_unlock(&line->lock);
- return;
- }
+ WARN_ON(line->state == PBLK_LINESTATE_FREE);
if (test_and_set_bit(paddr, line->invalid_bitmap)) {
WARN_ONCE(1, "pblk: double invalidate\n");
@@ -98,8 +120,7 @@ void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line,
spin_lock(&l_mg->gc_lock);
spin_lock(&line->lock);
/* Prevent moving a line that has just been chosen for GC */
- if (line->state == PBLK_LINESTATE_GC ||
- line->state == PBLK_LINESTATE_FREE) {
+ if (line->state == PBLK_LINESTATE_GC) {
spin_unlock(&line->lock);
spin_unlock(&l_mg->gc_lock);
return;
@@ -150,17 +171,25 @@ static void pblk_invalidate_range(struct pblk *pblk, sector_t slba,
spin_unlock(&pblk->trans_lock);
}
-struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw)
+/* Caller must guarantee that the request is a valid type */
+struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type)
{
mempool_t *pool;
struct nvm_rq *rqd;
int rq_size;
- if (rw == WRITE) {
+ switch (type) {
+ case PBLK_WRITE:
+ case PBLK_WRITE_INT:
pool = pblk->w_rq_pool;
rq_size = pblk_w_rq_size;
- } else {
- pool = pblk->g_rq_pool;
+ break;
+ case PBLK_READ:
+ pool = pblk->r_rq_pool;
+ rq_size = pblk_g_rq_size;
+ break;
+ default:
+ pool = pblk->e_rq_pool;
rq_size = pblk_g_rq_size;
}
@@ -170,15 +199,30 @@ struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw)
return rqd;
}
-void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw)
+/* Typically used on completion path. Cannot guarantee request consistency */
+void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type)
{
+ struct nvm_tgt_dev *dev = pblk->dev;
mempool_t *pool;
- if (rw == WRITE)
+ switch (type) {
+ case PBLK_WRITE:
+ kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap);
+ case PBLK_WRITE_INT:
pool = pblk->w_rq_pool;
- else
- pool = pblk->g_rq_pool;
+ break;
+ case PBLK_READ:
+ pool = pblk->r_rq_pool;
+ break;
+ case PBLK_ERASE:
+ pool = pblk->e_rq_pool;
+ break;
+ default:
+ pr_err("pblk: trying to free unknown rqd type\n");
+ return;
+ }
+ nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
mempool_free(rqd, pool);
}
@@ -190,10 +234,9 @@ void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
WARN_ON(off + nr_pages != bio->bi_vcnt);
- bio_advance(bio, off * PBLK_EXPOSED_PAGE_SIZE);
for (i = off; i < nr_pages + off; i++) {
bv = bio->bi_io_vec[i];
- mempool_free(bv.bv_page, pblk->page_pool);
+ mempool_free(bv.bv_page, pblk->page_bio_pool);
}
}
@@ -205,14 +248,12 @@ int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
int i, ret;
for (i = 0; i < nr_pages; i++) {
- page = mempool_alloc(pblk->page_pool, flags);
- if (!page)
- goto err;
+ page = mempool_alloc(pblk->page_bio_pool, flags);
ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0);
if (ret != PBLK_EXPOSED_PAGE_SIZE) {
pr_err("pblk: could not add page to bio\n");
- mempool_free(page, pblk->page_pool);
+ mempool_free(page, pblk->page_bio_pool);
goto err;
}
}
@@ -245,13 +286,6 @@ void pblk_write_should_kick(struct pblk *pblk)
pblk_write_kick(pblk);
}
-void pblk_end_bio_sync(struct bio *bio)
-{
- struct completion *waiting = bio->bi_private;
-
- complete(waiting);
-}
-
void pblk_end_io_sync(struct nvm_rq *rqd)
{
struct completion *waiting = rqd->private;
@@ -259,7 +293,7 @@ void pblk_end_io_sync(struct nvm_rq *rqd)
complete(waiting);
}
-void pblk_wait_for_meta(struct pblk *pblk)
+static void pblk_wait_for_meta(struct pblk *pblk)
{
do {
if (!atomic_read(&pblk->inflight_io))
@@ -336,17 +370,6 @@ void pblk_discard(struct pblk *pblk, struct bio *bio)
pblk_invalidate_range(pblk, slba, nr_secs);
}
-struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba)
-{
- struct ppa_addr ppa;
-
- spin_lock(&pblk->trans_lock);
- ppa = pblk_trans_map_get(pblk, lba);
- spin_unlock(&pblk->trans_lock);
-
- return ppa;
-}
-
void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd)
{
atomic_long_inc(&pblk->write_failed);
@@ -389,39 +412,38 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
struct nvm_tgt_dev *dev = pblk->dev;
#ifdef CONFIG_NVM_DEBUG
- struct ppa_addr *ppa_list;
+ int ret;
- ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
- if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) {
- WARN_ON(1);
- return -EINVAL;
- }
+ ret = pblk_check_io(pblk, rqd);
+ if (ret)
+ return ret;
+#endif
- if (rqd->opcode == NVM_OP_PWRITE) {
- struct pblk_line *line;
- struct ppa_addr ppa;
- int i;
+ atomic_inc(&pblk->inflight_io);
- for (i = 0; i < rqd->nr_ppas; i++) {
- ppa = ppa_list[i];
- line = &pblk->lines[pblk_dev_ppa_to_line(ppa)];
+ return nvm_submit_io(dev, rqd);
+}
- spin_lock(&line->lock);
- if (line->state != PBLK_LINESTATE_OPEN) {
- pr_err("pblk: bad ppa: line:%d,state:%d\n",
- line->id, line->state);
- WARN_ON(1);
- spin_unlock(&line->lock);
- return -EINVAL;
- }
- spin_unlock(&line->lock);
- }
- }
+int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+
+#ifdef CONFIG_NVM_DEBUG
+ int ret;
+
+ ret = pblk_check_io(pblk, rqd);
+ if (ret)
+ return ret;
#endif
atomic_inc(&pblk->inflight_io);
- return nvm_submit_io(dev, rqd);
+ return nvm_submit_io_sync(dev, rqd);
+}
+
+static void pblk_bio_map_addr_endio(struct bio *bio)
+{
+ bio_put(bio);
}
struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
@@ -460,6 +482,8 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
kaddr += PAGE_SIZE;
}
+
+ bio->bi_end_io = pblk_bio_map_addr_endio;
out:
return bio;
}
@@ -486,12 +510,14 @@ void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs)
u64 addr;
int i;
+ spin_lock(&line->lock);
addr = find_next_zero_bit(line->map_bitmap,
pblk->lm.sec_per_line, line->cur_sec);
line->cur_sec = addr - nr_secs;
for (i = 0; i < nr_secs; i++, line->cur_sec--)
WARN_ON(!test_and_clear_bit(line->cur_sec, line->map_bitmap));
+ spin_unlock(&line->lock);
}
u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs)
@@ -565,12 +591,11 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line,
int cmd_op, bio_op;
int i, j;
int ret;
- DECLARE_COMPLETION_ONSTACK(wait);
- if (dir == WRITE) {
+ if (dir == PBLK_WRITE) {
bio_op = REQ_OP_WRITE;
cmd_op = NVM_OP_PWRITE;
- } else if (dir == READ) {
+ } else if (dir == PBLK_READ) {
bio_op = REQ_OP_READ;
cmd_op = NVM_OP_PREAD;
} else
@@ -607,13 +632,11 @@ next_rq:
rqd.dma_ppa_list = dma_ppa_list;
rqd.opcode = cmd_op;
rqd.nr_ppas = rq_ppas;
- rqd.end_io = pblk_end_io_sync;
- rqd.private = &wait;
- if (dir == WRITE) {
+ if (dir == PBLK_WRITE) {
struct pblk_sec_meta *meta_list = rqd.meta_list;
- rqd.flags = pblk_set_progr_mode(pblk, WRITE);
+ rqd.flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
for (i = 0; i < rqd.nr_ppas; ) {
spin_lock(&line->lock);
paddr = __pblk_alloc_page(pblk, line, min);
@@ -662,25 +685,17 @@ next_rq:
}
}
- ret = pblk_submit_io(pblk, &rqd);
+ ret = pblk_submit_io_sync(pblk, &rqd);
if (ret) {
pr_err("pblk: emeta I/O submission failed: %d\n", ret);
bio_put(bio);
goto free_rqd_dma;
}
- if (!wait_for_completion_io_timeout(&wait,
- msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
- pr_err("pblk: emeta I/O timed out\n");
- }
atomic_dec(&pblk->inflight_io);
- reinit_completion(&wait);
-
- if (likely(pblk->l_mg.emeta_alloc_type == PBLK_VMALLOC_META))
- bio_put(bio);
if (rqd.error) {
- if (dir == WRITE)
+ if (dir == PBLK_WRITE)
pblk_log_write_err(pblk, &rqd);
else
pblk_log_read_err(pblk, &rqd);
@@ -721,14 +736,13 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
int i, ret;
int cmd_op, bio_op;
int flags;
- DECLARE_COMPLETION_ONSTACK(wait);
- if (dir == WRITE) {
+ if (dir == PBLK_WRITE) {
bio_op = REQ_OP_WRITE;
cmd_op = NVM_OP_PWRITE;
- flags = pblk_set_progr_mode(pblk, WRITE);
+ flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
lba_list = emeta_to_lbas(pblk, line->emeta->buf);
- } else if (dir == READ) {
+ } else if (dir == PBLK_READ) {
bio_op = REQ_OP_READ;
cmd_op = NVM_OP_PREAD;
flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
@@ -758,15 +772,13 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
rqd.opcode = cmd_op;
rqd.flags = flags;
rqd.nr_ppas = lm->smeta_sec;
- rqd.end_io = pblk_end_io_sync;
- rqd.private = &wait;
for (i = 0; i < lm->smeta_sec; i++, paddr++) {
struct pblk_sec_meta *meta_list = rqd.meta_list;
rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
- if (dir == WRITE) {
+ if (dir == PBLK_WRITE) {
__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
meta_list[i].lba = lba_list[paddr] = addr_empty;
@@ -778,21 +790,17 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
* the write thread is the only one sending write and erase commands,
* there is no need to take the LUN semaphore.
*/
- ret = pblk_submit_io(pblk, &rqd);
+ ret = pblk_submit_io_sync(pblk, &rqd);
if (ret) {
pr_err("pblk: smeta I/O submission failed: %d\n", ret);
bio_put(bio);
goto free_ppa_list;
}
- if (!wait_for_completion_io_timeout(&wait,
- msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
- pr_err("pblk: smeta I/O timed out\n");
- }
atomic_dec(&pblk->inflight_io);
if (rqd.error) {
- if (dir == WRITE)
+ if (dir == PBLK_WRITE)
pblk_log_write_err(pblk, &rqd);
else
pblk_log_read_err(pblk, &rqd);
@@ -808,14 +816,14 @@ int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line)
{
u64 bpaddr = pblk_line_smeta_start(pblk, line);
- return pblk_line_submit_smeta_io(pblk, line, bpaddr, READ);
+ return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ);
}
int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line,
void *emeta_buf)
{
return pblk_line_submit_emeta_io(pblk, line, emeta_buf,
- line->emeta_ssec, READ);
+ line->emeta_ssec, PBLK_READ);
}
static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd,
@@ -824,7 +832,7 @@ static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd,
rqd->opcode = NVM_OP_ERASE;
rqd->ppa_addr = ppa;
rqd->nr_ppas = 1;
- rqd->flags = pblk_set_progr_mode(pblk, ERASE);
+ rqd->flags = pblk_set_progr_mode(pblk, PBLK_ERASE);
rqd->bio = NULL;
}
@@ -832,19 +840,15 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
{
struct nvm_rq rqd;
int ret = 0;
- DECLARE_COMPLETION_ONSTACK(wait);
memset(&rqd, 0, sizeof(struct nvm_rq));
pblk_setup_e_rq(pblk, &rqd, ppa);
- rqd.end_io = pblk_end_io_sync;
- rqd.private = &wait;
-
/* The write thread schedules erases so that it minimizes disturbances
* with writes. Thus, there is no need to take the LUN semaphore.
*/
- ret = pblk_submit_io(pblk, &rqd);
+ ret = pblk_submit_io_sync(pblk, &rqd);
if (ret) {
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
@@ -857,11 +861,6 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
goto out;
}
- if (!wait_for_completion_io_timeout(&wait,
- msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
- pr_err("pblk: sync erase timed out\n");
- }
-
out:
rqd.private = pblk;
__pblk_end_io_erase(pblk, &rqd);
@@ -976,7 +975,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
memcpy(smeta_buf->header.uuid, pblk->instance_uuid, 16);
smeta_buf->header.id = cpu_to_le32(line->id);
smeta_buf->header.type = cpu_to_le16(line->type);
- smeta_buf->header.version = cpu_to_le16(1);
+ smeta_buf->header.version = SMETA_VERSION;
/* Start metadata */
smeta_buf->seq_nr = cpu_to_le64(line->seq_nr);
@@ -1046,7 +1045,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
line->smeta_ssec = off;
line->cur_sec = off + lm->smeta_sec;
- if (init && pblk_line_submit_smeta_io(pblk, line, off, WRITE)) {
+ if (init && pblk_line_submit_smeta_io(pblk, line, off, PBLK_WRITE)) {
pr_debug("pblk: line smeta I/O failed. Retry\n");
return 1;
}
@@ -1056,7 +1055,6 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
/* Mark emeta metadata sectors as bad sectors. We need to consider bad
* blocks to make sure that there are enough sectors to store emeta
*/
- bit = lm->sec_per_line;
off = lm->sec_per_line - lm->emeta_sec[0];
bitmap_set(line->invalid_bitmap, off, lm->emeta_sec[0]);
while (nr_bb) {
@@ -1093,25 +1091,21 @@ static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
struct pblk_line_meta *lm = &pblk->lm;
int blk_in_line = atomic_read(&line->blk_in_line);
- line->map_bitmap = mempool_alloc(pblk->line_meta_pool, GFP_ATOMIC);
+ line->map_bitmap = kzalloc(lm->sec_bitmap_len, GFP_ATOMIC);
if (!line->map_bitmap)
return -ENOMEM;
- memset(line->map_bitmap, 0, lm->sec_bitmap_len);
- /* invalid_bitmap is special since it is used when line is closed. No
- * need to zeroized; it will be initialized using bb info form
- * map_bitmap
- */
- line->invalid_bitmap = mempool_alloc(pblk->line_meta_pool, GFP_ATOMIC);
+ /* will be initialized using bb info from map_bitmap */
+ line->invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_ATOMIC);
if (!line->invalid_bitmap) {
- mempool_free(line->map_bitmap, pblk->line_meta_pool);
+ kfree(line->map_bitmap);
return -ENOMEM;
}
spin_lock(&line->lock);
if (line->state != PBLK_LINESTATE_FREE) {
- mempool_free(line->invalid_bitmap, pblk->line_meta_pool);
- mempool_free(line->map_bitmap, pblk->line_meta_pool);
+ kfree(line->map_bitmap);
+ kfree(line->invalid_bitmap);
spin_unlock(&line->lock);
WARN(1, "pblk: corrupted line %d, state %d\n",
line->id, line->state);
@@ -1163,7 +1157,7 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line)
{
- mempool_free(line->map_bitmap, pblk->line_meta_pool);
+ kfree(line->map_bitmap);
line->map_bitmap = NULL;
line->smeta = NULL;
line->emeta = NULL;
@@ -1328,6 +1322,41 @@ static void pblk_stop_writes(struct pblk *pblk, struct pblk_line *line)
pblk->state = PBLK_STATE_STOPPING;
}
+static void pblk_line_close_meta_sync(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line *line, *tline;
+ LIST_HEAD(list);
+
+ spin_lock(&l_mg->close_lock);
+ if (list_empty(&l_mg->emeta_list)) {
+ spin_unlock(&l_mg->close_lock);
+ return;
+ }
+
+ list_cut_position(&list, &l_mg->emeta_list, l_mg->emeta_list.prev);
+ spin_unlock(&l_mg->close_lock);
+
+ list_for_each_entry_safe(line, tline, &list, list) {
+ struct pblk_emeta *emeta = line->emeta;
+
+ while (emeta->mem < lm->emeta_len[0]) {
+ int ret;
+
+ ret = pblk_submit_meta_io(pblk, line);
+ if (ret) {
+ pr_err("pblk: sync meta line %d failed (%d)\n",
+ line->id, ret);
+ return;
+ }
+ }
+ }
+
+ pblk_wait_for_meta(pblk);
+ flush_workqueue(pblk->close_wq);
+}
+
void pblk_pipeline_stop(struct pblk *pblk)
{
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
@@ -1361,17 +1390,17 @@ void pblk_pipeline_stop(struct pblk *pblk)
spin_unlock(&l_mg->free_lock);
}
-void pblk_line_replace_data(struct pblk *pblk)
+struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
{
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
- struct pblk_line *cur, *new;
+ struct pblk_line *cur, *new = NULL;
unsigned int left_seblks;
int is_next = 0;
cur = l_mg->data_line;
new = l_mg->data_next;
if (!new)
- return;
+ goto out;
l_mg->data_line = new;
spin_lock(&l_mg->free_lock);
@@ -1379,7 +1408,7 @@ void pblk_line_replace_data(struct pblk *pblk)
l_mg->data_line = NULL;
l_mg->data_next = NULL;
spin_unlock(&l_mg->free_lock);
- return;
+ goto out;
}
pblk_line_setup_metadata(new, l_mg, &pblk->lm);
@@ -1391,7 +1420,7 @@ retry_erase:
/* If line is not fully erased, erase it */
if (atomic_read(&new->left_eblks)) {
if (pblk_line_erase(pblk, new))
- return;
+ goto out;
} else {
io_schedule();
}
@@ -1402,7 +1431,7 @@ retry_setup:
if (!pblk_line_init_metadata(pblk, new, cur)) {
new = pblk_line_retry(pblk, new);
if (!new)
- return;
+ goto out;
goto retry_setup;
}
@@ -1410,7 +1439,7 @@ retry_setup:
if (!pblk_line_init_bb(pblk, new, 1)) {
new = pblk_line_retry(pblk, new);
if (!new)
- return;
+ goto out;
goto retry_setup;
}
@@ -1434,14 +1463,15 @@ retry_setup:
if (is_next)
pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
+
+out:
+ return new;
}
void pblk_line_free(struct pblk *pblk, struct pblk_line *line)
{
- if (line->map_bitmap)
- mempool_free(line->map_bitmap, pblk->line_meta_pool);
- if (line->invalid_bitmap)
- mempool_free(line->invalid_bitmap, pblk->line_meta_pool);
+ kfree(line->map_bitmap);
+ kfree(line->invalid_bitmap);
*line->vsc = cpu_to_le32(EMPTY_ENTRY);
@@ -1451,11 +1481,10 @@ void pblk_line_free(struct pblk *pblk, struct pblk_line *line)
line->emeta = NULL;
}
-void pblk_line_put(struct kref *ref)
+static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line)
{
- struct pblk_line *line = container_of(ref, struct pblk_line, ref);
- struct pblk *pblk = line->pblk;
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_gc *gc = &pblk->gc;
spin_lock(&line->lock);
WARN_ON(line->state != PBLK_LINESTATE_GC);
@@ -1464,6 +1493,8 @@ void pblk_line_put(struct kref *ref)
pblk_line_free(pblk, line);
spin_unlock(&line->lock);
+ atomic_dec(&gc->pipeline_gc);
+
spin_lock(&l_mg->free_lock);
list_add_tail(&line->list, &l_mg->free_list);
l_mg->nr_free_lines++;
@@ -1472,13 +1503,49 @@ void pblk_line_put(struct kref *ref)
pblk_rl_free_lines_inc(&pblk->rl, line);
}
+static void pblk_line_put_ws(struct work_struct *work)
+{
+ struct pblk_line_ws *line_put_ws = container_of(work,
+ struct pblk_line_ws, ws);
+ struct pblk *pblk = line_put_ws->pblk;
+ struct pblk_line *line = line_put_ws->line;
+
+ __pblk_line_put(pblk, line);
+ mempool_free(line_put_ws, pblk->gen_ws_pool);
+}
+
+void pblk_line_put(struct kref *ref)
+{
+ struct pblk_line *line = container_of(ref, struct pblk_line, ref);
+ struct pblk *pblk = line->pblk;
+
+ __pblk_line_put(pblk, line);
+}
+
+void pblk_line_put_wq(struct kref *ref)
+{
+ struct pblk_line *line = container_of(ref, struct pblk_line, ref);
+ struct pblk *pblk = line->pblk;
+ struct pblk_line_ws *line_put_ws;
+
+ line_put_ws = mempool_alloc(pblk->gen_ws_pool, GFP_ATOMIC);
+ if (!line_put_ws)
+ return;
+
+ line_put_ws->pblk = pblk;
+ line_put_ws->line = line;
+ line_put_ws->priv = NULL;
+
+ INIT_WORK(&line_put_ws->ws, pblk_line_put_ws);
+ queue_work(pblk->r_end_wq, &line_put_ws->ws);
+}
+
int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
{
struct nvm_rq *rqd;
int err;
- rqd = mempool_alloc(pblk->g_rq_pool, GFP_KERNEL);
- memset(rqd, 0, pblk_g_rq_size);
+ rqd = pblk_alloc_rqd(pblk, PBLK_ERASE);
pblk_setup_e_rq(pblk, rqd, ppa);
@@ -1517,41 +1584,6 @@ int pblk_line_is_full(struct pblk_line *line)
return (line->left_msecs == 0);
}
-void pblk_line_close_meta_sync(struct pblk *pblk)
-{
- struct pblk_line_mgmt *l_mg = &pblk->l_mg;
- struct pblk_line_meta *lm = &pblk->lm;
- struct pblk_line *line, *tline;
- LIST_HEAD(list);
-
- spin_lock(&l_mg->close_lock);
- if (list_empty(&l_mg->emeta_list)) {
- spin_unlock(&l_mg->close_lock);
- return;
- }
-
- list_cut_position(&list, &l_mg->emeta_list, l_mg->emeta_list.prev);
- spin_unlock(&l_mg->close_lock);
-
- list_for_each_entry_safe(line, tline, &list, list) {
- struct pblk_emeta *emeta = line->emeta;
-
- while (emeta->mem < lm->emeta_len[0]) {
- int ret;
-
- ret = pblk_submit_meta_io(pblk, line);
- if (ret) {
- pr_err("pblk: sync meta line %d failed (%d)\n",
- line->id, ret);
- return;
- }
- }
- }
-
- pblk_wait_for_meta(pblk);
- flush_workqueue(pblk->close_wq);
-}
-
static void pblk_line_should_sync_meta(struct pblk *pblk)
{
if (pblk_rl_is_limit(&pblk->rl))
@@ -1582,15 +1614,13 @@ void pblk_line_close(struct pblk *pblk, struct pblk_line *line)
list_add_tail(&line->list, move_list);
- mempool_free(line->map_bitmap, pblk->line_meta_pool);
+ kfree(line->map_bitmap);
line->map_bitmap = NULL;
line->smeta = NULL;
line->emeta = NULL;
spin_unlock(&line->lock);
spin_unlock(&l_mg->gc_lock);
-
- pblk_gc_should_kick(pblk);
}
void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line)
@@ -1624,43 +1654,16 @@ void pblk_line_close_ws(struct work_struct *work)
struct pblk_line *line = line_ws->line;
pblk_line_close(pblk, line);
- mempool_free(line_ws, pblk->line_ws_pool);
-}
-
-void pblk_line_mark_bb(struct work_struct *work)
-{
- struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
- ws);
- struct pblk *pblk = line_ws->pblk;
- struct nvm_tgt_dev *dev = pblk->dev;
- struct ppa_addr *ppa = line_ws->priv;
- int ret;
-
- ret = nvm_set_tgt_bb_tbl(dev, ppa, 1, NVM_BLK_T_GRWN_BAD);
- if (ret) {
- struct pblk_line *line;
- int pos;
-
- line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)];
- pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa);
-
- pr_err("pblk: failed to mark bb, line:%d, pos:%d\n",
- line->id, pos);
- }
-
- kfree(ppa);
- mempool_free(line_ws, pblk->line_ws_pool);
+ mempool_free(line_ws, pblk->gen_ws_pool);
}
-void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
- void (*work)(struct work_struct *),
+void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
+ void (*work)(struct work_struct *), gfp_t gfp_mask,
struct workqueue_struct *wq)
{
struct pblk_line_ws *line_ws;
- line_ws = mempool_alloc(pblk->line_ws_pool, GFP_ATOMIC);
- if (!line_ws)
- return;
+ line_ws = mempool_alloc(pblk->gen_ws_pool, gfp_mask);
line_ws->pblk = pblk;
line_ws->line = line;
@@ -1689,16 +1692,8 @@ static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list,
#endif
ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000));
- if (ret) {
- switch (ret) {
- case -ETIME:
- pr_err("pblk: lun semaphore timed out\n");
- break;
- case -EINTR:
- pr_err("pblk: lun semaphore timed out\n");
- break;
- }
- }
+ if (ret == -ETIME || ret == -EINTR)
+ pr_err("pblk: taking lun semaphore timed out: err %d\n", -ret);
}
void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas)
@@ -1758,13 +1753,11 @@ void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
rlun = &pblk->luns[bit];
up(&rlun->wr_sem);
}
-
- kfree(lun_bitmap);
}
void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
{
- struct ppa_addr l2p_ppa;
+ struct ppa_addr ppa_l2p;
/* logic error: lba out-of-bounds. Ignore update */
if (!(lba < pblk->rl.nr_secs)) {
@@ -1773,10 +1766,10 @@ void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
}
spin_lock(&pblk->trans_lock);
- l2p_ppa = pblk_trans_map_get(pblk, lba);
+ ppa_l2p = pblk_trans_map_get(pblk, lba);
- if (!pblk_addr_in_cache(l2p_ppa) && !pblk_ppa_empty(l2p_ppa))
- pblk_map_invalidate(pblk, l2p_ppa);
+ if (!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p))
+ pblk_map_invalidate(pblk, ppa_l2p);
pblk_trans_map_set(pblk, lba, ppa);
spin_unlock(&pblk->trans_lock);
@@ -1784,6 +1777,7 @@ void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
{
+
#ifdef CONFIG_NVM_DEBUG
/* Callers must ensure that the ppa points to a cache address */
BUG_ON(!pblk_addr_in_cache(ppa));
@@ -1793,16 +1787,16 @@ void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
pblk_update_map(pblk, lba, ppa);
}
-int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
- struct pblk_line *gc_line)
+int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new,
+ struct pblk_line *gc_line, u64 paddr_gc)
{
- struct ppa_addr l2p_ppa;
+ struct ppa_addr ppa_l2p, ppa_gc;
int ret = 1;
#ifdef CONFIG_NVM_DEBUG
/* Callers must ensure that the ppa points to a cache address */
- BUG_ON(!pblk_addr_in_cache(ppa));
- BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa)));
+ BUG_ON(!pblk_addr_in_cache(ppa_new));
+ BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa_new)));
#endif
/* logic error: lba out-of-bounds. Ignore update */
@@ -1812,36 +1806,41 @@ int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
}
spin_lock(&pblk->trans_lock);
- l2p_ppa = pblk_trans_map_get(pblk, lba);
+ ppa_l2p = pblk_trans_map_get(pblk, lba);
+ ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, gc_line->id);
+
+ if (!pblk_ppa_comp(ppa_l2p, ppa_gc)) {
+ spin_lock(&gc_line->lock);
+ WARN(!test_bit(paddr_gc, gc_line->invalid_bitmap),
+ "pblk: corrupted GC update");
+ spin_unlock(&gc_line->lock);
- /* Prevent updated entries to be overwritten by GC */
- if (pblk_addr_in_cache(l2p_ppa) || pblk_ppa_empty(l2p_ppa) ||
- pblk_tgt_ppa_to_line(l2p_ppa) != gc_line->id) {
ret = 0;
goto out;
}
- pblk_trans_map_set(pblk, lba, ppa);
+ pblk_trans_map_set(pblk, lba, ppa_new);
out:
spin_unlock(&pblk->trans_lock);
return ret;
}
-void pblk_update_map_dev(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
- struct ppa_addr entry_line)
+void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
+ struct ppa_addr ppa_mapped, struct ppa_addr ppa_cache)
{
- struct ppa_addr l2p_line;
+ struct ppa_addr ppa_l2p;
#ifdef CONFIG_NVM_DEBUG
/* Callers must ensure that the ppa points to a device address */
- BUG_ON(pblk_addr_in_cache(ppa));
+ BUG_ON(pblk_addr_in_cache(ppa_mapped));
#endif
/* Invalidate and discard padded entries */
if (lba == ADDR_EMPTY) {
#ifdef CONFIG_NVM_DEBUG
atomic_long_inc(&pblk->padded_wb);
#endif
- pblk_map_invalidate(pblk, ppa);
+ if (!pblk_ppa_empty(ppa_mapped))
+ pblk_map_invalidate(pblk, ppa_mapped);
return;
}
@@ -1852,22 +1851,22 @@ void pblk_update_map_dev(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
}
spin_lock(&pblk->trans_lock);
- l2p_line = pblk_trans_map_get(pblk, lba);
+ ppa_l2p = pblk_trans_map_get(pblk, lba);
/* Do not update L2P if the cacheline has been updated. In this case,
* the mapped ppa must be invalidated
*/
- if (l2p_line.ppa != entry_line.ppa) {
- if (!pblk_ppa_empty(ppa))
- pblk_map_invalidate(pblk, ppa);
+ if (!pblk_ppa_comp(ppa_l2p, ppa_cache)) {
+ if (!pblk_ppa_empty(ppa_mapped))
+ pblk_map_invalidate(pblk, ppa_mapped);
goto out;
}
#ifdef CONFIG_NVM_DEBUG
- WARN_ON(!pblk_addr_in_cache(l2p_line) && !pblk_ppa_empty(l2p_line));
+ WARN_ON(!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p));
#endif
- pblk_trans_map_set(pblk, lba, ppa);
+ pblk_trans_map_set(pblk, lba, ppa_mapped);
out:
spin_unlock(&pblk->trans_lock);
}
@@ -1878,23 +1877,32 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
int i;
spin_lock(&pblk->trans_lock);
- for (i = 0; i < nr_secs; i++)
- ppas[i] = pblk_trans_map_get(pblk, blba + i);
+ for (i = 0; i < nr_secs; i++) {
+ struct ppa_addr ppa;
+
+ ppa = ppas[i] = pblk_trans_map_get(pblk, blba + i);
+
+ /* If the L2P entry maps to a line, the reference is valid */
+ if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) {
+ int line_id = pblk_dev_ppa_to_line(ppa);
+ struct pblk_line *line = &pblk->lines[line_id];
+
+ kref_get(&line->ref);
+ }
+ }
spin_unlock(&pblk->trans_lock);
}
void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
u64 *lba_list, int nr_secs)
{
- sector_t lba;
+ u64 lba;
int i;
spin_lock(&pblk->trans_lock);
for (i = 0; i < nr_secs; i++) {
lba = lba_list[i];
- if (lba == ADDR_EMPTY) {
- ppas[i].ppa = ADDR_EMPTY;
- } else {
+ if (lba != ADDR_EMPTY) {
/* logic error: lba out-of-bounds. Ignore update */
if (!(lba < pblk->rl.nr_secs)) {
WARN(1, "pblk: corrupted L2P map request\n");
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index 6090d28..00d5698 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -20,7 +20,8 @@
static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq)
{
- vfree(gc_rq->data);
+ if (gc_rq->data)
+ vfree(gc_rq->data);
kfree(gc_rq);
}
@@ -41,10 +42,7 @@ static int pblk_gc_write(struct pblk *pblk)
spin_unlock(&gc->w_lock);
list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) {
- pblk_write_gc_to_cache(pblk, gc_rq->data, gc_rq->lba_list,
- gc_rq->nr_secs, gc_rq->secs_to_gc,
- gc_rq->line, PBLK_IOTYPE_GC);
-
+ pblk_write_gc_to_cache(pblk, gc_rq);
list_del(&gc_rq->list);
kref_put(&gc_rq->line->ref, pblk_line_put);
pblk_gc_free_gc_rq(gc_rq);
@@ -58,42 +56,59 @@ static void pblk_gc_writer_kick(struct pblk_gc *gc)
wake_up_process(gc->gc_writer_ts);
}
-/*
- * Responsible for managing all memory related to a gc request. Also in case of
- * failure
- */
-static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
+static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct list_head *move_list;
+
+ spin_lock(&line->lock);
+ WARN_ON(line->state != PBLK_LINESTATE_GC);
+ line->state = PBLK_LINESTATE_CLOSED;
+ move_list = pblk_line_gc_list(pblk, line);
+ spin_unlock(&line->lock);
+
+ if (move_list) {
+ spin_lock(&l_mg->gc_lock);
+ list_add_tail(&line->list, move_list);
+ spin_unlock(&l_mg->gc_lock);
+ }
+}
+
+static void pblk_gc_line_ws(struct work_struct *work)
{
+ struct pblk_line_ws *gc_rq_ws = container_of(work,
+ struct pblk_line_ws, ws);
+ struct pblk *pblk = gc_rq_ws->pblk;
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_gc *gc = &pblk->gc;
- struct pblk_line *line = gc_rq->line;
- void *data;
- unsigned int secs_to_gc;
- int ret = 0;
+ struct pblk_line *line = gc_rq_ws->line;
+ struct pblk_gc_rq *gc_rq = gc_rq_ws->priv;
+ int ret;
- data = vmalloc(gc_rq->nr_secs * geo->sec_size);
- if (!data) {
- ret = -ENOMEM;
+ up(&gc->gc_sem);
+
+ gc_rq->data = vmalloc(gc_rq->nr_secs * geo->sec_size);
+ if (!gc_rq->data) {
+ pr_err("pblk: could not GC line:%d (%d/%d)\n",
+ line->id, *line->vsc, gc_rq->nr_secs);
goto out;
}
/* Read from GC victim block */
- if (pblk_submit_read_gc(pblk, gc_rq->lba_list, data, gc_rq->nr_secs,
- &secs_to_gc, line)) {
- ret = -EFAULT;
- goto free_data;
+ ret = pblk_submit_read_gc(pblk, gc_rq);
+ if (ret) {
+ pr_err("pblk: failed GC read in line:%d (err:%d)\n",
+ line->id, ret);
+ goto out;
}
- if (!secs_to_gc)
- goto free_rq;
-
- gc_rq->data = data;
- gc_rq->secs_to_gc = secs_to_gc;
+ if (!gc_rq->secs_to_gc)
+ goto out;
retry:
spin_lock(&gc->w_lock);
- if (gc->w_entries >= PBLK_GC_W_QD) {
+ if (gc->w_entries >= PBLK_GC_RQ_QD) {
spin_unlock(&gc->w_lock);
pblk_gc_writer_kick(&pblk->gc);
usleep_range(128, 256);
@@ -105,53 +120,13 @@ retry:
pblk_gc_writer_kick(&pblk->gc);
- return 0;
+ kfree(gc_rq_ws);
+ return;
-free_rq:
- kfree(gc_rq);
-free_data:
- vfree(data);
out:
+ pblk_gc_free_gc_rq(gc_rq);
kref_put(&line->ref, pblk_line_put);
- return ret;
-}
-
-static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line)
-{
- struct pblk_line_mgmt *l_mg = &pblk->l_mg;
- struct list_head *move_list;
-
- spin_lock(&line->lock);
- WARN_ON(line->state != PBLK_LINESTATE_GC);
- line->state = PBLK_LINESTATE_CLOSED;
- move_list = pblk_line_gc_list(pblk, line);
- spin_unlock(&line->lock);
-
- if (move_list) {
- spin_lock(&l_mg->gc_lock);
- list_add_tail(&line->list, move_list);
- spin_unlock(&l_mg->gc_lock);
- }
-}
-
-static void pblk_gc_line_ws(struct work_struct *work)
-{
- struct pblk_line_ws *line_rq_ws = container_of(work,
- struct pblk_line_ws, ws);
- struct pblk *pblk = line_rq_ws->pblk;
- struct pblk_gc *gc = &pblk->gc;
- struct pblk_line *line = line_rq_ws->line;
- struct pblk_gc_rq *gc_rq = line_rq_ws->priv;
-
- up(&gc->gc_sem);
-
- if (pblk_gc_move_valid_secs(pblk, gc_rq)) {
- pr_err("pblk: could not GC all sectors: line:%d (%d/%d)\n",
- line->id, *line->vsc,
- gc_rq->nr_secs);
- }
-
- mempool_free(line_rq_ws, pblk->line_ws_pool);
+ kfree(gc_rq_ws);
}
static void pblk_gc_line_prepare_ws(struct work_struct *work)
@@ -164,17 +139,24 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
struct pblk_line_meta *lm = &pblk->lm;
struct pblk_gc *gc = &pblk->gc;
struct line_emeta *emeta_buf;
- struct pblk_line_ws *line_rq_ws;
+ struct pblk_line_ws *gc_rq_ws;
struct pblk_gc_rq *gc_rq;
__le64 *lba_list;
+ unsigned long *invalid_bitmap;
int sec_left, nr_secs, bit;
int ret;
+ invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL);
+ if (!invalid_bitmap) {
+ pr_err("pblk: could not allocate GC invalid bitmap\n");
+ goto fail_free_ws;
+ }
+
emeta_buf = pblk_malloc(lm->emeta_len[0], l_mg->emeta_alloc_type,
GFP_KERNEL);
if (!emeta_buf) {
pr_err("pblk: cannot use GC emeta\n");
- return;
+ goto fail_free_bitmap;
}
ret = pblk_line_read_emeta(pblk, line, emeta_buf);
@@ -193,7 +175,11 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
goto fail_free_emeta;
}
+ spin_lock(&line->lock);
+ bitmap_copy(invalid_bitmap, line->invalid_bitmap, lm->sec_per_line);
sec_left = pblk_line_vsc(line);
+ spin_unlock(&line->lock);
+
if (sec_left < 0) {
pr_err("pblk: corrupted GC line (%d)\n", line->id);
goto fail_free_emeta;
@@ -207,11 +193,12 @@ next_rq:
nr_secs = 0;
do {
- bit = find_next_zero_bit(line->invalid_bitmap, lm->sec_per_line,
+ bit = find_next_zero_bit(invalid_bitmap, lm->sec_per_line,
bit + 1);
if (bit > line->emeta_ssec)
break;
+ gc_rq->paddr_list[nr_secs] = bit;
gc_rq->lba_list[nr_secs++] = le64_to_cpu(lba_list[bit]);
} while (nr_secs < pblk->max_write_pgs);
@@ -223,19 +210,25 @@ next_rq:
gc_rq->nr_secs = nr_secs;
gc_rq->line = line;
- line_rq_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL);
- if (!line_rq_ws)
+ gc_rq_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL);
+ if (!gc_rq_ws)
goto fail_free_gc_rq;
- line_rq_ws->pblk = pblk;
- line_rq_ws->line = line;
- line_rq_ws->priv = gc_rq;
+ gc_rq_ws->pblk = pblk;
+ gc_rq_ws->line = line;
+ gc_rq_ws->priv = gc_rq;
+
+ /* The write GC path can be much slower than the read GC one due to
+ * the budget imposed by the rate-limiter. Balance in case that we get
+ * back pressure from the write GC path.
+ */
+ while (down_timeout(&gc->gc_sem, msecs_to_jiffies(30000)))
+ io_schedule();
- down(&gc->gc_sem);
kref_get(&line->ref);
- INIT_WORK(&line_rq_ws->ws, pblk_gc_line_ws);
- queue_work(gc->gc_line_reader_wq, &line_rq_ws->ws);
+ INIT_WORK(&gc_rq_ws->ws, pblk_gc_line_ws);
+ queue_work(gc->gc_line_reader_wq, &gc_rq_ws->ws);
sec_left -= nr_secs;
if (sec_left > 0)
@@ -243,10 +236,11 @@ next_rq:
out:
pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
- mempool_free(line_ws, pblk->line_ws_pool);
+ kfree(line_ws);
+ kfree(invalid_bitmap);
kref_put(&line->ref, pblk_line_put);
- atomic_dec(&gc->inflight_gc);
+ atomic_dec(&gc->read_inflight_gc);
return;
@@ -254,10 +248,14 @@ fail_free_gc_rq:
kfree(gc_rq);
fail_free_emeta:
pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
+fail_free_bitmap:
+ kfree(invalid_bitmap);
+fail_free_ws:
+ kfree(line_ws);
+
pblk_put_line_back(pblk, line);
kref_put(&line->ref, pblk_line_put);
- mempool_free(line_ws, pblk->line_ws_pool);
- atomic_dec(&gc->inflight_gc);
+ atomic_dec(&gc->read_inflight_gc);
pr_err("pblk: Failed to GC line %d\n", line->id);
}
@@ -269,19 +267,40 @@ static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line)
pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id);
- line_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL);
+ line_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL);
if (!line_ws)
return -ENOMEM;
line_ws->pblk = pblk;
line_ws->line = line;
+ atomic_inc(&gc->pipeline_gc);
INIT_WORK(&line_ws->ws, pblk_gc_line_prepare_ws);
queue_work(gc->gc_reader_wq, &line_ws->ws);
return 0;
}
+static void pblk_gc_reader_kick(struct pblk_gc *gc)
+{
+ wake_up_process(gc->gc_reader_ts);
+}
+
+static void pblk_gc_kick(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+
+ pblk_gc_writer_kick(gc);
+ pblk_gc_reader_kick(gc);
+
+ /* If we're shutting down GC, let's not start it up again */
+ if (gc->gc_enabled) {
+ wake_up_process(gc->gc_ts);
+ mod_timer(&gc->gc_timer,
+ jiffies + msecs_to_jiffies(GC_TIME_MSECS));
+ }
+}
+
static int pblk_gc_read(struct pblk *pblk)
{
struct pblk_gc *gc = &pblk->gc;
@@ -305,11 +324,6 @@ static int pblk_gc_read(struct pblk *pblk)
return 0;
}
-static void pblk_gc_reader_kick(struct pblk_gc *gc)
-{
- wake_up_process(gc->gc_reader_ts);
-}
-
static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk,
struct list_head *group_list)
{
@@ -338,26 +352,17 @@ static bool pblk_gc_should_run(struct pblk_gc *gc, struct pblk_rl *rl)
return ((gc->gc_active) && (nr_blocks_need > nr_blocks_free));
}
-/*
- * Lines with no valid sectors will be returned to the free list immediately. If
- * GC is activated - either because the free block count is under the determined
- * threshold, or because it is being forced from user space - only lines with a
- * high count of invalid sectors will be recycled.
- */
-static void pblk_gc_run(struct pblk *pblk)
+void pblk_gc_free_full_lines(struct pblk *pblk)
{
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_gc *gc = &pblk->gc;
struct pblk_line *line;
- struct list_head *group_list;
- bool run_gc;
- int inflight_gc, gc_group = 0, prev_group = 0;
do {
spin_lock(&l_mg->gc_lock);
if (list_empty(&l_mg->gc_full_list)) {
spin_unlock(&l_mg->gc_lock);
- break;
+ return;
}
line = list_first_entry(&l_mg->gc_full_list,
@@ -371,11 +376,30 @@ static void pblk_gc_run(struct pblk *pblk)
list_del(&line->list);
spin_unlock(&l_mg->gc_lock);
+ atomic_inc(&gc->pipeline_gc);
kref_put(&line->ref, pblk_line_put);
} while (1);
+}
+
+/*
+ * Lines with no valid sectors will be returned to the free list immediately. If
+ * GC is activated - either because the free block count is under the determined
+ * threshold, or because it is being forced from user space - only lines with a
+ * high count of invalid sectors will be recycled.
+ */
+static void pblk_gc_run(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_gc *gc = &pblk->gc;
+ struct pblk_line *line;
+ struct list_head *group_list;
+ bool run_gc;
+ int read_inflight_gc, gc_group = 0, prev_group = 0;
+
+ pblk_gc_free_full_lines(pblk);
run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl);
- if (!run_gc || (atomic_read(&gc->inflight_gc) >= PBLK_GC_L_QD))
+ if (!run_gc || (atomic_read(&gc->read_inflight_gc) >= PBLK_GC_L_QD))
return;
next_gc_group:
@@ -402,14 +426,14 @@ next_gc_group:
list_add_tail(&line->list, &gc->r_list);
spin_unlock(&gc->r_lock);
- inflight_gc = atomic_inc_return(&gc->inflight_gc);
+ read_inflight_gc = atomic_inc_return(&gc->read_inflight_gc);
pblk_gc_reader_kick(gc);
prev_group = 1;
/* No need to queue up more GC lines than we can handle */
run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl);
- if (!run_gc || inflight_gc >= PBLK_GC_L_QD)
+ if (!run_gc || read_inflight_gc >= PBLK_GC_L_QD)
break;
} while (1);
@@ -418,16 +442,6 @@ next_gc_group:
goto next_gc_group;
}
-void pblk_gc_kick(struct pblk *pblk)
-{
- struct pblk_gc *gc = &pblk->gc;
-
- wake_up_process(gc->gc_ts);
- pblk_gc_writer_kick(gc);
- pblk_gc_reader_kick(gc);
- mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS));
-}
-
static void pblk_gc_timer(unsigned long data)
{
struct pblk *pblk = (struct pblk *)data;
@@ -465,6 +479,7 @@ static int pblk_gc_writer_ts(void *data)
static int pblk_gc_reader_ts(void *data)
{
struct pblk *pblk = data;
+ struct pblk_gc *gc = &pblk->gc;
while (!kthread_should_stop()) {
if (!pblk_gc_read(pblk))
@@ -473,6 +488,18 @@ static int pblk_gc_reader_ts(void *data)
io_schedule();
}
+#ifdef CONFIG_NVM_DEBUG
+ pr_info("pblk: flushing gc pipeline, %d lines left\n",
+ atomic_read(&gc->pipeline_gc));
+#endif
+
+ do {
+ if (!atomic_read(&gc->pipeline_gc))
+ break;
+
+ schedule();
+ } while (1);
+
return 0;
}
@@ -486,10 +513,10 @@ void pblk_gc_should_start(struct pblk *pblk)
{
struct pblk_gc *gc = &pblk->gc;
- if (gc->gc_enabled && !gc->gc_active)
+ if (gc->gc_enabled && !gc->gc_active) {
pblk_gc_start(pblk);
-
- pblk_gc_kick(pblk);
+ pblk_gc_kick(pblk);
+ }
}
/*
@@ -510,6 +537,11 @@ void pblk_gc_should_stop(struct pblk *pblk)
pblk_gc_stop(pblk, 0);
}
+void pblk_gc_should_kick(struct pblk *pblk)
+{
+ pblk_rl_update_rates(&pblk->rl);
+}
+
void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
int *gc_active)
{
@@ -576,7 +608,8 @@ int pblk_gc_init(struct pblk *pblk)
gc->gc_forced = 0;
gc->gc_enabled = 1;
gc->w_entries = 0;
- atomic_set(&gc->inflight_gc, 0);
+ atomic_set(&gc->read_inflight_gc, 0);
+ atomic_set(&gc->pipeline_gc, 0);
/* Workqueue that reads valid sectors from a line and submit them to the
* GC writer to be recycled.
@@ -602,7 +635,7 @@ int pblk_gc_init(struct pblk *pblk)
spin_lock_init(&gc->w_lock);
spin_lock_init(&gc->r_lock);
- sema_init(&gc->gc_sem, 128);
+ sema_init(&gc->gc_sem, PBLK_GC_RQ_QD);
INIT_LIST_HEAD(&gc->w_list);
INIT_LIST_HEAD(&gc->r_list);
@@ -625,24 +658,24 @@ void pblk_gc_exit(struct pblk *pblk)
{
struct pblk_gc *gc = &pblk->gc;
- flush_workqueue(gc->gc_reader_wq);
- flush_workqueue(gc->gc_line_reader_wq);
-
- del_timer(&gc->gc_timer);
+ gc->gc_enabled = 0;
+ del_timer_sync(&gc->gc_timer);
pblk_gc_stop(pblk, 1);
if (gc->gc_ts)
kthread_stop(gc->gc_ts);
+ if (gc->gc_reader_ts)
+ kthread_stop(gc->gc_reader_ts);
+
+ flush_workqueue(gc->gc_reader_wq);
if (gc->gc_reader_wq)
destroy_workqueue(gc->gc_reader_wq);
+ flush_workqueue(gc->gc_line_reader_wq);
if (gc->gc_line_reader_wq)
destroy_workqueue(gc->gc_line_reader_wq);
if (gc->gc_writer_ts)
kthread_stop(gc->gc_writer_ts);
-
- if (gc->gc_reader_ts)
- kthread_stop(gc->gc_reader_ts);
}
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 1b0f612..f62112b 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -20,8 +20,8 @@
#include "pblk.h"
-static struct kmem_cache *pblk_blk_ws_cache, *pblk_rec_cache, *pblk_g_rq_cache,
- *pblk_w_rq_cache, *pblk_line_meta_cache;
+static struct kmem_cache *pblk_ws_cache, *pblk_rec_cache, *pblk_g_rq_cache,
+ *pblk_w_rq_cache;
static DECLARE_RWSEM(pblk_lock);
struct bio_set *pblk_bio_set;
@@ -46,7 +46,7 @@ static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
* user I/Os. Unless stalled, the rate limiter leaves at least 256KB
* available for user I/O.
*/
- if (unlikely(pblk_get_secs(bio) >= pblk_rl_sysfs_rate_show(&pblk->rl)))
+ if (pblk_get_secs(bio) > pblk_rl_max_io(&pblk->rl))
blk_queue_split(q, &bio);
return pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER);
@@ -76,6 +76,28 @@ static blk_qc_t pblk_make_rq(struct request_queue *q, struct bio *bio)
return BLK_QC_T_NONE;
}
+static size_t pblk_trans_map_size(struct pblk *pblk)
+{
+ int entry_size = 8;
+
+ if (pblk->ppaf_bitsize < 32)
+ entry_size = 4;
+
+ return entry_size * pblk->rl.nr_secs;
+}
+
+#ifdef CONFIG_NVM_DEBUG
+static u32 pblk_l2p_crc(struct pblk *pblk)
+{
+ size_t map_size;
+ u32 crc = ~(u32)0;
+
+ map_size = pblk_trans_map_size(pblk);
+ crc = crc32_le(crc, pblk->trans_map, map_size);
+ return crc;
+}
+#endif
+
static void pblk_l2p_free(struct pblk *pblk)
{
vfree(pblk->trans_map);
@@ -85,12 +107,10 @@ static int pblk_l2p_init(struct pblk *pblk)
{
sector_t i;
struct ppa_addr ppa;
- int entry_size = 8;
+ size_t map_size;
- if (pblk->ppaf_bitsize < 32)
- entry_size = 4;
-
- pblk->trans_map = vmalloc(entry_size * pblk->rl.nr_secs);
+ map_size = pblk_trans_map_size(pblk);
+ pblk->trans_map = vmalloc(map_size);
if (!pblk->trans_map)
return -ENOMEM;
@@ -132,7 +152,6 @@ static int pblk_rwb_init(struct pblk *pblk)
}
/* Minimum pages needed within a lun */
-#define PAGE_POOL_SIZE 16
#define ADDR_POOL_SIZE 64
static int pblk_set_ppaf(struct pblk *pblk)
@@ -182,12 +201,10 @@ static int pblk_set_ppaf(struct pblk *pblk)
static int pblk_init_global_caches(struct pblk *pblk)
{
- char cache_name[PBLK_CACHE_NAME_LEN];
-
down_write(&pblk_lock);
- pblk_blk_ws_cache = kmem_cache_create("pblk_blk_ws",
+ pblk_ws_cache = kmem_cache_create("pblk_blk_ws",
sizeof(struct pblk_line_ws), 0, 0, NULL);
- if (!pblk_blk_ws_cache) {
+ if (!pblk_ws_cache) {
up_write(&pblk_lock);
return -ENOMEM;
}
@@ -195,7 +212,7 @@ static int pblk_init_global_caches(struct pblk *pblk)
pblk_rec_cache = kmem_cache_create("pblk_rec",
sizeof(struct pblk_rec_ctx), 0, 0, NULL);
if (!pblk_rec_cache) {
- kmem_cache_destroy(pblk_blk_ws_cache);
+ kmem_cache_destroy(pblk_ws_cache);
up_write(&pblk_lock);
return -ENOMEM;
}
@@ -203,7 +220,7 @@ static int pblk_init_global_caches(struct pblk *pblk)
pblk_g_rq_cache = kmem_cache_create("pblk_g_rq", pblk_g_rq_size,
0, 0, NULL);
if (!pblk_g_rq_cache) {
- kmem_cache_destroy(pblk_blk_ws_cache);
+ kmem_cache_destroy(pblk_ws_cache);
kmem_cache_destroy(pblk_rec_cache);
up_write(&pblk_lock);
return -ENOMEM;
@@ -212,30 +229,25 @@ static int pblk_init_global_caches(struct pblk *pblk)
pblk_w_rq_cache = kmem_cache_create("pblk_w_rq", pblk_w_rq_size,
0, 0, NULL);
if (!pblk_w_rq_cache) {
- kmem_cache_destroy(pblk_blk_ws_cache);
+ kmem_cache_destroy(pblk_ws_cache);
kmem_cache_destroy(pblk_rec_cache);
kmem_cache_destroy(pblk_g_rq_cache);
up_write(&pblk_lock);
return -ENOMEM;
}
-
- snprintf(cache_name, sizeof(cache_name), "pblk_line_m_%s",
- pblk->disk->disk_name);
- pblk_line_meta_cache = kmem_cache_create(cache_name,
- pblk->lm.sec_bitmap_len, 0, 0, NULL);
- if (!pblk_line_meta_cache) {
- kmem_cache_destroy(pblk_blk_ws_cache);
- kmem_cache_destroy(pblk_rec_cache);
- kmem_cache_destroy(pblk_g_rq_cache);
- kmem_cache_destroy(pblk_w_rq_cache);
- up_write(&pblk_lock);
- return -ENOMEM;
- }
up_write(&pblk_lock);
return 0;
}
+static void pblk_free_global_caches(struct pblk *pblk)
+{
+ kmem_cache_destroy(pblk_ws_cache);
+ kmem_cache_destroy(pblk_rec_cache);
+ kmem_cache_destroy(pblk_g_rq_cache);
+ kmem_cache_destroy(pblk_w_rq_cache);
+}
+
static int pblk_core_init(struct pblk *pblk)
{
struct nvm_tgt_dev *dev = pblk->dev;
@@ -247,70 +259,80 @@ static int pblk_core_init(struct pblk *pblk)
if (pblk_init_global_caches(pblk))
return -ENOMEM;
- pblk->page_pool = mempool_create_page_pool(PAGE_POOL_SIZE, 0);
- if (!pblk->page_pool)
- return -ENOMEM;
+ /* Internal bios can be at most the sectors signaled by the device. */
+ pblk->page_bio_pool = mempool_create_page_pool(nvm_max_phys_sects(dev),
+ 0);
+ if (!pblk->page_bio_pool)
+ goto free_global_caches;
- pblk->line_ws_pool = mempool_create_slab_pool(PBLK_WS_POOL_SIZE,
- pblk_blk_ws_cache);
- if (!pblk->line_ws_pool)
- goto free_page_pool;
+ pblk->gen_ws_pool = mempool_create_slab_pool(PBLK_GEN_WS_POOL_SIZE,
+ pblk_ws_cache);
+ if (!pblk->gen_ws_pool)
+ goto free_page_bio_pool;
pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache);
if (!pblk->rec_pool)
- goto free_blk_ws_pool;
+ goto free_gen_ws_pool;
- pblk->g_rq_pool = mempool_create_slab_pool(PBLK_READ_REQ_POOL_SIZE,
+ pblk->r_rq_pool = mempool_create_slab_pool(geo->nr_luns,
pblk_g_rq_cache);
- if (!pblk->g_rq_pool)
+ if (!pblk->r_rq_pool)
goto free_rec_pool;
- pblk->w_rq_pool = mempool_create_slab_pool(geo->nr_luns * 2,
+ pblk->e_rq_pool = mempool_create_slab_pool(geo->nr_luns,
+ pblk_g_rq_cache);
+ if (!pblk->e_rq_pool)
+ goto free_r_rq_pool;
+
+ pblk->w_rq_pool = mempool_create_slab_pool(geo->nr_luns,
pblk_w_rq_cache);
if (!pblk->w_rq_pool)
- goto free_g_rq_pool;
-
- pblk->line_meta_pool =
- mempool_create_slab_pool(PBLK_META_POOL_SIZE,
- pblk_line_meta_cache);
- if (!pblk->line_meta_pool)
- goto free_w_rq_pool;
+ goto free_e_rq_pool;
pblk->close_wq = alloc_workqueue("pblk-close-wq",
WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_NR_CLOSE_JOBS);
if (!pblk->close_wq)
- goto free_line_meta_pool;
+ goto free_w_rq_pool;
pblk->bb_wq = alloc_workqueue("pblk-bb-wq",
WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
if (!pblk->bb_wq)
goto free_close_wq;
- if (pblk_set_ppaf(pblk))
+ pblk->r_end_wq = alloc_workqueue("pblk-read-end-wq",
+ WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+ if (!pblk->r_end_wq)
goto free_bb_wq;
+ if (pblk_set_ppaf(pblk))
+ goto free_r_end_wq;
+
if (pblk_rwb_init(pblk))
- goto free_bb_wq;
+ goto free_r_end_wq;
INIT_LIST_HEAD(&pblk->compl_list);
return 0;
+free_r_end_wq:
+ destroy_workqueue(pblk->r_end_wq);
free_bb_wq:
destroy_workqueue(pblk->bb_wq);
free_close_wq:
destroy_workqueue(pblk->close_wq);
-free_line_meta_pool:
- mempool_destroy(pblk->line_meta_pool);
free_w_rq_pool:
mempool_destroy(pblk->w_rq_pool);
-free_g_rq_pool:
- mempool_destroy(pblk->g_rq_pool);
+free_e_rq_pool:
+ mempool_destroy(pblk->e_rq_pool);
+free_r_rq_pool:
+ mempool_destroy(pblk->r_rq_pool);
free_rec_pool:
mempool_destroy(pblk->rec_pool);
-free_blk_ws_pool:
- mempool_destroy(pblk->line_ws_pool);
-free_page_pool:
- mempool_destroy(pblk->page_pool);
+free_gen_ws_pool:
+ mempool_destroy(pblk->gen_ws_pool);
+free_page_bio_pool:
+ mempool_destroy(pblk->page_bio_pool);
+free_global_caches:
+ pblk_free_global_caches(pblk);
return -ENOMEM;
}
@@ -319,21 +341,20 @@ static void pblk_core_free(struct pblk *pblk)
if (pblk->close_wq)
destroy_workqueue(pblk->close_wq);
+ if (pblk->r_end_wq)
+ destroy_workqueue(pblk->r_end_wq);
+
if (pblk->bb_wq)
destroy_workqueue(pblk->bb_wq);
- mempool_destroy(pblk->page_pool);
- mempool_destroy(pblk->line_ws_pool);
+ mempool_destroy(pblk->page_bio_pool);
+ mempool_destroy(pblk->gen_ws_pool);
mempool_destroy(pblk->rec_pool);
- mempool_destroy(pblk->g_rq_pool);
+ mempool_destroy(pblk->r_rq_pool);
+ mempool_destroy(pblk->e_rq_pool);
mempool_destroy(pblk->w_rq_pool);
- mempool_destroy(pblk->line_meta_pool);
- kmem_cache_destroy(pblk_blk_ws_cache);
- kmem_cache_destroy(pblk_rec_cache);
- kmem_cache_destroy(pblk_g_rq_cache);
- kmem_cache_destroy(pblk_w_rq_cache);
- kmem_cache_destroy(pblk_line_meta_cache);
+ pblk_free_global_caches(pblk);
}
static void pblk_luns_free(struct pblk *pblk)
@@ -372,13 +393,11 @@ static void pblk_line_meta_free(struct pblk *pblk)
kfree(l_mg->bb_aux);
kfree(l_mg->vsc_list);
- spin_lock(&l_mg->free_lock);
for (i = 0; i < PBLK_DATA_LINES; i++) {
kfree(l_mg->sline_meta[i]);
pblk_mfree(l_mg->eline_meta[i]->buf, l_mg->emeta_alloc_type);
kfree(l_mg->eline_meta[i]);
}
- spin_unlock(&l_mg->free_lock);
kfree(pblk->lines);
}
@@ -507,6 +526,13 @@ static int pblk_lines_configure(struct pblk *pblk, int flags)
}
}
+#ifdef CONFIG_NVM_DEBUG
+ pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk));
+#endif
+
+ /* Free full lines directly as GC has not been started yet */
+ pblk_gc_free_full_lines(pblk);
+
if (!line) {
/* Configure next line for user data */
line = pblk_line_get_first_data(pblk);
@@ -630,7 +656,10 @@ static int pblk_lines_alloc_metadata(struct pblk *pblk)
fail_free_emeta:
while (--i >= 0) {
- vfree(l_mg->eline_meta[i]->buf);
+ if (l_mg->emeta_alloc_type == PBLK_VMALLOC_META)
+ vfree(l_mg->eline_meta[i]->buf);
+ else
+ kfree(l_mg->eline_meta[i]->buf);
kfree(l_mg->eline_meta[i]);
}
@@ -681,8 +710,8 @@ static int pblk_lines_init(struct pblk *pblk)
lm->blk_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long);
lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
- lm->high_thrs = lm->sec_per_line / 2;
- lm->mid_thrs = lm->sec_per_line / 4;
+ lm->mid_thrs = lm->sec_per_line / 2;
+ lm->high_thrs = lm->sec_per_line / 4;
lm->meta_distance = (geo->nr_luns / 2) * pblk->min_write_pgs;
/* Calculate necessary pages for smeta. See comment over struct
@@ -713,9 +742,13 @@ add_emeta_page:
goto add_emeta_page;
}
- lm->emeta_bb = geo->nr_luns - i;
- lm->min_blk_line = 1 + DIV_ROUND_UP(lm->smeta_sec + lm->emeta_sec[0],
- geo->sec_per_blk);
+ lm->emeta_bb = geo->nr_luns > i ? geo->nr_luns - i : 0;
+
+ lm->min_blk_line = 1;
+ if (geo->nr_luns > 1)
+ lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec +
+ lm->emeta_sec[0], geo->sec_per_blk);
+
if (lm->min_blk_line > lm->blk_per_line) {
pr_err("pblk: config. not supported. Min. LUN in line:%d\n",
lm->blk_per_line);
@@ -890,6 +923,11 @@ static void pblk_exit(void *private)
down_write(&pblk_lock);
pblk_gc_exit(pblk);
pblk_tear_down(pblk);
+
+#ifdef CONFIG_NVM_DEBUG
+ pr_info("pblk exit: L2P CRC: %x\n", pblk_l2p_crc(pblk));
+#endif
+
pblk_free(pblk);
up_write(&pblk_lock);
}
@@ -911,7 +949,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
int ret;
if (dev->identity.dom & NVM_RSP_L2P) {
- pr_err("pblk: device-side L2P table not supported. (%x)\n",
+ pr_err("pblk: host-side L2P table not supported. (%x)\n",
dev->identity.dom);
return ERR_PTR(-EINVAL);
}
@@ -923,6 +961,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
pblk->dev = dev;
pblk->disk = tdisk;
pblk->state = PBLK_STATE_RUNNING;
+ pblk->gc.gc_enabled = 0;
spin_lock_init(&pblk->trans_lock);
spin_lock_init(&pblk->lock);
@@ -944,6 +983,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
atomic_long_set(&pblk->recov_writes, 0);
atomic_long_set(&pblk->recov_writes, 0);
atomic_long_set(&pblk->recov_gc_writes, 0);
+ atomic_long_set(&pblk->recov_gc_reads, 0);
#endif
atomic_long_set(&pblk->read_failed, 0);
@@ -1012,6 +1052,10 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
pblk->rwb.nr_entries);
wake_up_process(pblk->writer_ts);
+
+ /* Check if we need to start GC */
+ pblk_gc_should_kick(pblk);
+
return pblk;
fail_stop_writer:
@@ -1044,6 +1088,7 @@ static struct nvm_tgt_type tt_pblk = {
.sysfs_init = pblk_sysfs_init,
.sysfs_exit = pblk_sysfs_exit,
+ .owner = THIS_MODULE,
};
static int __init pblk_module_init(void)
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
index fddb924..6f3ecde 100644
--- a/drivers/lightnvm/pblk-map.c
+++ b/drivers/lightnvm/pblk-map.c
@@ -25,16 +25,28 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
unsigned int valid_secs)
{
struct pblk_line *line = pblk_line_get_data(pblk);
- struct pblk_emeta *emeta = line->emeta;
+ struct pblk_emeta *emeta;
struct pblk_w_ctx *w_ctx;
- __le64 *lba_list = emeta_to_lbas(pblk, emeta->buf);
+ __le64 *lba_list;
u64 paddr;
int nr_secs = pblk->min_write_pgs;
int i;
+ if (pblk_line_is_full(line)) {
+ struct pblk_line *prev_line = line;
+
+ line = pblk_line_replace_data(pblk);
+ pblk_line_close_meta(pblk, prev_line);
+ }
+
+ emeta = line->emeta;
+ lba_list = emeta_to_lbas(pblk, emeta->buf);
+
paddr = pblk_alloc_page(pblk, line, nr_secs);
for (i = 0; i < nr_secs; i++, paddr++) {
+ __le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
+
/* ppa to be sent to the device */
ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
@@ -51,22 +63,14 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
w_ctx->ppa = ppa_list[i];
meta_list[i].lba = cpu_to_le64(w_ctx->lba);
lba_list[paddr] = cpu_to_le64(w_ctx->lba);
- line->nr_valid_lbas++;
+ if (lba_list[paddr] != addr_empty)
+ line->nr_valid_lbas++;
} else {
- __le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
-
lba_list[paddr] = meta_list[i].lba = addr_empty;
__pblk_map_invalidate(pblk, line, paddr);
}
}
- if (pblk_line_is_full(line)) {
- struct pblk_line *prev_line = line;
-
- pblk_line_replace_data(pblk);
- pblk_line_close_meta(pblk, prev_line);
- }
-
pblk_down_rq(pblk, ppa_list, nr_secs, lun_bitmap);
}
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index 9bc3257..b8f78e4 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -201,8 +201,7 @@ unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int nr_entries)
return subm;
}
-static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd,
- unsigned int to_update)
+static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update)
{
struct pblk *pblk = container_of(rb, struct pblk, rwb);
struct pblk_line *line;
@@ -213,7 +212,7 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd,
int flags;
for (i = 0; i < to_update; i++) {
- entry = &rb->entries[*l2p_upd];
+ entry = &rb->entries[rb->l2p_update];
w_ctx = &entry->w_ctx;
flags = READ_ONCE(entry->w_ctx.flags);
@@ -230,7 +229,7 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd,
line = &pblk->lines[pblk_tgt_ppa_to_line(w_ctx->ppa)];
kref_put(&line->ref, pblk_line_put);
clean_wctx(w_ctx);
- *l2p_upd = (*l2p_upd + 1) & (rb->nr_entries - 1);
+ rb->l2p_update = (rb->l2p_update + 1) & (rb->nr_entries - 1);
}
pblk_rl_out(&pblk->rl, user_io, gc_io);
@@ -258,7 +257,7 @@ static int pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int nr_entries,
count = nr_entries - space;
/* l2p_update used exclusively under rb->w_lock */
- ret = __pblk_rb_update_l2p(rb, &rb->l2p_update, count);
+ ret = __pblk_rb_update_l2p(rb, count);
out:
return ret;
@@ -280,7 +279,7 @@ void pblk_rb_sync_l2p(struct pblk_rb *rb)
sync = smp_load_acquire(&rb->sync);
to_update = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries);
- __pblk_rb_update_l2p(rb, &rb->l2p_update, to_update);
+ __pblk_rb_update_l2p(rb, to_update);
spin_unlock(&rb->w_lock);
}
@@ -325,8 +324,8 @@ void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
}
void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
- struct pblk_w_ctx w_ctx, struct pblk_line *gc_line,
- unsigned int ring_pos)
+ struct pblk_w_ctx w_ctx, struct pblk_line *line,
+ u64 paddr, unsigned int ring_pos)
{
struct pblk *pblk = container_of(rb, struct pblk, rwb);
struct pblk_rb_entry *entry;
@@ -341,7 +340,7 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
__pblk_rb_write_entry(rb, data, w_ctx, entry);
- if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, gc_line))
+ if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, line, paddr))
entry->w_ctx.lba = ADDR_EMPTY;
flags = w_ctx.flags | PBLK_WRITTEN_DATA;
@@ -355,7 +354,6 @@ static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio,
{
struct pblk_rb_entry *entry;
unsigned int subm, sync_point;
- int flags;
subm = READ_ONCE(rb->subm);
@@ -369,12 +367,6 @@ static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio,
sync_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
entry = &rb->entries[sync_point];
- flags = READ_ONCE(entry->w_ctx.flags);
- flags |= PBLK_FLUSH_ENTRY;
-
- /* Release flags on context. Protect from writes */
- smp_store_release(&entry->w_ctx.flags, flags);
-
/* Protect syncs */
smp_store_release(&rb->sync_point, sync_point);
@@ -454,6 +446,7 @@ static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries,
/* Protect from read count */
smp_store_release(&rb->mem, mem);
+
return 1;
}
@@ -558,12 +551,13 @@ out:
* persist data on the write buffer to the media.
*/
unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
- struct bio *bio, unsigned int pos,
- unsigned int nr_entries, unsigned int count)
+ unsigned int pos, unsigned int nr_entries,
+ unsigned int count)
{
struct pblk *pblk = container_of(rb, struct pblk, rwb);
struct request_queue *q = pblk->dev->q;
struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+ struct bio *bio = rqd->bio;
struct pblk_rb_entry *entry;
struct page *page;
unsigned int pad = 0, to_read = nr_entries;
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index d682e89..ca79d8f 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -39,21 +39,15 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
}
static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
- unsigned long *read_bitmap)
+ sector_t blba, unsigned long *read_bitmap)
{
+ struct pblk_sec_meta *meta_list = rqd->meta_list;
struct bio *bio = rqd->bio;
struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
- sector_t blba = pblk_get_lba(bio);
int nr_secs = rqd->nr_ppas;
bool advanced_bio = false;
int i, j = 0;
- /* logic error: lba out-of-bounds. Ignore read request */
- if (blba + nr_secs >= pblk->rl.nr_secs) {
- WARN(1, "pblk: read lbas out of bounds\n");
- return;
- }
-
pblk_lookup_l2p_seq(pblk, ppas, blba, nr_secs);
for (i = 0; i < nr_secs; i++) {
@@ -63,6 +57,7 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
retry:
if (pblk_ppa_empty(p)) {
WARN_ON(test_and_set_bit(i, read_bitmap));
+ meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
if (unlikely(!advanced_bio)) {
bio_advance(bio, (i) * PBLK_EXPOSED_PAGE_SIZE);
@@ -82,6 +77,7 @@ retry:
goto retry;
}
WARN_ON(test_and_set_bit(i, read_bitmap));
+ meta_list[i].lba = cpu_to_le64(lba);
advanced_bio = true;
#ifdef CONFIG_NVM_DEBUG
atomic_long_inc(&pblk->cache_reads);
@@ -117,10 +113,51 @@ static int pblk_submit_read_io(struct pblk *pblk, struct nvm_rq *rqd)
return NVM_IO_OK;
}
-static void pblk_end_io_read(struct nvm_rq *rqd)
+static void pblk_read_check(struct pblk *pblk, struct nvm_rq *rqd,
+ sector_t blba)
+{
+ struct pblk_sec_meta *meta_list = rqd->meta_list;
+ int nr_lbas = rqd->nr_ppas;
+ int i;
+
+ for (i = 0; i < nr_lbas; i++) {
+ u64 lba = le64_to_cpu(meta_list[i].lba);
+
+ if (lba == ADDR_EMPTY)
+ continue;
+
+ WARN(lba != blba + i, "pblk: corrupted read LBA\n");
+ }
+}
+
+static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ struct ppa_addr *ppa_list;
+ int i;
+
+ ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
+
+ for (i = 0; i < rqd->nr_ppas; i++) {
+ struct ppa_addr ppa = ppa_list[i];
+ struct pblk_line *line;
+
+ line = &pblk->lines[pblk_dev_ppa_to_line(ppa)];
+ kref_put(&line->ref, pblk_line_put_wq);
+ }
+}
+
+static void pblk_end_user_read(struct bio *bio)
+{
+#ifdef CONFIG_NVM_DEBUG
+ WARN_ONCE(bio->bi_status, "pblk: corrupted read bio\n");
+#endif
+ bio_endio(bio);
+ bio_put(bio);
+}
+
+static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
+ bool put_line)
{
- struct pblk *pblk = rqd->private;
- struct nvm_tgt_dev *dev = pblk->dev;
struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
struct bio *bio = rqd->bio;
@@ -131,47 +168,51 @@ static void pblk_end_io_read(struct nvm_rq *rqd)
WARN_ONCE(bio->bi_status, "pblk: corrupted read error\n");
#endif
- nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
+ pblk_read_check(pblk, rqd, r_ctx->lba);
bio_put(bio);
- if (r_ctx->private) {
- struct bio *orig_bio = r_ctx->private;
+ if (r_ctx->private)
+ pblk_end_user_read((struct bio *)r_ctx->private);
-#ifdef CONFIG_NVM_DEBUG
- WARN_ONCE(orig_bio->bi_status, "pblk: corrupted read bio\n");
-#endif
- bio_endio(orig_bio);
- bio_put(orig_bio);
- }
+ if (put_line)
+ pblk_read_put_rqd_kref(pblk, rqd);
#ifdef CONFIG_NVM_DEBUG
atomic_long_add(rqd->nr_ppas, &pblk->sync_reads);
atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads);
#endif
- pblk_free_rqd(pblk, rqd, READ);
+ pblk_free_rqd(pblk, rqd, PBLK_READ);
atomic_dec(&pblk->inflight_io);
}
+static void pblk_end_io_read(struct nvm_rq *rqd)
+{
+ struct pblk *pblk = rqd->private;
+
+ __pblk_end_io_read(pblk, rqd, true);
+}
+
static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
unsigned int bio_init_idx,
unsigned long *read_bitmap)
{
struct bio *new_bio, *bio = rqd->bio;
+ struct pblk_sec_meta *meta_list = rqd->meta_list;
struct bio_vec src_bv, dst_bv;
void *ppa_ptr = NULL;
void *src_p, *dst_p;
dma_addr_t dma_ppa_list = 0;
+ __le64 *lba_list_mem, *lba_list_media;
int nr_secs = rqd->nr_ppas;
int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
int i, ret, hole;
- DECLARE_COMPLETION_ONSTACK(wait);
+
+ /* Re-use allocated memory for intermediate lbas */
+ lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
+ lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size);
new_bio = bio_alloc(GFP_KERNEL, nr_holes);
- if (!new_bio) {
- pr_err("pblk: could not alloc read bio\n");
- return NVM_IO_ERR;
- }
if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
goto err;
@@ -181,34 +222,29 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
goto err;
}
+ for (i = 0; i < nr_secs; i++)
+ lba_list_mem[i] = meta_list[i].lba;
+
new_bio->bi_iter.bi_sector = 0; /* internal bio */
bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
- new_bio->bi_private = &wait;
- new_bio->bi_end_io = pblk_end_bio_sync;
rqd->bio = new_bio;
rqd->nr_ppas = nr_holes;
rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
- rqd->end_io = NULL;
- if (unlikely(nr_secs > 1 && nr_holes == 1)) {
+ if (unlikely(nr_holes == 1)) {
ppa_ptr = rqd->ppa_list;
dma_ppa_list = rqd->dma_ppa_list;
rqd->ppa_addr = rqd->ppa_list[0];
}
- ret = pblk_submit_read_io(pblk, rqd);
+ ret = pblk_submit_io_sync(pblk, rqd);
if (ret) {
bio_put(rqd->bio);
- pr_err("pblk: read IO submission failed\n");
+ pr_err("pblk: sync read IO submission failed\n");
goto err;
}
- if (!wait_for_completion_io_timeout(&wait,
- msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
- pr_err("pblk: partial read I/O timed out\n");
- }
-
if (rqd->error) {
atomic_long_inc(&pblk->read_failed);
#ifdef CONFIG_NVM_DEBUG
@@ -216,15 +252,31 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
#endif
}
- if (unlikely(nr_secs > 1 && nr_holes == 1)) {
+ if (unlikely(nr_holes == 1)) {
+ struct ppa_addr ppa;
+
+ ppa = rqd->ppa_addr;
rqd->ppa_list = ppa_ptr;
rqd->dma_ppa_list = dma_ppa_list;
+ rqd->ppa_list[0] = ppa;
+ }
+
+ for (i = 0; i < nr_secs; i++) {
+ lba_list_media[i] = meta_list[i].lba;
+ meta_list[i].lba = lba_list_mem[i];
}
/* Fill the holes in the original bio */
i = 0;
hole = find_first_zero_bit(read_bitmap, nr_secs);
do {
+ int line_id = pblk_dev_ppa_to_line(rqd->ppa_list[i]);
+ struct pblk_line *line = &pblk->lines[line_id];
+
+ kref_put(&line->ref, pblk_line_put);
+
+ meta_list[hole].lba = lba_list_media[i];
+
src_bv = new_bio->bi_io_vec[i++];
dst_bv = bio->bi_io_vec[bio_init_idx + hole];
@@ -238,7 +290,7 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
kunmap_atomic(src_p);
kunmap_atomic(dst_p);
- mempool_free(src_bv.bv_page, pblk->page_pool);
+ mempool_free(src_bv.bv_page, pblk->page_bio_pool);
hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1);
} while (hole < nr_secs);
@@ -246,34 +298,26 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
bio_put(new_bio);
/* Complete the original bio and associated request */
+ bio_endio(bio);
rqd->bio = bio;
rqd->nr_ppas = nr_secs;
- rqd->private = pblk;
- bio_endio(bio);
- pblk_end_io_read(rqd);
+ __pblk_end_io_read(pblk, rqd, false);
return NVM_IO_OK;
err:
/* Free allocated pages in new bio */
pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt);
- rqd->private = pblk;
- pblk_end_io_read(rqd);
+ __pblk_end_io_read(pblk, rqd, false);
return NVM_IO_ERR;
}
static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd,
- unsigned long *read_bitmap)
+ sector_t lba, unsigned long *read_bitmap)
{
+ struct pblk_sec_meta *meta_list = rqd->meta_list;
struct bio *bio = rqd->bio;
struct ppa_addr ppa;
- sector_t lba = pblk_get_lba(bio);
-
- /* logic error: lba out-of-bounds. Ignore read request */
- if (lba >= pblk->rl.nr_secs) {
- WARN(1, "pblk: read lba out of bounds\n");
- return;
- }
pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
@@ -284,6 +328,7 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd,
retry:
if (pblk_ppa_empty(ppa)) {
WARN_ON(test_and_set_bit(0, read_bitmap));
+ meta_list[0].lba = cpu_to_le64(ADDR_EMPTY);
return;
}
@@ -295,9 +340,12 @@ retry:
pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
goto retry;
}
+
WARN_ON(test_and_set_bit(0, read_bitmap));
+ meta_list[0].lba = cpu_to_le64(lba);
+
#ifdef CONFIG_NVM_DEBUG
- atomic_long_inc(&pblk->cache_reads);
+ atomic_long_inc(&pblk->cache_reads);
#endif
} else {
rqd->ppa_addr = ppa;
@@ -309,22 +357,24 @@ retry:
int pblk_submit_read(struct pblk *pblk, struct bio *bio)
{
struct nvm_tgt_dev *dev = pblk->dev;
+ sector_t blba = pblk_get_lba(bio);
unsigned int nr_secs = pblk_get_secs(bio);
+ struct pblk_g_ctx *r_ctx;
struct nvm_rq *rqd;
- unsigned long read_bitmap; /* Max 64 ppas per request */
unsigned int bio_init_idx;
+ unsigned long read_bitmap; /* Max 64 ppas per request */
int ret = NVM_IO_ERR;
- if (nr_secs > PBLK_MAX_REQ_ADDRS)
+ /* logic error: lba out-of-bounds. Ignore read request */
+ if (blba >= pblk->rl.nr_secs || nr_secs > PBLK_MAX_REQ_ADDRS) {
+ WARN(1, "pblk: read lba out of bounds (lba:%llu, nr:%d)\n",
+ (unsigned long long)blba, nr_secs);
return NVM_IO_ERR;
+ }
bitmap_zero(&read_bitmap, nr_secs);
- rqd = pblk_alloc_rqd(pblk, READ);
- if (IS_ERR(rqd)) {
- pr_err_ratelimited("pblk: not able to alloc rqd");
- return NVM_IO_ERR;
- }
+ rqd = pblk_alloc_rqd(pblk, PBLK_READ);
rqd->opcode = NVM_OP_PREAD;
rqd->bio = bio;
@@ -332,6 +382,9 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
rqd->private = pblk;
rqd->end_io = pblk_end_io_read;
+ r_ctx = nvm_rq_to_pdu(rqd);
+ r_ctx->lba = blba;
+
/* Save the index for this bio's start. This is needed in case
* we need to fill a partial read.
*/
@@ -348,23 +401,22 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
- pblk_read_ppalist_rq(pblk, rqd, &read_bitmap);
+ pblk_read_ppalist_rq(pblk, rqd, blba, &read_bitmap);
} else {
- pblk_read_rq(pblk, rqd, &read_bitmap);
+ pblk_read_rq(pblk, rqd, blba, &read_bitmap);
}
bio_get(bio);
if (bitmap_full(&read_bitmap, nr_secs)) {
bio_endio(bio);
atomic_inc(&pblk->inflight_io);
- pblk_end_io_read(rqd);
+ __pblk_end_io_read(pblk, rqd, false);
return NVM_IO_OK;
}
/* All sectors are to be read from the device */
if (bitmap_empty(&read_bitmap, rqd->nr_ppas)) {
struct bio *int_bio = NULL;
- struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
/* Clone read bio to deal with read errors internally */
int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set);
@@ -399,40 +451,46 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
return NVM_IO_OK;
fail_rqd_free:
- pblk_free_rqd(pblk, rqd, READ);
+ pblk_free_rqd(pblk, rqd, PBLK_READ);
return ret;
}
static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
struct pblk_line *line, u64 *lba_list,
- unsigned int nr_secs)
+ u64 *paddr_list_gc, unsigned int nr_secs)
{
- struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
+ struct ppa_addr ppa_list_l2p[PBLK_MAX_REQ_ADDRS];
+ struct ppa_addr ppa_gc;
int valid_secs = 0;
int i;
- pblk_lookup_l2p_rand(pblk, ppas, lba_list, nr_secs);
+ pblk_lookup_l2p_rand(pblk, ppa_list_l2p, lba_list, nr_secs);
for (i = 0; i < nr_secs; i++) {
- if (pblk_addr_in_cache(ppas[i]) || ppas[i].g.blk != line->id ||
- pblk_ppa_empty(ppas[i])) {
- lba_list[i] = ADDR_EMPTY;
+ if (lba_list[i] == ADDR_EMPTY)
+ continue;
+
+ ppa_gc = addr_to_gen_ppa(pblk, paddr_list_gc[i], line->id);
+ if (!pblk_ppa_comp(ppa_list_l2p[i], ppa_gc)) {
+ paddr_list_gc[i] = lba_list[i] = ADDR_EMPTY;
continue;
}
- rqd->ppa_list[valid_secs++] = ppas[i];
+ rqd->ppa_list[valid_secs++] = ppa_list_l2p[i];
}
#ifdef CONFIG_NVM_DEBUG
atomic_long_add(valid_secs, &pblk->inflight_reads);
#endif
+
return valid_secs;
}
static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
- struct pblk_line *line, sector_t lba)
+ struct pblk_line *line, sector_t lba,
+ u64 paddr_gc)
{
- struct ppa_addr ppa;
+ struct ppa_addr ppa_l2p, ppa_gc;
int valid_secs = 0;
if (lba == ADDR_EMPTY)
@@ -445,15 +503,14 @@ static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
}
spin_lock(&pblk->trans_lock);
- ppa = pblk_trans_map_get(pblk, lba);
+ ppa_l2p = pblk_trans_map_get(pblk, lba);
spin_unlock(&pblk->trans_lock);
- /* Ignore updated values until the moment */
- if (pblk_addr_in_cache(ppa) || ppa.g.blk != line->id ||
- pblk_ppa_empty(ppa))
+ ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, line->id);
+ if (!pblk_ppa_comp(ppa_l2p, ppa_gc))
goto out;
- rqd->ppa_addr = ppa;
+ rqd->ppa_addr = ppa_l2p;
valid_secs = 1;
#ifdef CONFIG_NVM_DEBUG
@@ -464,42 +521,44 @@ out:
return valid_secs;
}
-int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
- unsigned int nr_secs, unsigned int *secs_to_gc,
- struct pblk_line *line)
+int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct bio *bio;
struct nvm_rq rqd;
- int ret, data_len;
- DECLARE_COMPLETION_ONSTACK(wait);
+ int data_len;
+ int ret = NVM_IO_OK;
memset(&rqd, 0, sizeof(struct nvm_rq));
rqd.meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
&rqd.dma_meta_list);
if (!rqd.meta_list)
- return NVM_IO_ERR;
+ return -ENOMEM;
- if (nr_secs > 1) {
+ if (gc_rq->nr_secs > 1) {
rqd.ppa_list = rqd.meta_list + pblk_dma_meta_size;
rqd.dma_ppa_list = rqd.dma_meta_list + pblk_dma_meta_size;
- *secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, line, lba_list,
- nr_secs);
- if (*secs_to_gc == 1)
+ gc_rq->secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, gc_rq->line,
+ gc_rq->lba_list,
+ gc_rq->paddr_list,
+ gc_rq->nr_secs);
+ if (gc_rq->secs_to_gc == 1)
rqd.ppa_addr = rqd.ppa_list[0];
} else {
- *secs_to_gc = read_rq_gc(pblk, &rqd, line, lba_list[0]);
+ gc_rq->secs_to_gc = read_rq_gc(pblk, &rqd, gc_rq->line,
+ gc_rq->lba_list[0],
+ gc_rq->paddr_list[0]);
}
- if (!(*secs_to_gc))
+ if (!(gc_rq->secs_to_gc))
goto out;
- data_len = (*secs_to_gc) * geo->sec_size;
- bio = pblk_bio_map_addr(pblk, data, *secs_to_gc, data_len,
- PBLK_KMALLOC_META, GFP_KERNEL);
+ data_len = (gc_rq->secs_to_gc) * geo->sec_size;
+ bio = pblk_bio_map_addr(pblk, gc_rq->data, gc_rq->secs_to_gc, data_len,
+ PBLK_VMALLOC_META, GFP_KERNEL);
if (IS_ERR(bio)) {
pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio));
goto err_free_dma;
@@ -509,23 +568,16 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
bio_set_op_attrs(bio, REQ_OP_READ, 0);
rqd.opcode = NVM_OP_PREAD;
- rqd.end_io = pblk_end_io_sync;
- rqd.private = &wait;
- rqd.nr_ppas = *secs_to_gc;
+ rqd.nr_ppas = gc_rq->secs_to_gc;
rqd.flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
rqd.bio = bio;
- ret = pblk_submit_read_io(pblk, &rqd);
- if (ret) {
- bio_endio(bio);
+ if (pblk_submit_io_sync(pblk, &rqd)) {
+ ret = -EIO;
pr_err("pblk: GC read request failed\n");
- goto err_free_dma;
+ goto err_free_bio;
}
- if (!wait_for_completion_io_timeout(&wait,
- msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
- pr_err("pblk: GC read I/O timed out\n");
- }
atomic_dec(&pblk->inflight_io);
if (rqd.error) {
@@ -536,16 +588,18 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
}
#ifdef CONFIG_NVM_DEBUG
- atomic_long_add(*secs_to_gc, &pblk->sync_reads);
- atomic_long_add(*secs_to_gc, &pblk->recov_gc_reads);
- atomic_long_sub(*secs_to_gc, &pblk->inflight_reads);
+ atomic_long_add(gc_rq->secs_to_gc, &pblk->sync_reads);
+ atomic_long_add(gc_rq->secs_to_gc, &pblk->recov_gc_reads);
+ atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads);
#endif
out:
nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list);
- return NVM_IO_OK;
+ return ret;
+err_free_bio:
+ bio_put(bio);
err_free_dma:
nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list);
- return NVM_IO_ERR;
+ return ret;
}
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index cb556e0..eadb3eb 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -34,10 +34,6 @@ void pblk_submit_rec(struct work_struct *work)
max_secs);
bio = bio_alloc(GFP_KERNEL, nr_rec_secs);
- if (!bio) {
- pr_err("pblk: not able to create recovery bio\n");
- return;
- }
bio->bi_iter.bi_sector = 0;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
@@ -71,7 +67,7 @@ void pblk_submit_rec(struct work_struct *work)
err:
bio_put(bio);
- pblk_free_rqd(pblk, rqd, WRITE);
+ pblk_free_rqd(pblk, rqd, PBLK_WRITE);
}
int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
@@ -84,12 +80,7 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
struct pblk_c_ctx *rec_ctx;
int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded;
- rec_rqd = pblk_alloc_rqd(pblk, WRITE);
- if (IS_ERR(rec_rqd)) {
- pr_err("pblk: could not create recovery req.\n");
- return -ENOMEM;
- }
-
+ rec_rqd = pblk_alloc_rqd(pblk, PBLK_WRITE);
rec_ctx = nvm_rq_to_pdu(rec_rqd);
/* Copy completion bitmap, but exclude the first X completed entries */
@@ -142,19 +133,19 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
struct pblk_emeta *emeta = line->emeta;
struct line_emeta *emeta_buf = emeta->buf;
__le64 *lba_list;
- int data_start;
- int nr_data_lbas, nr_valid_lbas, nr_lbas = 0;
- int i;
+ u64 data_start, data_end;
+ u64 nr_valid_lbas, nr_lbas = 0;
+ u64 i;
lba_list = pblk_recov_get_lba_list(pblk, emeta_buf);
if (!lba_list)
return 1;
data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec;
- nr_data_lbas = lm->sec_per_line - lm->emeta_sec[0];
+ data_end = line->emeta_ssec;
nr_valid_lbas = le64_to_cpu(emeta_buf->nr_valid_lbas);
- for (i = data_start; i < nr_data_lbas && nr_lbas < nr_valid_lbas; i++) {
+ for (i = data_start; i < data_end; i++) {
struct ppa_addr ppa;
int pos;
@@ -181,8 +172,8 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
}
if (nr_valid_lbas != nr_lbas)
- pr_err("pblk: line %d - inconsistent lba list(%llu/%d)\n",
- line->id, emeta_buf->nr_valid_lbas, nr_lbas);
+ pr_err("pblk: line %d - inconsistent lba list(%llu/%llu)\n",
+ line->id, nr_valid_lbas, nr_lbas);
line->left_msecs = 0;
@@ -225,7 +216,6 @@ static int pblk_recov_read_oob(struct pblk *pblk, struct pblk_line *line,
int rq_ppas, rq_len;
int i, j;
int ret = 0;
- DECLARE_COMPLETION_ONSTACK(wait);
ppa_list = p.ppa_list;
meta_list = p.meta_list;
@@ -262,8 +252,6 @@ next_read_rq:
rqd->ppa_list = ppa_list;
rqd->dma_ppa_list = dma_ppa_list;
rqd->dma_meta_list = dma_meta_list;
- rqd->end_io = pblk_end_io_sync;
- rqd->private = &wait;
if (pblk_io_aligned(pblk, rq_ppas))
rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
@@ -289,19 +277,13 @@ next_read_rq:
}
/* If read fails, more padding is needed */
- ret = pblk_submit_io(pblk, rqd);
+ ret = pblk_submit_io_sync(pblk, rqd);
if (ret) {
pr_err("pblk: I/O submission failed: %d\n", ret);
return ret;
}
- if (!wait_for_completion_io_timeout(&wait,
- msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
- pr_err("pblk: L2P recovery read timed out\n");
- return -EINTR;
- }
atomic_dec(&pblk->inflight_io);
- reinit_completion(&wait);
/* At this point, the read should not fail. If it does, it is a problem
* we cannot recover from here. Need FTL log.
@@ -338,13 +320,10 @@ static void pblk_end_io_recov(struct nvm_rq *rqd)
{
struct pblk_pad_rq *pad_rq = rqd->private;
struct pblk *pblk = pad_rq->pblk;
- struct nvm_tgt_dev *dev = pblk->dev;
pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas);
- bio_put(rqd->bio);
- nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
- pblk_free_rqd(pblk, rqd, WRITE);
+ pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
atomic_dec(&pblk->inflight_io);
kref_put(&pad_rq->ref, pblk_recov_complete);
@@ -404,25 +383,21 @@ next_pad_rq:
ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
- rqd = pblk_alloc_rqd(pblk, WRITE);
- if (IS_ERR(rqd)) {
- ret = PTR_ERR(rqd);
- goto fail_free_meta;
- }
-
bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len,
PBLK_VMALLOC_META, GFP_KERNEL);
if (IS_ERR(bio)) {
ret = PTR_ERR(bio);
- goto fail_free_rqd;
+ goto fail_free_meta;
}
bio->bi_iter.bi_sector = 0; /* internal bio */
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT);
+
rqd->bio = bio;
rqd->opcode = NVM_OP_PWRITE;
- rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+ rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
rqd->meta_list = meta_list;
rqd->nr_ppas = rq_ppas;
rqd->ppa_list = ppa_list;
@@ -490,8 +465,6 @@ free_rq:
fail_free_bio:
bio_put(bio);
-fail_free_rqd:
- pblk_free_rqd(pblk, rqd, WRITE);
fail_free_meta:
nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
fail_free_pad:
@@ -522,7 +495,6 @@ static int pblk_recov_scan_all_oob(struct pblk *pblk, struct pblk_line *line,
int ret = 0;
int rec_round;
int left_ppas = pblk_calc_sec_in_line(pblk, line) - line->cur_sec;
- DECLARE_COMPLETION_ONSTACK(wait);
ppa_list = p.ppa_list;
meta_list = p.meta_list;
@@ -557,8 +529,6 @@ next_rq:
rqd->ppa_list = ppa_list;
rqd->dma_ppa_list = dma_ppa_list;
rqd->dma_meta_list = dma_meta_list;
- rqd->end_io = pblk_end_io_sync;
- rqd->private = &wait;
if (pblk_io_aligned(pblk, rq_ppas))
rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
@@ -584,18 +554,13 @@ next_rq:
addr_to_gen_ppa(pblk, w_ptr, line->id);
}
- ret = pblk_submit_io(pblk, rqd);
+ ret = pblk_submit_io_sync(pblk, rqd);
if (ret) {
pr_err("pblk: I/O submission failed: %d\n", ret);
return ret;
}
- if (!wait_for_completion_io_timeout(&wait,
- msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
- pr_err("pblk: L2P recovery read timed out\n");
- }
atomic_dec(&pblk->inflight_io);
- reinit_completion(&wait);
/* This should not happen since the read failed during normal recovery,
* but the media works funny sometimes...
@@ -663,7 +628,6 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
int i, j;
int ret = 0;
int left_ppas = pblk_calc_sec_in_line(pblk, line);
- DECLARE_COMPLETION_ONSTACK(wait);
ppa_list = p.ppa_list;
meta_list = p.meta_list;
@@ -696,8 +660,6 @@ next_rq:
rqd->ppa_list = ppa_list;
rqd->dma_ppa_list = dma_ppa_list;
rqd->dma_meta_list = dma_meta_list;
- rqd->end_io = pblk_end_io_sync;
- rqd->private = &wait;
if (pblk_io_aligned(pblk, rq_ppas))
rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
@@ -723,19 +685,14 @@ next_rq:
addr_to_gen_ppa(pblk, paddr, line->id);
}
- ret = pblk_submit_io(pblk, rqd);
+ ret = pblk_submit_io_sync(pblk, rqd);
if (ret) {
pr_err("pblk: I/O submission failed: %d\n", ret);
bio_put(bio);
return ret;
}
- if (!wait_for_completion_io_timeout(&wait,
- msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
- pr_err("pblk: L2P recovery read timed out\n");
- }
atomic_dec(&pblk->inflight_io);
- reinit_completion(&wait);
/* Reached the end of the written line */
if (rqd->error) {
@@ -785,15 +742,9 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
dma_addr_t dma_ppa_list, dma_meta_list;
int done, ret = 0;
- rqd = pblk_alloc_rqd(pblk, READ);
- if (IS_ERR(rqd))
- return PTR_ERR(rqd);
-
meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list);
- if (!meta_list) {
- ret = -ENOMEM;
- goto free_rqd;
- }
+ if (!meta_list)
+ return -ENOMEM;
ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
@@ -804,6 +755,8 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
goto free_meta_list;
}
+ rqd = pblk_alloc_rqd(pblk, PBLK_READ);
+
p.ppa_list = ppa_list;
p.meta_list = meta_list;
p.rqd = rqd;
@@ -832,8 +785,6 @@ out:
kfree(data);
free_meta_list:
nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
-free_rqd:
- pblk_free_rqd(pblk, rqd, READ);
return ret;
}
@@ -851,11 +802,33 @@ static void pblk_recov_line_add_ordered(struct list_head *head,
__list_add(&line->list, t->list.prev, &t->list);
}
-struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
+static u64 pblk_line_emeta_start(struct pblk *pblk, struct pblk_line *line)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_line_meta *lm = &pblk->lm;
+ unsigned int emeta_secs;
+ u64 emeta_start;
+ struct ppa_addr ppa;
+ int pos;
+
+ emeta_secs = lm->emeta_sec[0];
+ emeta_start = lm->sec_per_line;
+
+ while (emeta_secs) {
+ emeta_start--;
+ ppa = addr_to_pblk_ppa(pblk, emeta_start, line->id);
+ pos = pblk_ppa_to_pos(geo, ppa);
+ if (!test_bit(pos, line->blk_bitmap))
+ emeta_secs--;
+ }
+
+ return emeta_start;
+}
+
+struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line *line, *tline, *data_line = NULL;
struct pblk_smeta *smeta;
@@ -900,9 +873,9 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
if (le32_to_cpu(smeta_buf->header.identifier) != PBLK_MAGIC)
continue;
- if (le16_to_cpu(smeta_buf->header.version) != 1) {
+ if (smeta_buf->header.version != SMETA_VERSION) {
pr_err("pblk: found incompatible line version %u\n",
- smeta_buf->header.version);
+ le16_to_cpu(smeta_buf->header.version));
return ERR_PTR(-EINVAL);
}
@@ -954,15 +927,9 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
/* Verify closed blocks and recover this portion of L2P table*/
list_for_each_entry_safe(line, tline, &recov_list, list) {
- int off, nr_bb;
-
recovered_lines++;
- /* Calculate where emeta starts based on the line bb */
- off = lm->sec_per_line - lm->emeta_sec[0];
- nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
- off -= nr_bb * geo->sec_per_pl;
- line->emeta_ssec = off;
+ line->emeta_ssec = pblk_line_emeta_start(pblk, line);
line->emeta = emeta;
memset(line->emeta->buf, 0, lm->emeta_len[0]);
@@ -987,7 +954,7 @@ next:
list_move_tail(&line->list, move_list);
spin_unlock(&l_mg->gc_lock);
- mempool_free(line->map_bitmap, pblk->line_meta_pool);
+ kfree(line->map_bitmap);
line->map_bitmap = NULL;
line->smeta = NULL;
line->emeta = NULL;
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c
index 2e6a536..abae31f 100644
--- a/drivers/lightnvm/pblk-rl.c
+++ b/drivers/lightnvm/pblk-rl.c
@@ -96,9 +96,11 @@ unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl)
*
* Only the total number of free blocks is used to configure the rate limiter.
*/
-static int pblk_rl_update_rates(struct pblk_rl *rl, unsigned long max)
+void pblk_rl_update_rates(struct pblk_rl *rl)
{
+ struct pblk *pblk = container_of(rl, struct pblk, rl);
unsigned long free_blocks = pblk_rl_nr_free_blks(rl);
+ int max = rl->rb_budget;
if (free_blocks >= rl->high) {
rl->rb_user_max = max;
@@ -124,23 +126,18 @@ static int pblk_rl_update_rates(struct pblk_rl *rl, unsigned long max)
rl->rb_state = PBLK_RL_LOW;
}
- return rl->rb_state;
+ if (rl->rb_state == (PBLK_RL_MID | PBLK_RL_LOW))
+ pblk_gc_should_start(pblk);
+ else
+ pblk_gc_should_stop(pblk);
}
void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line)
{
- struct pblk *pblk = container_of(rl, struct pblk, rl);
int blk_in_line = atomic_read(&line->blk_in_line);
- int ret;
atomic_add(blk_in_line, &rl->free_blocks);
- /* Rates will not change that often - no need to lock update */
- ret = pblk_rl_update_rates(rl, rl->rb_budget);
-
- if (ret == (PBLK_RL_MID | PBLK_RL_LOW))
- pblk_gc_should_start(pblk);
- else
- pblk_gc_should_stop(pblk);
+ pblk_rl_update_rates(rl);
}
void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line)
@@ -148,19 +145,7 @@ void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line)
int blk_in_line = atomic_read(&line->blk_in_line);
atomic_sub(blk_in_line, &rl->free_blocks);
-}
-
-void pblk_gc_should_kick(struct pblk *pblk)
-{
- struct pblk_rl *rl = &pblk->rl;
- int ret;
-
- /* Rates will not change that often - no need to lock update */
- ret = pblk_rl_update_rates(rl, rl->rb_budget);
- if (ret == (PBLK_RL_MID | PBLK_RL_LOW))
- pblk_gc_should_start(pblk);
- else
- pblk_gc_should_stop(pblk);
+ pblk_rl_update_rates(rl);
}
int pblk_rl_high_thrs(struct pblk_rl *rl)
@@ -168,14 +153,9 @@ int pblk_rl_high_thrs(struct pblk_rl *rl)
return rl->high;
}
-int pblk_rl_low_thrs(struct pblk_rl *rl)
-{
- return rl->low;
-}
-
-int pblk_rl_sysfs_rate_show(struct pblk_rl *rl)
+int pblk_rl_max_io(struct pblk_rl *rl)
{
- return rl->rb_user_max;
+ return rl->rb_max_io;
}
static void pblk_rl_u_timer(unsigned long data)
@@ -214,6 +194,7 @@ void pblk_rl_init(struct pblk_rl *rl, int budget)
/* To start with, all buffer is available to user I/O writers */
rl->rb_budget = budget;
rl->rb_user_max = budget;
+ rl->rb_max_io = budget >> 1;
rl->rb_gc_max = 0;
rl->rb_state = PBLK_RL_HIGH;
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c
index 95fb434..cd49e88 100644
--- a/drivers/lightnvm/pblk-sysfs.c
+++ b/drivers/lightnvm/pblk-sysfs.c
@@ -253,7 +253,7 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
sz += snprintf(page + sz, PAGE_SIZE - sz,
"GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, queue:%d\n",
gc_full, gc_high, gc_mid, gc_low, gc_empty,
- atomic_read(&pblk->gc.inflight_gc));
+ atomic_read(&pblk->gc.read_inflight_gc));
sz += snprintf(page + sz, PAGE_SIZE - sz,
"data (%d) cur:%d, left:%d, vsc:%d, s:%d, map:%d/%d (%d)\n",
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 3ad9e56..6c1cafa 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -20,7 +20,6 @@
static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
struct pblk_c_ctx *c_ctx)
{
- struct nvm_tgt_dev *dev = pblk->dev;
struct bio *original_bio;
unsigned long ret;
int i;
@@ -33,16 +32,18 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
bio_endio(original_bio);
}
+ if (c_ctx->nr_padded)
+ pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid,
+ c_ctx->nr_padded);
+
#ifdef CONFIG_NVM_DEBUG
- atomic_long_add(c_ctx->nr_valid, &pblk->sync_writes);
+ atomic_long_add(rqd->nr_ppas, &pblk->sync_writes);
#endif
ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid);
- nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
-
bio_put(rqd->bio);
- pblk_free_rqd(pblk, rqd, WRITE);
+ pblk_free_rqd(pblk, rqd, PBLK_WRITE);
return ret;
}
@@ -107,10 +108,7 @@ static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
ppa_list = &rqd->ppa_addr;
recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC);
- if (!recovery) {
- pr_err("pblk: could not allocate recovery context\n");
- return;
- }
+
INIT_LIST_HEAD(&recovery->failed);
bit = -1;
@@ -175,7 +173,6 @@ static void pblk_end_io_write(struct nvm_rq *rqd)
static void pblk_end_io_write_meta(struct nvm_rq *rqd)
{
struct pblk *pblk = rqd->private;
- struct nvm_tgt_dev *dev = pblk->dev;
struct pblk_g_ctx *m_ctx = nvm_rq_to_pdu(rqd);
struct pblk_line *line = m_ctx->private;
struct pblk_emeta *emeta = line->emeta;
@@ -187,19 +184,13 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd)
pblk_log_write_err(pblk, rqd);
pr_err("pblk: metadata I/O failed. Line %d\n", line->id);
}
-#ifdef CONFIG_NVM_DEBUG
- else
- WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n");
-#endif
sync = atomic_add_return(rqd->nr_ppas, &emeta->sync);
if (sync == emeta->nr_entries)
- pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws,
- pblk->close_wq);
+ pblk_gen_run_ws(pblk, line, NULL, pblk_line_close_ws,
+ GFP_ATOMIC, pblk->close_wq);
- bio_put(rqd->bio);
- nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
- pblk_free_rqd(pblk, rqd, READ);
+ pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
atomic_dec(&pblk->inflight_io);
}
@@ -213,7 +204,7 @@ static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
/* Setup write request */
rqd->opcode = NVM_OP_PWRITE;
rqd->nr_ppas = nr_secs;
- rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+ rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
rqd->private = pblk;
rqd->end_io = end_io;
@@ -229,15 +220,16 @@ static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
}
static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
- struct pblk_c_ctx *c_ctx, struct ppa_addr *erase_ppa)
+ struct ppa_addr *erase_ppa)
{
struct pblk_line_meta *lm = &pblk->lm;
struct pblk_line *e_line = pblk_line_get_erase(pblk);
+ struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
unsigned int valid = c_ctx->nr_valid;
unsigned int padded = c_ctx->nr_padded;
unsigned int nr_secs = valid + padded;
unsigned long *lun_bitmap;
- int ret = 0;
+ int ret;
lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
if (!lun_bitmap)
@@ -279,7 +271,7 @@ int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, c_ctx->nr_valid, 0);
rqd->ppa_status = (u64)0;
- rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+ rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
return ret;
}
@@ -303,55 +295,6 @@ static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
return secs_to_sync;
}
-static inline int pblk_valid_meta_ppa(struct pblk *pblk,
- struct pblk_line *meta_line,
- struct ppa_addr *ppa_list, int nr_ppas)
-{
- struct nvm_tgt_dev *dev = pblk->dev;
- struct nvm_geo *geo = &dev->geo;
- struct pblk_line *data_line;
- struct ppa_addr ppa, ppa_opt;
- u64 paddr;
- int i;
-
- data_line = &pblk->lines[pblk_dev_ppa_to_line(ppa_list[0])];
- paddr = pblk_lookup_page(pblk, meta_line);
- ppa = addr_to_gen_ppa(pblk, paddr, 0);
-
- if (test_bit(pblk_ppa_to_pos(geo, ppa), data_line->blk_bitmap))
- return 1;
-
- /* Schedule a metadata I/O that is half the distance from the data I/O
- * with regards to the number of LUNs forming the pblk instance. This
- * balances LUN conflicts across every I/O.
- *
- * When the LUN configuration changes (e.g., due to GC), this distance
- * can align, which would result on a LUN deadlock. In this case, modify
- * the distance to not be optimal, but allow metadata I/Os to succeed.
- */
- ppa_opt = addr_to_gen_ppa(pblk, paddr + data_line->meta_distance, 0);
- if (unlikely(ppa_opt.ppa == ppa.ppa)) {
- data_line->meta_distance--;
- return 0;
- }
-
- for (i = 0; i < nr_ppas; i += pblk->min_write_pgs)
- if (ppa_list[i].g.ch == ppa_opt.g.ch &&
- ppa_list[i].g.lun == ppa_opt.g.lun)
- return 1;
-
- if (test_bit(pblk_ppa_to_pos(geo, ppa_opt), data_line->blk_bitmap)) {
- for (i = 0; i < nr_ppas; i += pblk->min_write_pgs)
- if (ppa_list[i].g.ch == ppa.g.ch &&
- ppa_list[i].g.lun == ppa.g.lun)
- return 0;
-
- return 1;
- }
-
- return 0;
-}
-
int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
{
struct nvm_tgt_dev *dev = pblk->dev;
@@ -370,11 +313,8 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
int i, j;
int ret;
- rqd = pblk_alloc_rqd(pblk, READ);
- if (IS_ERR(rqd)) {
- pr_err("pblk: cannot allocate write req.\n");
- return PTR_ERR(rqd);
- }
+ rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT);
+
m_ctx = nvm_rq_to_pdu(rqd);
m_ctx->private = meta_line;
@@ -407,8 +347,6 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
if (emeta->mem >= lm->emeta_len[0]) {
spin_lock(&l_mg->close_lock);
list_del(&meta_line->list);
- WARN(!bitmap_full(meta_line->map_bitmap, lm->sec_per_line),
- "pblk: corrupt meta line %d\n", meta_line->id);
spin_unlock(&l_mg->close_lock);
}
@@ -428,18 +366,51 @@ fail_rollback:
pblk_dealloc_page(pblk, meta_line, rq_ppas);
list_add(&meta_line->list, &meta_line->list);
spin_unlock(&l_mg->close_lock);
-
- nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
fail_free_bio:
- if (likely(l_mg->emeta_alloc_type == PBLK_VMALLOC_META))
- bio_put(bio);
+ bio_put(bio);
fail_free_rqd:
- pblk_free_rqd(pblk, rqd, READ);
+ pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
return ret;
}
-static int pblk_sched_meta_io(struct pblk *pblk, struct ppa_addr *prev_list,
- int prev_n)
+static inline bool pblk_valid_meta_ppa(struct pblk *pblk,
+ struct pblk_line *meta_line,
+ struct nvm_rq *data_rqd)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_c_ctx *data_c_ctx = nvm_rq_to_pdu(data_rqd);
+ struct pblk_line *data_line = pblk_line_get_data(pblk);
+ struct ppa_addr ppa, ppa_opt;
+ u64 paddr;
+ int pos_opt;
+
+ /* Schedule a metadata I/O that is half the distance from the data I/O
+ * with regards to the number of LUNs forming the pblk instance. This
+ * balances LUN conflicts across every I/O.
+ *
+ * When the LUN configuration changes (e.g., due to GC), this distance
+ * can align, which would result on metadata and data I/Os colliding. In
+ * this case, modify the distance to not be optimal, but move the
+ * optimal in the right direction.
+ */
+ paddr = pblk_lookup_page(pblk, meta_line);
+ ppa = addr_to_gen_ppa(pblk, paddr, 0);
+ ppa_opt = addr_to_gen_ppa(pblk, paddr + data_line->meta_distance, 0);
+ pos_opt = pblk_ppa_to_pos(geo, ppa_opt);
+
+ if (test_bit(pos_opt, data_c_ctx->lun_bitmap) ||
+ test_bit(pos_opt, data_line->blk_bitmap))
+ return true;
+
+ if (unlikely(pblk_ppa_comp(ppa_opt, ppa)))
+ data_line->meta_distance--;
+
+ return false;
+}
+
+static struct pblk_line *pblk_should_submit_meta_io(struct pblk *pblk,
+ struct nvm_rq *data_rqd)
{
struct pblk_line_meta *lm = &pblk->lm;
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
@@ -449,57 +420,45 @@ static int pblk_sched_meta_io(struct pblk *pblk, struct ppa_addr *prev_list,
retry:
if (list_empty(&l_mg->emeta_list)) {
spin_unlock(&l_mg->close_lock);
- return 0;
+ return NULL;
}
meta_line = list_first_entry(&l_mg->emeta_list, struct pblk_line, list);
- if (bitmap_full(meta_line->map_bitmap, lm->sec_per_line))
+ if (meta_line->emeta->mem >= lm->emeta_len[0])
goto retry;
spin_unlock(&l_mg->close_lock);
- if (!pblk_valid_meta_ppa(pblk, meta_line, prev_list, prev_n))
- return 0;
+ if (!pblk_valid_meta_ppa(pblk, meta_line, data_rqd))
+ return NULL;
- return pblk_submit_meta_io(pblk, meta_line);
+ return meta_line;
}
static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
{
- struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
struct ppa_addr erase_ppa;
+ struct pblk_line *meta_line;
int err;
ppa_set_empty(&erase_ppa);
/* Assign lbas to ppas and populate request structure */
- err = pblk_setup_w_rq(pblk, rqd, c_ctx, &erase_ppa);
+ err = pblk_setup_w_rq(pblk, rqd, &erase_ppa);
if (err) {
pr_err("pblk: could not setup write request: %d\n", err);
return NVM_IO_ERR;
}
- if (likely(ppa_empty(erase_ppa))) {
- /* Submit metadata write for previous data line */
- err = pblk_sched_meta_io(pblk, rqd->ppa_list, rqd->nr_ppas);
- if (err) {
- pr_err("pblk: metadata I/O submission failed: %d", err);
- return NVM_IO_ERR;
- }
+ meta_line = pblk_should_submit_meta_io(pblk, rqd);
- /* Submit data write for current data line */
- err = pblk_submit_io(pblk, rqd);
- if (err) {
- pr_err("pblk: data I/O submission failed: %d\n", err);
- return NVM_IO_ERR;
- }
- } else {
- /* Submit data write for current data line */
- err = pblk_submit_io(pblk, rqd);
- if (err) {
- pr_err("pblk: data I/O submission failed: %d\n", err);
- return NVM_IO_ERR;
- }
+ /* Submit data write for current data line */
+ err = pblk_submit_io(pblk, rqd);
+ if (err) {
+ pr_err("pblk: data I/O submission failed: %d\n", err);
+ return NVM_IO_ERR;
+ }
- /* Submit available erase for next data line */
+ if (!ppa_empty(erase_ppa)) {
+ /* Submit erase for next data line */
if (pblk_blk_erase_async(pblk, erase_ppa)) {
struct pblk_line *e_line = pblk_line_get_erase(pblk);
struct nvm_tgt_dev *dev = pblk->dev;
@@ -512,6 +471,15 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
}
}
+ if (meta_line) {
+ /* Submit metadata write for previous data line */
+ err = pblk_submit_meta_io(pblk, meta_line);
+ if (err) {
+ pr_err("pblk: metadata I/O submission failed: %d", err);
+ return NVM_IO_ERR;
+ }
+ }
+
return NVM_IO_OK;
}
@@ -521,7 +489,8 @@ static void pblk_free_write_rqd(struct pblk *pblk, struct nvm_rq *rqd)
struct bio *bio = rqd->bio;
if (c_ctx->nr_padded)
- pblk_bio_free_pages(pblk, bio, rqd->nr_ppas, c_ctx->nr_padded);
+ pblk_bio_free_pages(pblk, bio, c_ctx->nr_valid,
+ c_ctx->nr_padded);
}
static int pblk_submit_write(struct pblk *pblk)
@@ -543,31 +512,24 @@ static int pblk_submit_write(struct pblk *pblk)
if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
return 1;
- rqd = pblk_alloc_rqd(pblk, WRITE);
- if (IS_ERR(rqd)) {
- pr_err("pblk: cannot allocate write req.\n");
- return 1;
- }
-
- bio = bio_alloc(GFP_KERNEL, pblk->max_write_pgs);
- if (!bio) {
- pr_err("pblk: cannot allocate write bio\n");
- goto fail_free_rqd;
- }
- bio->bi_iter.bi_sector = 0; /* internal bio */
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
- rqd->bio = bio;
-
secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush);
if (secs_to_sync > pblk->max_write_pgs) {
pr_err("pblk: bad buffer sync calculation\n");
- goto fail_put_bio;
+ return 1;
}
secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync;
pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
- if (pblk_rb_read_to_bio(&pblk->rwb, rqd, bio, pos, secs_to_sync,
+ bio = bio_alloc(GFP_KERNEL, secs_to_sync);
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+ rqd = pblk_alloc_rqd(pblk, PBLK_WRITE);
+ rqd->bio = bio;
+
+ if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync,
secs_avail)) {
pr_err("pblk: corrupted write bio\n");
goto fail_put_bio;
@@ -586,8 +548,7 @@ fail_free_bio:
pblk_free_write_rqd(pblk, rqd);
fail_put_bio:
bio_put(bio);
-fail_free_rqd:
- pblk_free_rqd(pblk, rqd, WRITE);
+ pblk_free_rqd(pblk, rqd, PBLK_WRITE);
return 1;
}
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 67e623b..9096103 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -40,10 +40,6 @@
#define PBLK_MAX_REQ_ADDRS (64)
#define PBLK_MAX_REQ_ADDRS_PW (6)
-#define PBLK_WS_POOL_SIZE (128)
-#define PBLK_META_POOL_SIZE (128)
-#define PBLK_READ_REQ_POOL_SIZE (1024)
-
#define PBLK_NR_CLOSE_JOBS (4)
#define PBLK_CACHE_NAME_LEN (DISK_NAME_LEN + 16)
@@ -59,7 +55,15 @@
for ((i) = 0, rlun = &(pblk)->luns[0]; \
(i) < (pblk)->nr_luns; (i)++, rlun = &(pblk)->luns[(i)])
-#define ERASE 2 /* READ = 0, WRITE = 1 */
+/* Static pool sizes */
+#define PBLK_GEN_WS_POOL_SIZE (2)
+
+enum {
+ PBLK_READ = READ,
+ PBLK_WRITE = WRITE,/* Write from write buffer */
+ PBLK_WRITE_INT, /* Internal write - no write buffer */
+ PBLK_ERASE,
+};
enum {
/* IO Types */
@@ -95,6 +99,7 @@ enum {
};
#define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * PBLK_MAX_REQ_ADDRS)
+#define pblk_dma_ppa_size (sizeof(u64) * PBLK_MAX_REQ_ADDRS)
/* write buffer completion context */
struct pblk_c_ctx {
@@ -106,9 +111,10 @@ struct pblk_c_ctx {
unsigned int nr_padded;
};
-/* generic context */
+/* read context */
struct pblk_g_ctx {
void *private;
+ u64 lba;
};
/* Pad context */
@@ -207,6 +213,7 @@ struct pblk_lun {
struct pblk_gc_rq {
struct pblk_line *line;
void *data;
+ u64 paddr_list[PBLK_MAX_REQ_ADDRS];
u64 lba_list[PBLK_MAX_REQ_ADDRS];
int nr_secs;
int secs_to_gc;
@@ -231,7 +238,10 @@ struct pblk_gc {
struct timer_list gc_timer;
struct semaphore gc_sem;
- atomic_t inflight_gc;
+ atomic_t read_inflight_gc; /* Number of lines with inflight GC reads */
+ atomic_t pipeline_gc; /* Number of lines in the GC pipeline -
+ * started reads to finished writes
+ */
int w_entries;
struct list_head w_list;
@@ -267,6 +277,7 @@ struct pblk_rl {
int rb_gc_max; /* Max buffer entries available for GC I/O */
int rb_gc_rsv; /* Reserved buffer entries for GC I/O */
int rb_state; /* Rate-limiter current state */
+ int rb_max_io; /* Maximum size for an I/O giving the config */
atomic_t rb_user_cnt; /* User I/O buffer counter */
atomic_t rb_gc_cnt; /* GC I/O buffer counter */
@@ -310,6 +321,7 @@ enum {
};
#define PBLK_MAGIC 0x70626c6b /*pblk*/
+#define SMETA_VERSION cpu_to_le16(1)
struct line_header {
__le32 crc;
@@ -618,15 +630,16 @@ struct pblk {
struct list_head compl_list;
- mempool_t *page_pool;
- mempool_t *line_ws_pool;
+ mempool_t *page_bio_pool;
+ mempool_t *gen_ws_pool;
mempool_t *rec_pool;
- mempool_t *g_rq_pool;
+ mempool_t *r_rq_pool;
mempool_t *w_rq_pool;
- mempool_t *line_meta_pool;
+ mempool_t *e_rq_pool;
struct workqueue_struct *close_wq;
struct workqueue_struct *bb_wq;
+ struct workqueue_struct *r_end_wq;
struct timer_list wtimer;
@@ -657,15 +670,15 @@ int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
struct pblk_w_ctx w_ctx, unsigned int pos);
void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
- struct pblk_w_ctx w_ctx, struct pblk_line *gc_line,
- unsigned int pos);
+ struct pblk_w_ctx w_ctx, struct pblk_line *line,
+ u64 paddr, unsigned int pos);
struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos);
void pblk_rb_flush(struct pblk_rb *rb);
void pblk_rb_sync_l2p(struct pblk_rb *rb);
unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
- struct bio *bio, unsigned int pos,
- unsigned int nr_entries, unsigned int count);
+ unsigned int pos, unsigned int nr_entries,
+ unsigned int count);
unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
struct list_head *list,
unsigned int max);
@@ -692,24 +705,23 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf);
/*
* pblk core
*/
-struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw);
+struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type);
+void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type);
void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write);
int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
struct pblk_c_ctx *c_ctx);
-void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw);
-void pblk_wait_for_meta(struct pblk *pblk);
-struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba);
void pblk_discard(struct pblk *pblk, struct bio *bio);
void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd);
void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd);
int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd);
+int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd);
int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line);
struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
unsigned int nr_secs, unsigned int len,
int alloc_type, gfp_t gfp_mask);
struct pblk_line *pblk_line_get(struct pblk *pblk);
struct pblk_line *pblk_line_get_first_data(struct pblk *pblk);
-void pblk_line_replace_data(struct pblk *pblk);
+struct pblk_line *pblk_line_replace_data(struct pblk *pblk);
int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line);
void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line);
struct pblk_line *pblk_line_get_data(struct pblk *pblk);
@@ -719,19 +731,18 @@ int pblk_line_is_full(struct pblk_line *line);
void pblk_line_free(struct pblk *pblk, struct pblk_line *line);
void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line);
void pblk_line_close(struct pblk *pblk, struct pblk_line *line);
-void pblk_line_close_meta_sync(struct pblk *pblk);
void pblk_line_close_ws(struct work_struct *work);
void pblk_pipeline_stop(struct pblk *pblk);
-void pblk_line_mark_bb(struct work_struct *work);
-void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
- void (*work)(struct work_struct *),
- struct workqueue_struct *wq);
+void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
+ void (*work)(struct work_struct *), gfp_t gfp_mask,
+ struct workqueue_struct *wq);
u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line);
int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line);
int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line,
void *emeta_buf);
int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr erase_ppa);
void pblk_line_put(struct kref *ref);
+void pblk_line_put_wq(struct kref *ref);
struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line);
u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line);
void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
@@ -745,7 +756,6 @@ void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas);
void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
unsigned long *lun_bitmap);
-void pblk_end_bio_sync(struct bio *bio);
void pblk_end_io_sync(struct nvm_rq *rqd);
int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
int nr_pages);
@@ -760,7 +770,7 @@ void pblk_update_map_cache(struct pblk *pblk, sector_t lba,
void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
struct ppa_addr ppa, struct ppa_addr entry_line);
int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
- struct pblk_line *gc_line);
+ struct pblk_line *gc_line, u64 paddr);
void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
u64 *lba_list, int nr_secs);
void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
@@ -771,9 +781,7 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
*/
int pblk_write_to_cache(struct pblk *pblk, struct bio *bio,
unsigned long flags);
-int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list,
- unsigned int nr_entries, unsigned int nr_rec_entries,
- struct pblk_line *gc_line, unsigned long flags);
+int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
/*
* pblk map
@@ -797,9 +805,7 @@ void pblk_write_should_kick(struct pblk *pblk);
*/
extern struct bio_set *pblk_bio_set;
int pblk_submit_read(struct pblk *pblk, struct bio *bio);
-int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
- unsigned int nr_secs, unsigned int *secs_to_gc,
- struct pblk_line *line);
+int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
/*
* pblk recovery
*/
@@ -815,7 +821,7 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
* pblk gc
*/
#define PBLK_GC_MAX_READERS 8 /* Max number of outstanding GC reader jobs */
-#define PBLK_GC_W_QD 128 /* Queue depth for inflight GC write I/Os */
+#define PBLK_GC_RQ_QD 128 /* Queue depth for inflight GC requests */
#define PBLK_GC_L_QD 4 /* Queue depth for inflight GC lines */
#define PBLK_GC_RSV_LINE 1 /* Reserved lines for GC */
@@ -824,7 +830,7 @@ void pblk_gc_exit(struct pblk *pblk);
void pblk_gc_should_start(struct pblk *pblk);
void pblk_gc_should_stop(struct pblk *pblk);
void pblk_gc_should_kick(struct pblk *pblk);
-void pblk_gc_kick(struct pblk *pblk);
+void pblk_gc_free_full_lines(struct pblk *pblk);
void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
int *gc_active);
int pblk_gc_sysfs_force(struct pblk *pblk, int force);
@@ -834,8 +840,8 @@ int pblk_gc_sysfs_force(struct pblk *pblk, int force);
*/
void pblk_rl_init(struct pblk_rl *rl, int budget);
void pblk_rl_free(struct pblk_rl *rl);
+void pblk_rl_update_rates(struct pblk_rl *rl);
int pblk_rl_high_thrs(struct pblk_rl *rl);
-int pblk_rl_low_thrs(struct pblk_rl *rl);
unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl);
int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries);
void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries);
@@ -843,10 +849,9 @@ void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries);
int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries);
void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries);
void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc);
-int pblk_rl_sysfs_rate_show(struct pblk_rl *rl);
+int pblk_rl_max_io(struct pblk_rl *rl);
void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line);
void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line);
-void pblk_rl_set_space_limit(struct pblk_rl *rl, int entries_left);
int pblk_rl_is_limit(struct pblk_rl *rl);
/*
@@ -892,13 +897,7 @@ static inline void *emeta_to_vsc(struct pblk *pblk, struct line_emeta *emeta)
static inline int pblk_line_vsc(struct pblk_line *line)
{
- int vsc;
-
- spin_lock(&line->lock);
- vsc = le32_to_cpu(*line->vsc);
- spin_unlock(&line->lock);
-
- return vsc;
+ return le32_to_cpu(*line->vsc);
}
#define NVM_MEM_PAGE_WRITE (8)
@@ -1140,7 +1139,7 @@ static inline int pblk_set_progr_mode(struct pblk *pblk, int type)
flags = geo->plane_mode >> 1;
- if (type == WRITE)
+ if (type == PBLK_WRITE)
flags |= NVM_IO_SCRAMBLE_ENABLE;
return flags;
@@ -1200,7 +1199,6 @@ static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd,
pr_err("error:%d, ppa_status:%llx\n", error, rqd->ppa_status);
}
-#endif
static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
struct ppa_addr *ppas, int nr_ppas)
@@ -1221,14 +1219,50 @@ static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
ppa->g.sec < geo->sec_per_pg)
continue;
-#ifdef CONFIG_NVM_DEBUG
print_ppa(ppa, "boundary", i);
-#endif
+
return 1;
}
return 0;
}
+static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct ppa_addr *ppa_list;
+
+ ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
+
+ if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ if (rqd->opcode == NVM_OP_PWRITE) {
+ struct pblk_line *line;
+ struct ppa_addr ppa;
+ int i;
+
+ for (i = 0; i < rqd->nr_ppas; i++) {
+ ppa = ppa_list[i];
+ line = &pblk->lines[pblk_dev_ppa_to_line(ppa)];
+
+ spin_lock(&line->lock);
+ if (line->state != PBLK_LINESTATE_OPEN) {
+ pr_err("pblk: bad ppa: line:%d,state:%d\n",
+ line->id, line->state);
+ WARN_ON(1);
+ spin_unlock(&line->lock);
+ return -EINVAL;
+ }
+ spin_unlock(&line->lock);
+ }
+ }
+
+ return 0;
+}
+#endif
+
static inline int pblk_boundary_paddr_checks(struct pblk *pblk, u64 paddr)
{
struct pblk_line_meta *lm = &pblk->lm;
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 0803563..a27d852 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -407,7 +407,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
finish_wait(&ca->set->bucket_wait, &w);
out:
- wake_up_process(ca->alloc_thread);
+ if (ca->alloc_thread)
+ wake_up_process(ca->alloc_thread);
trace_bcache_alloc(ca, reserve);
@@ -442,6 +443,11 @@ out:
b->prio = INITIAL_PRIO;
}
+ if (ca->set->avail_nbuckets > 0) {
+ ca->set->avail_nbuckets--;
+ bch_update_bucket_in_use(ca->set, &ca->set->gc_stats);
+ }
+
return r;
}
@@ -449,6 +455,11 @@ void __bch_bucket_free(struct cache *ca, struct bucket *b)
{
SET_GC_MARK(b, 0);
SET_GC_SECTORS_USED(b, 0);
+
+ if (ca->set->avail_nbuckets < ca->set->nbuckets) {
+ ca->set->avail_nbuckets++;
+ bch_update_bucket_in_use(ca->set, &ca->set->gc_stats);
+ }
}
void bch_bucket_free(struct cache_set *c, struct bkey *k)
@@ -601,7 +612,7 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,
/*
* If we had to allocate, we might race and not need to allocate the
- * second time we call find_data_bucket(). If we allocated a bucket but
+ * second time we call pick_data_bucket(). If we allocated a bucket but
* didn't use it, drop the refcount bch_bucket_alloc_set() took:
*/
if (KEY_PTRS(&alloc.key))
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index abd31e8..843877e 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -185,6 +185,7 @@
#include <linux/mutex.h>
#include <linux/rbtree.h>
#include <linux/rwsem.h>
+#include <linux/refcount.h>
#include <linux/types.h>
#include <linux/workqueue.h>
@@ -266,9 +267,6 @@ struct bcache_device {
atomic_t *stripe_sectors_dirty;
unsigned long *full_dirty_stripes;
- unsigned long sectors_dirty_last;
- long sectors_dirty_derivative;
-
struct bio_set *bio_split;
unsigned data_csum:1;
@@ -300,7 +298,7 @@ struct cached_dev {
struct semaphore sb_write_mutex;
/* Refcount on the cache set. Always nonzero when we're caching. */
- atomic_t count;
+ refcount_t count;
struct work_struct detach;
/*
@@ -363,12 +361,14 @@ struct cached_dev {
uint64_t writeback_rate_target;
int64_t writeback_rate_proportional;
- int64_t writeback_rate_derivative;
- int64_t writeback_rate_change;
+ int64_t writeback_rate_integral;
+ int64_t writeback_rate_integral_scaled;
+ int32_t writeback_rate_change;
unsigned writeback_rate_update_seconds;
- unsigned writeback_rate_d_term;
+ unsigned writeback_rate_i_term_inverse;
unsigned writeback_rate_p_term_inverse;
+ unsigned writeback_rate_minimum;
};
enum alloc_reserve {
@@ -582,6 +582,7 @@ struct cache_set {
uint8_t need_gc;
struct gc_stat gc_stats;
size_t nbuckets;
+ size_t avail_nbuckets;
struct task_struct *gc_thread;
/* Where in the btree gc currently is */
@@ -807,13 +808,13 @@ do { \
static inline void cached_dev_put(struct cached_dev *dc)
{
- if (atomic_dec_and_test(&dc->count))
+ if (refcount_dec_and_test(&dc->count))
schedule_work(&dc->detach);
}
static inline bool cached_dev_get(struct cached_dev *dc)
{
- if (!atomic_inc_not_zero(&dc->count))
+ if (!refcount_inc_not_zero(&dc->count))
return false;
/* Paired with the mb in cached_dev_attach */
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 658c54b..11c5503 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1241,6 +1241,11 @@ void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k)
__bch_btree_mark_key(c, level, k);
}
+void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats)
+{
+ stats->in_use = (c->nbuckets - c->avail_nbuckets) * 100 / c->nbuckets;
+}
+
static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
{
uint8_t stale = 0;
@@ -1652,9 +1657,8 @@ static void btree_gc_start(struct cache_set *c)
mutex_unlock(&c->bucket_lock);
}
-static size_t bch_btree_gc_finish(struct cache_set *c)
+static void bch_btree_gc_finish(struct cache_set *c)
{
- size_t available = 0;
struct bucket *b;
struct cache *ca;
unsigned i;
@@ -1691,6 +1695,7 @@ static size_t bch_btree_gc_finish(struct cache_set *c)
}
rcu_read_unlock();
+ c->avail_nbuckets = 0;
for_each_cache(ca, c, i) {
uint64_t *i;
@@ -1712,18 +1717,16 @@ static size_t bch_btree_gc_finish(struct cache_set *c)
BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
- available++;
+ c->avail_nbuckets++;
}
}
mutex_unlock(&c->bucket_lock);
- return available;
}
static void bch_btree_gc(struct cache_set *c)
{
int ret;
- unsigned long available;
struct gc_stat stats;
struct closure writes;
struct btree_op op;
@@ -1746,14 +1749,14 @@ static void bch_btree_gc(struct cache_set *c)
pr_warn("gc failed!");
} while (ret);
- available = bch_btree_gc_finish(c);
+ bch_btree_gc_finish(c);
wake_up_allocators(c);
bch_time_stats_update(&c->btree_gc_time, start_time);
stats.key_bytes *= sizeof(uint64_t);
stats.data <<= 9;
- stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
+ bch_update_bucket_in_use(c, &stats);
memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
trace_bcache_gc_end(c);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 42204d6..d211e2c 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -306,5 +306,5 @@ void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
struct keybuf_key *bch_keybuf_next(struct keybuf *);
struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *,
struct bkey *, keybuf_pred_fn *);
-
+void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats);
#endif
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 965907c..ccfbea6 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -252,6 +252,12 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
static inline void closure_queue(struct closure *cl)
{
struct workqueue_struct *wq = cl->wq;
+ /**
+ * Changes made to closure, work_struct, or a couple of other structs
+ * may cause work.func not pointing to the right location.
+ */
+ BUILD_BUG_ON(offsetof(struct closure, fn)
+ != offsetof(struct work_struct, func));
if (wq) {
INIT_WORK(&cl->work, cl->work.func);
BUG_ON(!queue_work(wq, &cl->work));
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 3475d66..3a7aed7 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -27,12 +27,12 @@ struct kmem_cache *bch_search_cache;
static void bch_data_insert_start(struct closure *);
-static unsigned cache_mode(struct cached_dev *dc, struct bio *bio)
+static unsigned cache_mode(struct cached_dev *dc)
{
return BDEV_CACHE_MODE(&dc->sb);
}
-static bool verify(struct cached_dev *dc, struct bio *bio)
+static bool verify(struct cached_dev *dc)
{
return dc->verify;
}
@@ -370,7 +370,7 @@ static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
{
struct cache_set *c = dc->disk.c;
- unsigned mode = cache_mode(dc, bio);
+ unsigned mode = cache_mode(dc);
unsigned sectors, congested = bch_get_congested(c);
struct task_struct *task = current;
struct io *i;
@@ -385,6 +385,14 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
op_is_write(bio_op(bio))))
goto skip;
+ /*
+ * Flag for bypass if the IO is for read-ahead or background,
+ * unless the read-ahead request is for metadata (eg, for gfs2).
+ */
+ if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) &&
+ !(bio->bi_opf & REQ_META))
+ goto skip;
+
if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
bio_sectors(bio) & (c->sb.block_size - 1)) {
pr_debug("skipping unaligned io");
@@ -463,6 +471,7 @@ struct search {
unsigned recoverable:1;
unsigned write:1;
unsigned read_dirty_data:1;
+ unsigned cache_missed:1;
unsigned long start_time;
@@ -649,6 +658,7 @@ static inline struct search *search_alloc(struct bio *bio,
s->orig_bio = bio;
s->cache_miss = NULL;
+ s->cache_missed = 0;
s->d = d;
s->recoverable = 1;
s->write = op_is_write(bio_op(bio));
@@ -698,8 +708,16 @@ static void cached_dev_read_error(struct closure *cl)
{
struct search *s = container_of(cl, struct search, cl);
struct bio *bio = &s->bio.bio;
+ struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
- if (s->recoverable) {
+ /*
+ * If cache device is dirty (dc->has_dirty is non-zero), then
+ * recovery a failed read request from cached device may get a
+ * stale data back. So read failure recovery is only permitted
+ * when cache device is clean.
+ */
+ if (s->recoverable &&
+ (dc && !atomic_read(&dc->has_dirty))) {
/* Retry from the backing device: */
trace_bcache_read_retry(s->orig_bio);
@@ -740,7 +758,7 @@ static void cached_dev_read_done(struct closure *cl)
s->cache_miss = NULL;
}
- if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data)
+ if (verify(dc) && s->recoverable && !s->read_dirty_data)
bch_data_verify(dc, s->orig_bio);
bio_complete(s);
@@ -760,12 +778,12 @@ static void cached_dev_read_done_bh(struct closure *cl)
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
bch_mark_cache_accounting(s->iop.c, s->d,
- !s->cache_miss, s->iop.bypass);
+ !s->cache_missed, s->iop.bypass);
trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass);
if (s->iop.status)
continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq);
- else if (s->iop.bio || verify(dc, &s->bio.bio))
+ else if (s->iop.bio || verify(dc))
continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq);
else
continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
@@ -779,6 +797,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
struct bio *miss, *cache_bio;
+ s->cache_missed = 1;
+
if (s->cache_miss || s->iop.bypass) {
miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
@@ -892,7 +912,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
s->iop.bypass = true;
if (should_writeback(dc, s->orig_bio,
- cache_mode(dc, bio),
+ cache_mode(dc),
s->iop.bypass)) {
s->iop.bypass = false;
s->iop.writeback = true;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index fc0a31b..b4d2892 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -53,12 +53,15 @@ LIST_HEAD(bch_cache_sets);
static LIST_HEAD(uncached_devices);
static int bcache_major;
-static DEFINE_IDA(bcache_minor);
+static DEFINE_IDA(bcache_device_idx);
static wait_queue_head_t unregister_wait;
struct workqueue_struct *bcache_wq;
#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
-#define BCACHE_MINORS 16 /* partition support */
+/* limitation of partitions number on single bcache device */
+#define BCACHE_MINORS 128
+/* limitation of bcache devices number on single system */
+#define BCACHE_DEVICE_IDX_MAX ((1U << MINORBITS)/BCACHE_MINORS)
/* Superblock */
@@ -721,6 +724,16 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
closure_get(&c->caching);
}
+static inline int first_minor_to_idx(int first_minor)
+{
+ return (first_minor/BCACHE_MINORS);
+}
+
+static inline int idx_to_first_minor(int idx)
+{
+ return (idx * BCACHE_MINORS);
+}
+
static void bcache_device_free(struct bcache_device *d)
{
lockdep_assert_held(&bch_register_lock);
@@ -734,7 +747,8 @@ static void bcache_device_free(struct bcache_device *d)
if (d->disk && d->disk->queue)
blk_cleanup_queue(d->disk->queue);
if (d->disk) {
- ida_simple_remove(&bcache_minor, d->disk->first_minor);
+ ida_simple_remove(&bcache_device_idx,
+ first_minor_to_idx(d->disk->first_minor));
put_disk(d->disk);
}
@@ -751,7 +765,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
{
struct request_queue *q;
size_t n;
- int minor;
+ int idx;
if (!d->stripe_size)
d->stripe_size = 1 << 31;
@@ -776,25 +790,24 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
if (!d->full_dirty_stripes)
return -ENOMEM;
- minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL);
- if (minor < 0)
- return minor;
-
- minor *= BCACHE_MINORS;
+ idx = ida_simple_get(&bcache_device_idx, 0,
+ BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
+ if (idx < 0)
+ return idx;
if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio),
BIOSET_NEED_BVECS |
BIOSET_NEED_RESCUER)) ||
!(d->disk = alloc_disk(BCACHE_MINORS))) {
- ida_simple_remove(&bcache_minor, minor);
+ ida_simple_remove(&bcache_device_idx, idx);
return -ENOMEM;
}
set_capacity(d->disk, sectors);
- snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor);
+ snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
d->disk->major = bcache_major;
- d->disk->first_minor = minor;
+ d->disk->first_minor = idx_to_first_minor(idx);
d->disk->fops = &bcache_ops;
d->disk->private_data = d;
@@ -889,7 +902,7 @@ static void cached_dev_detach_finish(struct work_struct *w)
closure_init_stack(&cl);
BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
- BUG_ON(atomic_read(&dc->count));
+ BUG_ON(refcount_read(&dc->count));
mutex_lock(&bch_register_lock);
@@ -1016,7 +1029,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
* dc->c must be set before dc->count != 0 - paired with the mb in
* cached_dev_get()
*/
- atomic_set(&dc->count, 1);
+ refcount_set(&dc->count, 1);
/* Block writeback thread, but spawn it */
down_write(&dc->writeback_lock);
@@ -1028,7 +1041,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
bch_sectors_dirty_init(&dc->disk);
atomic_set(&dc->has_dirty, 1);
- atomic_inc(&dc->count);
+ refcount_inc(&dc->count);
bch_writeback_queue(dc);
}
@@ -1129,9 +1142,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
if (ret)
return ret;
- set_capacity(dc->disk.disk,
- dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
-
dc->disk.disk->queue->backing_dev_info->ra_pages =
max(dc->disk.disk->queue->backing_dev_info->ra_pages,
q->backing_dev_info->ra_pages);
@@ -2085,6 +2095,7 @@ static void bcache_exit(void)
if (bcache_major)
unregister_blkdev(bcache_major, "bcache");
unregister_reboot_notifier(&reboot);
+ mutex_destroy(&bch_register_lock);
}
static int __init bcache_init(void)
@@ -2103,14 +2114,15 @@ static int __init bcache_init(void)
bcache_major = register_blkdev(0, "bcache");
if (bcache_major < 0) {
unregister_reboot_notifier(&reboot);
+ mutex_destroy(&bch_register_lock);
return bcache_major;
}
if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
!(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
- sysfs_create_files(bcache_kobj, files) ||
bch_request_init() ||
- bch_debug_init(bcache_kobj))
+ bch_debug_init(bcache_kobj) ||
+ sysfs_create_files(bcache_kobj, files))
goto err;
return 0;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 234b2f5..b418409 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -82,8 +82,9 @@ rw_attribute(writeback_delay);
rw_attribute(writeback_rate);
rw_attribute(writeback_rate_update_seconds);
-rw_attribute(writeback_rate_d_term);
+rw_attribute(writeback_rate_i_term_inverse);
rw_attribute(writeback_rate_p_term_inverse);
+rw_attribute(writeback_rate_minimum);
read_attribute(writeback_rate_debug);
read_attribute(stripe_size);
@@ -131,15 +132,16 @@ SHOW(__bch_cached_dev)
sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9);
var_print(writeback_rate_update_seconds);
- var_print(writeback_rate_d_term);
+ var_print(writeback_rate_i_term_inverse);
var_print(writeback_rate_p_term_inverse);
+ var_print(writeback_rate_minimum);
if (attr == &sysfs_writeback_rate_debug) {
char rate[20];
char dirty[20];
char target[20];
char proportional[20];
- char derivative[20];
+ char integral[20];
char change[20];
s64 next_io;
@@ -147,7 +149,7 @@ SHOW(__bch_cached_dev)
bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9);
bch_hprint(target, dc->writeback_rate_target << 9);
bch_hprint(proportional,dc->writeback_rate_proportional << 9);
- bch_hprint(derivative, dc->writeback_rate_derivative << 9);
+ bch_hprint(integral, dc->writeback_rate_integral_scaled << 9);
bch_hprint(change, dc->writeback_rate_change << 9);
next_io = div64_s64(dc->writeback_rate.next - local_clock(),
@@ -158,11 +160,11 @@ SHOW(__bch_cached_dev)
"dirty:\t\t%s\n"
"target:\t\t%s\n"
"proportional:\t%s\n"
- "derivative:\t%s\n"
+ "integral:\t%s\n"
"change:\t\t%s/sec\n"
"next io:\t%llims\n",
rate, dirty, target, proportional,
- derivative, change, next_io);
+ integral, change, next_io);
}
sysfs_hprint(dirty_data,
@@ -214,7 +216,7 @@ STORE(__cached_dev)
dc->writeback_rate.rate, 1, INT_MAX);
d_strtoul_nonzero(writeback_rate_update_seconds);
- d_strtoul(writeback_rate_d_term);
+ d_strtoul(writeback_rate_i_term_inverse);
d_strtoul_nonzero(writeback_rate_p_term_inverse);
d_strtoi_h(sequential_cutoff);
@@ -320,7 +322,7 @@ static struct attribute *bch_cached_dev_files[] = {
&sysfs_writeback_percent,
&sysfs_writeback_rate,
&sysfs_writeback_rate_update_seconds,
- &sysfs_writeback_rate_d_term,
+ &sysfs_writeback_rate_i_term_inverse,
&sysfs_writeback_rate_p_term_inverse,
&sysfs_writeback_rate_debug,
&sysfs_dirty_data,
@@ -746,6 +748,11 @@ static struct attribute *bch_cache_set_internal_files[] = {
};
KTYPE(bch_cache_set_internal);
+static int __bch_cache_cmp(const void *l, const void *r)
+{
+ return *((uint16_t *)r) - *((uint16_t *)l);
+}
+
SHOW(__bch_cache)
{
struct cache *ca = container_of(kobj, struct cache, kobj);
@@ -770,9 +777,6 @@ SHOW(__bch_cache)
CACHE_REPLACEMENT(&ca->sb));
if (attr == &sysfs_priority_stats) {
- int cmp(const void *l, const void *r)
- { return *((uint16_t *) r) - *((uint16_t *) l); }
-
struct bucket *b;
size_t n = ca->sb.nbuckets, i;
size_t unused = 0, available = 0, dirty = 0, meta = 0;
@@ -801,7 +805,7 @@ SHOW(__bch_cache)
p[i] = ca->buckets[i].prio;
mutex_unlock(&ca->set->bucket_lock);
- sort(p, n, sizeof(uint16_t), cmp, NULL);
+ sort(p, n, sizeof(uint16_t), __bch_cache_cmp, NULL);
while (n &&
!cached[n - 1])
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 176d3c2..e548b8b 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -232,8 +232,14 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
d->next += div_u64(done * NSEC_PER_SEC, d->rate);
- if (time_before64(now + NSEC_PER_SEC, d->next))
- d->next = now + NSEC_PER_SEC;
+ /* Bound the time. Don't let us fall further than 2 seconds behind
+ * (this prevents unnecessary backlog that would make it impossible
+ * to catch up). If we're ahead of the desired writeback rate,
+ * don't let us sleep more than 2.5 seconds (so we can notice/respond
+ * if the control system tells us to speed up!).
+ */
+ if (time_before64(now + NSEC_PER_SEC * 5LLU / 2LLU, d->next))
+ d->next = now + NSEC_PER_SEC * 5LLU / 2LLU;
if (time_after64(now - NSEC_PER_SEC * 2, d->next))
d->next = now - NSEC_PER_SEC * 2;
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index f54b582..ed5e8a4 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -442,10 +442,10 @@ struct bch_ratelimit {
uint64_t next;
/*
- * Rate at which we want to do work, in units per nanosecond
+ * Rate at which we want to do work, in units per second
* The units here correspond to the units passed to bch_next_delay()
*/
- unsigned rate;
+ uint32_t rate;
};
static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 70454f2..56a3788 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -26,48 +26,63 @@ static void __update_writeback_rate(struct cached_dev *dc)
bcache_flash_devs_sectors_dirty(c);
uint64_t cache_dirty_target =
div_u64(cache_sectors * dc->writeback_percent, 100);
-
int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
c->cached_dev_sectors);
- /* PD controller */
-
+ /*
+ * PI controller:
+ * Figures out the amount that should be written per second.
+ *
+ * First, the error (number of sectors that are dirty beyond our
+ * target) is calculated. The error is accumulated (numerically
+ * integrated).
+ *
+ * Then, the proportional value and integral value are scaled
+ * based on configured values. These are stored as inverses to
+ * avoid fixed point math and to make configuration easy-- e.g.
+ * the default value of 40 for writeback_rate_p_term_inverse
+ * attempts to write at a rate that would retire all the dirty
+ * blocks in 40 seconds.
+ *
+ * The writeback_rate_i_inverse value of 10000 means that 1/10000th
+ * of the error is accumulated in the integral term per second.
+ * This acts as a slow, long-term average that is not subject to
+ * variations in usage like the p term.
+ */
int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
- int64_t derivative = dirty - dc->disk.sectors_dirty_last;
- int64_t proportional = dirty - target;
- int64_t change;
-
- dc->disk.sectors_dirty_last = dirty;
-
- /* Scale to sectors per second */
-
- proportional *= dc->writeback_rate_update_seconds;
- proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse);
-
- derivative = div_s64(derivative, dc->writeback_rate_update_seconds);
-
- derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
- (dc->writeback_rate_d_term /
- dc->writeback_rate_update_seconds) ?: 1, 0);
-
- derivative *= dc->writeback_rate_d_term;
- derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse);
-
- change = proportional + derivative;
+ int64_t error = dirty - target;
+ int64_t proportional_scaled =
+ div_s64(error, dc->writeback_rate_p_term_inverse);
+ int64_t integral_scaled;
+ uint32_t new_rate;
+
+ if ((error < 0 && dc->writeback_rate_integral > 0) ||
+ (error > 0 && time_before64(local_clock(),
+ dc->writeback_rate.next + NSEC_PER_MSEC))) {
+ /*
+ * Only decrease the integral term if it's more than
+ * zero. Only increase the integral term if the device
+ * is keeping up. (Don't wind up the integral
+ * ineffectively in either case).
+ *
+ * It's necessary to scale this by
+ * writeback_rate_update_seconds to keep the integral
+ * term dimensioned properly.
+ */
+ dc->writeback_rate_integral += error *
+ dc->writeback_rate_update_seconds;
+ }
- /* Don't increase writeback rate if the device isn't keeping up */
- if (change > 0 &&
- time_after64(local_clock(),
- dc->writeback_rate.next + NSEC_PER_MSEC))
- change = 0;
+ integral_scaled = div_s64(dc->writeback_rate_integral,
+ dc->writeback_rate_i_term_inverse);
- dc->writeback_rate.rate =
- clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
- 1, NSEC_PER_MSEC);
+ new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled),
+ dc->writeback_rate_minimum, NSEC_PER_SEC);
- dc->writeback_rate_proportional = proportional;
- dc->writeback_rate_derivative = derivative;
- dc->writeback_rate_change = change;
+ dc->writeback_rate_proportional = proportional_scaled;
+ dc->writeback_rate_integral_scaled = integral_scaled;
+ dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
+ dc->writeback_rate.rate = new_rate;
dc->writeback_rate_target = target;
}
@@ -180,13 +195,21 @@ static void write_dirty(struct closure *cl)
struct dirty_io *io = container_of(cl, struct dirty_io, cl);
struct keybuf_key *w = io->bio.bi_private;
- dirty_init(w);
- bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
- io->bio.bi_iter.bi_sector = KEY_START(&w->key);
- bio_set_dev(&io->bio, io->dc->bdev);
- io->bio.bi_end_io = dirty_endio;
+ /*
+ * IO errors are signalled using the dirty bit on the key.
+ * If we failed to read, we should not attempt to write to the
+ * backing device. Instead, immediately go to write_dirty_finish
+ * to clean up.
+ */
+ if (KEY_DIRTY(&w->key)) {
+ dirty_init(w);
+ bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
+ io->bio.bi_iter.bi_sector = KEY_START(&w->key);
+ bio_set_dev(&io->bio, io->dc->bdev);
+ io->bio.bi_end_io = dirty_endio;
- closure_bio_submit(&io->bio, cl);
+ closure_bio_submit(&io->bio, cl);
+ }
continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
}
@@ -418,6 +441,8 @@ static int bch_writeback_thread(void *arg)
struct cached_dev *dc = arg;
bool searched_full_index;
+ bch_ratelimit_reset(&dc->writeback_rate);
+
while (!kthread_should_stop()) {
down_write(&dc->writeback_lock);
if (!atomic_read(&dc->has_dirty) ||
@@ -445,7 +470,6 @@ static int bch_writeback_thread(void *arg)
up_write(&dc->writeback_lock);
- bch_ratelimit_reset(&dc->writeback_rate);
read_dirty(dc);
if (searched_full_index) {
@@ -455,6 +479,8 @@ static int bch_writeback_thread(void *arg)
!kthread_should_stop() &&
!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
delay = schedule_timeout_interruptible(delay);
+
+ bch_ratelimit_reset(&dc->writeback_rate);
}
}
@@ -492,8 +518,6 @@ void bch_sectors_dirty_init(struct bcache_device *d)
bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
sectors_dirty_init_fn, 0);
-
- d->sectors_dirty_last = bcache_dev_sectors_dirty(d);
}
void bch_cached_dev_writeback_init(struct cached_dev *dc)
@@ -507,10 +531,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
dc->writeback_percent = 10;
dc->writeback_delay = 30;
dc->writeback_rate.rate = 1024;
+ dc->writeback_rate_minimum = 8;
dc->writeback_rate_update_seconds = 5;
- dc->writeback_rate_d_term = 30;
- dc->writeback_rate_p_term_inverse = 6000;
+ dc->writeback_rate_p_term_inverse = 40;
+ dc->writeback_rate_i_term_inverse = 10000;
INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
}
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 1515447..a9e3ffb 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -77,7 +77,9 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
if (would_skip)
return false;
- return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK;
+ return (op_is_sync(bio->bi_opf) ||
+ bio->bi_opf & (REQ_META|REQ_PRIO) ||
+ in_use <= CUTOFF_WRITEBACK);
}
static inline void bch_writeback_queue(struct cached_dev *dc)
@@ -90,7 +92,7 @@ static inline void bch_writeback_add(struct cached_dev *dc)
{
if (!atomic_read(&dc->has_dirty) &&
!atomic_xchg(&dc->has_dirty, 1)) {
- atomic_inc(&dc->count);
+ refcount_inc(&dc->count);
if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index d212163..4d8ed74 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -368,7 +368,7 @@ static int read_page(struct file *file, unsigned long index,
pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
(unsigned long long)index << PAGE_SHIFT);
- bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0);
+ bh = alloc_page_buffers(page, 1<<inode->i_blkbits, false);
if (!bh) {
ret = -ENOMEM;
goto out;
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index eadfcfd..9d32f25 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -56,7 +56,7 @@ static unsigned dm_get_blk_mq_queue_depth(void)
int dm_request_based(struct mapped_device *md)
{
- return blk_queue_stackable(md->queue);
+ return queue_is_rq_based(md->queue);
}
static void dm_old_start_queue(struct request_queue *q)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index ef7b8f2..7528182 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1000,7 +1000,7 @@ verify_rq_based:
list_for_each_entry(dd, devices, list) {
struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev);
- if (!blk_queue_stackable(q)) {
+ if (!queue_is_rq_based(q)) {
DMERR("table load rejected: including"
" non-request-stackable devices");
return -EINVAL;
@@ -1847,19 +1847,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
*/
if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random))
queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q);
-
- /*
- * QUEUE_FLAG_STACKABLE must be set after all queue settings are
- * visible to other CPUs because, once the flag is set, incoming bios
- * are processed by request-based dm, which refers to the queue
- * settings.
- * Until the flag set, bios are passed to bio-based dm and queued to
- * md->deferred where queue settings are not needed yet.
- * Those bios are passed to request-based dm at the resume time.
- */
- smp_mb();
- if (dm_table_request_based(t))
- queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q);
}
unsigned int dm_table_get_num_targets(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8aaffa1..a3f8cbb 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1619,17 +1619,6 @@ static void dm_wq_work(struct work_struct *work);
void dm_init_md_queue(struct mapped_device *md)
{
/*
- * Request-based dm devices cannot be stacked on top of bio-based dm
- * devices. The type of this dm device may not have been decided yet.
- * The type is decided at the first table loading time.
- * To prevent problematic device stacking, clear the queue flag
- * for request stacking support until then.
- *
- * This queue is new, so no concurrency on the queue_flags.
- */
- queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
-
- /*
* Initialize data that will only be used by a non-blk-mq DM queue
* - must do so here (in alloc_dev callchain) before queue is used
*/
diff --git a/drivers/nvme/Kconfig b/drivers/nvme/Kconfig
index b7c78a5..04008e0 100644
--- a/drivers/nvme/Kconfig
+++ b/drivers/nvme/Kconfig
@@ -1,2 +1,6 @@
+menu "NVME Support"
+
source "drivers/nvme/host/Kconfig"
source "drivers/nvme/target/Kconfig"
+
+endmenu
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 46d6cb1..b979cf3 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -13,6 +13,15 @@ config BLK_DEV_NVME
To compile this driver as a module, choose M here: the
module will be called nvme.
+config NVME_MULTIPATH
+ bool "NVMe multipath support"
+ depends on NVME_CORE
+ ---help---
+ This option enables support for multipath access to NVMe
+ subsystems. If this option is enabled only a single
+ /dev/nvmeXnY device will show up for each NVMe namespaces,
+ even if it is accessible through multiple controllers.
+
config NVME_FABRICS
tristate
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index 7b96e45..a25fd43 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o
obj-$(CONFIG_NVME_FC) += nvme-fc.o
nvme-core-y := core.o
+nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o
nvme-core-$(CONFIG_NVM) += lightnvm.o
nvme-y += pci.o
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 37f9039..25da74d 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -34,13 +34,13 @@
#define NVME_MINORS (1U << MINORBITS)
-unsigned char admin_timeout = 60;
-module_param(admin_timeout, byte, 0644);
+unsigned int admin_timeout = 60;
+module_param(admin_timeout, uint, 0644);
MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
EXPORT_SYMBOL_GPL(admin_timeout);
-unsigned char nvme_io_timeout = 30;
-module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
+unsigned int nvme_io_timeout = 30;
+module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
EXPORT_SYMBOL_GPL(nvme_io_timeout);
@@ -52,9 +52,6 @@ static u8 nvme_max_retries = 5;
module_param_named(max_retries, nvme_max_retries, byte, 0644);
MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
-static int nvme_char_major;
-module_param(nvme_char_major, int, 0);
-
static unsigned long default_ps_max_latency_us = 100000;
module_param(default_ps_max_latency_us, ulong, 0644);
MODULE_PARM_DESC(default_ps_max_latency_us,
@@ -71,10 +68,17 @@ MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
struct workqueue_struct *nvme_wq;
EXPORT_SYMBOL_GPL(nvme_wq);
-static LIST_HEAD(nvme_ctrl_list);
-static DEFINE_SPINLOCK(dev_list_lock);
+static DEFINE_IDA(nvme_subsystems_ida);
+static LIST_HEAD(nvme_subsystems);
+static DEFINE_MUTEX(nvme_subsystems_lock);
+static DEFINE_IDA(nvme_instance_ida);
+static dev_t nvme_chr_devt;
static struct class *nvme_class;
+static struct class *nvme_subsys_class;
+
+static void nvme_ns_remove(struct nvme_ns *ns);
+static int nvme_revalidate_disk(struct gendisk *disk);
static __le32 nvme_get_log_dw10(u8 lid, size_t size)
{
@@ -101,6 +105,51 @@ static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
return ret;
}
+static void nvme_delete_ctrl_work(struct work_struct *work)
+{
+ struct nvme_ctrl *ctrl =
+ container_of(work, struct nvme_ctrl, delete_work);
+
+ flush_work(&ctrl->reset_work);
+ nvme_stop_ctrl(ctrl);
+ nvme_remove_namespaces(ctrl);
+ ctrl->ops->delete_ctrl(ctrl);
+ nvme_uninit_ctrl(ctrl);
+ nvme_put_ctrl(ctrl);
+}
+
+int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
+{
+ if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
+ return -EBUSY;
+ if (!queue_work(nvme_wq, &ctrl->delete_work))
+ return -EBUSY;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
+
+int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
+{
+ int ret = 0;
+
+ /*
+ * Keep a reference until the work is flushed since ->delete_ctrl
+ * can free the controller.
+ */
+ nvme_get_ctrl(ctrl);
+ ret = nvme_delete_ctrl(ctrl);
+ if (!ret)
+ flush_work(&ctrl->delete_work);
+ nvme_put_ctrl(ctrl);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync);
+
+static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
+{
+ return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple);
+}
+
static blk_status_t nvme_error_status(struct request *req)
{
switch (nvme_req(req)->status & 0x7ff) {
@@ -142,9 +191,16 @@ static inline bool nvme_req_needs_retry(struct request *req)
void nvme_complete_rq(struct request *req)
{
if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
- nvme_req(req)->retries++;
- blk_mq_requeue_request(req, true);
- return;
+ if (nvme_req_needs_failover(req)) {
+ nvme_failover_req(req);
+ return;
+ }
+
+ if (!blk_queue_dying(req->q)) {
+ nvme_req(req)->retries++;
+ blk_mq_requeue_request(req, true);
+ return;
+ }
}
blk_mq_end_request(req, nvme_error_status(req));
@@ -153,18 +209,13 @@ EXPORT_SYMBOL_GPL(nvme_complete_rq);
void nvme_cancel_request(struct request *req, void *data, bool reserved)
{
- int status;
-
if (!blk_mq_request_started(req))
return;
dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
"Cancelling I/O %d", req->tag);
- status = NVME_SC_ABORT_REQ;
- if (blk_queue_dying(req->q))
- status |= NVME_SC_DNR;
- nvme_req(req)->status = status;
+ nvme_req(req)->status = NVME_SC_ABORT_REQ;
blk_mq_complete_request(req);
}
@@ -205,6 +256,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
case NVME_CTRL_RECONNECTING:
switch (old_state) {
case NVME_CTRL_LIVE:
+ case NVME_CTRL_RESETTING:
changed = true;
/* FALLTHRU */
default:
@@ -239,11 +291,29 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
ctrl->state = new_state;
spin_unlock_irqrestore(&ctrl->lock, flags);
-
+ if (changed && ctrl->state == NVME_CTRL_LIVE)
+ nvme_kick_requeue_lists(ctrl);
return changed;
}
EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
+static void nvme_free_ns_head(struct kref *ref)
+{
+ struct nvme_ns_head *head =
+ container_of(ref, struct nvme_ns_head, ref);
+
+ nvme_mpath_remove_disk(head);
+ ida_simple_remove(&head->subsys->ns_ida, head->instance);
+ list_del_init(&head->entry);
+ cleanup_srcu_struct(&head->srcu);
+ kfree(head);
+}
+
+static void nvme_put_ns_head(struct nvme_ns_head *head)
+{
+ kref_put(&head->ref, nvme_free_ns_head);
+}
+
static void nvme_free_ns(struct kref *kref)
{
struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
@@ -251,14 +321,8 @@ static void nvme_free_ns(struct kref *kref)
if (ns->ndev)
nvme_nvm_unregister(ns);
- if (ns->disk) {
- spin_lock(&dev_list_lock);
- ns->disk->private_data = NULL;
- spin_unlock(&dev_list_lock);
- }
-
put_disk(ns->disk);
- ida_simple_remove(&ns->ctrl->ns_ida, ns->instance);
+ nvme_put_ns_head(ns->head);
nvme_put_ctrl(ns->ctrl);
kfree(ns);
}
@@ -268,31 +332,8 @@ static void nvme_put_ns(struct nvme_ns *ns)
kref_put(&ns->kref, nvme_free_ns);
}
-static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk)
-{
- struct nvme_ns *ns;
-
- spin_lock(&dev_list_lock);
- ns = disk->private_data;
- if (ns) {
- if (!kref_get_unless_zero(&ns->kref))
- goto fail;
- if (!try_module_get(ns->ctrl->ops->module))
- goto fail_put_ns;
- }
- spin_unlock(&dev_list_lock);
-
- return ns;
-
-fail_put_ns:
- kref_put(&ns->kref, nvme_free_ns);
-fail:
- spin_unlock(&dev_list_lock);
- return NULL;
-}
-
struct request *nvme_alloc_request(struct request_queue *q,
- struct nvme_command *cmd, unsigned int flags, int qid)
+ struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid)
{
unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
struct request *req;
@@ -417,7 +458,7 @@ static inline void nvme_setup_flush(struct nvme_ns *ns,
{
memset(cmnd, 0, sizeof(*cmnd));
cmnd->common.opcode = nvme_cmd_flush;
- cmnd->common.nsid = cpu_to_le32(ns->ns_id);
+ cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
}
static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
@@ -448,7 +489,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
memset(cmnd, 0, sizeof(*cmnd));
cmnd->dsm.opcode = nvme_cmd_dsm;
- cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
+ cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
cmnd->dsm.nr = cpu_to_le32(segments - 1);
cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
@@ -467,16 +508,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
u16 control = 0;
u32 dsmgmt = 0;
- /*
- * If formated with metadata, require the block layer provide a buffer
- * unless this namespace is formated such that the metadata can be
- * stripped/generated by the controller with PRACT=1.
- */
- if (ns && ns->ms &&
- (!ns->pi_type || ns->ms != sizeof(struct t10_pi_tuple)) &&
- !blk_integrity_rq(req) && !blk_rq_is_passthrough(req))
- return BLK_STS_NOTSUPP;
-
if (req->cmd_flags & REQ_FUA)
control |= NVME_RW_FUA;
if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
@@ -487,7 +518,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
memset(cmnd, 0, sizeof(*cmnd));
cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
- cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
+ cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
@@ -495,6 +526,18 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
if (ns->ms) {
+ /*
+ * If formated with metadata, the block layer always provides a
+ * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else
+ * we enable the PRACT bit for protection information or set the
+ * namespace capacity to zero to prevent any I/O.
+ */
+ if (!blk_integrity_rq(req)) {
+ if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
+ return BLK_STS_NOTSUPP;
+ control |= NVME_RW_PRINFO_PRACT;
+ }
+
switch (ns->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
control |= NVME_RW_PRINFO_PRCHK_GUARD;
@@ -507,8 +550,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
nvme_block_nr(ns, blk_rq_pos(req)));
break;
}
- if (!blk_integrity_rq(req))
- control |= NVME_RW_PRINFO_PRACT;
}
cmnd->rw.control = cpu_to_le16(control);
@@ -560,7 +601,8 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd);
*/
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
union nvme_result *result, void *buffer, unsigned bufflen,
- unsigned timeout, int qid, int at_head, int flags)
+ unsigned timeout, int qid, int at_head,
+ blk_mq_req_flags_t flags)
{
struct request *req;
int ret;
@@ -778,7 +820,7 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
}
static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
- u8 *eui64, u8 *nguid, uuid_t *uuid)
+ struct nvme_ns_ids *ids)
{
struct nvme_command c = { };
int status;
@@ -814,7 +856,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
goto free_data;
}
len = NVME_NIDT_EUI64_LEN;
- memcpy(eui64, data + pos + sizeof(*cur), len);
+ memcpy(ids->eui64, data + pos + sizeof(*cur), len);
break;
case NVME_NIDT_NGUID:
if (cur->nidl != NVME_NIDT_NGUID_LEN) {
@@ -824,7 +866,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
goto free_data;
}
len = NVME_NIDT_NGUID_LEN;
- memcpy(nguid, data + pos + sizeof(*cur), len);
+ memcpy(ids->nguid, data + pos + sizeof(*cur), len);
break;
case NVME_NIDT_UUID:
if (cur->nidl != NVME_NIDT_UUID_LEN) {
@@ -834,7 +876,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
goto free_data;
}
len = NVME_NIDT_UUID_LEN;
- uuid_copy(uuid, data + pos + sizeof(*cur));
+ uuid_copy(&ids->uuid, data + pos + sizeof(*cur));
break;
default:
/* Skip unnkown types */
@@ -968,7 +1010,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
memset(&c, 0, sizeof(c));
c.rw.opcode = io.opcode;
c.rw.flags = io.flags;
- c.rw.nsid = cpu_to_le32(ns->ns_id);
+ c.rw.nsid = cpu_to_le32(ns->head->ns_id);
c.rw.slba = cpu_to_le64(io.slba);
c.rw.length = cpu_to_le16(io.nblocks);
c.rw.control = cpu_to_le16(io.control);
@@ -982,12 +1024,87 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
metadata, meta_len, io.slba, NULL, 0);
}
+static u32 nvme_known_admin_effects(u8 opcode)
+{
+ switch (opcode) {
+ case nvme_admin_format_nvm:
+ return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
+ NVME_CMD_EFFECTS_CSE_MASK;
+ case nvme_admin_sanitize_nvm:
+ return NVME_CMD_EFFECTS_CSE_MASK;
+ default:
+ break;
+ }
+ return 0;
+}
+
+static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+ u8 opcode)
+{
+ u32 effects = 0;
+
+ if (ns) {
+ if (ctrl->effects)
+ effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
+ if (effects & ~NVME_CMD_EFFECTS_CSUPP)
+ dev_warn(ctrl->device,
+ "IO command:%02x has unhandled effects:%08x\n",
+ opcode, effects);
+ return 0;
+ }
+
+ if (ctrl->effects)
+ effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
+ else
+ effects = nvme_known_admin_effects(opcode);
+
+ /*
+ * For simplicity, IO to all namespaces is quiesced even if the command
+ * effects say only one namespace is affected.
+ */
+ if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
+ nvme_start_freeze(ctrl);
+ nvme_wait_freeze(ctrl);
+ }
+ return effects;
+}
+
+static void nvme_update_formats(struct nvme_ctrl *ctrl)
+{
+ struct nvme_ns *ns;
+
+ mutex_lock(&ctrl->namespaces_mutex);
+ list_for_each_entry(ns, &ctrl->namespaces, list) {
+ if (ns->disk && nvme_revalidate_disk(ns->disk))
+ nvme_ns_remove(ns);
+ }
+ mutex_unlock(&ctrl->namespaces_mutex);
+}
+
+static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
+{
+ /*
+ * Revalidate LBA changes prior to unfreezing. This is necessary to
+ * prevent memory corruption if a logical block size was changed by
+ * this command.
+ */
+ if (effects & NVME_CMD_EFFECTS_LBCC)
+ nvme_update_formats(ctrl);
+ if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK))
+ nvme_unfreeze(ctrl);
+ if (effects & NVME_CMD_EFFECTS_CCC)
+ nvme_init_identify(ctrl);
+ if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC))
+ nvme_queue_scan(ctrl);
+}
+
static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
struct nvme_passthru_cmd __user *ucmd)
{
struct nvme_passthru_cmd cmd;
struct nvme_command c;
unsigned timeout = 0;
+ u32 effects;
int status;
if (!capable(CAP_SYS_ADMIN))
@@ -1013,10 +1130,13 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
+ effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
(void __user *)(uintptr_t)cmd.addr, cmd.data_len,
(void __user *)(uintptr_t)cmd.metadata, cmd.metadata,
0, &cmd.result, timeout);
+ nvme_passthru_end(ctrl, effects);
+
if (status >= 0) {
if (put_user(cmd.result, &ucmd->result))
return -EFAULT;
@@ -1025,15 +1145,37 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
return status;
}
-static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long arg)
+/*
+ * Issue ioctl requests on the first available path. Note that unlike normal
+ * block layer requests we will not retry failed request on another controller.
+ */
+static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
+ struct nvme_ns_head **head, int *srcu_idx)
{
- struct nvme_ns *ns = bdev->bd_disk->private_data;
+#ifdef CONFIG_NVME_MULTIPATH
+ if (disk->fops == &nvme_ns_head_ops) {
+ *head = disk->private_data;
+ *srcu_idx = srcu_read_lock(&(*head)->srcu);
+ return nvme_find_path(*head);
+ }
+#endif
+ *head = NULL;
+ *srcu_idx = -1;
+ return disk->private_data;
+}
+static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
+{
+ if (head)
+ srcu_read_unlock(&head->srcu, idx);
+}
+
+static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned cmd, unsigned long arg)
+{
switch (cmd) {
case NVME_IOCTL_ID:
force_successful_syscall_return();
- return ns->ns_id;
+ return ns->head->ns_id;
case NVME_IOCTL_ADMIN_CMD:
return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
case NVME_IOCTL_IO_CMD:
@@ -1052,27 +1194,39 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
}
}
-#ifdef CONFIG_COMPAT
-static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long arg)
+static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
{
- return nvme_ioctl(bdev, mode, cmd, arg);
+ struct nvme_ns_head *head = NULL;
+ struct nvme_ns *ns;
+ int srcu_idx, ret;
+
+ ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
+ if (unlikely(!ns))
+ ret = -EWOULDBLOCK;
+ else
+ ret = nvme_ns_ioctl(ns, cmd, arg);
+ nvme_put_ns_from_disk(head, srcu_idx);
+ return ret;
}
-#else
-#define nvme_compat_ioctl NULL
-#endif
static int nvme_open(struct block_device *bdev, fmode_t mode)
{
- return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO;
+ struct nvme_ns *ns = bdev->bd_disk->private_data;
+
+#ifdef CONFIG_NVME_MULTIPATH
+ /* should never be called due to GENHD_FL_HIDDEN */
+ if (WARN_ON_ONCE(ns->head->disk))
+ return -ENXIO;
+#endif
+ if (!kref_get_unless_zero(&ns->kref))
+ return -ENXIO;
+ return 0;
}
static void nvme_release(struct gendisk *disk, fmode_t mode)
{
- struct nvme_ns *ns = disk->private_data;
-
- module_put(ns->ctrl->ops->module);
- nvme_put_ns(ns);
+ nvme_put_ns(disk->private_data);
}
static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
@@ -1085,35 +1239,12 @@ static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
}
#ifdef CONFIG_BLK_DEV_INTEGRITY
-static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
- u16 bs)
-{
- struct nvme_ns *ns = disk->private_data;
- u16 old_ms = ns->ms;
- u8 pi_type = 0;
-
- ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
- ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
-
- /* PI implementation requires metadata equal t10 pi tuple size */
- if (ns->ms == sizeof(struct t10_pi_tuple))
- pi_type = id->dps & NVME_NS_DPS_PI_MASK;
-
- if (blk_get_integrity(disk) &&
- (ns->pi_type != pi_type || ns->ms != old_ms ||
- bs != queue_logical_block_size(disk->queue) ||
- (ns->ms && ns->ext)))
- blk_integrity_unregister(disk);
-
- ns->pi_type = pi_type;
-}
-
-static void nvme_init_integrity(struct nvme_ns *ns)
+static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
{
struct blk_integrity integrity;
memset(&integrity, 0, sizeof(integrity));
- switch (ns->pi_type) {
+ switch (pi_type) {
case NVME_NS_DPS_PI_TYPE3:
integrity.profile = &t10_pi_type3_crc;
integrity.tag_size = sizeof(u16) + sizeof(u32);
@@ -1129,16 +1260,12 @@ static void nvme_init_integrity(struct nvme_ns *ns)
integrity.profile = NULL;
break;
}
- integrity.tuple_size = ns->ms;
- blk_integrity_register(ns->disk, &integrity);
- blk_queue_max_integrity_segments(ns->queue, 1);
+ integrity.tuple_size = ms;
+ blk_integrity_register(disk, &integrity);
+ blk_queue_max_integrity_segments(disk->queue, 1);
}
#else
-static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
- u16 bs)
-{
-}
-static void nvme_init_integrity(struct nvme_ns *ns)
+static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
{
}
#endif /* CONFIG_BLK_DEV_INTEGRITY */
@@ -1149,53 +1276,89 @@ static void nvme_set_chunk_size(struct nvme_ns *ns)
blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size));
}
-static void nvme_config_discard(struct nvme_ns *ns)
+static void nvme_config_discard(struct nvme_ctrl *ctrl,
+ unsigned stream_alignment, struct request_queue *queue)
{
- struct nvme_ctrl *ctrl = ns->ctrl;
- u32 logical_block_size = queue_logical_block_size(ns->queue);
+ u32 size = queue_logical_block_size(queue);
+
+ if (stream_alignment)
+ size *= stream_alignment;
BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
NVME_DSM_MAX_RANGES);
- if (ctrl->nr_streams && ns->sws && ns->sgs) {
- unsigned int sz = logical_block_size * ns->sws * ns->sgs;
+ queue->limits.discard_alignment = size;
+ queue->limits.discard_granularity = size;
- ns->queue->limits.discard_alignment = sz;
- ns->queue->limits.discard_granularity = sz;
- } else {
- ns->queue->limits.discard_alignment = logical_block_size;
- ns->queue->limits.discard_granularity = logical_block_size;
- }
- blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
- blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES);
- queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
+ blk_queue_max_discard_sectors(queue, UINT_MAX);
+ blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, queue);
if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
- blk_queue_max_write_zeroes_sectors(ns->queue, UINT_MAX);
+ blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
}
static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
- struct nvme_id_ns *id, u8 *eui64, u8 *nguid, uuid_t *uuid)
+ struct nvme_id_ns *id, struct nvme_ns_ids *ids)
{
+ memset(ids, 0, sizeof(*ids));
+
if (ctrl->vs >= NVME_VS(1, 1, 0))
- memcpy(eui64, id->eui64, sizeof(id->eui64));
+ memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
if (ctrl->vs >= NVME_VS(1, 2, 0))
- memcpy(nguid, id->nguid, sizeof(id->nguid));
+ memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
if (ctrl->vs >= NVME_VS(1, 3, 0)) {
/* Don't treat error as fatal we potentially
* already have a NGUID or EUI-64
*/
- if (nvme_identify_ns_descs(ctrl, nsid, eui64, nguid, uuid))
+ if (nvme_identify_ns_descs(ctrl, nsid, ids))
dev_warn(ctrl->device,
"%s: Identify Descriptors failed\n", __func__);
}
}
+static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
+{
+ return !uuid_is_null(&ids->uuid) ||
+ memchr_inv(ids->nguid, 0, sizeof(ids->nguid)) ||
+ memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
+}
+
+static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
+{
+ return uuid_equal(&a->uuid, &b->uuid) &&
+ memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
+ memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0;
+}
+
+static void nvme_update_disk_info(struct gendisk *disk,
+ struct nvme_ns *ns, struct nvme_id_ns *id)
+{
+ sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9);
+ unsigned stream_alignment = 0;
+
+ if (ns->ctrl->nr_streams && ns->sws && ns->sgs)
+ stream_alignment = ns->sws * ns->sgs;
+
+ blk_mq_freeze_queue(disk->queue);
+ blk_integrity_unregister(disk);
+
+ blk_queue_logical_block_size(disk->queue, 1 << ns->lba_shift);
+ if (ns->ms && !ns->ext &&
+ (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
+ nvme_init_integrity(disk, ns->ms, ns->pi_type);
+ if (ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk))
+ capacity = 0;
+ set_capacity(disk, capacity);
+
+ if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
+ nvme_config_discard(ns->ctrl, stream_alignment, disk->queue);
+ blk_mq_unfreeze_queue(disk->queue);
+}
+
static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
{
struct nvme_ns *ns = disk->private_data;
- struct nvme_ctrl *ctrl = ns->ctrl;
- u16 bs;
/*
* If identify namespace failed, use default 512 byte block size so
@@ -1204,26 +1367,22 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
if (ns->lba_shift == 0)
ns->lba_shift = 9;
- bs = 1 << ns->lba_shift;
ns->noiob = le16_to_cpu(id->noiob);
+ ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
+ ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
+ /* the PI implementation requires metadata equal t10 pi tuple size */
+ if (ns->ms == sizeof(struct t10_pi_tuple))
+ ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
+ else
+ ns->pi_type = 0;
- blk_mq_freeze_queue(disk->queue);
-
- if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)
- nvme_prep_integrity(disk, id, bs);
- blk_queue_logical_block_size(ns->queue, bs);
if (ns->noiob)
nvme_set_chunk_size(ns);
- if (ns->ms && !blk_get_integrity(disk) && !ns->ext)
- nvme_init_integrity(ns);
- if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
- set_capacity(disk, 0);
- else
- set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
-
- if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
- nvme_config_discard(ns);
- blk_mq_unfreeze_queue(disk->queue);
+ nvme_update_disk_info(disk, ns, id);
+#ifdef CONFIG_NVME_MULTIPATH
+ if (ns->head->disk)
+ nvme_update_disk_info(ns->head->disk, ns, id);
+#endif
}
static int nvme_revalidate_disk(struct gendisk *disk)
@@ -1231,8 +1390,7 @@ static int nvme_revalidate_disk(struct gendisk *disk)
struct nvme_ns *ns = disk->private_data;
struct nvme_ctrl *ctrl = ns->ctrl;
struct nvme_id_ns *id;
- u8 eui64[8] = { 0 }, nguid[16] = { 0 };
- uuid_t uuid = uuid_null;
+ struct nvme_ns_ids ids;
int ret = 0;
if (test_bit(NVME_NS_DEAD, &ns->flags)) {
@@ -1240,7 +1398,7 @@ static int nvme_revalidate_disk(struct gendisk *disk)
return -ENODEV;
}
- id = nvme_identify_ns(ctrl, ns->ns_id);
+ id = nvme_identify_ns(ctrl, ns->head->ns_id);
if (!id)
return -ENODEV;
@@ -1250,12 +1408,10 @@ static int nvme_revalidate_disk(struct gendisk *disk)
}
__nvme_revalidate_disk(disk, id);
- nvme_report_ns_ids(ctrl, ns->ns_id, id, eui64, nguid, &uuid);
- if (!uuid_equal(&ns->uuid, &uuid) ||
- memcmp(&ns->nguid, &nguid, sizeof(ns->nguid)) ||
- memcmp(&ns->eui, &eui64, sizeof(ns->eui))) {
+ nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
+ if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) {
dev_err(ctrl->device,
- "identifiers changed for nsid %d\n", ns->ns_id);
+ "identifiers changed for nsid %d\n", ns->head->ns_id);
ret = -ENODEV;
}
@@ -1287,8 +1443,10 @@ static char nvme_pr_type(enum pr_type type)
static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
u64 key, u64 sa_key, u8 op)
{
- struct nvme_ns *ns = bdev->bd_disk->private_data;
+ struct nvme_ns_head *head = NULL;
+ struct nvme_ns *ns;
struct nvme_command c;
+ int srcu_idx, ret;
u8 data[16] = { 0, };
put_unaligned_le64(key, &data[0]);
@@ -1296,10 +1454,16 @@ static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
memset(&c, 0, sizeof(c));
c.common.opcode = op;
- c.common.nsid = cpu_to_le32(ns->ns_id);
+ c.common.nsid = cpu_to_le32(head->ns_id);
c.common.cdw10[0] = cpu_to_le32(cdw10);
- return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
+ ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
+ if (unlikely(!ns))
+ ret = -EWOULDBLOCK;
+ else
+ ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
+ nvme_put_ns_from_disk(head, srcu_idx);
+ return ret;
}
static int nvme_pr_register(struct block_device *bdev, u64 old,
@@ -1381,7 +1545,7 @@ EXPORT_SYMBOL_GPL(nvme_sec_submit);
static const struct block_device_operations nvme_fops = {
.owner = THIS_MODULE,
.ioctl = nvme_ioctl,
- .compat_ioctl = nvme_compat_ioctl,
+ .compat_ioctl = nvme_ioctl,
.open = nvme_open,
.release = nvme_release,
.getgeo = nvme_getgeo,
@@ -1389,6 +1553,32 @@ static const struct block_device_operations nvme_fops = {
.pr_ops = &nvme_pr_ops,
};
+#ifdef CONFIG_NVME_MULTIPATH
+static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode)
+{
+ struct nvme_ns_head *head = bdev->bd_disk->private_data;
+
+ if (!kref_get_unless_zero(&head->ref))
+ return -ENXIO;
+ return 0;
+}
+
+static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode)
+{
+ nvme_put_ns_head(disk->private_data);
+}
+
+const struct block_device_operations nvme_ns_head_ops = {
+ .owner = THIS_MODULE,
+ .open = nvme_ns_head_open,
+ .release = nvme_ns_head_release,
+ .ioctl = nvme_ioctl,
+ .compat_ioctl = nvme_ioctl,
+ .getgeo = nvme_getgeo,
+ .pr_ops = &nvme_pr_ops,
+};
+#endif /* CONFIG_NVME_MULTIPATH */
+
static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
{
unsigned long timeout =
@@ -1737,14 +1927,15 @@ static bool quirk_matches(const struct nvme_id_ctrl *id,
string_matches(id->fr, q->fr, sizeof(id->fr));
}
-static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
+static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl,
+ struct nvme_id_ctrl *id)
{
size_t nqnlen;
int off;
nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
- strcpy(ctrl->subnqn, id->subnqn);
+ strncpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
return;
}
@@ -1752,14 +1943,222 @@ static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
/* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */
- off = snprintf(ctrl->subnqn, NVMF_NQN_SIZE,
+ off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
"nqn.2014.08.org.nvmexpress:%4x%4x",
le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
- memcpy(ctrl->subnqn + off, id->sn, sizeof(id->sn));
+ memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
off += sizeof(id->sn);
- memcpy(ctrl->subnqn + off, id->mn, sizeof(id->mn));
+ memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
off += sizeof(id->mn);
- memset(ctrl->subnqn + off, 0, sizeof(ctrl->subnqn) - off);
+ memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
+}
+
+static void __nvme_release_subsystem(struct nvme_subsystem *subsys)
+{
+ ida_simple_remove(&nvme_subsystems_ida, subsys->instance);
+ kfree(subsys);
+}
+
+static void nvme_release_subsystem(struct device *dev)
+{
+ __nvme_release_subsystem(container_of(dev, struct nvme_subsystem, dev));
+}
+
+static void nvme_destroy_subsystem(struct kref *ref)
+{
+ struct nvme_subsystem *subsys =
+ container_of(ref, struct nvme_subsystem, ref);
+
+ mutex_lock(&nvme_subsystems_lock);
+ list_del(&subsys->entry);
+ mutex_unlock(&nvme_subsystems_lock);
+
+ ida_destroy(&subsys->ns_ida);
+ device_del(&subsys->dev);
+ put_device(&subsys->dev);
+}
+
+static void nvme_put_subsystem(struct nvme_subsystem *subsys)
+{
+ kref_put(&subsys->ref, nvme_destroy_subsystem);
+}
+
+static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
+{
+ struct nvme_subsystem *subsys;
+
+ lockdep_assert_held(&nvme_subsystems_lock);
+
+ list_for_each_entry(subsys, &nvme_subsystems, entry) {
+ if (strcmp(subsys->subnqn, subsysnqn))
+ continue;
+ if (!kref_get_unless_zero(&subsys->ref))
+ continue;
+ return subsys;
+ }
+
+ return NULL;
+}
+
+#define SUBSYS_ATTR_RO(_name, _mode, _show) \
+ struct device_attribute subsys_attr_##_name = \
+ __ATTR(_name, _mode, _show, NULL)
+
+static ssize_t nvme_subsys_show_nqn(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct nvme_subsystem *subsys =
+ container_of(dev, struct nvme_subsystem, dev);
+
+ return snprintf(buf, PAGE_SIZE, "%s\n", subsys->subnqn);
+}
+static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn);
+
+#define nvme_subsys_show_str_function(field) \
+static ssize_t subsys_##field##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ struct nvme_subsystem *subsys = \
+ container_of(dev, struct nvme_subsystem, dev); \
+ return sprintf(buf, "%.*s\n", \
+ (int)sizeof(subsys->field), subsys->field); \
+} \
+static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show);
+
+nvme_subsys_show_str_function(model);
+nvme_subsys_show_str_function(serial);
+nvme_subsys_show_str_function(firmware_rev);
+
+static struct attribute *nvme_subsys_attrs[] = {
+ &subsys_attr_model.attr,
+ &subsys_attr_serial.attr,
+ &subsys_attr_firmware_rev.attr,
+ &subsys_attr_subsysnqn.attr,
+ NULL,
+};
+
+static struct attribute_group nvme_subsys_attrs_group = {
+ .attrs = nvme_subsys_attrs,
+};
+
+static const struct attribute_group *nvme_subsys_attrs_groups[] = {
+ &nvme_subsys_attrs_group,
+ NULL,
+};
+
+static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
+{
+ struct nvme_subsystem *subsys, *found;
+ int ret;
+
+ subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
+ if (!subsys)
+ return -ENOMEM;
+ ret = ida_simple_get(&nvme_subsystems_ida, 0, 0, GFP_KERNEL);
+ if (ret < 0) {
+ kfree(subsys);
+ return ret;
+ }
+ subsys->instance = ret;
+ mutex_init(&subsys->lock);
+ kref_init(&subsys->ref);
+ INIT_LIST_HEAD(&subsys->ctrls);
+ INIT_LIST_HEAD(&subsys->nsheads);
+ nvme_init_subnqn(subsys, ctrl, id);
+ memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
+ memcpy(subsys->model, id->mn, sizeof(subsys->model));
+ memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
+ subsys->vendor_id = le16_to_cpu(id->vid);
+ subsys->cmic = id->cmic;
+
+ subsys->dev.class = nvme_subsys_class;
+ subsys->dev.release = nvme_release_subsystem;
+ subsys->dev.groups = nvme_subsys_attrs_groups;
+ dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance);
+ device_initialize(&subsys->dev);
+
+ mutex_lock(&nvme_subsystems_lock);
+ found = __nvme_find_get_subsystem(subsys->subnqn);
+ if (found) {
+ /*
+ * Verify that the subsystem actually supports multiple
+ * controllers, else bail out.
+ */
+ if (!(id->cmic & (1 << 1))) {
+ dev_err(ctrl->device,
+ "ignoring ctrl due to duplicate subnqn (%s).\n",
+ found->subnqn);
+ nvme_put_subsystem(found);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ __nvme_release_subsystem(subsys);
+ subsys = found;
+ } else {
+ ret = device_add(&subsys->dev);
+ if (ret) {
+ dev_err(ctrl->device,
+ "failed to register subsystem device.\n");
+ goto out_unlock;
+ }
+ ida_init(&subsys->ns_ida);
+ list_add_tail(&subsys->entry, &nvme_subsystems);
+ }
+
+ ctrl->subsys = subsys;
+ mutex_unlock(&nvme_subsystems_lock);
+
+ if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
+ dev_name(ctrl->device))) {
+ dev_err(ctrl->device,
+ "failed to create sysfs link from subsystem.\n");
+ /* the transport driver will eventually put the subsystem */
+ return -EINVAL;
+ }
+
+ mutex_lock(&subsys->lock);
+ list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
+ mutex_unlock(&subsys->lock);
+
+ return 0;
+
+out_unlock:
+ mutex_unlock(&nvme_subsystems_lock);
+ put_device(&subsys->dev);
+ return ret;
+}
+
+static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log,
+ size_t size)
+{
+ struct nvme_command c = { };
+
+ c.common.opcode = nvme_admin_get_log_page;
+ c.common.nsid = cpu_to_le32(NVME_NSID_ALL);
+ c.common.cdw10[0] = nvme_get_log_dw10(log_page, size);
+
+ return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
+}
+
+static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
+{
+ int ret;
+
+ if (!ctrl->effects)
+ ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
+
+ if (!ctrl->effects)
+ return 0;
+
+ ret = nvme_get_log(ctrl, NVME_LOG_CMD_EFFECTS, ctrl->effects,
+ sizeof(*ctrl->effects));
+ if (ret) {
+ kfree(ctrl->effects);
+ ctrl->effects = NULL;
+ }
+ return ret;
}
/*
@@ -1797,9 +2196,19 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
return -EIO;
}
- nvme_init_subnqn(ctrl, id);
+ if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
+ ret = nvme_get_effects_log(ctrl);
+ if (ret < 0)
+ return ret;
+ }
if (!ctrl->identified) {
+ int i;
+
+ ret = nvme_init_subsystem(ctrl, id);
+ if (ret)
+ goto out_free;
+
/*
* Check for quirks. Quirk can depend on firmware version,
* so, in principle, the set of quirks present can change
@@ -1808,9 +2217,6 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
* the device, but we'd have to make sure that the driver
* behaves intelligently if the quirks change.
*/
-
- int i;
-
for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
if (quirk_matches(id, &core_quirks[i]))
ctrl->quirks |= core_quirks[i].quirks;
@@ -1823,14 +2229,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
}
ctrl->oacs = le16_to_cpu(id->oacs);
- ctrl->vid = le16_to_cpu(id->vid);
ctrl->oncs = le16_to_cpup(&id->oncs);
atomic_set(&ctrl->abort_limit, id->acl + 1);
ctrl->vwc = id->vwc;
ctrl->cntlid = le16_to_cpup(&id->cntlid);
- memcpy(ctrl->serial, id->sn, sizeof(id->sn));
- memcpy(ctrl->model, id->mn, sizeof(id->mn));
- memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr));
if (id->mdts)
max_hw_sectors = 1 << (id->mdts + page_shift - 9);
else
@@ -1931,33 +2333,12 @@ EXPORT_SYMBOL_GPL(nvme_init_identify);
static int nvme_dev_open(struct inode *inode, struct file *file)
{
- struct nvme_ctrl *ctrl;
- int instance = iminor(inode);
- int ret = -ENODEV;
-
- spin_lock(&dev_list_lock);
- list_for_each_entry(ctrl, &nvme_ctrl_list, node) {
- if (ctrl->instance != instance)
- continue;
-
- if (!ctrl->admin_q) {
- ret = -EWOULDBLOCK;
- break;
- }
- if (!kref_get_unless_zero(&ctrl->kref))
- break;
- file->private_data = ctrl;
- ret = 0;
- break;
- }
- spin_unlock(&dev_list_lock);
-
- return ret;
-}
+ struct nvme_ctrl *ctrl =
+ container_of(inode->i_cdev, struct nvme_ctrl, cdev);
-static int nvme_dev_release(struct inode *inode, struct file *file)
-{
- nvme_put_ctrl(file->private_data);
+ if (ctrl->state != NVME_CTRL_LIVE)
+ return -EWOULDBLOCK;
+ file->private_data = ctrl;
return 0;
}
@@ -2021,7 +2402,6 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
static const struct file_operations nvme_dev_fops = {
.owner = THIS_MODULE,
.open = nvme_dev_open,
- .release = nvme_dev_release,
.unlocked_ioctl = nvme_dev_ioctl,
.compat_ioctl = nvme_dev_ioctl,
};
@@ -2051,77 +2431,86 @@ static ssize_t nvme_sysfs_rescan(struct device *dev,
}
static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
+static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ if (disk->fops == &nvme_fops)
+ return nvme_get_ns_from_dev(dev)->head;
+ else
+ return disk->private_data;
+}
+
static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
- char *buf)
+ char *buf)
{
- struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
- struct nvme_ctrl *ctrl = ns->ctrl;
- int serial_len = sizeof(ctrl->serial);
- int model_len = sizeof(ctrl->model);
+ struct nvme_ns_head *head = dev_to_ns_head(dev);
+ struct nvme_ns_ids *ids = &head->ids;
+ struct nvme_subsystem *subsys = head->subsys;
+ int serial_len = sizeof(subsys->serial);
+ int model_len = sizeof(subsys->model);
- if (!uuid_is_null(&ns->uuid))
- return sprintf(buf, "uuid.%pU\n", &ns->uuid);
+ if (!uuid_is_null(&ids->uuid))
+ return sprintf(buf, "uuid.%pU\n", &ids->uuid);
- if (memchr_inv(ns->nguid, 0, sizeof(ns->nguid)))
- return sprintf(buf, "eui.%16phN\n", ns->nguid);
+ if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
+ return sprintf(buf, "eui.%16phN\n", ids->nguid);
- if (memchr_inv(ns->eui, 0, sizeof(ns->eui)))
- return sprintf(buf, "eui.%8phN\n", ns->eui);
+ if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
+ return sprintf(buf, "eui.%8phN\n", ids->eui64);
- while (serial_len > 0 && (ctrl->serial[serial_len - 1] == ' ' ||
- ctrl->serial[serial_len - 1] == '\0'))
+ while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' ||
+ subsys->serial[serial_len - 1] == '\0'))
serial_len--;
- while (model_len > 0 && (ctrl->model[model_len - 1] == ' ' ||
- ctrl->model[model_len - 1] == '\0'))
+ while (model_len > 0 && (subsys->model[model_len - 1] == ' ' ||
+ subsys->model[model_len - 1] == '\0'))
model_len--;
- return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", ctrl->vid,
- serial_len, ctrl->serial, model_len, ctrl->model, ns->ns_id);
+ return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
+ serial_len, subsys->serial, model_len, subsys->model,
+ head->ns_id);
}
static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL);
static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
- char *buf)
+ char *buf)
{
- struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
- return sprintf(buf, "%pU\n", ns->nguid);
+ return sprintf(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid);
}
static DEVICE_ATTR(nguid, S_IRUGO, nguid_show, NULL);
static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
- char *buf)
+ char *buf)
{
- struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+ struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
/* For backward compatibility expose the NGUID to userspace if
* we have no UUID set
*/
- if (uuid_is_null(&ns->uuid)) {
+ if (uuid_is_null(&ids->uuid)) {
printk_ratelimited(KERN_WARNING
"No UUID available providing old NGUID\n");
- return sprintf(buf, "%pU\n", ns->nguid);
+ return sprintf(buf, "%pU\n", ids->nguid);
}
- return sprintf(buf, "%pU\n", &ns->uuid);
+ return sprintf(buf, "%pU\n", &ids->uuid);
}
static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL);
static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
- char *buf)
+ char *buf)
{
- struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
- return sprintf(buf, "%8phd\n", ns->eui);
+ return sprintf(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64);
}
static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL);
static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
- char *buf)
+ char *buf)
{
- struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
- return sprintf(buf, "%d\n", ns->ns_id);
+ return sprintf(buf, "%d\n", dev_to_ns_head(dev)->ns_id);
}
static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL);
-static struct attribute *nvme_ns_attrs[] = {
+static struct attribute *nvme_ns_id_attrs[] = {
&dev_attr_wwid.attr,
&dev_attr_uuid.attr,
&dev_attr_nguid.attr,
@@ -2130,31 +2519,31 @@ static struct attribute *nvme_ns_attrs[] = {
NULL,
};
-static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
+static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
struct attribute *a, int n)
{
struct device *dev = container_of(kobj, struct device, kobj);
- struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+ struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
if (a == &dev_attr_uuid.attr) {
- if (uuid_is_null(&ns->uuid) &&
- !memchr_inv(ns->nguid, 0, sizeof(ns->nguid)))
+ if (uuid_is_null(&ids->uuid) &&
+ !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
return 0;
}
if (a == &dev_attr_nguid.attr) {
- if (!memchr_inv(ns->nguid, 0, sizeof(ns->nguid)))
+ if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
return 0;
}
if (a == &dev_attr_eui.attr) {
- if (!memchr_inv(ns->eui, 0, sizeof(ns->eui)))
+ if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
return 0;
}
return a->mode;
}
-static const struct attribute_group nvme_ns_attr_group = {
- .attrs = nvme_ns_attrs,
- .is_visible = nvme_ns_attrs_are_visible,
+const struct attribute_group nvme_ns_id_attr_group = {
+ .attrs = nvme_ns_id_attrs,
+ .is_visible = nvme_ns_id_attrs_are_visible,
};
#define nvme_show_str_function(field) \
@@ -2162,10 +2551,15 @@ static ssize_t field##_show(struct device *dev, \
struct device_attribute *attr, char *buf) \
{ \
struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \
- return sprintf(buf, "%.*s\n", (int)sizeof(ctrl->field), ctrl->field); \
+ return sprintf(buf, "%.*s\n", \
+ (int)sizeof(ctrl->subsys->field), ctrl->subsys->field); \
} \
static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
+nvme_show_str_function(model);
+nvme_show_str_function(serial);
+nvme_show_str_function(firmware_rev);
+
#define nvme_show_int_function(field) \
static ssize_t field##_show(struct device *dev, \
struct device_attribute *attr, char *buf) \
@@ -2175,9 +2569,6 @@ static ssize_t field##_show(struct device *dev, \
} \
static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
-nvme_show_str_function(model);
-nvme_show_str_function(serial);
-nvme_show_str_function(firmware_rev);
nvme_show_int_function(cntlid);
static ssize_t nvme_sysfs_delete(struct device *dev,
@@ -2187,7 +2578,7 @@ static ssize_t nvme_sysfs_delete(struct device *dev,
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
if (device_remove_file_self(dev, attr))
- ctrl->ops->delete_ctrl(ctrl);
+ nvme_delete_ctrl_sync(ctrl);
return count;
}
static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
@@ -2231,7 +2622,7 @@ static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
- return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subnqn);
+ return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subsys->subnqn);
}
static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
@@ -2284,12 +2675,128 @@ static const struct attribute_group *nvme_dev_attr_groups[] = {
NULL,
};
+static struct nvme_ns_head *__nvme_find_ns_head(struct nvme_subsystem *subsys,
+ unsigned nsid)
+{
+ struct nvme_ns_head *h;
+
+ lockdep_assert_held(&subsys->lock);
+
+ list_for_each_entry(h, &subsys->nsheads, entry) {
+ if (h->ns_id == nsid && kref_get_unless_zero(&h->ref))
+ return h;
+ }
+
+ return NULL;
+}
+
+static int __nvme_check_ids(struct nvme_subsystem *subsys,
+ struct nvme_ns_head *new)
+{
+ struct nvme_ns_head *h;
+
+ lockdep_assert_held(&subsys->lock);
+
+ list_for_each_entry(h, &subsys->nsheads, entry) {
+ if (nvme_ns_ids_valid(&new->ids) &&
+ nvme_ns_ids_equal(&new->ids, &h->ids))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
+ unsigned nsid, struct nvme_id_ns *id)
+{
+ struct nvme_ns_head *head;
+ int ret = -ENOMEM;
+
+ head = kzalloc(sizeof(*head), GFP_KERNEL);
+ if (!head)
+ goto out;
+ ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL);
+ if (ret < 0)
+ goto out_free_head;
+ head->instance = ret;
+ INIT_LIST_HEAD(&head->list);
+ init_srcu_struct(&head->srcu);
+ head->subsys = ctrl->subsys;
+ head->ns_id = nsid;
+ kref_init(&head->ref);
+
+ nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
+
+ ret = __nvme_check_ids(ctrl->subsys, head);
+ if (ret) {
+ dev_err(ctrl->device,
+ "duplicate IDs for nsid %d\n", nsid);
+ goto out_cleanup_srcu;
+ }
+
+ ret = nvme_mpath_alloc_disk(ctrl, head);
+ if (ret)
+ goto out_cleanup_srcu;
+
+ list_add_tail(&head->entry, &ctrl->subsys->nsheads);
+ return head;
+out_cleanup_srcu:
+ cleanup_srcu_struct(&head->srcu);
+ ida_simple_remove(&ctrl->subsys->ns_ida, head->instance);
+out_free_head:
+ kfree(head);
+out:
+ return ERR_PTR(ret);
+}
+
+static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
+ struct nvme_id_ns *id, bool *new)
+{
+ struct nvme_ctrl *ctrl = ns->ctrl;
+ bool is_shared = id->nmic & (1 << 0);
+ struct nvme_ns_head *head = NULL;
+ int ret = 0;
+
+ mutex_lock(&ctrl->subsys->lock);
+ if (is_shared)
+ head = __nvme_find_ns_head(ctrl->subsys, nsid);
+ if (!head) {
+ head = nvme_alloc_ns_head(ctrl, nsid, id);
+ if (IS_ERR(head)) {
+ ret = PTR_ERR(head);
+ goto out_unlock;
+ }
+
+ *new = true;
+ } else {
+ struct nvme_ns_ids ids;
+
+ nvme_report_ns_ids(ctrl, nsid, id, &ids);
+ if (!nvme_ns_ids_equal(&head->ids, &ids)) {
+ dev_err(ctrl->device,
+ "IDs don't match for shared namespace %d\n",
+ nsid);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ *new = false;
+ }
+
+ list_add_tail(&ns->siblings, &head->list);
+ ns->head = head;
+
+out_unlock:
+ mutex_unlock(&ctrl->subsys->lock);
+ return ret;
+}
+
static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
{
struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
- return nsa->ns_id - nsb->ns_id;
+ return nsa->head->ns_id - nsb->head->ns_id;
}
static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
@@ -2298,12 +2805,13 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
mutex_lock(&ctrl->namespaces_mutex);
list_for_each_entry(ns, &ctrl->namespaces, list) {
- if (ns->ns_id == nsid) {
- kref_get(&ns->kref);
+ if (ns->head->ns_id == nsid) {
+ if (!kref_get_unless_zero(&ns->kref))
+ continue;
ret = ns;
break;
}
- if (ns->ns_id > nsid)
+ if (ns->head->ns_id > nsid)
break;
}
mutex_unlock(&ctrl->namespaces_mutex);
@@ -2318,7 +2826,7 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
if (!ctrl->nr_streams)
return 0;
- ret = nvme_get_stream_params(ctrl, &s, ns->ns_id);
+ ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id);
if (ret)
return ret;
@@ -2342,33 +2850,27 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
struct gendisk *disk;
struct nvme_id_ns *id;
char disk_name[DISK_NAME_LEN];
- int node = dev_to_node(ctrl->dev);
+ int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT;
+ bool new = true;
ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
if (!ns)
return;
- ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL);
- if (ns->instance < 0)
- goto out_free_ns;
-
ns->queue = blk_mq_init_queue(ctrl->tagset);
if (IS_ERR(ns->queue))
- goto out_release_instance;
+ goto out_free_ns;
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
ns->queue->queuedata = ns;
ns->ctrl = ctrl;
kref_init(&ns->kref);
- ns->ns_id = nsid;
ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
nvme_set_queue_limits(ctrl, ns->queue);
nvme_setup_streams_ns(ctrl, ns);
- sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
-
id = nvme_identify_ns(ctrl, nsid);
if (!id)
goto out_free_queue;
@@ -2376,23 +2878,49 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
if (id->ncap == 0)
goto out_free_id;
- nvme_report_ns_ids(ctrl, ns->ns_id, id, ns->eui, ns->nguid, &ns->uuid);
+ if (nvme_init_ns_head(ns, nsid, id, &new))
+ goto out_free_id;
+
+#ifdef CONFIG_NVME_MULTIPATH
+ /*
+ * If multipathing is enabled we need to always use the subsystem
+ * instance number for numbering our devices to avoid conflicts
+ * between subsystems that have multiple controllers and thus use
+ * the multipath-aware subsystem node and those that have a single
+ * controller and use the controller node directly.
+ */
+ if (ns->head->disk) {
+ sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
+ ctrl->cntlid, ns->head->instance);
+ flags = GENHD_FL_HIDDEN;
+ } else {
+ sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance,
+ ns->head->instance);
+ }
+#else
+ /*
+ * But without the multipath code enabled, multiple controller per
+ * subsystems are visible as devices and thus we cannot use the
+ * subsystem instance.
+ */
+ sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
+#endif
if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
if (nvme_nvm_register(ns, disk_name, node)) {
dev_warn(ctrl->device, "LightNVM init failure\n");
- goto out_free_id;
+ goto out_unlink_ns;
}
}
disk = alloc_disk_node(0, node);
if (!disk)
- goto out_free_id;
+ goto out_unlink_ns;
disk->fops = &nvme_fops;
disk->private_data = ns;
disk->queue = ns->queue;
- disk->flags = GENHD_FL_EXT_DEVT;
+ disk->flags = flags;
memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
ns->disk = disk;
@@ -2402,49 +2930,65 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
list_add_tail(&ns->list, &ctrl->namespaces);
mutex_unlock(&ctrl->namespaces_mutex);
- kref_get(&ctrl->kref);
+ nvme_get_ctrl(ctrl);
kfree(id);
device_add_disk(ctrl->device, ns->disk);
if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
- &nvme_ns_attr_group))
+ &nvme_ns_id_attr_group))
pr_warn("%s: failed to create sysfs group for identification\n",
ns->disk->disk_name);
if (ns->ndev && nvme_nvm_register_sysfs(ns))
pr_warn("%s: failed to register lightnvm sysfs group for identification\n",
ns->disk->disk_name);
+
+ if (new)
+ nvme_mpath_add_disk(ns->head);
+ nvme_mpath_add_disk_links(ns);
return;
+ out_unlink_ns:
+ mutex_lock(&ctrl->subsys->lock);
+ list_del_rcu(&ns->siblings);
+ mutex_unlock(&ctrl->subsys->lock);
out_free_id:
kfree(id);
out_free_queue:
blk_cleanup_queue(ns->queue);
- out_release_instance:
- ida_simple_remove(&ctrl->ns_ida, ns->instance);
out_free_ns:
kfree(ns);
}
static void nvme_ns_remove(struct nvme_ns *ns)
{
+ struct nvme_ns_head *head = ns->head;
+
if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
return;
if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
if (blk_get_integrity(ns->disk))
blk_integrity_unregister(ns->disk);
+ nvme_mpath_remove_disk_links(ns);
sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
- &nvme_ns_attr_group);
+ &nvme_ns_id_attr_group);
if (ns->ndev)
nvme_nvm_unregister_sysfs(ns);
del_gendisk(ns->disk);
blk_cleanup_queue(ns->queue);
}
+ mutex_lock(&ns->ctrl->subsys->lock);
+ nvme_mpath_clear_current_path(ns);
+ if (head)
+ list_del_rcu(&ns->siblings);
+ mutex_unlock(&ns->ctrl->subsys->lock);
+
mutex_lock(&ns->ctrl->namespaces_mutex);
list_del_init(&ns->list);
mutex_unlock(&ns->ctrl->namespaces_mutex);
+ synchronize_srcu(&head->srcu);
nvme_put_ns(ns);
}
@@ -2467,7 +3011,7 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
struct nvme_ns *ns, *next;
list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
- if (ns->ns_id > nsid)
+ if (ns->head->ns_id > nsid)
nvme_ns_remove(ns);
}
}
@@ -2583,20 +3127,29 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
+static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
+{
+ char *envp[2] = { NULL, NULL };
+ u32 aen_result = ctrl->aen_result;
+
+ ctrl->aen_result = 0;
+ if (!aen_result)
+ return;
+
+ envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
+ if (!envp[0])
+ return;
+ kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
+ kfree(envp[0]);
+}
+
static void nvme_async_event_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl =
container_of(work, struct nvme_ctrl, async_event_work);
- spin_lock_irq(&ctrl->lock);
- while (ctrl->state == NVME_CTRL_LIVE && ctrl->event_limit > 0) {
- int aer_idx = --ctrl->event_limit;
-
- spin_unlock_irq(&ctrl->lock);
- ctrl->ops->submit_async_event(ctrl, aer_idx);
- spin_lock_irq(&ctrl->lock);
- }
- spin_unlock_irq(&ctrl->lock);
+ nvme_aen_uevent(ctrl);
+ ctrl->ops->submit_async_event(ctrl);
}
static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
@@ -2615,18 +3168,13 @@ static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
{
- struct nvme_command c = { };
struct nvme_fw_slot_info_log *log;
log = kmalloc(sizeof(*log), GFP_KERNEL);
if (!log)
return;
- c.common.opcode = nvme_admin_get_log_page;
- c.common.nsid = cpu_to_le32(NVME_NSID_ALL);
- c.common.cdw10[0] = nvme_get_log_dw10(NVME_LOG_FW_SLOT, sizeof(*log));
-
- if (!nvme_submit_sync_cmd(ctrl->admin_q, &c, log, sizeof(*log)))
+ if (nvme_get_log(ctrl, NVME_LOG_FW_SLOT, log, sizeof(*log)))
dev_warn(ctrl->device,
"Get FW SLOT INFO log error\n");
kfree(log);
@@ -2660,7 +3208,7 @@ static void nvme_fw_act_work(struct work_struct *work)
return;
nvme_start_queues(ctrl);
- /* read FW slot informationi to clear the AER*/
+ /* read FW slot information to clear the AER */
nvme_get_fw_slot_info(ctrl);
}
@@ -2668,24 +3216,21 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
union nvme_result *res)
{
u32 result = le32_to_cpu(res->u32);
- bool done = true;
- switch (le16_to_cpu(status) >> 1) {
- case NVME_SC_SUCCESS:
- done = false;
- /*FALLTHRU*/
- case NVME_SC_ABORT_REQ:
- ++ctrl->event_limit;
- if (ctrl->state == NVME_CTRL_LIVE)
- queue_work(nvme_wq, &ctrl->async_event_work);
+ if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
+ return;
+
+ switch (result & 0x7) {
+ case NVME_AER_ERROR:
+ case NVME_AER_SMART:
+ case NVME_AER_CSS:
+ case NVME_AER_VS:
+ ctrl->aen_result = result;
break;
default:
break;
}
- if (done)
- return;
-
switch (result & 0xff07) {
case NVME_AER_NOTICE_NS_CHANGED:
dev_info(ctrl->device, "rescanning\n");
@@ -2697,44 +3242,9 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
default:
dev_warn(ctrl->device, "async event result %08x\n", result);
}
-}
-EXPORT_SYMBOL_GPL(nvme_complete_async_event);
-
-void nvme_queue_async_events(struct nvme_ctrl *ctrl)
-{
- ctrl->event_limit = NVME_NR_AERS;
queue_work(nvme_wq, &ctrl->async_event_work);
}
-EXPORT_SYMBOL_GPL(nvme_queue_async_events);
-
-static DEFINE_IDA(nvme_instance_ida);
-
-static int nvme_set_instance(struct nvme_ctrl *ctrl)
-{
- int instance, error;
-
- do {
- if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
- return -ENODEV;
-
- spin_lock(&dev_list_lock);
- error = ida_get_new(&nvme_instance_ida, &instance);
- spin_unlock(&dev_list_lock);
- } while (error == -EAGAIN);
-
- if (error)
- return -ENODEV;
-
- ctrl->instance = instance;
- return 0;
-}
-
-static void nvme_release_instance(struct nvme_ctrl *ctrl)
-{
- spin_lock(&dev_list_lock);
- ida_remove(&nvme_instance_ida, ctrl->instance);
- spin_unlock(&dev_list_lock);
-}
+EXPORT_SYMBOL_GPL(nvme_complete_async_event);
void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
{
@@ -2752,7 +3262,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
if (ctrl->queue_count > 1) {
nvme_queue_scan(ctrl);
- nvme_queue_async_events(ctrl);
+ queue_work(nvme_wq, &ctrl->async_event_work);
nvme_start_queues(ctrl);
}
}
@@ -2760,30 +3270,31 @@ EXPORT_SYMBOL_GPL(nvme_start_ctrl);
void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
{
- device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance));
-
- spin_lock(&dev_list_lock);
- list_del(&ctrl->node);
- spin_unlock(&dev_list_lock);
+ cdev_device_del(&ctrl->cdev, ctrl->device);
}
EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
-static void nvme_free_ctrl(struct kref *kref)
+static void nvme_free_ctrl(struct device *dev)
{
- struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref);
+ struct nvme_ctrl *ctrl =
+ container_of(dev, struct nvme_ctrl, ctrl_device);
+ struct nvme_subsystem *subsys = ctrl->subsys;
- put_device(ctrl->device);
- nvme_release_instance(ctrl);
- ida_destroy(&ctrl->ns_ida);
+ ida_simple_remove(&nvme_instance_ida, ctrl->instance);
+ kfree(ctrl->effects);
+
+ if (subsys) {
+ mutex_lock(&subsys->lock);
+ list_del(&ctrl->subsys_entry);
+ mutex_unlock(&subsys->lock);
+ sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
+ }
ctrl->ops->free_ctrl(ctrl);
-}
-void nvme_put_ctrl(struct nvme_ctrl *ctrl)
-{
- kref_put(&ctrl->kref, nvme_free_ctrl);
+ if (subsys)
+ nvme_put_subsystem(subsys);
}
-EXPORT_SYMBOL_GPL(nvme_put_ctrl);
/*
* Initialize a NVMe controller structures. This needs to be called during
@@ -2799,32 +3310,36 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
spin_lock_init(&ctrl->lock);
INIT_LIST_HEAD(&ctrl->namespaces);
mutex_init(&ctrl->namespaces_mutex);
- kref_init(&ctrl->kref);
ctrl->dev = dev;
ctrl->ops = ops;
ctrl->quirks = quirks;
INIT_WORK(&ctrl->scan_work, nvme_scan_work);
INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
+ INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
- ret = nvme_set_instance(ctrl);
- if (ret)
+ ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
+ if (ret < 0)
goto out;
-
- ctrl->device = device_create_with_groups(nvme_class, ctrl->dev,
- MKDEV(nvme_char_major, ctrl->instance),
- ctrl, nvme_dev_attr_groups,
- "nvme%d", ctrl->instance);
- if (IS_ERR(ctrl->device)) {
- ret = PTR_ERR(ctrl->device);
+ ctrl->instance = ret;
+
+ device_initialize(&ctrl->ctrl_device);
+ ctrl->device = &ctrl->ctrl_device;
+ ctrl->device->devt = MKDEV(MAJOR(nvme_chr_devt), ctrl->instance);
+ ctrl->device->class = nvme_class;
+ ctrl->device->parent = ctrl->dev;
+ ctrl->device->groups = nvme_dev_attr_groups;
+ ctrl->device->release = nvme_free_ctrl;
+ dev_set_drvdata(ctrl->device, ctrl);
+ ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
+ if (ret)
goto out_release_instance;
- }
- get_device(ctrl->device);
- ida_init(&ctrl->ns_ida);
- spin_lock(&dev_list_lock);
- list_add_tail(&ctrl->node, &nvme_ctrl_list);
- spin_unlock(&dev_list_lock);
+ cdev_init(&ctrl->cdev, &nvme_dev_fops);
+ ctrl->cdev.owner = ops->module;
+ ret = cdev_device_add(&ctrl->cdev, ctrl->device);
+ if (ret)
+ goto out_free_name;
/*
* Initialize latency tolerance controls. The sysfs files won't
@@ -2835,8 +3350,10 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
min(default_ps_max_latency_us, (unsigned long)S32_MAX));
return 0;
+out_free_name:
+ kfree_const(dev->kobj.name);
out_release_instance:
- nvme_release_instance(ctrl);
+ ida_simple_remove(&nvme_instance_ida, ctrl->instance);
out:
return ret;
}
@@ -2945,6 +3462,16 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_start_queues);
+int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set)
+{
+ if (!ctrl->ops->reinit_request)
+ return 0;
+
+ return blk_mq_tagset_iter(set, set->driver_data,
+ ctrl->ops->reinit_request);
+}
+EXPORT_SYMBOL_GPL(nvme_reinit_tagset);
+
int __init nvme_core_init(void)
{
int result;
@@ -2954,12 +3481,9 @@ int __init nvme_core_init(void)
if (!nvme_wq)
return -ENOMEM;
- result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
- &nvme_dev_fops);
+ result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme");
if (result < 0)
goto destroy_wq;
- else if (result > 0)
- nvme_char_major = result;
nvme_class = class_create(THIS_MODULE, "nvme");
if (IS_ERR(nvme_class)) {
@@ -2967,10 +3491,17 @@ int __init nvme_core_init(void)
goto unregister_chrdev;
}
+ nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
+ if (IS_ERR(nvme_subsys_class)) {
+ result = PTR_ERR(nvme_subsys_class);
+ goto destroy_class;
+ }
return 0;
+destroy_class:
+ class_destroy(nvme_class);
unregister_chrdev:
- __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
+ unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
destroy_wq:
destroy_workqueue(nvme_wq);
return result;
@@ -2978,8 +3509,10 @@ destroy_wq:
void nvme_core_exit(void)
{
+ ida_destroy(&nvme_subsystems_ida);
+ class_destroy(nvme_subsys_class);
class_destroy(nvme_class);
- __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
+ unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
destroy_workqueue(nvme_wq);
}
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 555c976..76b4fe6 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -548,6 +548,7 @@ static const match_table_t opt_tokens = {
{ NVMF_OPT_HOSTNQN, "hostnqn=%s" },
{ NVMF_OPT_HOST_TRADDR, "host_traddr=%s" },
{ NVMF_OPT_HOST_ID, "hostid=%s" },
+ { NVMF_OPT_DUP_CONNECT, "duplicate_connect" },
{ NVMF_OPT_ERR, NULL }
};
@@ -566,6 +567,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
opts->nr_io_queues = num_online_cpus();
opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY;
opts->kato = NVME_DEFAULT_KATO;
+ opts->duplicate_connect = false;
options = o = kstrdup(buf, GFP_KERNEL);
if (!options)
@@ -742,6 +744,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
goto out;
}
break;
+ case NVMF_OPT_DUP_CONNECT:
+ opts->duplicate_connect = true;
+ break;
default:
pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
p);
@@ -823,7 +828,7 @@ EXPORT_SYMBOL_GPL(nvmf_free_options);
#define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN)
#define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \
NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \
- NVMF_OPT_HOST_ID)
+ NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT)
static struct nvme_ctrl *
nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
@@ -841,6 +846,9 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
if (ret)
goto out_free_opts;
+
+ request_module("nvme-%s", opts->transport);
+
/*
* Check the generic options first as we need a valid transport for
* the lookup below. Then clear the generic flags so that transport
@@ -874,12 +882,12 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
goto out_unlock;
}
- if (strcmp(ctrl->subnqn, opts->subsysnqn)) {
+ if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) {
dev_warn(ctrl->device,
"controller returned incorrect NQN: \"%s\".\n",
- ctrl->subnqn);
+ ctrl->subsys->subnqn);
up_read(&nvmf_transports_rwsem);
- ctrl->ops->delete_ctrl(ctrl);
+ nvme_delete_ctrl_sync(ctrl);
return ERR_PTR(-EINVAL);
}
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index bf33663..42232e7 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -57,6 +57,7 @@ enum {
NVMF_OPT_HOST_TRADDR = 1 << 10,
NVMF_OPT_CTRL_LOSS_TMO = 1 << 11,
NVMF_OPT_HOST_ID = 1 << 12,
+ NVMF_OPT_DUP_CONNECT = 1 << 13,
};
/**
@@ -96,6 +97,7 @@ struct nvmf_ctrl_options {
unsigned int nr_io_queues;
unsigned int reconnect_delay;
bool discovery_nqn;
+ bool duplicate_connect;
unsigned int kato;
struct nvmf_host *host;
int max_reconnects;
@@ -131,6 +133,18 @@ struct nvmf_transport_ops {
struct nvmf_ctrl_options *opts);
};
+static inline bool
+nvmf_ctlr_matches_baseopts(struct nvme_ctrl *ctrl,
+ struct nvmf_ctrl_options *opts)
+{
+ if (strcmp(opts->subsysnqn, ctrl->opts->subsysnqn) ||
+ strcmp(opts->host->nqn, ctrl->opts->host->nqn) ||
+ memcmp(&opts->host->id, &ctrl->opts->host->id, sizeof(uuid_t)))
+ return false;
+
+ return true;
+}
+
int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val);
int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val);
int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index be49d0f..7ab0be5 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -30,27 +30,19 @@
/* *************************** Data Structures/Defines ****************** */
-/*
- * We handle AEN commands ourselves and don't even let the
- * block layer know about them.
- */
-#define NVME_FC_NR_AEN_COMMANDS 1
-#define NVME_FC_AQ_BLKMQ_DEPTH \
- (NVME_AQ_DEPTH - NVME_FC_NR_AEN_COMMANDS)
-#define AEN_CMDID_BASE (NVME_FC_AQ_BLKMQ_DEPTH + 1)
-
enum nvme_fc_queue_flags {
NVME_FC_Q_CONNECTED = (1 << 0),
};
#define NVMEFC_QUEUE_DELAY 3 /* ms units */
+#define NVME_FC_DEFAULT_DEV_LOSS_TMO 60 /* seconds */
+
struct nvme_fc_queue {
struct nvme_fc_ctrl *ctrl;
struct device *dev;
struct blk_mq_hw_ctx *hctx;
void *lldd_handle;
- int queue_size;
size_t cmnd_capsule_len;
u32 qnum;
u32 rqcnt;
@@ -124,6 +116,7 @@ struct nvme_fc_lport {
struct device *dev; /* physical device for dma */
struct nvme_fc_port_template *ops;
struct kref ref;
+ atomic_t act_rport_cnt;
} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */
struct nvme_fc_rport {
@@ -136,6 +129,8 @@ struct nvme_fc_rport {
struct nvme_fc_lport *lport;
spinlock_t lock;
struct kref ref;
+ atomic_t act_ctrl_cnt;
+ unsigned long dev_loss_end;
} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */
enum nvme_fcctrl_flags {
@@ -150,6 +145,7 @@ struct nvme_fc_ctrl {
struct nvme_fc_rport *rport;
u32 cnum;
+ bool assoc_active;
u64 association_id;
struct list_head ctrl_list; /* rport->ctrl_list */
@@ -157,7 +153,6 @@ struct nvme_fc_ctrl {
struct blk_mq_tag_set admin_tag_set;
struct blk_mq_tag_set tag_set;
- struct work_struct delete_work;
struct delayed_work connect_work;
struct kref ref;
@@ -165,7 +160,7 @@ struct nvme_fc_ctrl {
u32 iocnt;
wait_queue_head_t ioabort_wait;
- struct nvme_fc_fcp_op aen_ops[NVME_FC_NR_AEN_COMMANDS];
+ struct nvme_fc_fcp_op aen_ops[NVME_NR_AEN_COMMANDS];
struct nvme_ctrl ctrl;
};
@@ -213,10 +208,16 @@ static DEFINE_IDA(nvme_fc_ctrl_cnt);
+/*
+ * These items are short-term. They will eventually be moved into
+ * a generic FC class. See comments in module init.
+ */
+static struct class *fc_class;
+static struct device *fc_udev_device;
+
/* *********************** FC-NVME Port Management ************************ */
-static int __nvme_fc_del_ctrl(struct nvme_fc_ctrl *);
static void __nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *,
struct nvme_fc_queue *, unsigned int);
@@ -235,9 +236,6 @@ nvme_fc_free_lport(struct kref *ref)
list_del(&lport->port_list);
spin_unlock_irqrestore(&nvme_fc_lock, flags);
- /* let the LLDD know we've finished tearing it down */
- lport->ops->localport_delete(&lport->localport);
-
ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num);
ida_destroy(&lport->endp_cnt);
@@ -260,7 +258,9 @@ nvme_fc_lport_get(struct nvme_fc_lport *lport)
static struct nvme_fc_lport *
-nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo)
+nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo,
+ struct nvme_fc_port_template *ops,
+ struct device *dev)
{
struct nvme_fc_lport *lport;
unsigned long flags;
@@ -272,6 +272,11 @@ nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo)
lport->localport.port_name != pinfo->port_name)
continue;
+ if (lport->dev != dev) {
+ lport = ERR_PTR(-EXDEV);
+ goto out_done;
+ }
+
if (lport->localport.port_state != FC_OBJSTATE_DELETED) {
lport = ERR_PTR(-EEXIST);
goto out_done;
@@ -288,6 +293,7 @@ nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo)
/* resume the lport */
+ lport->ops = ops;
lport->localport.port_role = pinfo->port_role;
lport->localport.port_id = pinfo->port_id;
lport->localport.port_state = FC_OBJSTATE_ONLINE;
@@ -348,7 +354,7 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo,
* expired, we can simply re-enable the localport. Remoteports
* and controller reconnections should resume naturally.
*/
- newrec = nvme_fc_attach_to_unreg_lport(pinfo);
+ newrec = nvme_fc_attach_to_unreg_lport(pinfo, template, dev);
/* found an lport, but something about its state is bad */
if (IS_ERR(newrec)) {
@@ -384,6 +390,7 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo,
INIT_LIST_HEAD(&newrec->port_list);
INIT_LIST_HEAD(&newrec->endp_list);
kref_init(&newrec->ref);
+ atomic_set(&newrec->act_rport_cnt, 0);
newrec->ops = template;
newrec->dev = dev;
ida_init(&newrec->endp_cnt);
@@ -446,12 +453,177 @@ nvme_fc_unregister_localport(struct nvme_fc_local_port *portptr)
spin_unlock_irqrestore(&nvme_fc_lock, flags);
+ if (atomic_read(&lport->act_rport_cnt) == 0)
+ lport->ops->localport_delete(&lport->localport);
+
nvme_fc_lport_put(lport);
return 0;
}
EXPORT_SYMBOL_GPL(nvme_fc_unregister_localport);
+/*
+ * TRADDR strings, per FC-NVME are fixed format:
+ * "nn-0x<16hexdigits>:pn-0x<16hexdigits>" - 43 characters
+ * udev event will only differ by prefix of what field is
+ * being specified:
+ * "NVMEFC_HOST_TRADDR=" or "NVMEFC_TRADDR=" - 19 max characters
+ * 19 + 43 + null_fudge = 64 characters
+ */
+#define FCNVME_TRADDR_LENGTH 64
+
+static void
+nvme_fc_signal_discovery_scan(struct nvme_fc_lport *lport,
+ struct nvme_fc_rport *rport)
+{
+ char hostaddr[FCNVME_TRADDR_LENGTH]; /* NVMEFC_HOST_TRADDR=...*/
+ char tgtaddr[FCNVME_TRADDR_LENGTH]; /* NVMEFC_TRADDR=...*/
+ char *envp[4] = { "FC_EVENT=nvmediscovery", hostaddr, tgtaddr, NULL };
+
+ if (!(rport->remoteport.port_role & FC_PORT_ROLE_NVME_DISCOVERY))
+ return;
+
+ snprintf(hostaddr, sizeof(hostaddr),
+ "NVMEFC_HOST_TRADDR=nn-0x%016llx:pn-0x%016llx",
+ lport->localport.node_name, lport->localport.port_name);
+ snprintf(tgtaddr, sizeof(tgtaddr),
+ "NVMEFC_TRADDR=nn-0x%016llx:pn-0x%016llx",
+ rport->remoteport.node_name, rport->remoteport.port_name);
+ kobject_uevent_env(&fc_udev_device->kobj, KOBJ_CHANGE, envp);
+}
+
+static void
+nvme_fc_free_rport(struct kref *ref)
+{
+ struct nvme_fc_rport *rport =
+ container_of(ref, struct nvme_fc_rport, ref);
+ struct nvme_fc_lport *lport =
+ localport_to_lport(rport->remoteport.localport);
+ unsigned long flags;
+
+ WARN_ON(rport->remoteport.port_state != FC_OBJSTATE_DELETED);
+ WARN_ON(!list_empty(&rport->ctrl_list));
+
+ /* remove from lport list */
+ spin_lock_irqsave(&nvme_fc_lock, flags);
+ list_del(&rport->endp_list);
+ spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+ ida_simple_remove(&lport->endp_cnt, rport->remoteport.port_num);
+
+ kfree(rport);
+
+ nvme_fc_lport_put(lport);
+}
+
+static void
+nvme_fc_rport_put(struct nvme_fc_rport *rport)
+{
+ kref_put(&rport->ref, nvme_fc_free_rport);
+}
+
+static int
+nvme_fc_rport_get(struct nvme_fc_rport *rport)
+{
+ return kref_get_unless_zero(&rport->ref);
+}
+
+static void
+nvme_fc_resume_controller(struct nvme_fc_ctrl *ctrl)
+{
+ switch (ctrl->ctrl.state) {
+ case NVME_CTRL_NEW:
+ case NVME_CTRL_RECONNECTING:
+ /*
+ * As all reconnects were suppressed, schedule a
+ * connect.
+ */
+ dev_info(ctrl->ctrl.device,
+ "NVME-FC{%d}: connectivity re-established. "
+ "Attempting reconnect\n", ctrl->cnum);
+
+ queue_delayed_work(nvme_wq, &ctrl->connect_work, 0);
+ break;
+
+ case NVME_CTRL_RESETTING:
+ /*
+ * Controller is already in the process of terminating the
+ * association. No need to do anything further. The reconnect
+ * step will naturally occur after the reset completes.
+ */
+ break;
+
+ default:
+ /* no action to take - let it delete */
+ break;
+ }
+}
+
+static struct nvme_fc_rport *
+nvme_fc_attach_to_suspended_rport(struct nvme_fc_lport *lport,
+ struct nvme_fc_port_info *pinfo)
+{
+ struct nvme_fc_rport *rport;
+ struct nvme_fc_ctrl *ctrl;
+ unsigned long flags;
+
+ spin_lock_irqsave(&nvme_fc_lock, flags);
+
+ list_for_each_entry(rport, &lport->endp_list, endp_list) {
+ if (rport->remoteport.node_name != pinfo->node_name ||
+ rport->remoteport.port_name != pinfo->port_name)
+ continue;
+
+ if (!nvme_fc_rport_get(rport)) {
+ rport = ERR_PTR(-ENOLCK);
+ goto out_done;
+ }
+
+ spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+ spin_lock_irqsave(&rport->lock, flags);
+
+ /* has it been unregistered */
+ if (rport->remoteport.port_state != FC_OBJSTATE_DELETED) {
+ /* means lldd called us twice */
+ spin_unlock_irqrestore(&rport->lock, flags);
+ nvme_fc_rport_put(rport);
+ return ERR_PTR(-ESTALE);
+ }
+
+ rport->remoteport.port_state = FC_OBJSTATE_ONLINE;
+ rport->dev_loss_end = 0;
+
+ /*
+ * kick off a reconnect attempt on all associations to the
+ * remote port. A successful reconnects will resume i/o.
+ */
+ list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list)
+ nvme_fc_resume_controller(ctrl);
+
+ spin_unlock_irqrestore(&rport->lock, flags);
+
+ return rport;
+ }
+
+ rport = NULL;
+
+out_done:
+ spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+ return rport;
+}
+
+static inline void
+__nvme_fc_set_dev_loss_tmo(struct nvme_fc_rport *rport,
+ struct nvme_fc_port_info *pinfo)
+{
+ if (pinfo->dev_loss_tmo)
+ rport->remoteport.dev_loss_tmo = pinfo->dev_loss_tmo;
+ else
+ rport->remoteport.dev_loss_tmo = NVME_FC_DEFAULT_DEV_LOSS_TMO;
+}
+
/**
* nvme_fc_register_remoteport - transport entry point called by an
* LLDD to register the existence of a NVME
@@ -478,28 +650,52 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
unsigned long flags;
int ret, idx;
+ if (!nvme_fc_lport_get(lport)) {
+ ret = -ESHUTDOWN;
+ goto out_reghost_failed;
+ }
+
+ /*
+ * look to see if there is already a remoteport that is waiting
+ * for a reconnect (within dev_loss_tmo) with the same WWN's.
+ * If so, transition to it and reconnect.
+ */
+ newrec = nvme_fc_attach_to_suspended_rport(lport, pinfo);
+
+ /* found an rport, but something about its state is bad */
+ if (IS_ERR(newrec)) {
+ ret = PTR_ERR(newrec);
+ goto out_lport_put;
+
+ /* found existing rport, which was resumed */
+ } else if (newrec) {
+ nvme_fc_lport_put(lport);
+ __nvme_fc_set_dev_loss_tmo(newrec, pinfo);
+ nvme_fc_signal_discovery_scan(lport, newrec);
+ *portptr = &newrec->remoteport;
+ return 0;
+ }
+
+ /* nothing found - allocate a new remoteport struct */
+
newrec = kmalloc((sizeof(*newrec) + lport->ops->remote_priv_sz),
GFP_KERNEL);
if (!newrec) {
ret = -ENOMEM;
- goto out_reghost_failed;
- }
-
- if (!nvme_fc_lport_get(lport)) {
- ret = -ESHUTDOWN;
- goto out_kfree_rport;
+ goto out_lport_put;
}
idx = ida_simple_get(&lport->endp_cnt, 0, 0, GFP_KERNEL);
if (idx < 0) {
ret = -ENOSPC;
- goto out_lport_put;
+ goto out_kfree_rport;
}
INIT_LIST_HEAD(&newrec->endp_list);
INIT_LIST_HEAD(&newrec->ctrl_list);
INIT_LIST_HEAD(&newrec->ls_req_list);
kref_init(&newrec->ref);
+ atomic_set(&newrec->act_ctrl_cnt, 0);
spin_lock_init(&newrec->lock);
newrec->remoteport.localport = &lport->localport;
newrec->dev = lport->dev;
@@ -511,63 +707,27 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
newrec->remoteport.port_id = pinfo->port_id;
newrec->remoteport.port_state = FC_OBJSTATE_ONLINE;
newrec->remoteport.port_num = idx;
+ __nvme_fc_set_dev_loss_tmo(newrec, pinfo);
spin_lock_irqsave(&nvme_fc_lock, flags);
list_add_tail(&newrec->endp_list, &lport->endp_list);
spin_unlock_irqrestore(&nvme_fc_lock, flags);
+ nvme_fc_signal_discovery_scan(lport, newrec);
+
*portptr = &newrec->remoteport;
return 0;
-out_lport_put:
- nvme_fc_lport_put(lport);
out_kfree_rport:
kfree(newrec);
+out_lport_put:
+ nvme_fc_lport_put(lport);
out_reghost_failed:
*portptr = NULL;
return ret;
}
EXPORT_SYMBOL_GPL(nvme_fc_register_remoteport);
-static void
-nvme_fc_free_rport(struct kref *ref)
-{
- struct nvme_fc_rport *rport =
- container_of(ref, struct nvme_fc_rport, ref);
- struct nvme_fc_lport *lport =
- localport_to_lport(rport->remoteport.localport);
- unsigned long flags;
-
- WARN_ON(rport->remoteport.port_state != FC_OBJSTATE_DELETED);
- WARN_ON(!list_empty(&rport->ctrl_list));
-
- /* remove from lport list */
- spin_lock_irqsave(&nvme_fc_lock, flags);
- list_del(&rport->endp_list);
- spin_unlock_irqrestore(&nvme_fc_lock, flags);
-
- /* let the LLDD know we've finished tearing it down */
- lport->ops->remoteport_delete(&rport->remoteport);
-
- ida_simple_remove(&lport->endp_cnt, rport->remoteport.port_num);
-
- kfree(rport);
-
- nvme_fc_lport_put(lport);
-}
-
-static void
-nvme_fc_rport_put(struct nvme_fc_rport *rport)
-{
- kref_put(&rport->ref, nvme_fc_free_rport);
-}
-
-static int
-nvme_fc_rport_get(struct nvme_fc_rport *rport)
-{
- return kref_get_unless_zero(&rport->ref);
-}
-
static int
nvme_fc_abort_lsops(struct nvme_fc_rport *rport)
{
@@ -592,6 +752,58 @@ restart:
return 0;
}
+static void
+nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
+{
+ dev_info(ctrl->ctrl.device,
+ "NVME-FC{%d}: controller connectivity lost. Awaiting "
+ "Reconnect", ctrl->cnum);
+
+ switch (ctrl->ctrl.state) {
+ case NVME_CTRL_NEW:
+ case NVME_CTRL_LIVE:
+ /*
+ * Schedule a controller reset. The reset will terminate the
+ * association and schedule the reconnect timer. Reconnects
+ * will be attempted until either the ctlr_loss_tmo
+ * (max_retries * connect_delay) expires or the remoteport's
+ * dev_loss_tmo expires.
+ */
+ if (nvme_reset_ctrl(&ctrl->ctrl)) {
+ dev_warn(ctrl->ctrl.device,
+ "NVME-FC{%d}: Couldn't schedule reset. "
+ "Deleting controller.\n",
+ ctrl->cnum);
+ nvme_delete_ctrl(&ctrl->ctrl);
+ }
+ break;
+
+ case NVME_CTRL_RECONNECTING:
+ /*
+ * The association has already been terminated and the
+ * controller is attempting reconnects. No need to do anything
+ * futher. Reconnects will be attempted until either the
+ * ctlr_loss_tmo (max_retries * connect_delay) expires or the
+ * remoteport's dev_loss_tmo expires.
+ */
+ break;
+
+ case NVME_CTRL_RESETTING:
+ /*
+ * Controller is already in the process of terminating the
+ * association. No need to do anything further. The reconnect
+ * step will kick in naturally after the association is
+ * terminated.
+ */
+ break;
+
+ case NVME_CTRL_DELETING:
+ default:
+ /* no action to take - let it delete */
+ break;
+ }
+}
+
/**
* nvme_fc_unregister_remoteport - transport entry point called by an
* LLDD to deregister/remove a previously
@@ -621,19 +833,78 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr)
}
portptr->port_state = FC_OBJSTATE_DELETED;
- /* tear down all associations to the remote port */
- list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list)
- __nvme_fc_del_ctrl(ctrl);
+ rport->dev_loss_end = jiffies + (portptr->dev_loss_tmo * HZ);
+
+ list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) {
+ /* if dev_loss_tmo==0, dev loss is immediate */
+ if (!portptr->dev_loss_tmo) {
+ dev_warn(ctrl->ctrl.device,
+ "NVME-FC{%d}: controller connectivity lost. "
+ "Deleting controller.\n",
+ ctrl->cnum);
+ nvme_delete_ctrl(&ctrl->ctrl);
+ } else
+ nvme_fc_ctrl_connectivity_loss(ctrl);
+ }
spin_unlock_irqrestore(&rport->lock, flags);
nvme_fc_abort_lsops(rport);
+ if (atomic_read(&rport->act_ctrl_cnt) == 0)
+ rport->lport->ops->remoteport_delete(portptr);
+
+ /*
+ * release the reference, which will allow, if all controllers
+ * go away, which should only occur after dev_loss_tmo occurs,
+ * for the rport to be torn down.
+ */
nvme_fc_rport_put(rport);
+
return 0;
}
EXPORT_SYMBOL_GPL(nvme_fc_unregister_remoteport);
+/**
+ * nvme_fc_rescan_remoteport - transport entry point called by an
+ * LLDD to request a nvme device rescan.
+ * @remoteport: pointer to the (registered) remote port that is to be
+ * rescanned.
+ *
+ * Returns: N/A
+ */
+void
+nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport)
+{
+ struct nvme_fc_rport *rport = remoteport_to_rport(remoteport);
+
+ nvme_fc_signal_discovery_scan(rport->lport, rport);
+}
+EXPORT_SYMBOL_GPL(nvme_fc_rescan_remoteport);
+
+int
+nvme_fc_set_remoteport_devloss(struct nvme_fc_remote_port *portptr,
+ u32 dev_loss_tmo)
+{
+ struct nvme_fc_rport *rport = remoteport_to_rport(portptr);
+ unsigned long flags;
+
+ spin_lock_irqsave(&rport->lock, flags);
+
+ if (portptr->port_state != FC_OBJSTATE_ONLINE) {
+ spin_unlock_irqrestore(&rport->lock, flags);
+ return -EINVAL;
+ }
+
+ /* a dev_loss_tmo of 0 (immediate) is allowed to be set */
+ rport->remoteport.dev_loss_tmo = dev_loss_tmo;
+
+ spin_unlock_irqrestore(&rport->lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_fc_set_remoteport_devloss);
+
/* *********************** FC-NVME DMA Handling **************************** */
@@ -723,7 +994,6 @@ fc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
dma_unmap_sg(dev, sg, nents, dir);
}
-
/* *********************** FC-NVME LS Handling **************************** */
static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *);
@@ -1266,7 +1536,7 @@ nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl)
unsigned long flags;
int i, ret;
- for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) {
+ for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) {
if (atomic_read(&aen_op->state) != FCPOP_STATE_ACTIVE)
continue;
@@ -1331,7 +1601,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
struct nvme_command *sqe = &op->cmd_iu.sqe;
__le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1);
union nvme_result result;
- bool complete_rq, terminate_assoc = true;
+ bool terminate_assoc = true;
/*
* WARNING:
@@ -1373,8 +1643,9 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
fc_dma_sync_single_for_cpu(ctrl->lport->dev, op->fcp_req.rspdma,
sizeof(op->rsp_iu), DMA_FROM_DEVICE);
- if (atomic_read(&op->state) == FCPOP_STATE_ABORTED)
- status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1);
+ if (atomic_read(&op->state) == FCPOP_STATE_ABORTED ||
+ op->flags & FCOP_FLAGS_TERMIO)
+ status = cpu_to_le16(NVME_SC_ABORT_REQ << 1);
else if (freq->status)
status = cpu_to_le16(NVME_SC_INTERNAL << 1);
@@ -1438,23 +1709,27 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
done:
if (op->flags & FCOP_FLAGS_AEN) {
nvme_complete_async_event(&queue->ctrl->ctrl, status, &result);
- complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op);
+ __nvme_fc_fcpop_chk_teardowns(ctrl, op);
atomic_set(&op->state, FCPOP_STATE_IDLE);
op->flags = FCOP_FLAGS_AEN; /* clear other flags */
nvme_fc_ctrl_put(ctrl);
goto check_error;
}
- complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op);
- if (!complete_rq) {
- if (unlikely(op->flags & FCOP_FLAGS_TERMIO)) {
- status = cpu_to_le16(NVME_SC_ABORT_REQ << 1);
- if (blk_queue_dying(rq->q))
- status |= cpu_to_le16(NVME_SC_DNR << 1);
- }
- nvme_end_request(rq, status, result);
- } else
+ /*
+ * Force failures of commands if we're killing the controller
+ * or have an error on a command used to create an new association
+ */
+ if (status &&
+ (blk_queue_dying(rq->q) ||
+ ctrl->ctrl.state == NVME_CTRL_NEW ||
+ ctrl->ctrl.state == NVME_CTRL_RECONNECTING))
+ status |= cpu_to_le16(NVME_SC_DNR << 1);
+
+ if (__nvme_fc_fcpop_chk_teardowns(ctrl, op))
__nvme_fc_final_op_cleanup(rq);
+ else
+ nvme_end_request(rq, status, result);
check_error:
if (terminate_assoc)
@@ -1531,7 +1806,7 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl)
int i, ret;
aen_op = ctrl->aen_ops;
- for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) {
+ for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) {
private = kzalloc(ctrl->lport->ops->fcprqst_priv_sz,
GFP_KERNEL);
if (!private)
@@ -1541,7 +1816,7 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl)
sqe = &cmdiu->sqe;
ret = __nvme_fc_init_request(ctrl, &ctrl->queues[0],
aen_op, (struct request *)NULL,
- (AEN_CMDID_BASE + i));
+ (NVME_AQ_BLK_MQ_DEPTH + i));
if (ret) {
kfree(private);
return ret;
@@ -1554,7 +1829,7 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl)
memset(sqe, 0, sizeof(*sqe));
sqe->common.opcode = nvme_admin_async_event;
/* Note: core layer may overwrite the sqe.command_id value */
- sqe->common.command_id = AEN_CMDID_BASE + i;
+ sqe->common.command_id = NVME_AQ_BLK_MQ_DEPTH + i;
}
return 0;
}
@@ -1566,7 +1841,7 @@ nvme_fc_term_aen_ops(struct nvme_fc_ctrl *ctrl)
int i;
aen_op = ctrl->aen_ops;
- for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) {
+ for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) {
if (!aen_op->fcp_req.private)
continue;
@@ -1610,7 +1885,7 @@ nvme_fc_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
}
static void
-nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx, size_t queue_size)
+nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx)
{
struct nvme_fc_queue *queue;
@@ -1626,8 +1901,6 @@ nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx, size_t queue_size)
else
queue->cmnd_capsule_len = sizeof(struct nvme_command);
- queue->queue_size = queue_size;
-
/*
* Considered whether we should allocate buffers for all SQEs
* and CQEs and dma map them - mapping their respective entries
@@ -1751,7 +2024,7 @@ nvme_fc_init_io_queues(struct nvme_fc_ctrl *ctrl)
int i;
for (i = 1; i < ctrl->ctrl.queue_count; i++)
- nvme_fc_init_queue(ctrl, i, ctrl->ctrl.sqsize);
+ nvme_fc_init_queue(ctrl, i);
}
static void
@@ -1825,13 +2098,6 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
dev_warn(ctrl->ctrl.device,
"NVME-FC{%d}: resetting controller\n", ctrl->cnum);
- if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
- dev_err(ctrl->ctrl.device,
- "NVME-FC{%d}: error_recovery: Couldn't change state "
- "to RECONNECTING\n", ctrl->cnum);
- return;
- }
-
nvme_reset_ctrl(&ctrl->ctrl);
}
@@ -1842,13 +2108,14 @@ nvme_fc_timeout(struct request *rq, bool reserved)
struct nvme_fc_ctrl *ctrl = op->ctrl;
int ret;
- if (reserved)
+ if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE ||
+ atomic_read(&op->state) == FCPOP_STATE_ABORTED)
return BLK_EH_RESET_TIMER;
ret = __nvme_fc_abort_op(ctrl, op);
if (ret)
- /* io wasn't active to abort consider it done */
- return BLK_EH_HANDLED;
+ /* io wasn't active to abort */
+ return BLK_EH_NOT_HANDLED;
/*
* we can't individually ABTS an io without affecting the queue,
@@ -1859,7 +2126,12 @@ nvme_fc_timeout(struct request *rq, bool reserved)
*/
nvme_fc_error_recovery(ctrl, "io timeout error");
- return BLK_EH_HANDLED;
+ /*
+ * the io abort has been initiated. Have the reset timer
+ * restarted and the abort completion will complete the io
+ * shortly. Avoids a synchronous wait while the abort finishes.
+ */
+ return BLK_EH_RESET_TIMER;
}
static int
@@ -2110,7 +2382,7 @@ nvme_fc_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
}
static void
-nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
+nvme_fc_submit_async_event(struct nvme_ctrl *arg)
{
struct nvme_fc_ctrl *ctrl = to_fc_ctrl(arg);
struct nvme_fc_fcp_op *aen_op;
@@ -2118,9 +2390,6 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
bool terminating = false;
blk_status_t ret;
- if (aer_idx > NVME_FC_NR_AEN_COMMANDS)
- return;
-
spin_lock_irqsave(&ctrl->lock, flags);
if (ctrl->flags & FCCTRL_TERMIO)
terminating = true;
@@ -2129,13 +2398,13 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
if (terminating)
return;
- aen_op = &ctrl->aen_ops[aer_idx];
+ aen_op = &ctrl->aen_ops[0];
ret = nvme_fc_start_fcp_op(ctrl, aen_op->queue, aen_op, 0,
NVMEFC_FCP_NODATA);
if (ret)
dev_err(ctrl->ctrl.device,
- "failed async event work [%d]\n", aer_idx);
+ "failed async event work\n");
}
static void
@@ -2337,7 +2606,7 @@ nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl)
nvme_fc_init_io_queues(ctrl);
- ret = blk_mq_reinit_tagset(&ctrl->tag_set, nvme_fc_reinit_request);
+ ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
if (ret)
goto out_free_io_queues;
@@ -2360,6 +2629,61 @@ out_free_io_queues:
return ret;
}
+static void
+nvme_fc_rport_active_on_lport(struct nvme_fc_rport *rport)
+{
+ struct nvme_fc_lport *lport = rport->lport;
+
+ atomic_inc(&lport->act_rport_cnt);
+}
+
+static void
+nvme_fc_rport_inactive_on_lport(struct nvme_fc_rport *rport)
+{
+ struct nvme_fc_lport *lport = rport->lport;
+ u32 cnt;
+
+ cnt = atomic_dec_return(&lport->act_rport_cnt);
+ if (cnt == 0 && lport->localport.port_state == FC_OBJSTATE_DELETED)
+ lport->ops->localport_delete(&lport->localport);
+}
+
+static int
+nvme_fc_ctlr_active_on_rport(struct nvme_fc_ctrl *ctrl)
+{
+ struct nvme_fc_rport *rport = ctrl->rport;
+ u32 cnt;
+
+ if (ctrl->assoc_active)
+ return 1;
+
+ ctrl->assoc_active = true;
+ cnt = atomic_inc_return(&rport->act_ctrl_cnt);
+ if (cnt == 1)
+ nvme_fc_rport_active_on_lport(rport);
+
+ return 0;
+}
+
+static int
+nvme_fc_ctlr_inactive_on_rport(struct nvme_fc_ctrl *ctrl)
+{
+ struct nvme_fc_rport *rport = ctrl->rport;
+ struct nvme_fc_lport *lport = rport->lport;
+ u32 cnt;
+
+ /* ctrl->assoc_active=false will be set independently */
+
+ cnt = atomic_dec_return(&rport->act_ctrl_cnt);
+ if (cnt == 0) {
+ if (rport->remoteport.port_state == FC_OBJSTATE_DELETED)
+ lport->ops->remoteport_delete(&rport->remoteport);
+ nvme_fc_rport_inactive_on_lport(rport);
+ }
+
+ return 0;
+}
+
/*
* This routine restarts the controller on the host side, and
* on the link side, recreates the controller association.
@@ -2368,26 +2692,31 @@ static int
nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
{
struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
- u32 segs;
int ret;
bool changed;
++ctrl->ctrl.nr_reconnects;
+ if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
+ return -ENODEV;
+
+ if (nvme_fc_ctlr_active_on_rport(ctrl))
+ return -ENOTUNIQ;
+
/*
* Create the admin queue
*/
- nvme_fc_init_queue(ctrl, 0, NVME_FC_AQ_BLKMQ_DEPTH);
+ nvme_fc_init_queue(ctrl, 0);
ret = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0,
- NVME_FC_AQ_BLKMQ_DEPTH);
+ NVME_AQ_BLK_MQ_DEPTH);
if (ret)
goto out_free_queue;
ret = nvme_fc_connect_admin_queue(ctrl, &ctrl->queues[0],
- NVME_FC_AQ_BLKMQ_DEPTH,
- (NVME_FC_AQ_BLKMQ_DEPTH / 4));
+ NVME_AQ_BLK_MQ_DEPTH,
+ (NVME_AQ_BLK_MQ_DEPTH / 4));
if (ret)
goto out_delete_hw_queue;
@@ -2419,9 +2748,8 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
if (ret)
goto out_disconnect_admin_queue;
- segs = min_t(u32, NVME_FC_MAX_SEGMENTS,
- ctrl->lport->ops->max_sgl_segments);
- ctrl->ctrl.max_hw_sectors = (segs - 1) << (PAGE_SHIFT - 9);
+ ctrl->ctrl.max_hw_sectors =
+ (ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9);
ret = nvme_init_identify(&ctrl->ctrl);
if (ret)
@@ -2465,11 +2793,11 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
}
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
- WARN_ON_ONCE(!changed);
ctrl->ctrl.nr_reconnects = 0;
- nvme_start_ctrl(&ctrl->ctrl);
+ if (changed)
+ nvme_start_ctrl(&ctrl->ctrl);
return 0; /* Success */
@@ -2482,6 +2810,8 @@ out_delete_hw_queue:
__nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
out_free_queue:
nvme_fc_free_queue(&ctrl->queues[0]);
+ ctrl->assoc_active = false;
+ nvme_fc_ctlr_inactive_on_rport(ctrl);
return ret;
}
@@ -2497,6 +2827,10 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
{
unsigned long flags;
+ if (!ctrl->assoc_active)
+ return;
+ ctrl->assoc_active = false;
+
spin_lock_irqsave(&ctrl->lock, flags);
ctrl->flags |= FCCTRL_TERMIO;
ctrl->iocnt = 0;
@@ -2537,7 +2871,8 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
* use blk_mq_tagset_busy_itr() and the transport routine to
* terminate the exchanges.
*/
- blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+ if (ctrl->ctrl.state != NVME_CTRL_NEW)
+ blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
nvme_fc_terminate_exchange, &ctrl->ctrl);
@@ -2568,102 +2903,64 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
__nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
nvme_fc_free_queue(&ctrl->queues[0]);
+
+ nvme_fc_ctlr_inactive_on_rport(ctrl);
}
static void
-nvme_fc_delete_ctrl_work(struct work_struct *work)
+nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
{
- struct nvme_fc_ctrl *ctrl =
- container_of(work, struct nvme_fc_ctrl, delete_work);
+ struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
- cancel_work_sync(&ctrl->ctrl.reset_work);
cancel_delayed_work_sync(&ctrl->connect_work);
- nvme_stop_ctrl(&ctrl->ctrl);
- nvme_remove_namespaces(&ctrl->ctrl);
/*
* kill the association on the link side. this will block
* waiting for io to terminate
*/
nvme_fc_delete_association(ctrl);
-
- /*
- * tear down the controller
- * After the last reference on the nvme ctrl is removed,
- * the transport nvme_fc_nvme_ctrl_freed() callback will be
- * invoked. From there, the transport will tear down it's
- * logical queues and association.
- */
- nvme_uninit_ctrl(&ctrl->ctrl);
-
- nvme_put_ctrl(&ctrl->ctrl);
-}
-
-static bool
-__nvme_fc_schedule_delete_work(struct nvme_fc_ctrl *ctrl)
-{
- if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
- return true;
-
- if (!queue_work(nvme_wq, &ctrl->delete_work))
- return true;
-
- return false;
-}
-
-static int
-__nvme_fc_del_ctrl(struct nvme_fc_ctrl *ctrl)
-{
- return __nvme_fc_schedule_delete_work(ctrl) ? -EBUSY : 0;
-}
-
-/*
- * Request from nvme core layer to delete the controller
- */
-static int
-nvme_fc_del_nvme_ctrl(struct nvme_ctrl *nctrl)
-{
- struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
- int ret;
-
- if (!kref_get_unless_zero(&ctrl->ctrl.kref))
- return -EBUSY;
-
- ret = __nvme_fc_del_ctrl(ctrl);
-
- if (!ret)
- flush_workqueue(nvme_wq);
-
- nvme_put_ctrl(&ctrl->ctrl);
-
- return ret;
}
static void
nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
{
- /* If we are resetting/deleting then do nothing */
- if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) {
- WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW ||
- ctrl->ctrl.state == NVME_CTRL_LIVE);
- return;
- }
+ struct nvme_fc_rport *rport = ctrl->rport;
+ struct nvme_fc_remote_port *portptr = &rport->remoteport;
+ unsigned long recon_delay = ctrl->ctrl.opts->reconnect_delay * HZ;
+ bool recon = true;
- dev_info(ctrl->ctrl.device,
- "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n",
- ctrl->cnum, status);
+ if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING)
+ return;
- if (nvmf_should_reconnect(&ctrl->ctrl)) {
+ if (portptr->port_state == FC_OBJSTATE_ONLINE)
dev_info(ctrl->ctrl.device,
- "NVME-FC{%d}: Reconnect attempt in %d seconds.\n",
- ctrl->cnum, ctrl->ctrl.opts->reconnect_delay);
- queue_delayed_work(nvme_wq, &ctrl->connect_work,
- ctrl->ctrl.opts->reconnect_delay * HZ);
+ "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n",
+ ctrl->cnum, status);
+ else if (time_after_eq(jiffies, rport->dev_loss_end))
+ recon = false;
+
+ if (recon && nvmf_should_reconnect(&ctrl->ctrl)) {
+ if (portptr->port_state == FC_OBJSTATE_ONLINE)
+ dev_info(ctrl->ctrl.device,
+ "NVME-FC{%d}: Reconnect attempt in %ld "
+ "seconds\n",
+ ctrl->cnum, recon_delay / HZ);
+ else if (time_after(jiffies + recon_delay, rport->dev_loss_end))
+ recon_delay = rport->dev_loss_end - jiffies;
+
+ queue_delayed_work(nvme_wq, &ctrl->connect_work, recon_delay);
} else {
- dev_warn(ctrl->ctrl.device,
+ if (portptr->port_state == FC_OBJSTATE_ONLINE)
+ dev_warn(ctrl->ctrl.device,
"NVME-FC{%d}: Max reconnect attempts (%d) "
"reached. Removing controller\n",
ctrl->cnum, ctrl->ctrl.nr_reconnects);
- WARN_ON(__nvme_fc_schedule_delete_work(ctrl));
+ else
+ dev_warn(ctrl->ctrl.device,
+ "NVME-FC{%d}: dev_loss_tmo (%d) expired "
+ "while waiting for remoteport connectivity. "
+ "Removing controller\n", ctrl->cnum,
+ portptr->dev_loss_tmo);
+ WARN_ON(nvme_delete_ctrl(&ctrl->ctrl));
}
}
@@ -2675,15 +2972,28 @@ nvme_fc_reset_ctrl_work(struct work_struct *work)
int ret;
nvme_stop_ctrl(&ctrl->ctrl);
+
/* will block will waiting for io to terminate */
nvme_fc_delete_association(ctrl);
- ret = nvme_fc_create_association(ctrl);
+ if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
+ dev_err(ctrl->ctrl.device,
+ "NVME-FC{%d}: error_recovery: Couldn't change state "
+ "to RECONNECTING\n", ctrl->cnum);
+ return;
+ }
+
+ if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE)
+ ret = nvme_fc_create_association(ctrl);
+ else
+ ret = -ENOTCONN;
+
if (ret)
nvme_fc_reconnect_or_delete(ctrl, ret);
else
dev_info(ctrl->ctrl.device,
- "NVME-FC{%d}: controller reset complete\n", ctrl->cnum);
+ "NVME-FC{%d}: controller reset complete\n",
+ ctrl->cnum);
}
static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
@@ -2695,8 +3005,9 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
.reg_write32 = nvmf_reg_write32,
.free_ctrl = nvme_fc_nvme_ctrl_freed,
.submit_async_event = nvme_fc_submit_async_event,
- .delete_ctrl = nvme_fc_del_nvme_ctrl,
+ .delete_ctrl = nvme_fc_delete_ctrl,
.get_address = nvmf_get_address,
+ .reinit_request = nvme_fc_reinit_request,
};
static void
@@ -2728,6 +3039,33 @@ static const struct blk_mq_ops nvme_fc_admin_mq_ops = {
};
+/*
+ * Fails a controller request if it matches an existing controller
+ * (association) with the same tuple:
+ * <Host NQN, Host ID, local FC port, remote FC port, SUBSYS NQN>
+ *
+ * The ports don't need to be compared as they are intrinsically
+ * already matched by the port pointers supplied.
+ */
+static bool
+nvme_fc_existing_controller(struct nvme_fc_rport *rport,
+ struct nvmf_ctrl_options *opts)
+{
+ struct nvme_fc_ctrl *ctrl;
+ unsigned long flags;
+ bool found = false;
+
+ spin_lock_irqsave(&rport->lock, flags);
+ list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) {
+ found = nvmf_ctlr_matches_baseopts(&ctrl->ctrl, opts);
+ if (found)
+ break;
+ }
+ spin_unlock_irqrestore(&rport->lock, flags);
+
+ return found;
+}
+
static struct nvme_ctrl *
nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
struct nvme_fc_lport *lport, struct nvme_fc_rport *rport)
@@ -2742,6 +3080,12 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
goto out_fail;
}
+ if (!opts->duplicate_connect &&
+ nvme_fc_existing_controller(rport, opts)) {
+ ret = -EALREADY;
+ goto out_fail;
+ }
+
ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
if (!ctrl) {
ret = -ENOMEM;
@@ -2760,12 +3104,12 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
ctrl->rport = rport;
ctrl->dev = lport->dev;
ctrl->cnum = idx;
+ ctrl->assoc_active = false;
init_waitqueue_head(&ctrl->ioabort_wait);
get_device(ctrl->dev);
kref_init(&ctrl->ref);
- INIT_WORK(&ctrl->delete_work, nvme_fc_delete_ctrl_work);
INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work);
INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work);
spin_lock_init(&ctrl->lock);
@@ -2787,7 +3131,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
- ctrl->admin_tag_set.queue_depth = NVME_FC_AQ_BLKMQ_DEPTH;
+ ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */
ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
@@ -2797,6 +3141,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
ctrl->admin_tag_set.driver_data = ctrl;
ctrl->admin_tag_set.nr_hw_queues = 1;
ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
+ ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED;
ret = blk_mq_alloc_tag_set(&ctrl->admin_tag_set);
if (ret)
@@ -2878,7 +3223,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
return ERR_PTR(ret);
}
- kref_get(&ctrl->ctrl.kref);
+ nvme_get_ctrl(&ctrl->ctrl);
dev_info(ctrl->ctrl.device,
"NVME-FC{%d}: new ctrl: NQN \"%s\"\n",
@@ -3026,7 +3371,50 @@ static struct nvmf_transport_ops nvme_fc_transport = {
static int __init nvme_fc_init_module(void)
{
- return nvmf_register_transport(&nvme_fc_transport);
+ int ret;
+
+ /*
+ * NOTE:
+ * It is expected that in the future the kernel will combine
+ * the FC-isms that are currently under scsi and now being
+ * added to by NVME into a new standalone FC class. The SCSI
+ * and NVME protocols and their devices would be under this
+ * new FC class.
+ *
+ * As we need something to post FC-specific udev events to,
+ * specifically for nvme probe events, start by creating the
+ * new device class. When the new standalone FC class is
+ * put in place, this code will move to a more generic
+ * location for the class.
+ */
+ fc_class = class_create(THIS_MODULE, "fc");
+ if (IS_ERR(fc_class)) {
+ pr_err("couldn't register class fc\n");
+ return PTR_ERR(fc_class);
+ }
+
+ /*
+ * Create a device for the FC-centric udev events
+ */
+ fc_udev_device = device_create(fc_class, NULL, MKDEV(0, 0), NULL,
+ "fc_udev_device");
+ if (IS_ERR(fc_udev_device)) {
+ pr_err("couldn't create fc_udev device!\n");
+ ret = PTR_ERR(fc_udev_device);
+ goto out_destroy_class;
+ }
+
+ ret = nvmf_register_transport(&nvme_fc_transport);
+ if (ret)
+ goto out_destroy_device;
+
+ return 0;
+
+out_destroy_device:
+ device_destroy(fc_class, MKDEV(0, 0));
+out_destroy_class:
+ class_destroy(fc_class);
+ return ret;
}
static void __exit nvme_fc_exit_module(void)
@@ -3039,6 +3427,9 @@ static void __exit nvme_fc_exit_module(void)
ida_destroy(&nvme_fc_local_port_cnt);
ida_destroy(&nvme_fc_ctrl_cnt);
+
+ device_destroy(fc_class, MKDEV(0, 0));
+ class_destroy(fc_class);
}
module_init(nvme_fc_init_module);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 1f79e3f..ba3d7f3 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -305,7 +305,7 @@ static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id)
int ret;
c.identity.opcode = nvme_nvm_admin_identity;
- c.identity.nsid = cpu_to_le32(ns->ns_id);
+ c.identity.nsid = cpu_to_le32(ns->head->ns_id);
c.identity.chnl_off = 0;
nvme_nvm_id = kmalloc(sizeof(struct nvme_nvm_id), GFP_KERNEL);
@@ -344,7 +344,7 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb,
int ret = 0;
c.l2p.opcode = nvme_nvm_admin_get_l2p_tbl;
- c.l2p.nsid = cpu_to_le32(ns->ns_id);
+ c.l2p.nsid = cpu_to_le32(ns->head->ns_id);
entries = kmalloc(len, GFP_KERNEL);
if (!entries)
return -ENOMEM;
@@ -402,7 +402,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
int ret = 0;
c.get_bb.opcode = nvme_nvm_admin_get_bb_tbl;
- c.get_bb.nsid = cpu_to_le32(ns->ns_id);
+ c.get_bb.nsid = cpu_to_le32(ns->head->ns_id);
c.get_bb.spba = cpu_to_le64(ppa.ppa);
bb_tbl = kzalloc(tblsz, GFP_KERNEL);
@@ -452,7 +452,7 @@ static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr *ppas,
int ret = 0;
c.set_bb.opcode = nvme_nvm_admin_set_bb_tbl;
- c.set_bb.nsid = cpu_to_le32(ns->ns_id);
+ c.set_bb.nsid = cpu_to_le32(ns->head->ns_id);
c.set_bb.spba = cpu_to_le64(ppas->ppa);
c.set_bb.nlb = cpu_to_le16(nr_ppas - 1);
c.set_bb.value = type;
@@ -469,7 +469,7 @@ static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns,
struct nvme_nvm_command *c)
{
c->ph_rw.opcode = rqd->opcode;
- c->ph_rw.nsid = cpu_to_le32(ns->ns_id);
+ c->ph_rw.nsid = cpu_to_le32(ns->head->ns_id);
c->ph_rw.spba = cpu_to_le64(rqd->ppa_addr.ppa);
c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list);
c->ph_rw.control = cpu_to_le16(rqd->flags);
@@ -492,34 +492,47 @@ static void nvme_nvm_end_io(struct request *rq, blk_status_t status)
blk_mq_free_request(rq);
}
-static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
+static struct request *nvme_nvm_alloc_request(struct request_queue *q,
+ struct nvm_rq *rqd,
+ struct nvme_nvm_command *cmd)
{
- struct request_queue *q = dev->q;
struct nvme_ns *ns = q->queuedata;
struct request *rq;
- struct bio *bio = rqd->bio;
- struct nvme_nvm_command *cmd;
-
- cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL);
- if (!cmd)
- return -ENOMEM;
nvme_nvm_rqtocmd(rqd, ns, cmd);
rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0, NVME_QID_ANY);
- if (IS_ERR(rq)) {
- kfree(cmd);
- return PTR_ERR(rq);
- }
+ if (IS_ERR(rq))
+ return rq;
+
rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
- if (bio) {
- blk_init_request_from_bio(rq, bio);
+ if (rqd->bio) {
+ blk_init_request_from_bio(rq, rqd->bio);
} else {
rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
rq->__data_len = 0;
}
+ return rq;
+}
+
+static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
+{
+ struct request_queue *q = dev->q;
+ struct nvme_nvm_command *cmd;
+ struct request *rq;
+
+ cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL);
+ if (!cmd)
+ return -ENOMEM;
+
+ rq = nvme_nvm_alloc_request(q, rqd, cmd);
+ if (IS_ERR(rq)) {
+ kfree(cmd);
+ return PTR_ERR(rq);
+ }
+
rq->end_io_data = rqd;
blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io);
@@ -527,6 +540,34 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
return 0;
}
+static int nvme_nvm_submit_io_sync(struct nvm_dev *dev, struct nvm_rq *rqd)
+{
+ struct request_queue *q = dev->q;
+ struct request *rq;
+ struct nvme_nvm_command cmd;
+ int ret = 0;
+
+ memset(&cmd, 0, sizeof(struct nvme_nvm_command));
+
+ rq = nvme_nvm_alloc_request(q, rqd, &cmd);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ /* I/Os can fail and the error is signaled through rqd. Callers must
+ * handle the error accordingly.
+ */
+ blk_execute_rq(q, NULL, rq, 0);
+ if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
+ ret = -EINTR;
+
+ rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64);
+ rqd->error = nvme_req(rq)->status;
+
+ blk_mq_free_request(rq);
+
+ return ret;
+}
+
static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name)
{
struct nvme_ns *ns = nvmdev->q->queuedata;
@@ -562,6 +603,7 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = {
.set_bb_tbl = nvme_nvm_set_bb_tbl,
.submit_io = nvme_nvm_submit_io,
+ .submit_io_sync = nvme_nvm_submit_io_sync,
.create_dma_pool = nvme_nvm_create_dma_pool,
.destroy_dma_pool = nvme_nvm_destroy_dma_pool,
@@ -600,8 +642,6 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q,
rq->timeout = timeout ? timeout : ADMIN_TIMEOUT;
- rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
-
if (ppa_buf && ppa_len) {
ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma);
if (!ppa_list) {
@@ -691,7 +731,7 @@ static int nvme_nvm_submit_vio(struct nvme_ns *ns,
memset(&c, 0, sizeof(c));
c.ph_rw.opcode = vio.opcode;
- c.ph_rw.nsid = cpu_to_le32(ns->ns_id);
+ c.ph_rw.nsid = cpu_to_le32(ns->head->ns_id);
c.ph_rw.control = cpu_to_le16(vio.control);
c.ph_rw.length = cpu_to_le16(vio.nppas);
@@ -728,7 +768,7 @@ static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin,
memset(&c, 0, sizeof(c));
c.common.opcode = vcmd.opcode;
- c.common.nsid = cpu_to_le32(ns->ns_id);
+ c.common.nsid = cpu_to_le32(ns->head->ns_id);
c.common.cdw2[0] = cpu_to_le32(vcmd.cdw2);
c.common.cdw2[1] = cpu_to_le32(vcmd.cdw3);
/* cdw11-12 */
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
new file mode 100644
index 0000000..78d9215
--- /dev/null
+++ b/drivers/nvme/host/multipath.c
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2017 Christoph Hellwig.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/moduleparam.h>
+#include "nvme.h"
+
+static bool multipath = true;
+module_param(multipath, bool, 0644);
+MODULE_PARM_DESC(multipath,
+ "turn on native support for multiple controllers per subsystem");
+
+void nvme_failover_req(struct request *req)
+{
+ struct nvme_ns *ns = req->q->queuedata;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ns->head->requeue_lock, flags);
+ blk_steal_bios(&ns->head->requeue_list, req);
+ spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
+ blk_mq_end_request(req, 0);
+
+ nvme_reset_ctrl(ns->ctrl);
+ kblockd_schedule_work(&ns->head->requeue_work);
+}
+
+bool nvme_req_needs_failover(struct request *req)
+{
+ if (!(req->cmd_flags & REQ_NVME_MPATH))
+ return false;
+
+ switch (nvme_req(req)->status & 0x7ff) {
+ /*
+ * Generic command status:
+ */
+ case NVME_SC_INVALID_OPCODE:
+ case NVME_SC_INVALID_FIELD:
+ case NVME_SC_INVALID_NS:
+ case NVME_SC_LBA_RANGE:
+ case NVME_SC_CAP_EXCEEDED:
+ case NVME_SC_RESERVATION_CONFLICT:
+ return false;
+
+ /*
+ * I/O command set specific error. Unfortunately these values are
+ * reused for fabrics commands, but those should never get here.
+ */
+ case NVME_SC_BAD_ATTRIBUTES:
+ case NVME_SC_INVALID_PI:
+ case NVME_SC_READ_ONLY:
+ case NVME_SC_ONCS_NOT_SUPPORTED:
+ WARN_ON_ONCE(nvme_req(req)->cmd->common.opcode ==
+ nvme_fabrics_command);
+ return false;
+
+ /*
+ * Media and Data Integrity Errors:
+ */
+ case NVME_SC_WRITE_FAULT:
+ case NVME_SC_READ_ERROR:
+ case NVME_SC_GUARD_CHECK:
+ case NVME_SC_APPTAG_CHECK:
+ case NVME_SC_REFTAG_CHECK:
+ case NVME_SC_COMPARE_FAILED:
+ case NVME_SC_ACCESS_DENIED:
+ case NVME_SC_UNWRITTEN_BLOCK:
+ return false;
+ }
+
+ /* Everything else could be a path failure, so should be retried */
+ return true;
+}
+
+void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
+{
+ struct nvme_ns *ns;
+
+ mutex_lock(&ctrl->namespaces_mutex);
+ list_for_each_entry(ns, &ctrl->namespaces, list) {
+ if (ns->head->disk)
+ kblockd_schedule_work(&ns->head->requeue_work);
+ }
+ mutex_unlock(&ctrl->namespaces_mutex);
+}
+
+static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head)
+{
+ struct nvme_ns *ns;
+
+ list_for_each_entry_rcu(ns, &head->list, siblings) {
+ if (ns->ctrl->state == NVME_CTRL_LIVE) {
+ rcu_assign_pointer(head->current_path, ns);
+ return ns;
+ }
+ }
+
+ return NULL;
+}
+
+inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
+{
+ struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu);
+
+ if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE))
+ ns = __nvme_find_path(head);
+ return ns;
+}
+
+static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
+ struct bio *bio)
+{
+ struct nvme_ns_head *head = q->queuedata;
+ struct device *dev = disk_to_dev(head->disk);
+ struct nvme_ns *ns;
+ blk_qc_t ret = BLK_QC_T_NONE;
+ int srcu_idx;
+
+ srcu_idx = srcu_read_lock(&head->srcu);
+ ns = nvme_find_path(head);
+ if (likely(ns)) {
+ bio->bi_disk = ns->disk;
+ bio->bi_opf |= REQ_NVME_MPATH;
+ ret = direct_make_request(bio);
+ } else if (!list_empty_careful(&head->list)) {
+ dev_warn_ratelimited(dev, "no path available - requeing I/O\n");
+
+ spin_lock_irq(&head->requeue_lock);
+ bio_list_add(&head->requeue_list, bio);
+ spin_unlock_irq(&head->requeue_lock);
+ } else {
+ dev_warn_ratelimited(dev, "no path - failing I/O\n");
+
+ bio->bi_status = BLK_STS_IOERR;
+ bio_endio(bio);
+ }
+
+ srcu_read_unlock(&head->srcu, srcu_idx);
+ return ret;
+}
+
+static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
+{
+ struct nvme_ns_head *head = q->queuedata;
+ struct nvme_ns *ns;
+ bool found = false;
+ int srcu_idx;
+
+ srcu_idx = srcu_read_lock(&head->srcu);
+ ns = srcu_dereference(head->current_path, &head->srcu);
+ if (likely(ns && ns->ctrl->state == NVME_CTRL_LIVE))
+ found = ns->queue->poll_fn(q, qc);
+ srcu_read_unlock(&head->srcu, srcu_idx);
+ return found;
+}
+
+static void nvme_requeue_work(struct work_struct *work)
+{
+ struct nvme_ns_head *head =
+ container_of(work, struct nvme_ns_head, requeue_work);
+ struct bio *bio, *next;
+
+ spin_lock_irq(&head->requeue_lock);
+ next = bio_list_get(&head->requeue_list);
+ spin_unlock_irq(&head->requeue_lock);
+
+ while ((bio = next) != NULL) {
+ next = bio->bi_next;
+ bio->bi_next = NULL;
+
+ /*
+ * Reset disk to the mpath node and resubmit to select a new
+ * path.
+ */
+ bio->bi_disk = head->disk;
+ generic_make_request(bio);
+ }
+}
+
+int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
+{
+ struct request_queue *q;
+ bool vwc = false;
+
+ bio_list_init(&head->requeue_list);
+ spin_lock_init(&head->requeue_lock);
+ INIT_WORK(&head->requeue_work, nvme_requeue_work);
+
+ /*
+ * Add a multipath node if the subsystems supports multiple controllers.
+ * We also do this for private namespaces as the namespace sharing data could
+ * change after a rescan.
+ */
+ if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
+ return 0;
+
+ q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE);
+ if (!q)
+ goto out;
+ q->queuedata = head;
+ blk_queue_make_request(q, nvme_ns_head_make_request);
+ q->poll_fn = nvme_ns_head_poll;
+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+ /* set to a default value for 512 until disk is validated */
+ blk_queue_logical_block_size(q, 512);
+
+ /* we need to propagate up the VMC settings */
+ if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
+ vwc = true;
+ blk_queue_write_cache(q, vwc, vwc);
+
+ head->disk = alloc_disk(0);
+ if (!head->disk)
+ goto out_cleanup_queue;
+ head->disk->fops = &nvme_ns_head_ops;
+ head->disk->private_data = head;
+ head->disk->queue = q;
+ head->disk->flags = GENHD_FL_EXT_DEVT;
+ sprintf(head->disk->disk_name, "nvme%dn%d",
+ ctrl->subsys->instance, head->instance);
+ return 0;
+
+out_cleanup_queue:
+ blk_cleanup_queue(q);
+out:
+ return -ENOMEM;
+}
+
+void nvme_mpath_add_disk(struct nvme_ns_head *head)
+{
+ if (!head->disk)
+ return;
+ device_add_disk(&head->subsys->dev, head->disk);
+ if (sysfs_create_group(&disk_to_dev(head->disk)->kobj,
+ &nvme_ns_id_attr_group))
+ pr_warn("%s: failed to create sysfs group for identification\n",
+ head->disk->disk_name);
+}
+
+void nvme_mpath_add_disk_links(struct nvme_ns *ns)
+{
+ struct kobject *slave_disk_kobj, *holder_disk_kobj;
+
+ if (!ns->head->disk)
+ return;
+
+ slave_disk_kobj = &disk_to_dev(ns->disk)->kobj;
+ if (sysfs_create_link(ns->head->disk->slave_dir, slave_disk_kobj,
+ kobject_name(slave_disk_kobj)))
+ return;
+
+ holder_disk_kobj = &disk_to_dev(ns->head->disk)->kobj;
+ if (sysfs_create_link(ns->disk->part0.holder_dir, holder_disk_kobj,
+ kobject_name(holder_disk_kobj)))
+ sysfs_remove_link(ns->head->disk->slave_dir,
+ kobject_name(slave_disk_kobj));
+}
+
+void nvme_mpath_remove_disk(struct nvme_ns_head *head)
+{
+ if (!head->disk)
+ return;
+ sysfs_remove_group(&disk_to_dev(head->disk)->kobj,
+ &nvme_ns_id_attr_group);
+ del_gendisk(head->disk);
+ blk_set_queue_dying(head->disk->queue);
+ /* make sure all pending bios are cleaned up */
+ kblockd_schedule_work(&head->requeue_work);
+ flush_work(&head->requeue_work);
+ blk_cleanup_queue(head->disk->queue);
+ put_disk(head->disk);
+}
+
+void nvme_mpath_remove_disk_links(struct nvme_ns *ns)
+{
+ if (!ns->head->disk)
+ return;
+
+ sysfs_remove_link(ns->disk->part0.holder_dir,
+ kobject_name(&disk_to_dev(ns->head->disk)->kobj));
+ sysfs_remove_link(ns->head->disk->slave_dir,
+ kobject_name(&disk_to_dev(ns->disk)->kobj));
+}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index d3f3c44..c0873a6 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -15,16 +15,17 @@
#define _NVME_H
#include <linux/nvme.h>
+#include <linux/cdev.h>
#include <linux/pci.h>
#include <linux/kref.h>
#include <linux/blk-mq.h>
#include <linux/lightnvm.h>
#include <linux/sed-opal.h>
-extern unsigned char nvme_io_timeout;
+extern unsigned int nvme_io_timeout;
#define NVME_IO_TIMEOUT (nvme_io_timeout * HZ)
-extern unsigned char admin_timeout;
+extern unsigned int admin_timeout;
#define ADMIN_TIMEOUT (admin_timeout * HZ)
#define NVME_DEFAULT_KATO 5
@@ -94,6 +95,11 @@ struct nvme_request {
u16 status;
};
+/*
+ * Mark a bio as coming in through the mpath node.
+ */
+#define REQ_NVME_MPATH REQ_DRV
+
enum {
NVME_REQ_CANCELLED = (1 << 0),
};
@@ -127,24 +133,23 @@ struct nvme_ctrl {
struct request_queue *admin_q;
struct request_queue *connect_q;
struct device *dev;
- struct kref kref;
int instance;
struct blk_mq_tag_set *tagset;
struct blk_mq_tag_set *admin_tagset;
struct list_head namespaces;
struct mutex namespaces_mutex;
+ struct device ctrl_device;
struct device *device; /* char device */
- struct list_head node;
- struct ida ns_ida;
+ struct cdev cdev;
struct work_struct reset_work;
+ struct work_struct delete_work;
+
+ struct nvme_subsystem *subsys;
+ struct list_head subsys_entry;
struct opal_dev *opal_dev;
char name[12];
- char serial[20];
- char model[40];
- char firmware_rev[8];
- char subnqn[NVMF_NQN_SIZE];
u16 cntlid;
u32 ctrl_config;
@@ -155,23 +160,23 @@ struct nvme_ctrl {
u32 page_size;
u32 max_hw_sectors;
u16 oncs;
- u16 vid;
u16 oacs;
u16 nssa;
u16 nr_streams;
atomic_t abort_limit;
- u8 event_limit;
u8 vwc;
u32 vs;
u32 sgls;
u16 kas;
u8 npss;
u8 apsta;
+ u32 aen_result;
unsigned int shutdown_timeout;
unsigned int kato;
bool subsystem;
unsigned long quirks;
struct nvme_id_power_state psd[32];
+ struct nvme_effects_log *effects;
struct work_struct scan_work;
struct work_struct async_event_work;
struct delayed_work ka_work;
@@ -197,21 +202,72 @@ struct nvme_ctrl {
struct nvmf_ctrl_options *opts;
};
+struct nvme_subsystem {
+ int instance;
+ struct device dev;
+ /*
+ * Because we unregister the device on the last put we need
+ * a separate refcount.
+ */
+ struct kref ref;
+ struct list_head entry;
+ struct mutex lock;
+ struct list_head ctrls;
+ struct list_head nsheads;
+ char subnqn[NVMF_NQN_SIZE];
+ char serial[20];
+ char model[40];
+ char firmware_rev[8];
+ u8 cmic;
+ u16 vendor_id;
+ struct ida ns_ida;
+};
+
+/*
+ * Container structure for uniqueue namespace identifiers.
+ */
+struct nvme_ns_ids {
+ u8 eui64[8];
+ u8 nguid[16];
+ uuid_t uuid;
+};
+
+/*
+ * Anchor structure for namespaces. There is one for each namespace in a
+ * NVMe subsystem that any of our controllers can see, and the namespace
+ * structure for each controller is chained of it. For private namespaces
+ * there is a 1:1 relation to our namespace structures, that is ->list
+ * only ever has a single entry for private namespaces.
+ */
+struct nvme_ns_head {
+#ifdef CONFIG_NVME_MULTIPATH
+ struct gendisk *disk;
+ struct nvme_ns __rcu *current_path;
+ struct bio_list requeue_list;
+ spinlock_t requeue_lock;
+ struct work_struct requeue_work;
+#endif
+ struct list_head list;
+ struct srcu_struct srcu;
+ struct nvme_subsystem *subsys;
+ unsigned ns_id;
+ struct nvme_ns_ids ids;
+ struct list_head entry;
+ struct kref ref;
+ int instance;
+};
+
struct nvme_ns {
struct list_head list;
struct nvme_ctrl *ctrl;
struct request_queue *queue;
struct gendisk *disk;
+ struct list_head siblings;
struct nvm_dev *ndev;
struct kref kref;
- int instance;
+ struct nvme_ns_head *head;
- u8 eui[8];
- u8 nguid[16];
- uuid_t uuid;
-
- unsigned ns_id;
int lba_shift;
u16 ms;
u16 sgs;
@@ -234,9 +290,10 @@ struct nvme_ctrl_ops {
int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val);
int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
void (*free_ctrl)(struct nvme_ctrl *ctrl);
- void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx);
- int (*delete_ctrl)(struct nvme_ctrl *ctrl);
+ void (*submit_async_event)(struct nvme_ctrl *ctrl);
+ void (*delete_ctrl)(struct nvme_ctrl *ctrl);
int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
+ int (*reinit_request)(void *data, struct request *rq);
};
static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl)
@@ -278,6 +335,16 @@ static inline void nvme_end_request(struct request *req, __le16 status,
blk_mq_complete_request(req);
}
+static inline void nvme_get_ctrl(struct nvme_ctrl *ctrl)
+{
+ get_device(ctrl->device);
+}
+
+static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl)
+{
+ put_device(ctrl->device);
+}
+
void nvme_complete_rq(struct request *req);
void nvme_cancel_request(struct request *req, void *data, bool reserved);
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
@@ -299,10 +366,8 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
bool send);
-#define NVME_NR_AERS 1
void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
union nvme_result *res);
-void nvme_queue_async_events(struct nvme_ctrl *ctrl);
void nvme_stop_queues(struct nvme_ctrl *ctrl);
void nvme_start_queues(struct nvme_ctrl *ctrl);
@@ -311,21 +376,79 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl);
void nvme_wait_freeze(struct nvme_ctrl *ctrl);
void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);
void nvme_start_freeze(struct nvme_ctrl *ctrl);
+int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set);
#define NVME_QID_ANY -1
struct request *nvme_alloc_request(struct request_queue *q,
- struct nvme_command *cmd, unsigned int flags, int qid);
+ struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid);
blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmd);
int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
void *buf, unsigned bufflen);
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
union nvme_result *result, void *buffer, unsigned bufflen,
- unsigned timeout, int qid, int at_head, int flags);
+ unsigned timeout, int qid, int at_head,
+ blk_mq_req_flags_t flags);
int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
void nvme_start_keep_alive(struct nvme_ctrl *ctrl);
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
+int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
+int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
+
+extern const struct attribute_group nvme_ns_id_attr_group;
+extern const struct block_device_operations nvme_ns_head_ops;
+
+#ifdef CONFIG_NVME_MULTIPATH
+void nvme_failover_req(struct request *req);
+bool nvme_req_needs_failover(struct request *req);
+void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
+int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
+void nvme_mpath_add_disk(struct nvme_ns_head *head);
+void nvme_mpath_add_disk_links(struct nvme_ns *ns);
+void nvme_mpath_remove_disk(struct nvme_ns_head *head);
+void nvme_mpath_remove_disk_links(struct nvme_ns *ns);
+
+static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
+{
+ struct nvme_ns_head *head = ns->head;
+
+ if (head && ns == srcu_dereference(head->current_path, &head->srcu))
+ rcu_assign_pointer(head->current_path, NULL);
+}
+struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
+#else
+static inline void nvme_failover_req(struct request *req)
+{
+}
+static inline bool nvme_req_needs_failover(struct request *req)
+{
+ return false;
+}
+static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
+{
+}
+static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,
+ struct nvme_ns_head *head)
+{
+ return 0;
+}
+static inline void nvme_mpath_add_disk(struct nvme_ns_head *head)
+{
+}
+static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
+{
+}
+static inline void nvme_mpath_add_disk_links(struct nvme_ns *ns)
+{
+}
+static inline void nvme_mpath_remove_disk_links(struct nvme_ns *ns)
+{
+}
+static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
+{
+}
+#endif /* CONFIG_NVME_MULTIPATH */
#ifdef CONFIG_NVM
int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 3f5a04c..a11cfd4 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -13,7 +13,6 @@
*/
#include <linux/aer.h>
-#include <linux/bitops.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/blk-mq-pci.h>
@@ -26,12 +25,9 @@
#include <linux/mutex.h>
#include <linux/once.h>
#include <linux/pci.h>
-#include <linux/poison.h>
#include <linux/t10-pi.h>
-#include <linux/timer.h>
#include <linux/types.h>
#include <linux/io-64-nonatomic-lo-hi.h>
-#include <asm/unaligned.h>
#include <linux/sed-opal.h>
#include "nvme.h"
@@ -39,11 +35,7 @@
#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
-/*
- * We handle AEN commands ourselves and don't even let the
- * block layer know about them.
- */
-#define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AERS)
+#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc))
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0);
@@ -57,6 +49,12 @@ module_param(max_host_mem_size_mb, uint, 0444);
MODULE_PARM_DESC(max_host_mem_size_mb,
"Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
+static unsigned int sgl_threshold = SZ_32K;
+module_param(sgl_threshold, uint, 0644);
+MODULE_PARM_DESC(sgl_threshold,
+ "Use SGLs when average request segment size is larger or equal to "
+ "this size. Use 0 to disable SGLs.");
+
static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
static const struct kernel_param_ops io_queue_depth_ops = {
.set = io_queue_depth_set,
@@ -178,6 +176,7 @@ struct nvme_queue {
struct nvme_iod {
struct nvme_request req;
struct nvme_queue *nvmeq;
+ bool use_sgl;
int aborted;
int npages; /* In the PRP list. 0 means small pool in use */
int nents; /* Used in scatterlist */
@@ -331,17 +330,35 @@ static int nvme_npages(unsigned size, struct nvme_dev *dev)
return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
}
-static unsigned int nvme_iod_alloc_size(struct nvme_dev *dev,
- unsigned int size, unsigned int nseg)
+/*
+ * Calculates the number of pages needed for the SGL segments. For example a 4k
+ * page can accommodate 256 SGL descriptors.
+ */
+static int nvme_pci_npages_sgl(unsigned int num_seg)
{
- return sizeof(__le64 *) * nvme_npages(size, dev) +
- sizeof(struct scatterlist) * nseg;
+ return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE);
}
-static unsigned int nvme_cmd_size(struct nvme_dev *dev)
+static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev,
+ unsigned int size, unsigned int nseg, bool use_sgl)
{
- return sizeof(struct nvme_iod) +
- nvme_iod_alloc_size(dev, NVME_INT_BYTES(dev), NVME_INT_PAGES);
+ size_t alloc_size;
+
+ if (use_sgl)
+ alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg);
+ else
+ alloc_size = sizeof(__le64 *) * nvme_npages(size, dev);
+
+ return alloc_size + sizeof(struct scatterlist) * nseg;
+}
+
+static unsigned int nvme_pci_cmd_size(struct nvme_dev *dev, bool use_sgl)
+{
+ unsigned int alloc_size = nvme_pci_iod_alloc_size(dev,
+ NVME_INT_BYTES(dev), NVME_INT_PAGES,
+ use_sgl);
+
+ return sizeof(struct nvme_iod) + alloc_size;
}
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
@@ -425,10 +442,10 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
nvmeq->sq_tail = tail;
}
-static __le64 **iod_list(struct request *req)
+static void **nvme_pci_iod_list(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- return (__le64 **)(iod->sg + blk_rq_nr_phys_segments(req));
+ return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
}
static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
@@ -438,7 +455,10 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
unsigned int size = blk_rq_payload_bytes(rq);
if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
- iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC);
+ size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg,
+ iod->use_sgl);
+
+ iod->sg = kmalloc(alloc_size, GFP_ATOMIC);
if (!iod->sg)
return BLK_STS_RESOURCE;
} else {
@@ -456,18 +476,31 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- const int last_prp = dev->ctrl.page_size / 8 - 1;
+ const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
+ dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
+
int i;
- __le64 **list = iod_list(req);
- dma_addr_t prp_dma = iod->first_dma;
if (iod->npages == 0)
- dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
+ dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
+ dma_addr);
+
for (i = 0; i < iod->npages; i++) {
- __le64 *prp_list = list[i];
- dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
- dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
- prp_dma = next_prp_dma;
+ void *addr = nvme_pci_iod_list(req)[i];
+
+ if (iod->use_sgl) {
+ struct nvme_sgl_desc *sg_list = addr;
+
+ next_dma_addr =
+ le64_to_cpu((sg_list[SGES_PER_PAGE - 1]).addr);
+ } else {
+ __le64 *prp_list = addr;
+
+ next_dma_addr = le64_to_cpu(prp_list[last_prp]);
+ }
+
+ dma_pool_free(dev->prp_page_pool, addr, dma_addr);
+ dma_addr = next_dma_addr;
}
if (iod->sg != iod->inline_sg)
@@ -555,7 +588,8 @@ static void nvme_print_sgl(struct scatterlist *sgl, int nents)
}
}
-static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req)
+static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
+ struct request *req, struct nvme_rw_command *cmnd)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct dma_pool *pool;
@@ -566,14 +600,16 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req)
u32 page_size = dev->ctrl.page_size;
int offset = dma_addr & (page_size - 1);
__le64 *prp_list;
- __le64 **list = iod_list(req);
+ void **list = nvme_pci_iod_list(req);
dma_addr_t prp_dma;
int nprps, i;
+ iod->use_sgl = false;
+
length -= (page_size - offset);
if (length <= 0) {
iod->first_dma = 0;
- return BLK_STS_OK;
+ goto done;
}
dma_len -= (page_size - offset);
@@ -587,7 +623,7 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req)
if (length <= page_size) {
iod->first_dma = dma_addr;
- return BLK_STS_OK;
+ goto done;
}
nprps = DIV_ROUND_UP(length, page_size);
@@ -634,6 +670,10 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req)
dma_len = sg_dma_len(sg);
}
+done:
+ cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+ cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
+
return BLK_STS_OK;
bad_sgl:
@@ -643,6 +683,110 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req)
return BLK_STS_IOERR;
}
+static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
+ struct scatterlist *sg)
+{
+ sge->addr = cpu_to_le64(sg_dma_address(sg));
+ sge->length = cpu_to_le32(sg_dma_len(sg));
+ sge->type = NVME_SGL_FMT_DATA_DESC << 4;
+}
+
+static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
+ dma_addr_t dma_addr, int entries)
+{
+ sge->addr = cpu_to_le64(dma_addr);
+ if (entries < SGES_PER_PAGE) {
+ sge->length = cpu_to_le32(entries * sizeof(*sge));
+ sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
+ } else {
+ sge->length = cpu_to_le32(PAGE_SIZE);
+ sge->type = NVME_SGL_FMT_SEG_DESC << 4;
+ }
+}
+
+static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
+ struct request *req, struct nvme_rw_command *cmd)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ int length = blk_rq_payload_bytes(req);
+ struct dma_pool *pool;
+ struct nvme_sgl_desc *sg_list;
+ struct scatterlist *sg = iod->sg;
+ int entries = iod->nents, i = 0;
+ dma_addr_t sgl_dma;
+
+ iod->use_sgl = true;
+
+ /* setting the transfer type as SGL */
+ cmd->flags = NVME_CMD_SGL_METABUF;
+
+ if (length == sg_dma_len(sg)) {
+ nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
+ return BLK_STS_OK;
+ }
+
+ if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
+ pool = dev->prp_small_pool;
+ iod->npages = 0;
+ } else {
+ pool = dev->prp_page_pool;
+ iod->npages = 1;
+ }
+
+ sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
+ if (!sg_list) {
+ iod->npages = -1;
+ return BLK_STS_RESOURCE;
+ }
+
+ nvme_pci_iod_list(req)[0] = sg_list;
+ iod->first_dma = sgl_dma;
+
+ nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
+
+ do {
+ if (i == SGES_PER_PAGE) {
+ struct nvme_sgl_desc *old_sg_desc = sg_list;
+ struct nvme_sgl_desc *link = &old_sg_desc[i - 1];
+
+ sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
+ if (!sg_list)
+ return BLK_STS_RESOURCE;
+
+ i = 0;
+ nvme_pci_iod_list(req)[iod->npages++] = sg_list;
+ sg_list[i++] = *link;
+ nvme_pci_sgl_set_seg(link, sgl_dma, entries);
+ }
+
+ nvme_pci_sgl_set_data(&sg_list[i++], sg);
+
+ length -= sg_dma_len(sg);
+ sg = sg_next(sg);
+ entries--;
+ } while (length > 0);
+
+ WARN_ON(entries > 0);
+ return BLK_STS_OK;
+}
+
+static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ unsigned int avg_seg_size;
+
+ avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req),
+ blk_rq_nr_phys_segments(req));
+
+ if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
+ return false;
+ if (!iod->nvmeq->qid)
+ return false;
+ if (!sgl_threshold || avg_seg_size < sgl_threshold)
+ return false;
+ return true;
+}
+
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
struct nvme_command *cmnd)
{
@@ -662,7 +806,11 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
DMA_ATTR_NO_WARN))
goto out;
- ret = nvme_setup_prps(dev, req);
+ if (nvme_pci_use_sgls(dev, req))
+ ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
+ else
+ ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
+
if (ret != BLK_STS_OK)
goto out_unmap;
@@ -682,8 +830,6 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
goto out_unmap;
}
- cmnd->rw.dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
- cmnd->rw.dptr.prp2 = cpu_to_le64(iod->first_dma);
if (blk_integrity_rq(req))
cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg));
return BLK_STS_OK;
@@ -804,7 +950,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
* for them but rather special case them here.
*/
if (unlikely(nvmeq->qid == 0 &&
- cqe->command_id >= NVME_AQ_BLKMQ_DEPTH)) {
+ cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
nvme_complete_async_event(&nvmeq->dev->ctrl,
cqe->status, &cqe->result);
return;
@@ -897,7 +1043,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
return __nvme_poll(nvmeq, tag);
}
-static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx)
+static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
{
struct nvme_dev *dev = to_nvme_dev(ctrl);
struct nvme_queue *nvmeq = dev->queues[0];
@@ -905,7 +1051,7 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx)
memset(&c, 0, sizeof(c));
c.common.opcode = nvme_admin_async_event;
- c.common.command_id = NVME_AQ_BLKMQ_DEPTH + aer_idx;
+ c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
spin_lock_irq(&nvmeq->q_lock);
__nvme_submit_cmd(nvmeq, &c);
@@ -930,7 +1076,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
/*
- * Note: we (ab)use the fact the the prp fields survive if no data
+ * Note: we (ab)use the fact that the prp fields survive if no data
* is attached to the request.
*/
memset(&c, 0, sizeof(c));
@@ -951,7 +1097,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
int flags = NVME_QUEUE_PHYS_CONTIG;
/*
- * Note: we (ab)use the fact the the prp fields survive if no data
+ * Note: we (ab)use the fact that the prp fields survive if no data
* is attached to the request.
*/
memset(&c, 0, sizeof(c));
@@ -1372,14 +1518,10 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
dev->admin_tagset.ops = &nvme_mq_admin_ops;
dev->admin_tagset.nr_hw_queues = 1;
- /*
- * Subtract one to leave an empty queue entry for 'Full Queue'
- * condition. See NVM-Express 1.2 specification, section 4.1.2.
- */
- dev->admin_tagset.queue_depth = NVME_AQ_BLKMQ_DEPTH - 1;
+ dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
dev->admin_tagset.timeout = ADMIN_TIMEOUT;
dev->admin_tagset.numa_node = dev_to_node(dev->dev);
- dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
+ dev->admin_tagset.cmd_size = nvme_pci_cmd_size(dev, false);
dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
dev->admin_tagset.driver_data = dev;
@@ -1906,7 +2048,11 @@ static int nvme_dev_add(struct nvme_dev *dev)
dev->tagset.numa_node = dev_to_node(dev->dev);
dev->tagset.queue_depth =
min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
- dev->tagset.cmd_size = nvme_cmd_size(dev);
+ dev->tagset.cmd_size = nvme_pci_cmd_size(dev, false);
+ if ((dev->ctrl.sgls & ((1 << 0) | (1 << 1))) && sgl_threshold) {
+ dev->tagset.cmd_size = max(dev->tagset.cmd_size,
+ nvme_pci_cmd_size(dev, true));
+ }
dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
dev->tagset.driver_data = dev;
@@ -2132,9 +2278,9 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
{
dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);
- kref_get(&dev->ctrl.kref);
+ nvme_get_ctrl(&dev->ctrl);
nvme_dev_disable(dev, false);
- if (!schedule_work(&dev->remove_work))
+ if (!queue_work(nvme_wq, &dev->remove_work))
nvme_put_ctrl(&dev->ctrl);
}
@@ -2557,6 +2703,7 @@ static int __init nvme_init(void)
static void __exit nvme_exit(void)
{
pci_unregister_driver(&nvme_driver);
+ flush_workqueue(nvme_wq);
_nvme_check_size();
}
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 0ebb539..4f9bf2f 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -41,17 +41,9 @@
#define NVME_RDMA_MAX_INLINE_SEGMENTS 1
-/*
- * We handle AEN commands ourselves and don't even let the
- * block layer know about them.
- */
-#define NVME_RDMA_NR_AEN_COMMANDS 1
-#define NVME_RDMA_AQ_BLKMQ_DEPTH \
- (NVME_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS)
-
struct nvme_rdma_device {
- struct ib_device *dev;
- struct ib_pd *pd;
+ struct ib_device *dev;
+ struct ib_pd *pd;
struct kref ref;
struct list_head entry;
};
@@ -79,8 +71,8 @@ struct nvme_rdma_request {
};
enum nvme_rdma_queue_flags {
- NVME_RDMA_Q_LIVE = 0,
- NVME_RDMA_Q_DELETING = 1,
+ NVME_RDMA_Q_ALLOCATED = 0,
+ NVME_RDMA_Q_LIVE = 1,
};
struct nvme_rdma_queue {
@@ -105,7 +97,6 @@ struct nvme_rdma_ctrl {
/* other member variables */
struct blk_mq_tag_set tag_set;
- struct work_struct delete_work;
struct work_struct err_work;
struct nvme_rdma_qe async_event_sqe;
@@ -274,6 +265,9 @@ static int nvme_rdma_reinit_request(void *data, struct request *rq)
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
int ret = 0;
+ if (WARN_ON_ONCE(!req->mr))
+ return 0;
+
ib_dereg_mr(req->mr);
req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
@@ -434,11 +428,9 @@ out_err:
static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
{
- struct nvme_rdma_device *dev;
- struct ib_device *ibdev;
+ struct nvme_rdma_device *dev = queue->device;
+ struct ib_device *ibdev = dev->dev;
- dev = queue->device;
- ibdev = dev->dev;
rdma_destroy_qp(queue->cm_id);
ib_free_cq(queue->ib_cq);
@@ -493,7 +485,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
return 0;
out_destroy_qp:
- ib_destroy_qp(queue->qp);
+ rdma_destroy_qp(queue->cm_id);
out_destroy_ib_cq:
ib_free_cq(queue->ib_cq);
out_put_dev:
@@ -544,11 +536,11 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
ret = nvme_rdma_wait_for_cm(queue);
if (ret) {
dev_info(ctrl->ctrl.device,
- "rdma_resolve_addr wait failed (%d).\n", ret);
+ "rdma connection establishment failed (%d)\n", ret);
goto out_destroy_cm_id;
}
- clear_bit(NVME_RDMA_Q_DELETING, &queue->flags);
+ set_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags);
return 0;
@@ -568,7 +560,7 @@ static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
{
- if (test_and_set_bit(NVME_RDMA_Q_DELETING, &queue->flags))
+ if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
return;
if (nvme_rdma_queue_idx(queue) == 0) {
@@ -676,11 +668,10 @@ out_free_queues:
return ret;
}
-static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl, bool admin)
+static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl,
+ struct blk_mq_tag_set *set)
{
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
- struct blk_mq_tag_set *set = admin ?
- &ctrl->admin_tag_set : &ctrl->tag_set;
blk_mq_free_tag_set(set);
nvme_rdma_dev_put(ctrl->device);
@@ -697,7 +688,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
set = &ctrl->admin_tag_set;
memset(set, 0, sizeof(*set));
set->ops = &nvme_rdma_admin_mq_ops;
- set->queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH;
+ set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
set->reserved_tags = 2; /* connect + keep-alive */
set->numa_node = NUMA_NO_NODE;
set->cmd_size = sizeof(struct nvme_rdma_request) +
@@ -705,6 +696,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
set->driver_data = ctrl;
set->nr_hw_queues = 1;
set->timeout = ADMIN_TIMEOUT;
+ set->flags = BLK_MQ_F_NO_SCHED;
} else {
set = &ctrl->tag_set;
memset(set, 0, sizeof(*set));
@@ -748,7 +740,7 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
nvme_rdma_stop_queue(&ctrl->queues[0]);
if (remove) {
blk_cleanup_queue(ctrl->ctrl.admin_q);
- nvme_rdma_free_tagset(&ctrl->ctrl, true);
+ nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
}
nvme_rdma_free_queue(&ctrl->queues[0]);
}
@@ -780,8 +772,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
goto out_free_tagset;
}
} else {
- error = blk_mq_reinit_tagset(&ctrl->admin_tag_set,
- nvme_rdma_reinit_request);
+ error = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
if (error)
goto out_free_queue;
}
@@ -825,7 +816,7 @@ out_cleanup_queue:
blk_cleanup_queue(ctrl->ctrl.admin_q);
out_free_tagset:
if (new)
- nvme_rdma_free_tagset(&ctrl->ctrl, true);
+ nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
out_free_queue:
nvme_rdma_free_queue(&ctrl->queues[0]);
return error;
@@ -837,7 +828,7 @@ static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl,
nvme_rdma_stop_io_queues(ctrl);
if (remove) {
blk_cleanup_queue(ctrl->ctrl.connect_q);
- nvme_rdma_free_tagset(&ctrl->ctrl, false);
+ nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
}
nvme_rdma_free_io_queues(ctrl);
}
@@ -863,8 +854,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
goto out_free_tag_set;
}
} else {
- ret = blk_mq_reinit_tagset(&ctrl->tag_set,
- nvme_rdma_reinit_request);
+ ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
if (ret)
goto out_free_io_queues;
@@ -883,7 +873,7 @@ out_cleanup_connect_q:
blk_cleanup_queue(ctrl->ctrl.connect_q);
out_free_tag_set:
if (new)
- nvme_rdma_free_tagset(&ctrl->ctrl, false);
+ nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
out_free_io_queues:
nvme_rdma_free_io_queues(ctrl);
return ret;
@@ -922,7 +912,7 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
ctrl->ctrl.opts->reconnect_delay * HZ);
} else {
dev_info(ctrl->ctrl.device, "Removing controller...\n");
- queue_work(nvme_wq, &ctrl->delete_work);
+ nvme_delete_ctrl(&ctrl->ctrl);
}
}
@@ -935,10 +925,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
++ctrl->ctrl.nr_reconnects;
- if (ctrl->ctrl.queue_count > 1)
- nvme_rdma_destroy_io_queues(ctrl, false);
-
- nvme_rdma_destroy_admin_queue(ctrl, false);
ret = nvme_rdma_configure_admin_queue(ctrl, false);
if (ret)
goto requeue;
@@ -946,7 +932,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
if (ctrl->ctrl.queue_count > 1) {
ret = nvme_rdma_configure_io_queues(ctrl, false);
if (ret)
- goto requeue;
+ goto destroy_admin;
}
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
@@ -956,14 +942,17 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
return;
}
- ctrl->ctrl.nr_reconnects = 0;
-
nvme_start_ctrl(&ctrl->ctrl);
- dev_info(ctrl->ctrl.device, "Successfully reconnected\n");
+ dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n",
+ ctrl->ctrl.nr_reconnects);
+
+ ctrl->ctrl.nr_reconnects = 0;
return;
+destroy_admin:
+ nvme_rdma_destroy_admin_queue(ctrl, false);
requeue:
dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
ctrl->ctrl.nr_reconnects);
@@ -979,17 +968,15 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
if (ctrl->ctrl.queue_count > 1) {
nvme_stop_queues(&ctrl->ctrl);
- nvme_rdma_stop_io_queues(ctrl);
- }
- blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
- nvme_rdma_stop_queue(&ctrl->queues[0]);
-
- /* We must take care of fastfail/requeue all our inflight requests */
- if (ctrl->ctrl.queue_count > 1)
blk_mq_tagset_busy_iter(&ctrl->tag_set,
nvme_cancel_request, &ctrl->ctrl);
+ nvme_rdma_destroy_io_queues(ctrl, false);
+ }
+
+ blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
nvme_cancel_request, &ctrl->ctrl);
+ nvme_rdma_destroy_admin_queue(ctrl, false);
/*
* queues are not a live anymore, so restart the queues to fail fast
@@ -1065,7 +1052,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
if (!blk_rq_bytes(rq))
return;
- if (req->mr->need_inval) {
+ if (req->mr->need_inval && test_bit(NVME_RDMA_Q_LIVE, &req->queue->flags)) {
res = nvme_rdma_inv_rkey(queue, req);
if (unlikely(res < 0)) {
dev_err(ctrl->ctrl.device,
@@ -1314,7 +1301,7 @@ static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue)
return queue->ctrl->tag_set.tags[queue_idx - 1];
}
-static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
+static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
{
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
struct nvme_rdma_queue *queue = &ctrl->queues[0];
@@ -1324,14 +1311,11 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
struct ib_sge sge;
int ret;
- if (WARN_ON_ONCE(aer_idx != 0))
- return;
-
ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE);
memset(cmd, 0, sizeof(*cmd));
cmd->common.opcode = nvme_admin_async_event;
- cmd->common.command_id = NVME_RDMA_AQ_BLKMQ_DEPTH;
+ cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
cmd->common.flags |= NVME_CMD_SGL_METABUF;
nvme_rdma_set_sg_null(cmd);
@@ -1393,7 +1377,7 @@ static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
* for them but rather special case them here.
*/
if (unlikely(nvme_rdma_queue_idx(queue) == 0 &&
- cqe->command_id >= NVME_RDMA_AQ_BLKMQ_DEPTH))
+ cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH))
nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
&cqe->result);
else
@@ -1590,6 +1574,10 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
{
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
+ dev_warn(req->queue->ctrl->ctrl.device,
+ "I/O %d QID %d timeout, reset controller\n",
+ rq->tag, nvme_rdma_queue_idx(req->queue));
+
/* queue error recovery */
nvme_rdma_error_recovery(req->queue->ctrl);
@@ -1767,50 +1755,9 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
nvme_rdma_destroy_admin_queue(ctrl, shutdown);
}
-static void nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl)
+static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl)
{
- nvme_remove_namespaces(&ctrl->ctrl);
- nvme_rdma_shutdown_ctrl(ctrl, true);
- nvme_uninit_ctrl(&ctrl->ctrl);
- nvme_put_ctrl(&ctrl->ctrl);
-}
-
-static void nvme_rdma_del_ctrl_work(struct work_struct *work)
-{
- struct nvme_rdma_ctrl *ctrl = container_of(work,
- struct nvme_rdma_ctrl, delete_work);
-
- nvme_stop_ctrl(&ctrl->ctrl);
- nvme_rdma_remove_ctrl(ctrl);
-}
-
-static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl)
-{
- if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
- return -EBUSY;
-
- if (!queue_work(nvme_wq, &ctrl->delete_work))
- return -EBUSY;
-
- return 0;
-}
-
-static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl)
-{
- struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
- int ret = 0;
-
- /*
- * Keep a reference until all work is flushed since
- * __nvme_rdma_del_ctrl can free the ctrl mem
- */
- if (!kref_get_unless_zero(&ctrl->ctrl.kref))
- return -EBUSY;
- ret = __nvme_rdma_del_ctrl(ctrl);
- if (!ret)
- flush_work(&ctrl->delete_work);
- nvme_put_ctrl(&ctrl->ctrl);
- return ret;
+ nvme_rdma_shutdown_ctrl(to_rdma_ctrl(ctrl), true);
}
static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
@@ -1834,7 +1781,11 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
}
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
- WARN_ON_ONCE(!changed);
+ if (!changed) {
+ /* state change failure is ok if we're in DELETING state */
+ WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
+ return;
+ }
nvme_start_ctrl(&ctrl->ctrl);
@@ -1842,7 +1793,10 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
out_fail:
dev_warn(ctrl->ctrl.device, "Removing after reset failure\n");
- nvme_rdma_remove_ctrl(ctrl);
+ nvme_remove_namespaces(&ctrl->ctrl);
+ nvme_rdma_shutdown_ctrl(ctrl, true);
+ nvme_uninit_ctrl(&ctrl->ctrl);
+ nvme_put_ctrl(&ctrl->ctrl);
}
static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
@@ -1854,10 +1808,88 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
.reg_write32 = nvmf_reg_write32,
.free_ctrl = nvme_rdma_free_ctrl,
.submit_async_event = nvme_rdma_submit_async_event,
- .delete_ctrl = nvme_rdma_del_ctrl,
+ .delete_ctrl = nvme_rdma_delete_ctrl,
.get_address = nvmf_get_address,
+ .reinit_request = nvme_rdma_reinit_request,
};
+static inline bool
+__nvme_rdma_options_match(struct nvme_rdma_ctrl *ctrl,
+ struct nvmf_ctrl_options *opts)
+{
+ char *stdport = __stringify(NVME_RDMA_IP_PORT);
+
+
+ if (!nvmf_ctlr_matches_baseopts(&ctrl->ctrl, opts) ||
+ strcmp(opts->traddr, ctrl->ctrl.opts->traddr))
+ return false;
+
+ if (opts->mask & NVMF_OPT_TRSVCID &&
+ ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) {
+ if (strcmp(opts->trsvcid, ctrl->ctrl.opts->trsvcid))
+ return false;
+ } else if (opts->mask & NVMF_OPT_TRSVCID) {
+ if (strcmp(opts->trsvcid, stdport))
+ return false;
+ } else if (ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) {
+ if (strcmp(stdport, ctrl->ctrl.opts->trsvcid))
+ return false;
+ }
+ /* else, it's a match as both have stdport. Fall to next checks */
+
+ /*
+ * checking the local address is rough. In most cases, one
+ * is not specified and the host port is selected by the stack.
+ *
+ * Assume no match if:
+ * local address is specified and address is not the same
+ * local address is not specified but remote is, or vice versa
+ * (admin using specific host_traddr when it matters).
+ */
+ if (opts->mask & NVMF_OPT_HOST_TRADDR &&
+ ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) {
+ if (strcmp(opts->host_traddr, ctrl->ctrl.opts->host_traddr))
+ return false;
+ } else if (opts->mask & NVMF_OPT_HOST_TRADDR ||
+ ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
+ return false;
+ /*
+ * if neither controller had an host port specified, assume it's
+ * a match as everything else matched.
+ */
+
+ return true;
+}
+
+/*
+ * Fails a connection request if it matches an existing controller
+ * (association) with the same tuple:
+ * <Host NQN, Host ID, local address, remote address, remote port, SUBSYS NQN>
+ *
+ * if local address is not specified in the request, it will match an
+ * existing controller with all the other parameters the same and no
+ * local port address specified as well.
+ *
+ * The ports don't need to be compared as they are intrinsically
+ * already matched by the port pointers supplied.
+ */
+static bool
+nvme_rdma_existing_controller(struct nvmf_ctrl_options *opts)
+{
+ struct nvme_rdma_ctrl *ctrl;
+ bool found = false;
+
+ mutex_lock(&nvme_rdma_ctrl_mutex);
+ list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
+ found = __nvme_rdma_options_match(ctrl, opts);
+ if (found)
+ break;
+ }
+ mutex_unlock(&nvme_rdma_ctrl_mutex);
+
+ return found;
+}
+
static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
struct nvmf_ctrl_options *opts)
{
@@ -1894,6 +1926,11 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
}
}
+ if (!opts->duplicate_connect && nvme_rdma_existing_controller(opts)) {
+ ret = -EALREADY;
+ goto out_free_ctrl;
+ }
+
ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
0 /* no quirks, we're perfect! */);
if (ret)
@@ -1902,7 +1939,6 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
INIT_DELAYED_WORK(&ctrl->reconnect_work,
nvme_rdma_reconnect_ctrl_work);
INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
- INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work);
INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
@@ -1961,7 +1997,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
- kref_get(&ctrl->ctrl.kref);
+ nvme_get_ctrl(&ctrl->ctrl);
mutex_lock(&nvme_rdma_ctrl_mutex);
list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
@@ -2006,7 +2042,7 @@ static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
dev_info(ctrl->ctrl.device,
"Removing ctrl: NQN \"%s\", addr %pISp\n",
ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
- __nvme_rdma_del_ctrl(ctrl);
+ nvme_delete_ctrl(&ctrl->ctrl);
}
mutex_unlock(&nvme_rdma_ctrl_mutex);
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index c4a0bf3..90dcdc4 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -35,17 +35,14 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd)
static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
struct nvme_smart_log *slog)
{
- u16 status;
struct nvmet_ns *ns;
u64 host_reads, host_writes, data_units_read, data_units_written;
- status = NVME_SC_SUCCESS;
ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->get_log_page.nsid);
if (!ns) {
- status = NVME_SC_INVALID_NS;
pr_err("nvmet : Could not find namespace id : %d\n",
le32_to_cpu(req->cmd->get_log_page.nsid));
- goto out;
+ return NVME_SC_INVALID_NS;
}
host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]);
@@ -58,20 +55,18 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
put_unaligned_le64(host_writes, &slog->host_writes[0]);
put_unaligned_le64(data_units_written, &slog->data_units_written[0]);
nvmet_put_namespace(ns);
-out:
- return status;
+
+ return NVME_SC_SUCCESS;
}
static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
struct nvme_smart_log *slog)
{
- u16 status;
u64 host_reads = 0, host_writes = 0;
u64 data_units_read = 0, data_units_written = 0;
struct nvmet_ns *ns;
struct nvmet_ctrl *ctrl;
- status = NVME_SC_SUCCESS;
ctrl = req->sq->ctrl;
rcu_read_lock();
@@ -91,7 +86,7 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
put_unaligned_le64(host_writes, &slog->host_writes[0]);
put_unaligned_le64(data_units_written, &slog->data_units_written[0]);
- return status;
+ return NVME_SC_SUCCESS;
}
static u16 nvmet_get_smart_log(struct nvmet_req *req,
@@ -144,10 +139,8 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
}
smart_log = buf;
status = nvmet_get_smart_log(req, smart_log);
- if (status) {
- memset(buf, '\0', data_len);
+ if (status)
goto err;
- }
break;
case NVME_LOG_FW_SLOT:
/*
@@ -300,7 +293,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
}
/*
- * nuse = ncap = nsze isn't aways true, but we have no way to find
+ * nuse = ncap = nsze isn't always true, but we have no way to find
* that out from the underlying device.
*/
id->ncap = id->nuse = id->nsze =
@@ -424,7 +417,7 @@ out:
}
/*
- * A "mimimum viable" abort implementation: the command is mandatory in the
+ * A "minimum viable" abort implementation: the command is mandatory in the
* spec, but we are not required to do any useful work. We couldn't really
* do a useful abort, so don't bother even with waiting for the command
* to be exectuted and return immediately telling the command to abort
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 645ba7e..b54748a 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -57,6 +57,17 @@ u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len)
return 0;
}
+static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys)
+{
+ struct nvmet_ns *ns;
+
+ if (list_empty(&subsys->namespaces))
+ return 0;
+
+ ns = list_last_entry(&subsys->namespaces, struct nvmet_ns, dev_link);
+ return ns->nsid;
+}
+
static u32 nvmet_async_event_result(struct nvmet_async_event *aen)
{
return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16);
@@ -334,6 +345,8 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
ns->enabled = false;
list_del_rcu(&ns->dev_link);
+ if (ns->nsid == subsys->max_nsid)
+ subsys->max_nsid = nvmet_max_nsid(subsys);
mutex_unlock(&subsys->lock);
/*
@@ -497,6 +510,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
req->ops = ops;
req->sg = NULL;
req->sg_cnt = 0;
+ req->transfer_len = 0;
req->rsp->status = 0;
/* no support for fused commands yet */
@@ -546,6 +560,15 @@ void nvmet_req_uninit(struct nvmet_req *req)
}
EXPORT_SYMBOL_GPL(nvmet_req_uninit);
+void nvmet_req_execute(struct nvmet_req *req)
+{
+ if (unlikely(req->data_len != req->transfer_len))
+ nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR);
+ else
+ req->execute(req);
+}
+EXPORT_SYMBOL_GPL(nvmet_req_execute);
+
static inline bool nvmet_cc_en(u32 cc)
{
return (cc >> NVME_CC_EN_SHIFT) & 0x1;
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 58e010b..739b8fe 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -76,7 +76,6 @@ struct nvmet_fc_fcp_iod {
dma_addr_t rspdma;
struct scatterlist *data_sg;
int data_sg_cnt;
- u32 total_length;
u32 offset;
enum nvmet_fcp_datadir io_dir;
bool active;
@@ -150,6 +149,7 @@ struct nvmet_fc_tgt_assoc {
struct list_head a_list;
struct nvmet_fc_tgt_queue *queues[NVMET_NR_QUEUES + 1];
struct kref ref;
+ struct work_struct del_work;
};
@@ -232,6 +232,7 @@ static void nvmet_fc_tgtport_put(struct nvmet_fc_tgtport *tgtport);
static int nvmet_fc_tgtport_get(struct nvmet_fc_tgtport *tgtport);
static void nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
struct nvmet_fc_fcp_iod *fod);
+static void nvmet_fc_delete_target_assoc(struct nvmet_fc_tgt_assoc *assoc);
/* *********************** FC-NVME DMA Handling **************************** */
@@ -802,6 +803,16 @@ nvmet_fc_find_target_queue(struct nvmet_fc_tgtport *tgtport,
return NULL;
}
+static void
+nvmet_fc_delete_assoc(struct work_struct *work)
+{
+ struct nvmet_fc_tgt_assoc *assoc =
+ container_of(work, struct nvmet_fc_tgt_assoc, del_work);
+
+ nvmet_fc_delete_target_assoc(assoc);
+ nvmet_fc_tgt_a_put(assoc);
+}
+
static struct nvmet_fc_tgt_assoc *
nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport)
{
@@ -826,6 +837,7 @@ nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport)
assoc->a_id = idx;
INIT_LIST_HEAD(&assoc->a_list);
kref_init(&assoc->ref);
+ INIT_WORK(&assoc->del_work, nvmet_fc_delete_assoc);
while (needrandom) {
get_random_bytes(&ran, sizeof(ran) - BYTES_FOR_QID);
@@ -1118,8 +1130,7 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl)
nvmet_fc_tgtport_put(tgtport);
if (found_ctrl) {
- nvmet_fc_delete_target_assoc(assoc);
- nvmet_fc_tgt_a_put(assoc);
+ schedule_work(&assoc->del_work);
return;
}
@@ -1688,7 +1699,7 @@ nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
u32 page_len, length;
int i = 0;
- length = fod->total_length;
+ length = fod->req.transfer_len;
nent = DIV_ROUND_UP(length, PAGE_SIZE);
sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL);
if (!sg)
@@ -1777,7 +1788,7 @@ nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
u32 rsn, rspcnt, xfr_length;
if (fod->fcpreq->op == NVMET_FCOP_READDATA_RSP)
- xfr_length = fod->total_length;
+ xfr_length = fod->req.transfer_len;
else
xfr_length = fod->offset;
@@ -1803,7 +1814,7 @@ nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
rspcnt = atomic_inc_return(&fod->queue->zrspcnt);
if (!(rspcnt % fod->queue->ersp_ratio) ||
sqe->opcode == nvme_fabrics_command ||
- xfr_length != fod->total_length ||
+ xfr_length != fod->req.transfer_len ||
(le16_to_cpu(cqe->status) & 0xFFFE) || cqewd[0] || cqewd[1] ||
(sqe->flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND)) ||
queue_90percent_full(fod->queue, le16_to_cpu(cqe->sq_head)))
@@ -1880,7 +1891,7 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
fcpreq->timeout = NVME_FC_TGTOP_TIMEOUT_SEC;
tlen = min_t(u32, tgtport->max_sg_cnt * PAGE_SIZE,
- (fod->total_length - fod->offset));
+ (fod->req.transfer_len - fod->offset));
fcpreq->transfer_length = tlen;
fcpreq->transferred_length = 0;
fcpreq->fcp_error = 0;
@@ -1894,7 +1905,7 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
* combined xfr with response.
*/
if ((op == NVMET_FCOP_READDATA) &&
- ((fod->offset + fcpreq->transfer_length) == fod->total_length) &&
+ ((fod->offset + fcpreq->transfer_length) == fod->req.transfer_len) &&
(tgtport->ops->target_features & NVMET_FCTGTFEAT_READDATA_RSP)) {
fcpreq->op = NVMET_FCOP_READDATA_RSP;
nvmet_fc_prep_fcp_rsp(tgtport, fod);
@@ -1974,7 +1985,7 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
}
fod->offset += fcpreq->transferred_length;
- if (fod->offset != fod->total_length) {
+ if (fod->offset != fod->req.transfer_len) {
spin_lock_irqsave(&fod->flock, flags);
fod->writedataactive = true;
spin_unlock_irqrestore(&fod->flock, flags);
@@ -1986,9 +1997,7 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
}
/* data transfer complete, resume with nvmet layer */
-
- fod->req.execute(&fod->req);
-
+ nvmet_req_execute(&fod->req);
break;
case NVMET_FCOP_READDATA:
@@ -2011,7 +2020,7 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
}
fod->offset += fcpreq->transferred_length;
- if (fod->offset != fod->total_length) {
+ if (fod->offset != fod->req.transfer_len) {
/* transfer the next chunk */
nvmet_fc_transfer_fcp_data(tgtport, fod,
NVMET_FCOP_READDATA);
@@ -2148,7 +2157,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
fod->fcpreq->done = nvmet_fc_xmt_fcp_op_done;
- fod->total_length = be32_to_cpu(cmdiu->data_len);
+ fod->req.transfer_len = be32_to_cpu(cmdiu->data_len);
if (cmdiu->flags & FCNVME_CMD_FLAGS_WRITE) {
fod->io_dir = NVMET_FCP_WRITE;
if (!nvme_is_write(&cmdiu->sqe))
@@ -2159,7 +2168,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
goto transport_error;
} else {
fod->io_dir = NVMET_FCP_NODATA;
- if (fod->total_length)
+ if (fod->req.transfer_len)
goto transport_error;
}
@@ -2167,9 +2176,6 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
fod->req.rsp = &fod->rspiubuf.cqe;
fod->req.port = fod->queue->port;
- /* ensure nvmet handlers will set cmd handler callback */
- fod->req.execute = NULL;
-
/* clear any response payload */
memset(&fod->rspiubuf, 0, sizeof(fod->rspiubuf));
@@ -2189,7 +2195,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
/* keep a running counter of tail position */
atomic_inc(&fod->queue->sqtail);
- if (fod->total_length) {
+ if (fod->req.transfer_len) {
ret = nvmet_fc_alloc_tgt_pgs(fod);
if (ret) {
nvmet_req_complete(&fod->req, ret);
@@ -2212,9 +2218,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
* can invoke the nvmet_layer now. If read data, cmd completion will
* push the data
*/
-
- fod->req.execute(&fod->req);
-
+ nvmet_req_execute(&fod->req);
return;
transport_error:
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index 0d4c23d..0a4372a 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -33,18 +33,11 @@ static inline u32 nvmet_rw_len(struct nvmet_req *req)
req->ns->blksize_shift;
}
-static void nvmet_inline_bio_init(struct nvmet_req *req)
-{
- struct bio *bio = &req->inline_bio;
-
- bio_init(bio, req->inline_bvec, NVMET_MAX_INLINE_BIOVEC);
-}
-
static void nvmet_execute_rw(struct nvmet_req *req)
{
int sg_cnt = req->sg_cnt;
+ struct bio *bio = &req->inline_bio;
struct scatterlist *sg;
- struct bio *bio;
sector_t sector;
blk_qc_t cookie;
int op, op_flags = 0, i;
@@ -66,8 +59,7 @@ static void nvmet_execute_rw(struct nvmet_req *req)
sector = le64_to_cpu(req->cmd->rw.slba);
sector <<= (req->ns->blksize_shift - 9);
- nvmet_inline_bio_init(req);
- bio = &req->inline_bio;
+ bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
bio_set_dev(bio, req->ns->bdev);
bio->bi_iter.bi_sector = sector;
bio->bi_private = req;
@@ -94,16 +86,14 @@ static void nvmet_execute_rw(struct nvmet_req *req)
cookie = submit_bio(bio);
- blk_mq_poll(bdev_get_queue(req->ns->bdev), cookie);
+ blk_poll(bdev_get_queue(req->ns->bdev), cookie);
}
static void nvmet_execute_flush(struct nvmet_req *req)
{
- struct bio *bio;
-
- nvmet_inline_bio_init(req);
- bio = &req->inline_bio;
+ struct bio *bio = &req->inline_bio;
+ bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
bio_set_dev(bio, req->ns->bdev);
bio->bi_private = req;
bio->bi_end_io = nvmet_bio_done;
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 92628c4..96d3904 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -23,14 +23,6 @@
#define NVME_LOOP_MAX_SEGMENTS 256
-/*
- * We handle AEN commands ourselves and don't even let the
- * block layer know about them.
- */
-#define NVME_LOOP_NR_AEN_COMMANDS 1
-#define NVME_LOOP_AQ_BLKMQ_DEPTH \
- (NVME_AQ_DEPTH - NVME_LOOP_NR_AEN_COMMANDS)
-
struct nvme_loop_iod {
struct nvme_request nvme_req;
struct nvme_command cmd;
@@ -53,7 +45,6 @@ struct nvme_loop_ctrl {
struct nvme_ctrl ctrl;
struct nvmet_ctrl *target_ctrl;
- struct work_struct delete_work;
};
static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl)
@@ -113,7 +104,7 @@ static void nvme_loop_queue_response(struct nvmet_req *req)
* for them but rather special case them here.
*/
if (unlikely(nvme_loop_queue_idx(queue) == 0 &&
- cqe->command_id >= NVME_LOOP_AQ_BLKMQ_DEPTH)) {
+ cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
&cqe->result);
} else {
@@ -136,7 +127,7 @@ static void nvme_loop_execute_work(struct work_struct *work)
struct nvme_loop_iod *iod =
container_of(work, struct nvme_loop_iod, work);
- iod->req.execute(&iod->req);
+ nvmet_req_execute(&iod->req);
}
static enum blk_eh_timer_return
@@ -185,6 +176,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
iod->req.sg = iod->sg_table.sgl;
iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl);
+ iod->req.transfer_len = blk_rq_bytes(req);
}
blk_mq_start_request(req);
@@ -193,7 +185,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
return BLK_STS_OK;
}
-static void nvme_loop_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
+static void nvme_loop_submit_async_event(struct nvme_ctrl *arg)
{
struct nvme_loop_ctrl *ctrl = to_loop_ctrl(arg);
struct nvme_loop_queue *queue = &ctrl->queues[0];
@@ -201,7 +193,7 @@ static void nvme_loop_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
memset(&iod->cmd, 0, sizeof(iod->cmd));
iod->cmd.common.opcode = nvme_admin_async_event;
- iod->cmd.common.command_id = NVME_LOOP_AQ_BLKMQ_DEPTH;
+ iod->cmd.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq,
@@ -357,7 +349,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops;
- ctrl->admin_tag_set.queue_depth = NVME_LOOP_AQ_BLKMQ_DEPTH;
+ ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */
ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) +
@@ -365,6 +357,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
ctrl->admin_tag_set.driver_data = ctrl;
ctrl->admin_tag_set.nr_hw_queues = 1;
ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
+ ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED;
ctrl->queues[0].ctrl = ctrl;
error = nvmet_sq_init(&ctrl->queues[0].nvme_sq);
@@ -438,41 +431,9 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
nvme_loop_destroy_admin_queue(ctrl);
}
-static void nvme_loop_del_ctrl_work(struct work_struct *work)
+static void nvme_loop_delete_ctrl_host(struct nvme_ctrl *ctrl)
{
- struct nvme_loop_ctrl *ctrl = container_of(work,
- struct nvme_loop_ctrl, delete_work);
-
- nvme_stop_ctrl(&ctrl->ctrl);
- nvme_remove_namespaces(&ctrl->ctrl);
- nvme_loop_shutdown_ctrl(ctrl);
- nvme_uninit_ctrl(&ctrl->ctrl);
- nvme_put_ctrl(&ctrl->ctrl);
-}
-
-static int __nvme_loop_del_ctrl(struct nvme_loop_ctrl *ctrl)
-{
- if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
- return -EBUSY;
-
- if (!queue_work(nvme_wq, &ctrl->delete_work))
- return -EBUSY;
-
- return 0;
-}
-
-static int nvme_loop_del_ctrl(struct nvme_ctrl *nctrl)
-{
- struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl);
- int ret;
-
- ret = __nvme_loop_del_ctrl(ctrl);
- if (ret)
- return ret;
-
- flush_work(&ctrl->delete_work);
-
- return 0;
+ nvme_loop_shutdown_ctrl(to_loop_ctrl(ctrl));
}
static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl)
@@ -482,7 +443,7 @@ static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl)
mutex_lock(&nvme_loop_ctrl_mutex);
list_for_each_entry(ctrl, &nvme_loop_ctrl_list, list) {
if (ctrl->ctrl.cntlid == nctrl->cntlid)
- __nvme_loop_del_ctrl(ctrl);
+ nvme_delete_ctrl(&ctrl->ctrl);
}
mutex_unlock(&nvme_loop_ctrl_mutex);
}
@@ -538,7 +499,7 @@ static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = {
.reg_write32 = nvmf_reg_write32,
.free_ctrl = nvme_loop_free_ctrl,
.submit_async_event = nvme_loop_submit_async_event,
- .delete_ctrl = nvme_loop_del_ctrl,
+ .delete_ctrl = nvme_loop_delete_ctrl_host,
};
static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
@@ -600,7 +561,6 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
ctrl->ctrl.opts = opts;
INIT_LIST_HEAD(&ctrl->list);
- INIT_WORK(&ctrl->delete_work, nvme_loop_del_ctrl_work);
INIT_WORK(&ctrl->ctrl.reset_work, nvme_loop_reset_ctrl_work);
ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops,
@@ -641,7 +601,7 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
dev_info(ctrl->ctrl.device,
"new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn);
- kref_get(&ctrl->ctrl.kref);
+ nvme_get_ctrl(&ctrl->ctrl);
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
WARN_ON_ONCE(!changed);
@@ -730,7 +690,7 @@ static void __exit nvme_loop_cleanup_module(void)
mutex_lock(&nvme_loop_ctrl_mutex);
list_for_each_entry_safe(ctrl, next, &nvme_loop_ctrl_list, list)
- __nvme_loop_del_ctrl(ctrl);
+ nvme_delete_ctrl(&ctrl->ctrl);
mutex_unlock(&nvme_loop_ctrl_mutex);
flush_workqueue(nvme_wq);
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 87e429b..417f6c0 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -223,7 +223,10 @@ struct nvmet_req {
struct bio inline_bio;
struct bio_vec inline_bvec[NVMET_MAX_INLINE_BIOVEC];
int sg_cnt;
+ /* data length as parsed from the command: */
size_t data_len;
+ /* data length as parsed from the SGL descriptor: */
+ size_t transfer_len;
struct nvmet_port *port;
@@ -266,6 +269,7 @@ u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req);
bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops);
void nvmet_req_uninit(struct nvmet_req *req);
+void nvmet_req_execute(struct nvmet_req *req);
void nvmet_req_complete(struct nvmet_req *req, u16 status);
void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
@@ -314,7 +318,7 @@ u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf,
u32 nvmet_get_log_page_len(struct nvme_command *cmd);
#define NVMET_QUEUE_SIZE 1024
-#define NVMET_NR_QUEUES 64
+#define NVMET_NR_QUEUES 128
#define NVMET_MAX_CMD NVMET_QUEUE_SIZE
#define NVMET_KAS 10
#define NVMET_DISC_KATO 120
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 76d2bb7..4991290 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -148,14 +148,14 @@ static inline u32 get_unaligned_le24(const u8 *p)
static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp)
{
return nvme_is_write(rsp->req.cmd) &&
- rsp->req.data_len &&
+ rsp->req.transfer_len &&
!(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
}
static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp)
{
return !nvme_is_write(rsp->req.cmd) &&
- rsp->req.data_len &&
+ rsp->req.transfer_len &&
!rsp->req.rsp->status &&
!(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
}
@@ -577,7 +577,7 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
return;
}
- rsp->req.execute(&rsp->req);
+ nvmet_req_execute(&rsp->req);
}
static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len,
@@ -609,6 +609,7 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
nvmet_rdma_use_inline_sg(rsp, len, off);
rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA;
+ rsp->req.transfer_len += len;
return 0;
}
@@ -636,6 +637,7 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
nvmet_data_dir(&rsp->req));
if (ret < 0)
return NVME_SC_INTERNAL;
+ rsp->req.transfer_len += len;
rsp->n_rdma += ret;
if (invalidate) {
@@ -693,7 +695,7 @@ static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp)
queue->cm_id->port_num, &rsp->read_cqe, NULL))
nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR);
} else {
- rsp->req.execute(&rsp->req);
+ nvmet_req_execute(&rsp->req);
}
return true;
@@ -1512,15 +1514,17 @@ static struct nvmet_fabrics_ops nvmet_rdma_ops = {
static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data)
{
- struct nvmet_rdma_queue *queue;
+ struct nvmet_rdma_queue *queue, *tmp;
/* Device is being removed, delete all queues using this device */
mutex_lock(&nvmet_rdma_queue_mutex);
- list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) {
+ list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list,
+ queue_list) {
if (queue->dev->device != ib_device)
continue;
pr_info("Removing queue %d\n", queue->idx);
+ list_del_init(&queue->queue_list);
__nvmet_rdma_queue_disconnect(queue);
}
mutex_unlock(&nvmet_rdma_queue_mutex);
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 4136633..7669553 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -130,7 +130,8 @@ config CHR_DEV_OSST
config BLK_DEV_SR
tristate "SCSI CDROM support"
- depends on SCSI
+ depends on SCSI && BLK_DEV
+ select CDROM
---help---
If you want to use a CD or DVD drive attached to your computer
by SCSI, FireWire, USB or ATAPI, say Y and read the SCSI-HOWTO
diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c
index c17677f..3e02bc3 100644
--- a/drivers/scsi/lpfc/lpfc_attr.c
+++ b/drivers/scsi/lpfc/lpfc_attr.c
@@ -3246,6 +3246,11 @@ lpfc_update_rport_devloss_tmo(struct lpfc_vport *vport)
continue;
if (ndlp->rport)
ndlp->rport->dev_loss_tmo = vport->cfg_devloss_tmo;
+#if (IS_ENABLED(CONFIG_NVME_FC))
+ if (ndlp->nrport)
+ nvme_fc_set_remoteport_devloss(ndlp->nrport->remoteport,
+ vport->cfg_devloss_tmo);
+#endif
}
spin_unlock_irq(shost->host_lock);
}
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index bcc1694..54de24c 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -252,9 +252,9 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
struct scsi_request *rq;
int ret = DRIVER_ERROR << 24;
- req = blk_get_request(sdev->request_queue,
+ req = blk_get_request_flags(sdev->request_queue,
data_direction == DMA_TO_DEVICE ?
- REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, __GFP_RECLAIM);
+ REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, BLK_MQ_REQ_PREEMPT);
if (IS_ERR(req))
return ret;
rq = scsi_req(req);
@@ -268,7 +268,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
rq->retries = retries;
req->timeout = timeout;
req->cmd_flags |= flags;
- req->rq_flags |= rq_flags | RQF_QUIET | RQF_PREEMPT;
+ req->rq_flags |= rq_flags | RQF_QUIET;
/*
* head injection *required* here otherwise quiesce won't work
@@ -1301,7 +1301,7 @@ scsi_prep_state_check(struct scsi_device *sdev, struct request *req)
/*
* If the devices is blocked we defer normal commands.
*/
- if (!(req->rq_flags & RQF_PREEMPT))
+ if (req && !(req->rq_flags & RQF_PREEMPT))
ret = BLKPREP_DEFER;
break;
default:
@@ -1310,7 +1310,7 @@ scsi_prep_state_check(struct scsi_device *sdev, struct request *req)
* special commands. In particular any user initiated
* command is not allowed.
*/
- if (!(req->rq_flags & RQF_PREEMPT))
+ if (req && !(req->rq_flags & RQF_PREEMPT))
ret = BLKPREP_KILL;
break;
}
@@ -1940,6 +1940,33 @@ static void scsi_mq_done(struct scsi_cmnd *cmd)
blk_mq_complete_request(cmd->request);
}
+static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
+{
+ struct request_queue *q = hctx->queue;
+ struct scsi_device *sdev = q->queuedata;
+
+ atomic_dec(&sdev->device_busy);
+ put_device(&sdev->sdev_gendev);
+}
+
+static bool scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
+{
+ struct request_queue *q = hctx->queue;
+ struct scsi_device *sdev = q->queuedata;
+
+ if (!get_device(&sdev->sdev_gendev))
+ goto out;
+ if (!scsi_dev_queue_ready(q, sdev))
+ goto out_put_device;
+
+ return true;
+
+out_put_device:
+ put_device(&sdev->sdev_gendev);
+out:
+ return false;
+}
+
static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
@@ -1953,16 +1980,11 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
ret = prep_to_mq(scsi_prep_state_check(sdev, req));
if (ret != BLK_STS_OK)
- goto out;
+ goto out_put_budget;
ret = BLK_STS_RESOURCE;
- if (!get_device(&sdev->sdev_gendev))
- goto out;
-
- if (!scsi_dev_queue_ready(q, sdev))
- goto out_put_device;
if (!scsi_target_queue_ready(shost, sdev))
- goto out_dec_device_busy;
+ goto out_put_budget;
if (!scsi_host_queue_ready(q, shost, sdev))
goto out_dec_target_busy;
@@ -1993,15 +2015,12 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
return BLK_STS_OK;
out_dec_host_busy:
- atomic_dec(&shost->host_busy);
+ atomic_dec(&shost->host_busy);
out_dec_target_busy:
if (scsi_target(sdev)->can_queue > 0)
atomic_dec(&scsi_target(sdev)->target_busy);
-out_dec_device_busy:
- atomic_dec(&sdev->device_busy);
-out_put_device:
- put_device(&sdev->sdev_gendev);
-out:
+out_put_budget:
+ scsi_mq_put_budget(hctx);
switch (ret) {
case BLK_STS_OK:
break;
@@ -2205,6 +2224,8 @@ struct request_queue *scsi_old_alloc_queue(struct scsi_device *sdev)
}
static const struct blk_mq_ops scsi_mq_ops = {
+ .get_budget = scsi_mq_get_budget,
+ .put_budget = scsi_mq_put_budget,
.queue_rq = scsi_queue_rq,
.complete = scsi_softirq_done,
.timeout = scsi_timeout,
@@ -2919,21 +2940,37 @@ static void scsi_wait_for_queuecommand(struct scsi_device *sdev)
int
scsi_device_quiesce(struct scsi_device *sdev)
{
+ struct request_queue *q = sdev->request_queue;
int err;
+ /*
+ * It is allowed to call scsi_device_quiesce() multiple times from
+ * the same context but concurrent scsi_device_quiesce() calls are
+ * not allowed.
+ */
+ WARN_ON_ONCE(sdev->quiesced_by && sdev->quiesced_by != current);
+
+ blk_set_preempt_only(q);
+
+ blk_mq_freeze_queue(q);
+ /*
+ * Ensure that the effect of blk_set_preempt_only() will be visible
+ * for percpu_ref_tryget() callers that occur after the queue
+ * unfreeze even if the queue was already frozen before this function
+ * was called. See also https://lwn.net/Articles/573497/.
+ */
+ synchronize_rcu();
+ blk_mq_unfreeze_queue(q);
+
mutex_lock(&sdev->state_mutex);
err = scsi_device_set_state(sdev, SDEV_QUIESCE);
+ if (err == 0)
+ sdev->quiesced_by = current;
+ else
+ blk_clear_preempt_only(q);
mutex_unlock(&sdev->state_mutex);
- if (err)
- return err;
-
- scsi_run_queue(sdev->request_queue);
- while (atomic_read(&sdev->device_busy)) {
- msleep_interruptible(200);
- scsi_run_queue(sdev->request_queue);
- }
- return 0;
+ return err;
}
EXPORT_SYMBOL(scsi_device_quiesce);
@@ -2953,9 +2990,11 @@ void scsi_device_resume(struct scsi_device *sdev)
* device deleted during suspend)
*/
mutex_lock(&sdev->state_mutex);
- if (sdev->sdev_state == SDEV_QUIESCE &&
- scsi_device_set_state(sdev, SDEV_RUNNING) == 0)
- scsi_run_queue(sdev->request_queue);
+ WARN_ON_ONCE(!sdev->quiesced_by);
+ sdev->quiesced_by = NULL;
+ blk_clear_preempt_only(sdev->request_queue);
+ if (sdev->sdev_state == SDEV_QUIESCE)
+ scsi_device_set_state(sdev, SDEV_RUNNING);
mutex_unlock(&sdev->state_mutex);
}
EXPORT_SYMBOL(scsi_device_resume);
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index aa28874..f098877 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -217,7 +217,7 @@ static int sg_allow_access(struct file *filp, unsigned char *cmd)
if (sfp->parentdp->device->type == TYPE_SCANNER)
return 0;
- return blk_verify_command(cmd, filp->f_mode & FMODE_WRITE);
+ return blk_verify_command(cmd, filp->f_mode);
}
static int
OpenPOWER on IntegriCloud