summaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/nvme-core.c203
-rw-r--r--drivers/block/nvme-scsi.c36
-rw-r--r--drivers/block/rbd.c252
3 files changed, 355 insertions, 136 deletions
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index a842c71..02351e2 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -10,10 +10,6 @@
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <linux/nvme.h>
@@ -46,16 +42,26 @@
#include <scsi/sg.h>
#include <asm-generic/io-64-nonatomic-lo-hi.h>
-#define NVME_Q_DEPTH 1024
+#include <trace/events/block.h>
+
+#define NVME_Q_DEPTH 1024
#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
-#define ADMIN_TIMEOUT (60 * HZ)
-#define IOD_TIMEOUT (4 * NVME_IO_TIMEOUT)
+#define ADMIN_TIMEOUT (admin_timeout * HZ)
+#define IOD_TIMEOUT (retry_time * HZ)
+
+static unsigned char admin_timeout = 60;
+module_param(admin_timeout, byte, 0644);
+MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
-unsigned char io_timeout = 30;
-module_param(io_timeout, byte, 0644);
+unsigned char nvme_io_timeout = 30;
+module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
+static unsigned char retry_time = 30;
+module_param(retry_time, byte, 0644);
+MODULE_PARM_DESC(retry_time, "time in seconds to retry failed I/O");
+
static int nvme_major;
module_param(nvme_major, int, 0);
@@ -67,6 +73,7 @@ static LIST_HEAD(dev_list);
static struct task_struct *nvme_thread;
static struct workqueue_struct *nvme_workq;
static wait_queue_head_t nvme_kthread_wait;
+static struct notifier_block nvme_nb;
static void nvme_reset_failed_dev(struct work_struct *ws);
@@ -199,16 +206,13 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE)
#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE)
#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE)
-#define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE)
-#define CMD_CTX_ABORT (0x31C + CMD_CTX_BASE)
+#define CMD_CTX_ABORT (0x318 + CMD_CTX_BASE)
static void special_completion(struct nvme_queue *nvmeq, void *ctx,
struct nvme_completion *cqe)
{
if (ctx == CMD_CTX_CANCELLED)
return;
- if (ctx == CMD_CTX_FLUSH)
- return;
if (ctx == CMD_CTX_ABORT) {
++nvmeq->dev->abort_limit;
return;
@@ -247,8 +251,9 @@ static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid,
void *ctx;
struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
- if (cmdid >= nvmeq->q_depth) {
- *fn = special_completion;
+ if (cmdid >= nvmeq->q_depth || !info[cmdid].fn) {
+ if (fn)
+ *fn = special_completion;
return CMD_CTX_INVALID;
}
if (fn)
@@ -281,9 +286,17 @@ static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid)
static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU)
{
+ struct nvme_queue *nvmeq;
unsigned queue_id = get_cpu_var(*dev->io_queue);
+
rcu_read_lock();
- return rcu_dereference(dev->queues[queue_id]);
+ nvmeq = rcu_dereference(dev->queues[queue_id]);
+ if (nvmeq)
+ return nvmeq;
+
+ rcu_read_unlock();
+ put_cpu_var(*dev->io_queue);
+ return NULL;
}
static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
@@ -295,8 +308,15 @@ static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx)
__acquires(RCU)
{
+ struct nvme_queue *nvmeq;
+
rcu_read_lock();
- return rcu_dereference(dev->queues[q_idx]);
+ nvmeq = rcu_dereference(dev->queues[q_idx]);
+ if (nvmeq)
+ return nvmeq;
+
+ rcu_read_unlock();
+ return NULL;
}
static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
@@ -387,25 +407,30 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
static void nvme_start_io_acct(struct bio *bio)
{
struct gendisk *disk = bio->bi_bdev->bd_disk;
- const int rw = bio_data_dir(bio);
- int cpu = part_stat_lock();
- part_round_stats(cpu, &disk->part0);
- part_stat_inc(cpu, &disk->part0, ios[rw]);
- part_stat_add(cpu, &disk->part0, sectors[rw], bio_sectors(bio));
- part_inc_in_flight(&disk->part0, rw);
- part_stat_unlock();
+ if (blk_queue_io_stat(disk->queue)) {
+ const int rw = bio_data_dir(bio);
+ int cpu = part_stat_lock();
+ part_round_stats(cpu, &disk->part0);
+ part_stat_inc(cpu, &disk->part0, ios[rw]);
+ part_stat_add(cpu, &disk->part0, sectors[rw],
+ bio_sectors(bio));
+ part_inc_in_flight(&disk->part0, rw);
+ part_stat_unlock();
+ }
}
static void nvme_end_io_acct(struct bio *bio, unsigned long start_time)
{
struct gendisk *disk = bio->bi_bdev->bd_disk;
- const int rw = bio_data_dir(bio);
- unsigned long duration = jiffies - start_time;
- int cpu = part_stat_lock();
- part_stat_add(cpu, &disk->part0, ticks[rw], duration);
- part_round_stats(cpu, &disk->part0);
- part_dec_in_flight(&disk->part0, rw);
- part_stat_unlock();
+ if (blk_queue_io_stat(disk->queue)) {
+ const int rw = bio_data_dir(bio);
+ unsigned long duration = jiffies - start_time;
+ int cpu = part_stat_lock();
+ part_stat_add(cpu, &disk->part0, ticks[rw], duration);
+ part_round_stats(cpu, &disk->part0);
+ part_dec_in_flight(&disk->part0, rw);
+ part_stat_unlock();
+ }
}
static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
@@ -414,6 +439,7 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
struct nvme_iod *iod = ctx;
struct bio *bio = iod->private;
u16 status = le16_to_cpup(&cqe->status) >> 1;
+ int error = 0;
if (unlikely(status)) {
if (!(status & NVME_SC_DNR ||
@@ -426,6 +452,7 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
wake_up(&nvmeq->sq_full);
return;
}
+ error = -EIO;
}
if (iod->nents) {
dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents,
@@ -433,10 +460,9 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
nvme_end_io_acct(bio, iod->start_time);
}
nvme_free_iod(nvmeq->dev, iod);
- if (status)
- bio_endio(bio, -EIO);
- else
- bio_endio(bio, 0);
+
+ trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio, error);
+ bio_endio(bio, error);
}
/* length is in bytes. gfp flags indicates whether we may sleep. */
@@ -525,6 +551,8 @@ static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq,
if (!split)
return -ENOMEM;
+ trace_block_split(bdev_get_queue(bio->bi_bdev), bio,
+ split->bi_iter.bi_sector);
bio_chain(split, bio);
if (!waitqueue_active(&nvmeq->sq_full))
@@ -627,16 +655,6 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
return 0;
}
-int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
-{
- int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH,
- special_completion, NVME_IO_TIMEOUT);
- if (unlikely(cmdid < 0))
- return cmdid;
-
- return nvme_submit_flush(nvmeq, ns, cmdid);
-}
-
static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod)
{
struct bio *bio = iod->private;
@@ -652,7 +670,7 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod)
if (bio->bi_rw & REQ_DISCARD)
return nvme_submit_discard(nvmeq, ns, bio, iod, cmdid);
- if ((bio->bi_rw & REQ_FLUSH) && !iod->nents)
+ if (bio->bi_rw & REQ_FLUSH)
return nvme_submit_flush(nvmeq, ns, cmdid);
control = 0;
@@ -686,6 +704,26 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod)
return 0;
}
+static int nvme_split_flush_data(struct nvme_queue *nvmeq, struct bio *bio)
+{
+ struct bio *split = bio_clone(bio, GFP_ATOMIC);
+ if (!split)
+ return -ENOMEM;
+
+ split->bi_iter.bi_size = 0;
+ split->bi_phys_segments = 0;
+ bio->bi_rw &= ~REQ_FLUSH;
+ bio_chain(split, bio);
+
+ if (!waitqueue_active(&nvmeq->sq_full))
+ add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
+ bio_list_add(&nvmeq->sq_cong, split);
+ bio_list_add(&nvmeq->sq_cong, bio);
+ wake_up_process(nvme_thread);
+
+ return 0;
+}
+
/*
* Called with local interrupts disabled and the q_lock held. May not sleep.
*/
@@ -696,11 +734,8 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
int psegs = bio_phys_segments(ns->queue, bio);
int result;
- if ((bio->bi_rw & REQ_FLUSH) && psegs) {
- result = nvme_submit_flush_data(nvmeq, ns);
- if (result)
- return result;
- }
+ if ((bio->bi_rw & REQ_FLUSH) && psegs)
+ return nvme_split_flush_data(nvmeq, bio);
iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC);
if (!iod)
@@ -795,7 +830,6 @@ static void nvme_make_request(struct request_queue *q, struct bio *bio)
int result = -EBUSY;
if (!nvmeq) {
- put_nvmeq(NULL);
bio_endio(bio, -EIO);
return;
}
@@ -870,10 +904,8 @@ static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx,
struct nvme_queue *nvmeq;
nvmeq = lock_nvmeq(dev, q_idx);
- if (!nvmeq) {
- unlock_nvmeq(nvmeq);
+ if (!nvmeq)
return -ENODEV;
- }
cmdinfo.task = current;
cmdinfo.status = -EINTR;
@@ -898,9 +930,10 @@ static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx,
if (cmdinfo.status == -EINTR) {
nvmeq = lock_nvmeq(dev, q_idx);
- if (nvmeq)
+ if (nvmeq) {
nvme_abort_command(nvmeq, cmdid);
- unlock_nvmeq(nvmeq);
+ unlock_nvmeq(nvmeq);
+ }
return -EINTR;
}
@@ -1358,7 +1391,8 @@ static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled)
return -EINTR;
if (time_after(jiffies, timeout)) {
dev_err(&dev->pci_dev->dev,
- "Device not ready; aborting initialisation\n");
+ "Device not ready; aborting %s\n", enabled ?
+ "initialisation" : "reset");
return -ENODEV;
}
}
@@ -1481,7 +1515,11 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
goto put_pages;
}
+ err = -ENOMEM;
iod = nvme_alloc_iod(count, length, GFP_KERNEL);
+ if (!iod)
+ goto put_pages;
+
sg = iod->sg;
sg_init_table(sg, count);
for (i = 0; i < count; i++) {
@@ -1494,7 +1532,6 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
sg_mark_end(&sg[i - 1]);
iod->nents = count;
- err = -ENOMEM;
nents = dma_map_sg(&dev->pci_dev->dev, sg, count,
write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
if (!nents)
@@ -1894,6 +1931,8 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
if (dev->max_hw_sectors)
blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
+ if (dev->vwc & NVME_CTRL_VWC_PRESENT)
+ blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
disk->major = nvme_major;
disk->first_minor = 0;
@@ -2062,8 +2101,13 @@ static int set_queue_count(struct nvme_dev *dev, int count)
status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0,
&result);
- if (status)
- return status < 0 ? -EIO : -EBUSY;
+ if (status < 0)
+ return status;
+ if (status > 0) {
+ dev_err(&dev->pci_dev->dev, "Could not set queue count (%d)\n",
+ status);
+ return -EBUSY;
+ }
return min(result & 0xffff, result >> 16) + 1;
}
@@ -2072,14 +2116,25 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
}
+static void nvme_cpu_workfn(struct work_struct *work)
+{
+ struct nvme_dev *dev = container_of(work, struct nvme_dev, cpu_work);
+ if (dev->initialized)
+ nvme_assign_io_queues(dev);
+}
+
static int nvme_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
- struct nvme_dev *dev = container_of(self, struct nvme_dev, nb);
+ struct nvme_dev *dev;
+
switch (action) {
case CPU_ONLINE:
case CPU_DEAD:
- nvme_assign_io_queues(dev);
+ spin_lock(&dev_list_lock);
+ list_for_each_entry(dev, &dev_list, node)
+ schedule_work(&dev->cpu_work);
+ spin_unlock(&dev_list_lock);
break;
}
return NOTIFY_OK;
@@ -2148,11 +2203,6 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
nvme_free_queues(dev, nr_io_queues + 1);
nvme_assign_io_queues(dev);
- dev->nb.notifier_call = &nvme_cpu_notify;
- result = register_hotcpu_notifier(&dev->nb);
- if (result)
- goto free_queues;
-
return 0;
free_queues:
@@ -2184,6 +2234,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
res = nvme_identify(dev, 0, 1, dma_addr);
if (res) {
+ dev_err(&pdev->dev, "Identify Controller failed (%d)\n", res);
res = -EIO;
goto out;
}
@@ -2192,6 +2243,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
nn = le32_to_cpup(&ctrl->nn);
dev->oncs = le16_to_cpup(&ctrl->oncs);
dev->abort_limit = ctrl->acl + 1;
+ dev->vwc = ctrl->vwc;
memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
@@ -2450,8 +2502,6 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
int i;
dev->initialized = 0;
- unregister_hotcpu_notifier(&dev->nb);
-
nvme_dev_list_remove(dev);
if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) {
@@ -2722,6 +2772,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
INIT_LIST_HEAD(&dev->namespaces);
dev->reset_workfn = nvme_reset_failed_dev;
INIT_WORK(&dev->reset_work, nvme_reset_workfn);
+ INIT_WORK(&dev->cpu_work, nvme_cpu_workfn);
dev->pci_dev = pdev;
pci_set_drvdata(pdev, dev);
result = nvme_set_instance(dev);
@@ -2801,6 +2852,7 @@ static void nvme_remove(struct pci_dev *pdev)
pci_set_drvdata(pdev, NULL);
flush_work(&dev->reset_work);
+ flush_work(&dev->cpu_work);
misc_deregister(&dev->miscdev);
nvme_dev_remove(dev);
nvme_dev_shutdown(dev);
@@ -2889,11 +2941,18 @@ static int __init nvme_init(void)
else if (result > 0)
nvme_major = result;
- result = pci_register_driver(&nvme_driver);
+ nvme_nb.notifier_call = &nvme_cpu_notify;
+ result = register_hotcpu_notifier(&nvme_nb);
if (result)
goto unregister_blkdev;
+
+ result = pci_register_driver(&nvme_driver);
+ if (result)
+ goto unregister_hotcpu;
return 0;
+ unregister_hotcpu:
+ unregister_hotcpu_notifier(&nvme_nb);
unregister_blkdev:
unregister_blkdev(nvme_major, "nvme");
kill_workq:
@@ -2904,9 +2963,11 @@ static int __init nvme_init(void)
static void __exit nvme_exit(void)
{
pci_unregister_driver(&nvme_driver);
+ unregister_hotcpu_notifier(&nvme_nb);
unregister_blkdev(nvme_major, "nvme");
destroy_workqueue(nvme_workq);
BUG_ON(nvme_thread && !IS_ERR(nvme_thread));
+ _nvme_check_size();
}
MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
index 2c3f5be..a4cd6d6 100644
--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
@@ -1,6 +1,6 @@
/*
* NVM Express device driver
- * Copyright (c) 2011, Intel Corporation.
+ * Copyright (c) 2011-2014, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
@@ -10,10 +10,6 @@
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
*/
/*
@@ -243,8 +239,6 @@ static int sg_version_num = 30534; /* 2 digits for each component */
#define READ_CAP_16_RESP_SIZE 32
/* NVMe Namespace and Command Defines */
-#define NVME_GET_SMART_LOG_PAGE 0x02
-#define NVME_GET_FEAT_TEMP_THRESH 0x04
#define BYTES_TO_DWORDS 4
#define NVME_MAX_FIRMWARE_SLOT 7
@@ -686,6 +680,7 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns,
u8 resp_data_format = 0x02;
u8 protect;
u8 cmdque = 0x01 << 1;
+ u8 fw_offset = sizeof(dev->firmware_rev);
mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
&dma_addr, GFP_KERNEL);
@@ -721,7 +716,11 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns,
inq_response[7] = cmdque; /* wbus16=0 | sync=0 | vs=0 */
strncpy(&inq_response[8], "NVMe ", 8);
strncpy(&inq_response[16], dev->model, 16);
- strncpy(&inq_response[32], dev->firmware_rev, 4);
+
+ while (dev->firmware_rev[fw_offset - 1] == ' ' && fw_offset > 4)
+ fw_offset--;
+ fw_offset -= 4;
+ strncpy(&inq_response[32], dev->firmware_rev + fw_offset, 4);
xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
@@ -1018,8 +1017,8 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns,
c.common.opcode = nvme_admin_get_log_page;
c.common.nsid = cpu_to_le32(0xFFFFFFFF);
c.common.prp1 = cpu_to_le64(dma_addr);
- c.common.cdw10[0] = cpu_to_le32(((sizeof(struct nvme_smart_log) /
- BYTES_TO_DWORDS) << 16) | NVME_GET_SMART_LOG_PAGE);
+ c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) /
+ BYTES_TO_DWORDS) - 1) << 16) | NVME_LOG_SMART);
res = nvme_submit_admin_cmd(dev, &c, NULL);
if (res != NVME_SC_SUCCESS) {
temp_c = LOG_TEMP_UNKNOWN;
@@ -1086,8 +1085,8 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr,
c.common.opcode = nvme_admin_get_log_page;
c.common.nsid = cpu_to_le32(0xFFFFFFFF);
c.common.prp1 = cpu_to_le64(dma_addr);
- c.common.cdw10[0] = cpu_to_le32(((sizeof(struct nvme_smart_log) /
- BYTES_TO_DWORDS) << 16) | NVME_GET_SMART_LOG_PAGE);
+ c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) /
+ BYTES_TO_DWORDS) - 1) << 16) | NVME_LOG_SMART);
res = nvme_submit_admin_cmd(dev, &c, NULL);
if (res != NVME_SC_SUCCESS) {
temp_c_cur = LOG_TEMP_UNKNOWN;
@@ -1477,7 +1476,7 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr,
goto out_dma;
}
id_ctrl = mem;
- lowest_pow_st = id_ctrl->npss - 1;
+ lowest_pow_st = max(POWER_STATE_0, (int)(id_ctrl->npss - 1));
switch (pc) {
case NVME_POWER_STATE_START_VALID:
@@ -1494,20 +1493,19 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr,
break;
case NVME_POWER_STATE_IDLE:
/* Action unspecified if POWER CONDITION MODIFIER != [0,1,2] */
- /* min of desired state and (lps-1) because lps is STOP */
if (pcmod == 0x0)
- ps_desired = min(POWER_STATE_1, (lowest_pow_st - 1));
+ ps_desired = POWER_STATE_1;
else if (pcmod == 0x1)
- ps_desired = min(POWER_STATE_2, (lowest_pow_st - 1));
+ ps_desired = POWER_STATE_2;
else if (pcmod == 0x2)
- ps_desired = min(POWER_STATE_3, (lowest_pow_st - 1));
+ ps_desired = POWER_STATE_3;
break;
case NVME_POWER_STATE_STANDBY:
/* Action unspecified if POWER CONDITION MODIFIER != [0,1] */
if (pcmod == 0x0)
- ps_desired = max(0, (lowest_pow_st - 2));
+ ps_desired = max(POWER_STATE_0, (lowest_pow_st - 2));
else if (pcmod == 0x1)
- ps_desired = max(0, (lowest_pow_st - 1));
+ ps_desired = max(POWER_STATE_0, (lowest_pow_st - 1));
break;
case NVME_POWER_STATE_LU_CONTROL:
default:
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 4c95b50..b2c98c1 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -541,7 +541,6 @@ static int rbd_open(struct block_device *bdev, fmode_t mode)
return -ENOENT;
(void) get_device(&rbd_dev->dev);
- set_device_ro(bdev, rbd_dev->mapping.read_only);
return 0;
}
@@ -559,10 +558,76 @@ static void rbd_release(struct gendisk *disk, fmode_t mode)
put_device(&rbd_dev->dev);
}
+static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
+{
+ int ret = 0;
+ int val;
+ bool ro;
+ bool ro_changed = false;
+
+ /* get_user() may sleep, so call it before taking rbd_dev->lock */
+ if (get_user(val, (int __user *)(arg)))
+ return -EFAULT;
+
+ ro = val ? true : false;
+ /* Snapshot doesn't allow to write*/
+ if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
+ return -EROFS;
+
+ spin_lock_irq(&rbd_dev->lock);
+ /* prevent others open this device */
+ if (rbd_dev->open_count > 1) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ if (rbd_dev->mapping.read_only != ro) {
+ rbd_dev->mapping.read_only = ro;
+ ro_changed = true;
+ }
+
+out:
+ spin_unlock_irq(&rbd_dev->lock);
+ /* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */
+ if (ret == 0 && ro_changed)
+ set_disk_ro(rbd_dev->disk, ro ? 1 : 0);
+
+ return ret;
+}
+
+static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
+ int ret = 0;
+
+ switch (cmd) {
+ case BLKROSET:
+ ret = rbd_ioctl_set_ro(rbd_dev, arg);
+ break;
+ default:
+ ret = -ENOTTY;
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ return rbd_ioctl(bdev, mode, cmd, arg);
+}
+#endif /* CONFIG_COMPAT */
+
static const struct block_device_operations rbd_bd_ops = {
.owner = THIS_MODULE,
.open = rbd_open,
.release = rbd_release,
+ .ioctl = rbd_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = rbd_compat_ioctl,
+#endif
};
/*
@@ -1366,6 +1431,14 @@ static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
}
+static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
+{
+ struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
+
+ return obj_request->img_offset <
+ round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
+}
+
static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
{
dout("%s: obj %p (was %d)\n", __func__, obj_request,
@@ -1382,6 +1455,13 @@ static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
kref_put(&obj_request->kref, rbd_obj_request_destroy);
}
+static void rbd_img_request_get(struct rbd_img_request *img_request)
+{
+ dout("%s: img %p (was %d)\n", __func__, img_request,
+ atomic_read(&img_request->kref.refcount));
+ kref_get(&img_request->kref);
+}
+
static bool img_request_child_test(struct rbd_img_request *img_request);
static void rbd_parent_request_destroy(struct kref *kref);
static void rbd_img_request_destroy(struct kref *kref);
@@ -2142,6 +2222,7 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
img_request->next_completion = which;
out:
spin_unlock_irq(&img_request->completion_lock);
+ rbd_img_request_put(img_request);
if (!more)
rbd_img_request_complete(img_request);
@@ -2242,6 +2323,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
goto out_unwind;
obj_request->osd_req = osd_req;
obj_request->callback = rbd_img_obj_callback;
+ rbd_img_request_get(img_request);
if (write_request) {
osd_req_op_alloc_hint_init(osd_req, which,
@@ -2674,7 +2756,7 @@ static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
*/
if (!img_request_write_test(img_request) ||
!img_request_layered_test(img_request) ||
- rbd_dev->parent_overlap <= obj_request->img_offset ||
+ !obj_request_overlaps_parent(obj_request) ||
((known = obj_request_known_test(obj_request)) &&
obj_request_exists_test(obj_request))) {
@@ -2872,56 +2954,55 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
}
/*
- * Request sync osd watch/unwatch. The value of "start" determines
- * whether a watch request is being initiated or torn down.
+ * Initiate a watch request, synchronously.
*/
-static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
+static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
{
struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
struct rbd_obj_request *obj_request;
int ret;
- rbd_assert(start ^ !!rbd_dev->watch_event);
- rbd_assert(start ^ !!rbd_dev->watch_request);
+ rbd_assert(!rbd_dev->watch_event);
+ rbd_assert(!rbd_dev->watch_request);
- if (start) {
- ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
- &rbd_dev->watch_event);
- if (ret < 0)
- return ret;
- rbd_assert(rbd_dev->watch_event != NULL);
- }
+ ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
+ &rbd_dev->watch_event);
+ if (ret < 0)
+ return ret;
+
+ rbd_assert(rbd_dev->watch_event);
- ret = -ENOMEM;
obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
- OBJ_REQUEST_NODATA);
- if (!obj_request)
+ OBJ_REQUEST_NODATA);
+ if (!obj_request) {
+ ret = -ENOMEM;
goto out_cancel;
+ }
obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1,
obj_request);
- if (!obj_request->osd_req)
- goto out_cancel;
+ if (!obj_request->osd_req) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
- if (start)
- ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
- else
- ceph_osdc_unregister_linger_request(osdc,
- rbd_dev->watch_request->osd_req);
+ ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
- rbd_dev->watch_event->cookie, 0, start ? 1 : 0);
+ rbd_dev->watch_event->cookie, 0, 1);
rbd_osd_req_format_write(obj_request);
ret = rbd_obj_request_submit(osdc, obj_request);
if (ret)
- goto out_cancel;
+ goto out_linger;
+
ret = rbd_obj_request_wait(obj_request);
if (ret)
- goto out_cancel;
+ goto out_linger;
+
ret = obj_request->result;
if (ret)
- goto out_cancel;
+ goto out_linger;
/*
* A watch request is set to linger, so the underlying osd
@@ -2931,36 +3012,84 @@ static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
* it. We'll drop that reference (below) after we've
* unregistered it.
*/
- if (start) {
- rbd_dev->watch_request = obj_request;
+ rbd_dev->watch_request = obj_request;
- return 0;
+ return 0;
+
+out_linger:
+ ceph_osdc_unregister_linger_request(osdc, obj_request->osd_req);
+out_put:
+ rbd_obj_request_put(obj_request);
+out_cancel:
+ ceph_osdc_cancel_event(rbd_dev->watch_event);
+ rbd_dev->watch_event = NULL;
+
+ return ret;
+}
+
+/*
+ * Tear down a watch request, synchronously.
+ */
+static int __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
+{
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ struct rbd_obj_request *obj_request;
+ int ret;
+
+ rbd_assert(rbd_dev->watch_event);
+ rbd_assert(rbd_dev->watch_request);
+
+ obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
+ OBJ_REQUEST_NODATA);
+ if (!obj_request) {
+ ret = -ENOMEM;
+ goto out_cancel;
+ }
+
+ obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1,
+ obj_request);
+ if (!obj_request->osd_req) {
+ ret = -ENOMEM;
+ goto out_put;
}
+ osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
+ rbd_dev->watch_event->cookie, 0, 0);
+ rbd_osd_req_format_write(obj_request);
+
+ ret = rbd_obj_request_submit(osdc, obj_request);
+ if (ret)
+ goto out_put;
+
+ ret = rbd_obj_request_wait(obj_request);
+ if (ret)
+ goto out_put;
+
+ ret = obj_request->result;
+ if (ret)
+ goto out_put;
+
/* We have successfully torn down the watch request */
+ ceph_osdc_unregister_linger_request(osdc,
+ rbd_dev->watch_request->osd_req);
rbd_obj_request_put(rbd_dev->watch_request);
rbd_dev->watch_request = NULL;
+
+out_put:
+ rbd_obj_request_put(obj_request);
out_cancel:
- /* Cancel the event if we're tearing down, or on error */
ceph_osdc_cancel_event(rbd_dev->watch_event);
rbd_dev->watch_event = NULL;
- if (obj_request)
- rbd_obj_request_put(obj_request);
return ret;
}
-static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
-{
- return __rbd_dev_header_watch_sync(rbd_dev, true);
-}
-
static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
{
int ret;
- ret = __rbd_dev_header_watch_sync(rbd_dev, false);
+ ret = __rbd_dev_header_unwatch_sync(rbd_dev);
if (ret) {
rbd_warn(rbd_dev, "unable to tear down watch request: %d\n",
ret);
@@ -3058,7 +3187,6 @@ static void rbd_request_fn(struct request_queue *q)
__releases(q->queue_lock) __acquires(q->queue_lock)
{
struct rbd_device *rbd_dev = q->queuedata;
- bool read_only = rbd_dev->mapping.read_only;
struct request *rq;
int result;
@@ -3094,7 +3222,7 @@ static void rbd_request_fn(struct request_queue *q)
if (write_request) {
result = -EROFS;
- if (read_only)
+ if (rbd_dev->mapping.read_only)
goto end_request;
rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
}
@@ -4683,6 +4811,38 @@ out_err:
}
/*
+ * Return pool id (>= 0) or a negative error code.
+ */
+static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
+{
+ u64 newest_epoch;
+ unsigned long timeout = rbdc->client->options->mount_timeout * HZ;
+ int tries = 0;
+ int ret;
+
+again:
+ ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
+ if (ret == -ENOENT && tries++ < 1) {
+ ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap",
+ &newest_epoch);
+ if (ret < 0)
+ return ret;
+
+ if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
+ ceph_monc_request_next_osdmap(&rbdc->client->monc);
+ (void) ceph_monc_wait_osdmap(&rbdc->client->monc,
+ newest_epoch, timeout);
+ goto again;
+ } else {
+ /* the osdmap we have is new enough */
+ return -ENOENT;
+ }
+ }
+
+ return ret;
+}
+
+/*
* An rbd format 2 image has a unique identifier, distinct from the
* name given to it by the user. Internally, that identifier is
* what's used to specify the names of objects related to the image.
@@ -4752,7 +4912,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
image_id = ceph_extract_encoded_string(&p, p + ret,
NULL, GFP_NOIO);
- ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
+ ret = PTR_ERR_OR_ZERO(image_id);
if (!ret)
rbd_dev->image_format = 2;
} else {
@@ -4907,6 +5067,7 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
if (ret)
goto err_out_disk;
set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
+ set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);
ret = rbd_bus_add_dev(rbd_dev);
if (ret)
@@ -5053,7 +5214,6 @@ static ssize_t do_rbd_add(struct bus_type *bus,
struct rbd_options *rbd_opts = NULL;
struct rbd_spec *spec = NULL;
struct rbd_client *rbdc;
- struct ceph_osd_client *osdc;
bool read_only;
int rc = -ENOMEM;
@@ -5075,8 +5235,7 @@ static ssize_t do_rbd_add(struct bus_type *bus,
}
/* pick the pool */
- osdc = &rbdc->client->osdc;
- rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
+ rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
if (rc < 0)
goto err_out_client;
spec->pool_id = (u64)rc;
@@ -5387,6 +5546,7 @@ err_out_slab:
static void __exit rbd_exit(void)
{
+ ida_destroy(&rbd_dev_id_ida);
rbd_sysfs_cleanup();
if (single_major)
unregister_blkdev(rbd_major, RBD_DRV_NAME);
OpenPOWER on IntegriCloud