From 77032ca66f86fa631b38b2e7feeb2b953e59f2f0 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 24 Nov 2015 17:30:34 -0600
Subject: Return EBUSY from BLKRRPART for mounted whole-dev fs

Today, blockdev --rereadpt /dev/sda will fail with EBUSY if any
partition of sda is mounted (and will fail with EINVAL if pointed
at a partition).  But it will pass if the entire block device is
formatted with a filesystem and mounted.  I don't think this makes
sense; partitioning should surely not ever change out from under
a mounted device.

So check for bdev->bd_super, and fail that with -EBUSY as well.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/partition-generic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/partition-generic.c b/block/partition-generic.c
index 3b03015..746935a 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -397,7 +397,7 @@ static int drop_partitions(struct gendisk *disk, struct block_device *bdev)
 	struct hd_struct *part;
 	int res;
 
-	if (bdev->bd_part_count)
+	if (bdev->bd_part_count || bdev->bd_super)
 		return -EBUSY;
 	res = invalidate_partition(disk, 0);
 	if (res)
-- 
cgit v1.1


From c4699e70d1db14119708dae76dac7c43e1e12988 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Sat, 28 Nov 2015 16:49:22 +0100
Subject: lightnvm: Simplify config when disabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We shouldn't compile an object file to get empty implementations;
conforms to linux coding style on conditional compilation.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/Makefile   |  3 ++-
 drivers/nvme/host/lightnvm.c | 13 -------------
 drivers/nvme/host/nvme.h     | 14 ++++++++++++++
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index 219dc206..a5fe239 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -1,4 +1,5 @@
 
 obj-$(CONFIG_BLK_DEV_NVME)     += nvme.o
 
-nvme-y		+= pci.o scsi.o lightnvm.o
+lightnvm-$(CONFIG_NVM)	:= lightnvm.o
+nvme-y		+= pci.o scsi.o $(lightnvm-y)
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 9202d1a..07451d6 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -22,8 +22,6 @@
 
 #include "nvme.h"
 
-#ifdef CONFIG_NVM
-
 #include <linux/nvme.h>
 #include <linux/bitops.h>
 #include <linux/lightnvm.h>
@@ -588,14 +586,3 @@ int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id)
 
 	return 0;
 }
-#else
-int nvme_nvm_register(struct request_queue *q, char *disk_name)
-{
-	return 0;
-}
-void nvme_nvm_unregister(struct request_queue *q, char *disk_name) {};
-int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id)
-{
-	return 0;
-}
-#endif /* CONFIG_NVM */
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index fdb4e5b..044253d 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -136,8 +136,22 @@ int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr);
 int nvme_sg_io32(struct nvme_ns *ns, unsigned long arg);
 int nvme_sg_get_version_num(int __user *ip);
 
+#ifdef CONFIG_NVM
 int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id);
 int nvme_nvm_register(struct request_queue *q, char *disk_name);
 void nvme_nvm_unregister(struct request_queue *q, char *disk_name);
+#else
+static inline int nvme_nvm_register(struct request_queue *q, char *disk_name)
+{
+	return 0;
+}
+
+static inline void nvme_nvm_unregister(struct request_queue *q, char *disk_name) {};
+
+static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id)
+{
+	return 0;
+}
+#endif /* CONFIG_NVM */
 
 #endif /* _NVME_H */
-- 
cgit v1.1


From 8261bd48c6c9c36cd2c2e343a69e76a3be2b04a4 Mon Sep 17 00:00:00 2001
From: Wenwei Tao <ww.tao0320@gmail.com>
Date: Sat, 28 Nov 2015 16:49:23 +0100
Subject: lightnvm: free memory when gennvm register fails
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

free allocated nvm block and gennvm lun structures when
gennvm register fails, otherwise it will cause memory leak.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/gennvm.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index e20e74e..3969a98 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -207,6 +207,14 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn)
 	return 0;
 }
 
+static void gennvm_free(struct nvm_dev *dev)
+{
+	gennvm_blocks_free(dev);
+	gennvm_luns_free(dev);
+	kfree(dev->mp);
+	dev->mp = NULL;
+}
+
 static int gennvm_register(struct nvm_dev *dev)
 {
 	struct gen_nvm *gn;
@@ -234,16 +242,13 @@ static int gennvm_register(struct nvm_dev *dev)
 
 	return 1;
 err:
-	kfree(gn);
+	gennvm_free(dev);
 	return ret;
 }
 
 static void gennvm_unregister(struct nvm_dev *dev)
 {
-	gennvm_blocks_free(dev);
-	gennvm_luns_free(dev);
-	kfree(dev->mp);
-	dev->mp = NULL;
+	gennvm_free(dev);
 }
 
 static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev,
-- 
cgit v1.1


From 76e25081b6ae60fb094328dedf900ec15f10a9fd Mon Sep 17 00:00:00 2001
From: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Date: Sat, 28 Nov 2015 16:49:24 +0100
Subject: lightnvm: fix ioctl memory leaks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If copy_to_user() fails we returned error but we missed releasing
devices.

Signed-off-by: Sudip Mukherjee <sudip@vectorindia.org>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 5178645..ea50fa5 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -680,8 +680,10 @@ static long nvm_ioctl_info(struct file *file, void __user *arg)
 	info->tgtsize = tgt_iter;
 	up_write(&nvm_lock);
 
-	if (copy_to_user(arg, info, sizeof(struct nvm_ioctl_info)))
+	if (copy_to_user(arg, info, sizeof(struct nvm_ioctl_info))) {
+		kfree(info);
 		return -EFAULT;
+	}
 
 	kfree(info);
 	return 0;
@@ -724,8 +726,11 @@ static long nvm_ioctl_get_devices(struct file *file, void __user *arg)
 
 	devices->nr_devices = i;
 
-	if (copy_to_user(arg, devices, sizeof(struct nvm_ioctl_get_devices)))
+	if (copy_to_user(arg, devices,
+			 sizeof(struct nvm_ioctl_get_devices))) {
+		kfree(devices);
 		return -EFAULT;
+	}
 
 	kfree(devices);
 	return 0;
-- 
cgit v1.1


From d160147b5c96ef5ec842c604ccd79f5f03306677 Mon Sep 17 00:00:00 2001
From: Wenwei Tao <ww.tao0320@gmail.com>
Date: Sat, 28 Nov 2015 16:49:25 +0100
Subject: lightnvm: do device max sectors boundary check first
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

do device max_phys_sect boundary check first, otherwise
we will allocate dma_pools for devices whose max sectors
are beyond lightnvm support and register them.

Signed-off-by: Wenwei Tao <ww.tao0320@gmail.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index ea50fa5..ea6dba5 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -308,6 +308,12 @@ int nvm_register(struct request_queue *q, char *disk_name,
 	if (ret)
 		goto err_init;
 
+	if (dev->ops->max_phys_sect > 256) {
+		pr_info("nvm: max sectors supported is 256.\n");
+		ret = -EINVAL;
+		goto err_init;
+	}
+
 	if (dev->ops->max_phys_sect > 1) {
 		dev->ppalist_pool = dev->ops->create_dma_pool(dev->q,
 								"ppalist");
@@ -316,10 +322,6 @@ int nvm_register(struct request_queue *q, char *disk_name,
 			ret = -ENOMEM;
 			goto err_init;
 		}
-	} else if (dev->ops->max_phys_sect > 256) {
-		pr_info("nvm: max sectors supported is 256.\n");
-		ret = -EINVAL;
-		goto err_init;
 	}
 
 	down_write(&nvm_lock);
-- 
cgit v1.1


From 09f2e716096811081b204c6afd6264c2e64d1210 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= <m@bjorling.me>
Date: Sat, 28 Nov 2015 16:49:26 +0100
Subject: lightnvm: refactor and change vendor id for qemu
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The QEMU NVMe implementation uses Intel vendor, Intel device id, and the
first vendor specific byte to identify a LightNVM compatible nvme
instance.

Instead of using the Intel specific, use a preallocated from CNEX Labs
instead. This lets us uniquely identify a QEMU lightnvm device without
breaking other vendor specific work in the qemu device driver.

Reported-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/lightnvm.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 07451d6..b9e5cc7 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -569,18 +569,25 @@ void nvme_nvm_unregister(struct request_queue *q, char *disk_name)
 	nvm_unregister(disk_name);
 }
 
+/* move to shared place when used in multiple places. */
+#define PCI_VENDOR_ID_CNEX 0x1d1d
+#define PCI_DEVICE_ID_CNEX_WL 0x2807
+#define PCI_DEVICE_ID_CNEX_QEMU 0x1f1f
+
 int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id)
 {
 	struct nvme_dev *dev = ns->dev;
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
 	/* QEMU NVMe simulator - PCI ID + Vendor specific bit */
-	if (pdev->vendor == PCI_VENDOR_ID_INTEL && pdev->device == 0x5845 &&
+	if (pdev->vendor == PCI_VENDOR_ID_CNEX &&
+				pdev->device == PCI_DEVICE_ID_CNEX_QEMU &&
 							id->vs[0] == 0x1)
 		return 1;
 
 	/* CNEX Labs - PCI ID + Vendor specific bit */
-	if (pdev->vendor == 0x1d1d && pdev->device == 0x2807 &&
+	if (pdev->vendor == PCI_VENDOR_ID_CNEX &&
+				pdev->device == PCI_DEVICE_ID_CNEX_WL &&
 							id->vs[0] == 0x1)
 		return 1;
 
-- 
cgit v1.1


From 08236c6bb2980561fba657c58fdc76f2865f236c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= <m@bjorling.me>
Date: Sat, 28 Nov 2015 16:49:27 +0100
Subject: lightnvm: unconverted ppa returned in get_bb_tbl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The get_bb_tbl function takes ppa as a generic address, which is
converted to the ppa device address within the device driver. When
the update_bbtbl callback is called from get_bb_tbl, the device
specific ppa is used, instead of the generic ppa.

Make sure to pass the generic ppa.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/gennvm.c    | 3 +--
 drivers/nvme/host/lightnvm.c | 4 +++-
 include/linux/lightnvm.h     | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index 3969a98..35dde84 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -75,7 +75,6 @@ static int gennvm_block_bb(struct ppa_addr ppa, int nr_blocks, u8 *blks,
 	struct nvm_block *blk;
 	int i;
 
-	ppa = dev_to_generic_addr(gn->dev, ppa);
 	lun = &gn->luns[(dev->nr_luns * ppa.g.ch) + ppa.g.lun];
 
 	for (i = 0; i < nr_blocks; i++) {
@@ -187,7 +186,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn)
 			ppa.g.lun = lun->vlun.id;
 			ppa = generic_to_dev_addr(dev, ppa);
 
-			ret = dev->ops->get_bb_tbl(dev->q, ppa,
+			ret = dev->ops->get_bb_tbl(dev, ppa,
 						dev->blks_per_lun,
 						gennvm_block_bb, gn);
 			if (ret)
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index b9e5cc7..06c3364 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -355,10 +355,11 @@ out:
 	return ret;
 }
 
-static int nvme_nvm_get_bb_tbl(struct request_queue *q, struct ppa_addr ppa,
+static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
 				int nr_blocks, nvm_bb_update_fn *update_bbtbl,
 				void *priv)
 {
+	struct request_queue *q = nvmdev->q;
 	struct nvme_ns *ns = q->queuedata;
 	struct nvme_dev *dev = ns->dev;
 	struct nvme_nvm_command c = {};
@@ -402,6 +403,7 @@ static int nvme_nvm_get_bb_tbl(struct request_queue *q, struct ppa_addr ppa,
 		goto out;
 	}
 
+	ppa = dev_to_generic_addr(nvmdev, ppa);
 	ret = update_bbtbl(ppa, nr_blocks, bb_tbl->blk, priv);
 	if (ret) {
 		ret = -EINTR;
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 3db5552..c6916ae 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -179,7 +179,7 @@ typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *);
 typedef int (nvm_id_fn)(struct request_queue *, struct nvm_id *);
 typedef int (nvm_get_l2p_tbl_fn)(struct request_queue *, u64, u32,
 				nvm_l2p_update_fn *, void *);
-typedef int (nvm_op_bb_tbl_fn)(struct request_queue *, struct ppa_addr, int,
+typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, int,
 				nvm_bb_update_fn *, void *);
 typedef int (nvm_op_set_bb_fn)(struct request_queue *, struct nvm_rq *, int);
 typedef int (nvm_submit_io_fn)(struct request_queue *, struct nvm_rq *);
-- 
cgit v1.1


From d0a712ceb83ebaea32d520825ee7b997f59b168f Mon Sep 17 00:00:00 2001
From: Wenwei Tao <ww.tao0320@gmail.com>
Date: Sat, 28 Nov 2015 16:49:28 +0100
Subject: lightnvm: missing nvm_lock acquire
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To avoid race conditions, traverse dev, media manager,
and target lists and also register, unregister entries
to/from them, should be always under the nvm_lock control.

Signed-off-by: Wenwei Tao <ww.tao0320@gmail.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c | 75 +++++++++++++++++++++++++++----------------------
 1 file changed, 42 insertions(+), 33 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index ea6dba5..86ce887 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -123,6 +123,26 @@ void nvm_unregister_mgr(struct nvmm_type *mt)
 }
 EXPORT_SYMBOL(nvm_unregister_mgr);
 
+/* register with device with a supported manager */
+static int register_mgr(struct nvm_dev *dev)
+{
+	struct nvmm_type *mt;
+	int ret = 0;
+
+	list_for_each_entry(mt, &nvm_mgrs, list) {
+		ret = mt->register_mgr(dev);
+		if (ret > 0) {
+			dev->mt = mt;
+			break; /* successfully initialized */
+		}
+	}
+
+	if (!ret)
+		pr_info("nvm: no compatible nvm manager found.\n");
+
+	return ret;
+}
+
 static struct nvm_dev *nvm_find_nvm_dev(const char *name)
 {
 	struct nvm_dev *dev;
@@ -221,7 +241,6 @@ static void nvm_free(struct nvm_dev *dev)
 
 static int nvm_init(struct nvm_dev *dev)
 {
-	struct nvmm_type *mt;
 	int ret = -EINVAL;
 
 	if (!dev->q || !dev->ops)
@@ -252,21 +271,13 @@ static int nvm_init(struct nvm_dev *dev)
 		goto err;
 	}
 
-	/* register with device with a supported manager */
-	list_for_each_entry(mt, &nvm_mgrs, list) {
-		ret = mt->register_mgr(dev);
-		if (ret < 0)
-			goto err; /* initialization failed */
-		if (ret > 0) {
-			dev->mt = mt;
-			break; /* successfully initialized */
-		}
-	}
-
-	if (!ret) {
-		pr_info("nvm: no compatible manager found.\n");
+	down_write(&nvm_lock);
+	ret = register_mgr(dev);
+	up_write(&nvm_lock);
+	if (ret < 0)
+		goto err;
+	if (!ret)
 		return 0;
-	}
 
 	pr_info("nvm: registered %s [%u/%u/%u/%u/%u/%u]\n",
 			dev->name, dev->sec_per_pg, dev->nr_planes,
@@ -337,15 +348,17 @@ EXPORT_SYMBOL(nvm_register);
 
 void nvm_unregister(char *disk_name)
 {
-	struct nvm_dev *dev = nvm_find_nvm_dev(disk_name);
+	struct nvm_dev *dev;
 
+	down_write(&nvm_lock);
+	dev = nvm_find_nvm_dev(disk_name);
 	if (!dev) {
 		pr_err("nvm: could not find device %s to unregister\n",
 								disk_name);
+		up_write(&nvm_lock);
 		return;
 	}
 
-	down_write(&nvm_lock);
 	list_del(&dev->devices);
 	up_write(&nvm_lock);
 
@@ -363,38 +376,30 @@ static int nvm_create_target(struct nvm_dev *dev,
 {
 	struct nvm_ioctl_create_simple *s = &create->conf.s;
 	struct request_queue *tqueue;
-	struct nvmm_type *mt;
 	struct gendisk *tdisk;
 	struct nvm_tgt_type *tt;
 	struct nvm_target *t;
 	void *targetdata;
 	int ret = 0;
 
+	down_write(&nvm_lock);
 	if (!dev->mt) {
-		/* register with device with a supported NVM manager */
-		list_for_each_entry(mt, &nvm_mgrs, list) {
-			ret = mt->register_mgr(dev);
-			if (ret < 0)
-				return ret; /* initialization failed */
-			if (ret > 0) {
-				dev->mt = mt;
-				break; /* successfully initialized */
-			}
-		}
-
-		if (!ret) {
-			pr_info("nvm: no compatible nvm manager found.\n");
-			return -ENODEV;
+		ret = register_mgr(dev);
+		if (!ret)
+			ret = -ENODEV;
+		if (ret < 0) {
+			up_write(&nvm_lock);
+			return ret;
 		}
 	}
 
 	tt = nvm_find_target_type(create->tgttype);
 	if (!tt) {
 		pr_err("nvm: target type %s not found\n", create->tgttype);
+		up_write(&nvm_lock);
 		return -EINVAL;
 	}
 
-	down_write(&nvm_lock);
 	list_for_each_entry(t, &dev->online_targets, list) {
 		if (!strcmp(create->tgtname, t->disk->disk_name)) {
 			pr_err("nvm: target name already exists.\n");
@@ -478,7 +483,9 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create)
 	struct nvm_dev *dev;
 	struct nvm_ioctl_create_simple *s;
 
+	down_write(&nvm_lock);
 	dev = nvm_find_nvm_dev(create->dev);
+	up_write(&nvm_lock);
 	if (!dev) {
 		pr_err("nvm: device not found\n");
 		return -EINVAL;
@@ -537,7 +544,9 @@ static int nvm_configure_show(const char *val)
 		return -EINVAL;
 	}
 
+	down_write(&nvm_lock);
 	dev = nvm_find_nvm_dev(devname);
+	up_write(&nvm_lock);
 	if (!dev) {
 		pr_err("nvm: device not found\n");
 		return -EINVAL;
-- 
cgit v1.1


From bf4e6b4e757488dee1b6a581f49c7ac34cd217f8 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Thu, 26 Nov 2015 08:46:57 +0100
Subject: block: Always check queue limits for cloned requests

When a cloned request is retried on other queues it always needs
to be checked against the queue limits of that queue.
Otherwise the calculations for nr_phys_segments might be wrong,
leading to a crash in scsi_init_sgtable().

To clarify this the patch renames blk_rq_check_limits()
to blk_cloned_rq_check_limits() and removes the symbol
export, as the new function should only be used for
cloned requests and never exported.

Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Ewan Milne <emilne@redhat.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Hannes Reinecke <hare@suse.de>
Fixes: e2a60da74 ("block: Clean up special command handling logic")
Cc: stable@vger.kernel.org # 3.7+
Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       | 21 +++++++--------------
 include/linux/blkdev.h |  1 -
 2 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 5131993b..a0af404 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2114,7 +2114,8 @@ blk_qc_t submit_bio(int rw, struct bio *bio)
 EXPORT_SYMBOL(submit_bio);
 
 /**
- * blk_rq_check_limits - Helper function to check a request for the queue limit
+ * blk_cloned_rq_check_limits - Helper function to check a cloned request
+ *                              for new the queue limits
  * @q:  the queue
  * @rq: the request being checked
  *
@@ -2125,20 +2126,13 @@ EXPORT_SYMBOL(submit_bio);
  *    after it is inserted to @q, it should be checked against @q before
  *    the insertion using this generic function.
  *
- *    This function should also be useful for request stacking drivers
- *    in some cases below, so export this function.
  *    Request stacking drivers like request-based dm may change the queue
- *    limits while requests are in the queue (e.g. dm's table swapping).
- *    Such request stacking drivers should check those requests against
- *    the new queue limits again when they dispatch those requests,
- *    although such checkings are also done against the old queue limits
- *    when submitting requests.
+ *    limits when retrying requests on other queues. Those requests need
+ *    to be checked against the new queue limits again during dispatch.
  */
-int blk_rq_check_limits(struct request_queue *q, struct request *rq)
+static int blk_cloned_rq_check_limits(struct request_queue *q,
+				      struct request *rq)
 {
-	if (!rq_mergeable(rq))
-		return 0;
-
 	if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, rq->cmd_flags)) {
 		printk(KERN_ERR "%s: over max size limit.\n", __func__);
 		return -EIO;
@@ -2158,7 +2152,6 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(blk_rq_check_limits);
 
 /**
  * blk_insert_cloned_request - Helper for stacking drivers to submit a request
@@ -2170,7 +2163,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 	unsigned long flags;
 	int where = ELEVATOR_INSERT_BACK;
 
-	if (blk_rq_check_limits(q, rq))
+	if (blk_cloned_rq_check_limits(q, rq))
 		return -EIO;
 
 	if (rq->rq_disk &&
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c0d2b79..c06f8ea 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -773,7 +773,6 @@ extern void blk_rq_set_block_pc(struct request *);
 extern void blk_requeue_request(struct request_queue *, struct request *);
 extern void blk_add_request_payload(struct request *rq, struct page *page,
 		unsigned int len);
-extern int blk_rq_check_limits(struct request_queue *q, struct request *rq);
 extern int blk_lld_busy(struct request_queue *q);
 extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 			     struct bio_set *bs, gfp_t gfp_mask,
-- 
cgit v1.1


From 74cedf9b6c603f2278a05bc91b140b32b434d0b5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 30 Nov 2015 10:15:42 -0700
Subject: direct-io: Fix negative return from dio read beyond eof

Assume a filesystem with 4KB blocks. When a file has size 1000 bytes and
we issue direct IO read at offset 1024, blockdev_direct_IO() reads the
tail of the last block and the logic for handling short DIO reads in
dio_complete() results in a return value -24 (1000 - 1024) which
obviously confuses userspace.

Fix the problem by bailing out early once we sample i_size and can
reliably check that direct IO read starts beyond i_size.

Reported-by: Avi Kivity <avi@scylladb.com>
Fixes: 9fe55eea7e4b444bafc42fa0000cc2d1d2847275
CC: stable@vger.kernel.org
CC: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/direct-io.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index cb5337d..1c75a3a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1169,6 +1169,15 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 		}
 	}
 
+	/* Once we sampled i_size check for reads beyond EOF */
+	dio->i_size = i_size_read(inode);
+	if (iov_iter_rw(iter) == READ && offset >= dio->i_size) {
+		if (dio->flags & DIO_LOCKING)
+			mutex_unlock(&inode->i_mutex);
+		kmem_cache_free(dio_cache, dio);
+		goto out;
+	}
+
 	/*
 	 * For file extending writes updating i_size before data writeouts
 	 * complete can expose uninitialized blocks in dumb filesystems.
@@ -1222,7 +1231,6 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	sdio.next_block_for_io = -1;
 
 	dio->iocb = iocb;
-	dio->i_size = i_size_read(inode);
 
 	spin_lock_init(&dio->bio_lock);
 	dio->refcount = 1;
-- 
cgit v1.1


From a88d32af18b8a6616128c971f766eaf545966405 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Mon, 30 Nov 2015 16:05:49 +0800
Subject: blk-merge: fix computing bio->bi_seg_front_size in case of single
 segment

When bio has only one physical segment, we should set bio's
bi_seg_front_size as the real(final) size of the single segment.

Fixes: 02e707424c2ea(blk-merge: fix blk_bio_segment_split)
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Tested-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-merge.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 41a55ba..e01405a 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -103,6 +103,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 			bvprv = bv;
 			bvprvp = &bvprv;
 			sectors += bv.bv_len >> 9;
+
+			if (nsegs == 1 && seg_size > front_seg_size)
+				front_seg_size = seg_size;
 			continue;
 		}
 new_segment:
-- 
cgit v1.1


From 3c395a969acc690f331d5fb91ec99ea8eb5155dd Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@unimore.it>
Date: Tue, 1 Dec 2015 11:48:17 +0100
Subject: null_blk: set a separate timer for each command

For the Timer IRQ mode (i.e., when command completions are delayed),
there is one timer for each CPU. Each of these timers
. has a completion queue associated with it, containing all the
  command completions to be executed when the timer fires;
. is set, and a new completion-to-execute is inserted into its
  completion queue, every time the dispatch code for a new command
  happens to be executed on the CPU related to the timer.

This implies that, if the dispatch of a new command happens to be
executed on a CPU whose timer has already been set, but has not yet
fired, then the timer is set again, to the completion time of the
newly arrived command. When the timer eventually fires, all its queued
completions are executed.

This way of handling delayed command completions entails the following
problem: if more than one command completion is inserted into the
queue of a timer before the timer fires, then the expiration time for
the timer is moved forward every time each of these completions is
enqueued. As a consequence, only the last completion enqueued enjoys a
correct execution time, while all previous completions are unjustly
delayed until the last completion is executed (and at that time they
are executed all together).

Specifically, if all the above completions are enqueued almost at the
same time, then the problem is negligible. On the opposite end, if
every completion is enqueued a while after the previous completion was
enqueued (in the extreme case, it is enqueued only right before the
timer would have expired), then every enqueued completion, except for
the last one, experiences an inflated delay, proportional to the number
of completions enqueued after it. In the end, commands, and thus I/O
requests, may be completed at an arbitrarily lower rate than the
desired one.

This commit addresses this issue by replacing per-CPU timers with
per-command timers, i.e., by associating an individual timer with each
command.

Signed-off-by: Paolo Valente <paolo.valente@unimore.it>
Signed-off-by: Arianna Avanzini <avanzini@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/null_blk.c | 79 +++++++++++++++---------------------------------
 1 file changed, 24 insertions(+), 55 deletions(-)

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 5c8ba54..08932f5 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -18,6 +18,7 @@ struct nullb_cmd {
 	struct bio *bio;
 	unsigned int tag;
 	struct nullb_queue *nq;
+	struct hrtimer timer;
 };
 
 struct nullb_queue {
@@ -49,17 +50,6 @@ static int null_major;
 static int nullb_indexes;
 static struct kmem_cache *ppa_cache;
 
-struct completion_queue {
-	struct llist_head list;
-	struct hrtimer timer;
-};
-
-/*
- * These are per-cpu for now, they will need to be configured by the
- * complete_queues parameter and appropriately mapped.
- */
-static DEFINE_PER_CPU(struct completion_queue, completion_queues);
-
 enum {
 	NULL_IRQ_NONE		= 0,
 	NULL_IRQ_SOFTIRQ	= 1,
@@ -180,6 +170,8 @@ static void free_cmd(struct nullb_cmd *cmd)
 	put_tag(cmd->nq, cmd->tag);
 }
 
+static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer);
+
 static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq)
 {
 	struct nullb_cmd *cmd;
@@ -190,6 +182,11 @@ static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq)
 		cmd = &nq->cmds[tag];
 		cmd->tag = tag;
 		cmd->nq = nq;
+		if (irqmode == NULL_IRQ_TIMER) {
+			hrtimer_init(&cmd->timer, CLOCK_MONOTONIC,
+				     HRTIMER_MODE_REL);
+			cmd->timer.function = null_cmd_timer_expired;
+		}
 		return cmd;
 	}
 
@@ -238,47 +235,28 @@ static void end_cmd(struct nullb_cmd *cmd)
 
 static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
 {
-	struct completion_queue *cq;
-	struct llist_node *entry;
-	struct nullb_cmd *cmd;
-
-	cq = &per_cpu(completion_queues, smp_processor_id());
-
-	while ((entry = llist_del_all(&cq->list)) != NULL) {
-		entry = llist_reverse_order(entry);
-		do {
-			struct request_queue *q = NULL;
+	struct nullb_cmd *cmd = container_of(timer, struct nullb_cmd, timer);
+	struct request_queue *q = NULL;
 
-			cmd = container_of(entry, struct nullb_cmd, ll_list);
-			entry = entry->next;
-			if (cmd->rq)
-				q = cmd->rq->q;
-			end_cmd(cmd);
+	if (cmd->rq)
+		q = cmd->rq->q;
 
-			if (q && !q->mq_ops && blk_queue_stopped(q)) {
-				spin_lock(q->queue_lock);
-				if (blk_queue_stopped(q))
-					blk_start_queue(q);
-				spin_unlock(q->queue_lock);
-			}
-		} while (entry);
+	if (q && !q->mq_ops && blk_queue_stopped(q)) {
+		spin_lock(q->queue_lock);
+		if (blk_queue_stopped(q))
+			blk_start_queue(q);
+		spin_unlock(q->queue_lock);
 	}
+	end_cmd(cmd);
 
 	return HRTIMER_NORESTART;
 }
 
 static void null_cmd_end_timer(struct nullb_cmd *cmd)
 {
-	struct completion_queue *cq = &per_cpu(completion_queues, get_cpu());
-
-	cmd->ll_list.next = NULL;
-	if (llist_add(&cmd->ll_list, &cq->list)) {
-		ktime_t kt = ktime_set(0, completion_nsec);
+	ktime_t kt = ktime_set(0, completion_nsec);
 
-		hrtimer_start(&cq->timer, kt, HRTIMER_MODE_REL_PINNED);
-	}
-
-	put_cpu();
+	hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL);
 }
 
 static void null_softirq_done_fn(struct request *rq)
@@ -376,6 +354,10 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx,
 {
 	struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
 
+	if (irqmode == NULL_IRQ_TIMER) {
+		hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+		cmd->timer.function = null_cmd_timer_expired;
+	}
 	cmd->rq = bd->rq;
 	cmd->nq = hctx->driver_data;
 
@@ -813,19 +795,6 @@ static int __init null_init(void)
 
 	mutex_init(&lock);
 
-	/* Initialize a separate list for each CPU for issuing softirqs */
-	for_each_possible_cpu(i) {
-		struct completion_queue *cq = &per_cpu(completion_queues, i);
-
-		init_llist_head(&cq->list);
-
-		if (irqmode != NULL_IRQ_TIMER)
-			continue;
-
-		hrtimer_init(&cq->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-		cq->timer.function = null_cmd_timer_expired;
-	}
-
 	null_major = register_blkdev(0, "nullb");
 	if (null_major < 0)
 		return null_major;
-- 
cgit v1.1


From cf8ecc5a8455266f8d516426b2acd36f9bdfa061 Mon Sep 17 00:00:00 2001
From: Arianna Avanzini <avanzini@google.com>
Date: Tue, 1 Dec 2015 11:48:18 +0100
Subject: null_blk: guarantee device restart in all irq modes

In single-queue (block layer) mode,the function null_rq_prep_fn stops
the device if alloc_cmd fails. Then, once stopped, the device must be
restarted on the next command completion, so that the request(s) for
which alloc_cmd failed can be requeued. Otherwise the device hangs.

Unfortunately, device restart is currently performed only for delayed
completions, i.e., in irqmode==2. This fact causes hangs, for the
above reasons, with the other irqmodes in combination with single-queue
block layer.

This commits addresses this issue by making sure that, if stopped, the
device is properly restarted for all irqmodes on completions.

Signed-off-by: Paolo Valente <paolo.valente@unimore.it>
Signed-off-by: Arianna AVanzini <avanzini@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/null_blk.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 08932f5..cf65619 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -217,6 +217,8 @@ static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait)
 
 static void end_cmd(struct nullb_cmd *cmd)
 {
+	struct request_queue *q = NULL;
+
 	switch (queue_mode)  {
 	case NULL_Q_MQ:
 		blk_mq_end_request(cmd->rq, 0);
@@ -227,27 +229,28 @@ static void end_cmd(struct nullb_cmd *cmd)
 		break;
 	case NULL_Q_BIO:
 		bio_endio(cmd->bio);
-		break;
+		goto free_cmd;
 	}
 
-	free_cmd(cmd);
-}
-
-static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
-{
-	struct nullb_cmd *cmd = container_of(timer, struct nullb_cmd, timer);
-	struct request_queue *q = NULL;
-
 	if (cmd->rq)
 		q = cmd->rq->q;
 
+	/* Restart queue if needed, as we are freeing a tag */
 	if (q && !q->mq_ops && blk_queue_stopped(q)) {
-		spin_lock(q->queue_lock);
+		unsigned long flags;
+
+		spin_lock_irqsave(q->queue_lock, flags);
 		if (blk_queue_stopped(q))
 			blk_start_queue(q);
-		spin_unlock(q->queue_lock);
+		spin_unlock_irqrestore(q->queue_lock, flags);
 	}
-	end_cmd(cmd);
+free_cmd:
+	free_cmd(cmd);
+}
+
+static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
+{
+	end_cmd(container_of(timer, struct nullb_cmd, timer));
 
 	return HRTIMER_NORESTART;
 }
-- 
cgit v1.1


From dbac117542b7e814245c43daa638a3626230cb2a Mon Sep 17 00:00:00 2001
From: Arianna Avanzini <avanzini@google.com>
Date: Tue, 1 Dec 2015 11:48:19 +0100
Subject: null_blk: change type of completion_nsec to unsigned long

This commit at least doubles the maximum value for
completion_nsec. This helps in special cases where one wants/needs to
emulate an extremely slow I/O (for example to spot bugs).

Signed-off-by: Paolo Valente <paolo.valente@unimore.it>
Signed-off-by: Arianna Avanzini <avanzini@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/null_blk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index cf65619..0c3940e 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -132,8 +132,8 @@ static const struct kernel_param_ops null_irqmode_param_ops = {
 device_param_cb(irqmode, &null_irqmode_param_ops, &irqmode, S_IRUGO);
 MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer");
 
-static int completion_nsec = 10000;
-module_param(completion_nsec, int, S_IRUGO);
+static unsigned long completion_nsec = 10000;
+module_param(completion_nsec, ulong, S_IRUGO);
 MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns");
 
 static int hw_queue_depth = 64;
-- 
cgit v1.1


From 1f390c1fde3a96974784be53cb3a645da3e4849c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stephan=20G=C3=BCnther?= <guenther@tum.de>
Date: Tue, 1 Dec 2015 13:23:22 -0700
Subject: nvme: temporary fix for Apple controller reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Recent patches added basic support for the Apple NVMe controller but
still cause resets and data corruption on that particular controller
when a specific pattern of read/flush commands occurs. Limiting the
queue depth to 2 works around that issue.

This patch enforces that limit only for the Apple controller and is
considered a temporary fix until we find the root source of that
problem.

Signed-off-by: Stephan Günther <guenther@tum.de>
Signed-off-by: Maurice Leclaire <leclaire@in.tum.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/pci.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f3b53af..9e294ff 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2708,6 +2708,18 @@ static int nvme_dev_map(struct nvme_dev *dev)
 	dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
 	dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
 	dev->dbs = ((void __iomem *)dev->bar) + 4096;
+
+	/*
+	 * Temporary fix for the Apple controller found in the MacBook8,1 and
+	 * some MacBook7,1 to avoid controller resets and data loss.
+	 */
+	if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
+		dev->q_depth = 2;
+		dev_warn(dev->dev, "detected Apple NVMe controller, set "
+			"queue depth=%u to work around controller resets\n",
+			dev->q_depth);
+	}
+
 	if (readl(&dev->bar->vs) >= NVME_VS(1, 2))
 		dev->cmb = nvme_map_cmb(dev);
 
-- 
cgit v1.1