summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorsmh <smh@FreeBSD.org>2015-12-11 02:06:03 +0000
committersmh <smh@FreeBSD.org>2015-12-11 02:06:03 +0000
commit0026debd97a47e3b952758c1d81371421ab2c7bf (patch)
treeb548318abda7d9be5d710d5b0d1939fd5d567cf2
parentd153dbf650bab72803c26528301ccdbda7ed4b0d (diff)
downloadFreeBSD-src-0026debd97a47e3b952758c1d81371421ab2c7bf.zip
FreeBSD-src-0026debd97a47e3b952758c1d81371421ab2c7bf.tar.gz
Limit stripesize reported from nvd(4) to 4K
Intel NVMe controllers have a slow path for I/Os that span a 128KB stripe boundary but ZFS limits ashift, which is derived from d_stripesize, to 13 (8KB) so we limit the stripesize reported to geom(8) to 4KB. This may result in a small number of additional I/Os to require splitting in nvme(4), however the NVMe I/O path is very efficient so these additional I/Os will cause very minimal (if any) difference in performance or CPU utilisation. This can be controller by the new sysctl kern.nvme.max_optimal_sectorsize. MFC after: 1 week Sponsored by: Multiplay Differential Revision: https://reviews.freebsd.org/D4446
-rw-r--r--sys/dev/nvd/nvd.c2
-rw-r--r--sys/dev/nvme/nvme.h1
-rw-r--r--sys/dev/nvme/nvme_ns.c18
-rw-r--r--sys/dev/nvme/nvme_sysctl.c16
4 files changed, 36 insertions, 1 deletions
diff --git a/sys/dev/nvd/nvd.c b/sys/dev/nvd/nvd.c
index 5d75876..f459e06 100644
--- a/sys/dev/nvd/nvd.c
+++ b/sys/dev/nvd/nvd.c
@@ -279,7 +279,7 @@ nvd_new_disk(struct nvme_namespace *ns, void *ctrlr_arg)
disk->d_sectorsize = nvme_ns_get_sector_size(ns);
disk->d_mediasize = (off_t)nvme_ns_get_size(ns);
disk->d_delmaxsize = (off_t)nvme_ns_get_size(ns);
- disk->d_stripesize = nvme_ns_get_stripesize(ns);
+ disk->d_stripesize = nvme_ns_get_optimal_sector_size(ns);
if (TAILQ_EMPTY(&disk_head))
disk->d_unit = 0;
diff --git a/sys/dev/nvme/nvme.h b/sys/dev/nvme/nvme.h
index 7e41e77..227a89e 100644
--- a/sys/dev/nvme/nvme.h
+++ b/sys/dev/nvme/nvme.h
@@ -870,6 +870,7 @@ const char * nvme_ns_get_serial_number(struct nvme_namespace *ns);
const char * nvme_ns_get_model_number(struct nvme_namespace *ns);
const struct nvme_namespace_data *
nvme_ns_get_data(struct nvme_namespace *ns);
+uint32_t nvme_ns_get_optimal_sector_size(struct nvme_namespace *ns);
uint32_t nvme_ns_get_stripesize(struct nvme_namespace *ns);
int nvme_ns_bio_process(struct nvme_namespace *ns, struct bio *bp,
diff --git a/sys/dev/nvme/nvme_ns.c b/sys/dev/nvme/nvme_ns.c
index 754d074..4580e66 100644
--- a/sys/dev/nvme/nvme_ns.c
+++ b/sys/dev/nvme/nvme_ns.c
@@ -45,6 +45,8 @@ __FBSDID("$FreeBSD$");
#include "nvme_private.h"
+extern int nvme_max_optimal_sectorsize;
+
static void nvme_bio_child_inbed(struct bio *parent, int bio_error);
static void nvme_bio_child_done(void *arg,
const struct nvme_completion *cpl);
@@ -217,6 +219,22 @@ nvme_ns_get_stripesize(struct nvme_namespace *ns)
return (ns->stripesize);
}
+uint32_t
+nvme_ns_get_optimal_sector_size(struct nvme_namespace *ns)
+{
+ uint32_t stripesize;
+
+ stripesize = nvme_ns_get_stripesize(ns);
+
+ if (stripesize == 0)
+ return nvme_ns_get_sector_size(ns);
+
+ if (nvme_max_optimal_sectorsize == 0)
+ return (stripesize);
+
+ return (MIN(stripesize, nvme_max_optimal_sectorsize));
+}
+
static void
nvme_ns_bio_done(void *arg, const struct nvme_completion *status)
{
diff --git a/sys/dev/nvme/nvme_sysctl.c b/sys/dev/nvme/nvme_sysctl.c
index 0ebbbf7..8b99111 100644
--- a/sys/dev/nvme/nvme_sysctl.c
+++ b/sys/dev/nvme/nvme_sysctl.c
@@ -33,6 +33,22 @@ __FBSDID("$FreeBSD$");
#include "nvme_private.h"
+SYSCTL_NODE(_kern, OID_AUTO, nvme, CTLFLAG_RD, 0, "NVM Express");
+/*
+ * Intel NVMe controllers have a slow path for I/Os that span a 128KB
+ * stripe boundary but ZFS limits ashift, which is derived from
+ * d_stripesize, to 13 (8KB) so we limit the stripesize reported to
+ * geom(8) to 4KB by default.
+ *
+ * This may result in a small number of additional I/Os to require
+ * splitting in nvme(4), however the NVMe I/O path is very efficient
+ * so these additional I/Os will cause very minimal (if any) difference
+ * in performance or CPU utilisation.
+ */
+int nvme_max_optimal_sectorsize = 1<<12;
+SYSCTL_INT(_kern_nvme, OID_AUTO, max_optimal_sectorsize, CTLFLAG_RWTUN,
+ &nvme_max_optimal_sectorsize, 0, "The maximum optimal sectorsize reported");
+
/*
* CTLTYPE_S64 and sysctl_handle_64 were added in r217616. Define these
* explicitly here for older kernels that don't include the r217616
OpenPOWER on IntegriCloud