This is the first of several commits which will add NVM Express (NVMe)

support to FreeBSD. A full description of the overall functionality being added is below. nvmexpress.org defines NVM Express as "an optimized register interface, command set and feature set fo PCI Express (PCIe)-based Solid-State Drives (SSDs)." This commit adds nvme(4) and nvd(4) driver source code and Makefiles to the tree. Full NVMe functionality description: Add nvme(4) and nvd(4) drivers and nvmecontrol(8) for NVM Express (NVMe) device support. There will continue to be ongoing work on NVM Express support, but there is more than enough to allow for evaluation of pre-production NVM Express devices as well as soliciting feedback. Questions and feedback are welcome. nvme(4) implements NVMe hardware abstraction and is a provider of NVMe namespaces. The closest equivalent of an NVMe namespace is a SCSI LUN. nvd(4) is an NVMe consumer, surfacing NVMe namespaces as GEOM disks. nvmecontrol(8) is used for NVMe configuration and management. The following are currently supported: nvme(4) - full mandatory NVM command set support - per-CPU IO queues (enabled by default but configurable) - per-queue sysctls for statistics and full command/completion queue dumps for debugging - registration API for NVMe namespace consumers - I/O error handling (except for timeoutsee below) - compilation switches for support back to stable-7 nvd(4) - BIO_DELETE and BIO_FLUSH (if supported by controller) - proper BIO_ORDERED handling nvmecontrol(8) - devlist: list NVMe controllers and their namespaces - identify: display controller or namespace identify data in human-readable or hex format - perftest: quick and dirty performance test to measure raw performance of NVMe device without userspace/physio/GEOM overhead The following are still work in progress and will be completed over the next 3-6 months in rough priority order: - complete man pages - firmware download and activation - asynchronous error requests - command timeout error handling - controller resets - nvmecontrol(8) log page retrieval This has been primarily tested on amd64, with light testing on i386. I would be happy to provide assistance to anyone interested in porting this to other architectures, but am not currently planning to do this work myself. Big-endian and dmamap sync for command/completion queues are the main areas that would need to be addressed. The nvme(4) driver currently has references to Chatham, which is an Intel-developed prototype board which is not fully spec compliant. These references will all be removed over time. Sponsored by: Intel Contributions from: Joe Golio/EMC <joseph dot golio at emc dot com>
author: jimharris <jimharris@FreeBSD.org> 2012-09-17 19:23:01 +0000
committer: jimharris <jimharris@FreeBSD.org> 2012-09-17 19:23:01 +0000
commit: 99662f533f035115d9b35d04f784c856a6bf7cc3 (patch)
tree: c8c46274e99382fa4ac1e233a8ae5469c7b912c3 /sys
parent: cde424c3369d2f37888aae925d9cfb3ce31e544b (diff)
download: FreeBSD-src-99662f533f035115d9b35d04f784c856a6bf7cc3.zip
FreeBSD-src-99662f533f035115d9b35d04f784c856a6bf7cc3.tar.gz
12 files changed, 4510 insertions, 0 deletions
diff --git a/sys/dev/nvd/nvd.c b/sys/dev/nvd/nvd.c
new file mode 100644
index 0000000..2017d79
--- /dev/null
+++ b/sys/dev/nvd/nvd.c
@@ -0,0 +1,318 @@
+/*-
+ * Copyright (C) 2012 Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bio.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/systm.h>
+#include <sys/taskqueue.h>
+
+#include <geom/geom.h>
+#include <geom/geom_disk.h>
+
+#include <dev/nvme/nvme.h>
+
+struct nvd_disk;
+
+static disk_ioctl_t nvd_ioctl;
+static disk_strategy_t nvd_strategy;
+
+static void create_geom_disk(void *, struct nvme_namespace *ns);
+static void destroy_geom_disk(struct nvd_disk *ndisk);
+
+static int nvd_load(void);
+static void nvd_unload(void);
+
+MALLOC_DEFINE(M_NVD, "nvd", "nvd(4) allocations");
+
+struct nvme_consumer *consumer_handle;
+
+struct nvd_disk {
+
+	struct bio_queue_head	bioq;
+	struct task		bioqtask;
+	struct mtx		bioqlock;
+
+	struct disk		*disk;
+	struct taskqueue	*tq;
+	struct nvme_namespace	*ns;
+
+	uint32_t		cur_depth;
+
+	TAILQ_ENTRY(nvd_disk)	tailq;
+};
+
+TAILQ_HEAD(, nvd_disk)	nvd_head;
+
+static int nvd_modevent(module_t mod, int type, void *arg)
+{
+	int error = 0;
+
+	switch (type) {
+	case MOD_LOAD:
+		error = nvd_load();
+		break;
+	case MOD_UNLOAD:
+		nvd_unload();
+		break;
+	default:
+		break;
+	}
+
+	return (error);
+}
+
+moduledata_t nvd_mod = {
+	"nvd",
+	(modeventhand_t)nvd_modevent,
+	0
+};
+
+DECLARE_MODULE(nvd, nvd_mod, SI_SUB_DRIVERS, SI_ORDER_ANY);
+MODULE_VERSION(nvd, 1);
+MODULE_DEPEND(nvd, nvme, 1, 1, 1);
+
+static int
+nvd_load()
+{
+
+	TAILQ_INIT(&nvd_head);
+	consumer_handle = nvme_register_consumer(create_geom_disk, NULL);
+
+	return (consumer_handle != NULL ? 0 : -1);
+}
+
+static void
+nvd_unload()
+{
+	struct nvd_disk *nvd;
+
+	while (!TAILQ_EMPTY(&nvd_head)) {
+		nvd = TAILQ_FIRST(&nvd_head);
+		TAILQ_REMOVE(&nvd_head, nvd, tailq);
+		destroy_geom_disk(nvd);
+		free(nvd, M_NVD);
+	}
+
+	nvme_unregister_consumer(consumer_handle);
+}
+
+static void
+nvd_strategy(struct bio *bp)
+{
+	struct nvd_disk *ndisk;
+
+	ndisk = (struct nvd_disk *)bp->bio_disk->d_drv1;
+
+	mtx_lock(&ndisk->bioqlock);
+	bioq_insert_tail(&ndisk->bioq, bp);
+	mtx_unlock(&ndisk->bioqlock);
+	taskqueue_enqueue(ndisk->tq, &ndisk->bioqtask);
+}
+
+static int
+nvd_ioctl(struct disk *ndisk, u_long cmd, void *data, int fflag,
+    struct thread *td)
+{
+	int ret = 0;
+
+	switch (cmd) {
+	default:
+		ret = EIO;
+	}
+
+	return (ret);
+}
+
+static void
+nvd_done(void *arg, const struct nvme_completion *status)
+{
+	struct bio *bp;
+	struct nvd_disk *ndisk;
+
+	bp = (struct bio *)arg;
+
+	ndisk = bp->bio_disk->d_drv1;
+
+	if (atomic_fetchadd_int(&ndisk->cur_depth, -1) == NVME_QD)
+		taskqueue_enqueue(ndisk->tq, &ndisk->bioqtask);
+
+	/*
+	 * TODO: add more extensive translation of NVMe status codes
+	 *  to different bio error codes (i.e. EIO, EINVAL, etc.)
+	 */
+	if (status->sf_sc || status->sf_sct) {
+		bp->bio_error = EIO;
+		bp->bio_flags |= BIO_ERROR;
+		bp->bio_resid = bp->bio_bcount;
+	} else
+		bp->bio_resid = 0;
+
+	biodone(bp);
+}
+
+static void
+nvd_bioq_process(void *arg, int pending)
+{
+	struct nvd_disk *ndisk = arg;
+	struct bio *bp;
+	int err;
+
+	for (;;) {
+		if (atomic_load_acq_int(&ndisk->cur_depth) >= NVME_QD)
+			break;
+
+		mtx_lock(&ndisk->bioqlock);
+		bp = bioq_takefirst(&ndisk->bioq);
+		mtx_unlock(&ndisk->bioqlock);
+		if (bp == NULL)
+			break;
+
+#ifdef BIO_ORDERED
+		/*
+		 * BIO_ORDERED flag dictates that all outstanding bios
+		 *  must be completed before processing the bio with
+		 *  BIO_ORDERED flag set.
+		 */
+		if (bp->bio_flags & BIO_ORDERED) {
+			while (ndisk->cur_depth > 0) {
+				pause("nvd flush", 1);
+			}
+		}
+#endif
+
+		bp->bio_driver1 = NULL;
+		atomic_add_acq_int(&ndisk->cur_depth, 1);
+
+		err = nvme_ns_bio_process(ndisk->ns, bp, nvd_done);
+
+		if (err) {
+			atomic_add_acq_int(&ndisk->cur_depth, -1);
+			bp->bio_error = EIO;
+			bp->bio_flags |= BIO_ERROR;
+			bp->bio_resid = bp->bio_bcount;
+			biodone(bp);
+		}
+
+#ifdef BIO_ORDERED
+		/*
+		 * BIO_ORDERED flag dictates that the bio with BIO_ORDERED
+		 *  flag set must be completed before proceeding with
+		 *  additional bios.
+		 */
+		if (bp->bio_flags & BIO_ORDERED) {
+			while (ndisk->cur_depth > 0) {
+				pause("nvd flush", 1);
+			}
+		}
+#endif
+	}
+}
+
+static void
+create_geom_disk(void *arg, struct nvme_namespace *ns)
+{
+	struct nvd_disk *ndisk;
+	struct disk *disk;
+
+	ndisk = malloc(sizeof(struct nvd_disk), M_NVD, M_ZERO | M_NOWAIT);
+
+	disk = disk_alloc();
+	disk->d_strategy = nvd_strategy;
+	disk->d_ioctl = nvd_ioctl;
+	disk->d_name = "nvd";
+	disk->d_drv1 = ndisk;
+
+	disk->d_maxsize = nvme_ns_get_max_io_xfer_size(ns);
+	disk->d_sectorsize = nvme_ns_get_sector_size(ns);
+	disk->d_mediasize = (off_t)nvme_ns_get_size(ns);
+
+	if (TAILQ_EMPTY(&nvd_head))
+		disk->d_unit = 0;
+	else
+		disk->d_unit = TAILQ_FIRST(&nvd_head)->disk->d_unit + 1;
+
+	disk->d_flags = 0;
+
+	if (nvme_ns_get_flags(ns) & NVME_NS_DEALLOCATE_SUPPORTED)
+		disk->d_flags |= DISKFLAG_CANDELETE;
+
+	if (nvme_ns_get_flags(ns) & NVME_NS_FLUSH_SUPPORTED)
+		disk->d_flags |= DISKFLAG_CANFLUSHCACHE;
+
+	strlcpy(disk->d_ident, nvme_ns_get_serial_number(ns),
+	    sizeof(disk->d_ident));
+
+#if __FreeBSD_version >= 900034
+	strlcpy(disk->d_descr, nvme_ns_get_model_number(ns),
+	    sizeof(disk->d_descr));
+#endif
+
+	disk_create(disk, DISK_VERSION);
+
+	ndisk->ns = ns;
+	ndisk->disk = disk;
+	ndisk->cur_depth = 0;
+
+	mtx_init(&ndisk->bioqlock, "NVD bioq lock", NULL, MTX_DEF);
+	bioq_init(&ndisk->bioq);
+
+	TASK_INIT(&ndisk->bioqtask, 0, nvd_bioq_process, ndisk);
+	ndisk->tq = taskqueue_create("nvd_taskq", M_WAITOK,
+	    taskqueue_thread_enqueue, &ndisk->tq);
+	taskqueue_start_threads(&ndisk->tq, 1, PI_DISK, "nvd taskq");
+
+	TAILQ_INSERT_HEAD(&nvd_head, ndisk, tailq);
+}
+
+static void
+destroy_geom_disk(struct nvd_disk *ndisk)
+{
+	struct bio *bp;
+
+	taskqueue_free(ndisk->tq);
+	disk_destroy(ndisk->disk);
+
+	mtx_lock(&ndisk->bioqlock);
+	for (;;) {
+		bp = bioq_takefirst(&ndisk->bioq);
+		if (bp == NULL)
+			break;
+		bp->bio_error = EIO;
+		bp->bio_flags |= BIO_ERROR;
+		bp->bio_resid = bp->bio_bcount;
+
+		biodone(bp);
+	}
+	mtx_unlock(&ndisk->bioqlock);
+
+	mtx_destroy(&ndisk->bioqlock);
+}
diff --git a/sys/dev/nvme/nvme.c b/sys/dev/nvme/nvme.c
new file mode 100644
index 0000000..f7f925d
--- /dev/null
+++ b/sys/dev/nvme/nvme.c
@@ -0,0 +1,408 @@
+/*-
+ * Copyright (C) 2012 Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/module.h>
+
+#include <dev/pci/pcivar.h>
+
+#include "nvme_private.h"
+
+struct nvme_consumer {
+	nvme_consumer_cb_fn_t		cb_fn;
+	void				*cb_arg;
+};
+
+struct nvme_consumer nvme_consumer[NVME_MAX_CONSUMERS];
+
+MALLOC_DEFINE(M_NVME, "nvme", "nvme(4) memory allocations");
+
+static int    nvme_probe(device_t);
+static int    nvme_attach(device_t);
+static int    nvme_detach(device_t);
+
+static devclass_t nvme_devclass;
+
+static device_method_t nvme_pci_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,     nvme_probe),
+	DEVMETHOD(device_attach,    nvme_attach),
+	DEVMETHOD(device_detach,    nvme_detach),
+	{ 0, 0 }
+};
+
+static driver_t nvme_pci_driver = {
+	"nvme",
+	nvme_pci_methods,
+	sizeof(struct nvme_controller),
+};
+
+DRIVER_MODULE(nvme, pci, nvme_pci_driver, nvme_devclass, 0, 0);
+MODULE_VERSION(nvme, 1);
+
+static struct _pcsid
+{
+	u_int32_t   type;
+	const char  *desc;
+} pci_ids[] = {
+	{ 0x01118086,		"NVMe Controller"  },
+	{ CHATHAM_PCI_ID,	"Chatham Prototype NVMe Controller"  },
+	{ IDT_PCI_ID,		"IDT NVMe Controller"  },
+	{ 0x00000000,		NULL  }
+};
+
+static int
+nvme_probe (device_t device)
+{
+	u_int32_t type = pci_get_devid(device);
+	struct _pcsid *ep = pci_ids;
+
+	while (ep->type && ep->type != type)
+		++ep;
+
+	if (ep->desc) {
+		device_set_desc(device, ep->desc);
+		return (BUS_PROBE_DEFAULT);
+	} else
+		return (ENXIO);
+}
+
+static void
+nvme_load(void)
+{
+}
+
+static void
+nvme_unload(void)
+{
+}
+
+static void
+nvme_shutdown(void)
+{
+	device_t		*devlist;
+	struct nvme_controller	*ctrlr;
+	union cc_register	cc;
+	union csts_register	csts;
+	int			dev, devcount;
+
+	if (devclass_get_devices(nvme_devclass, &devlist, &devcount))
+		return;
+
+	for (dev = 0; dev < devcount; dev++) {
+		/*
+		 * Only notify controller of shutdown when a real shutdown is
+		 *  in process, not when a module unload occurs.  It seems at
+		 *  least some controllers (Chatham at least) don't let you
+		 *  re-enable the controller after shutdown notification has
+		 *  been received.
+		 */
+		ctrlr = DEVICE2SOFTC(devlist[dev]);
+		cc.raw = nvme_mmio_read_4(ctrlr, cc);
+		cc.bits.shn = NVME_SHN_NORMAL;
+		nvme_mmio_write_4(ctrlr, cc, cc.raw);
+		csts.raw = nvme_mmio_read_4(ctrlr, csts);
+		while (csts.bits.shst != NVME_SHST_COMPLETE) {
+			DELAY(5);
+			csts.raw = nvme_mmio_read_4(ctrlr, csts);
+		}
+	}
+
+	free(devlist, M_TEMP);
+}
+
+static int
+nvme_modevent(module_t mod, int type, void *arg)
+{
+
+	switch (type) {
+	case MOD_LOAD:
+		nvme_load();
+		break;
+	case MOD_UNLOAD:
+		nvme_unload();
+		break;
+	case MOD_SHUTDOWN:
+		nvme_shutdown();
+		break;
+	default:
+		break;
+	}
+
+	return (0);
+}
+
+moduledata_t nvme_mod = {
+	"nvme",
+	(modeventhand_t)nvme_modevent,
+	0
+};
+
+DECLARE_MODULE(nvme, nvme_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
+
+void
+nvme_dump_command(struct nvme_command *cmd)
+{
+	printf("opc:%x f:%x r1:%x cid:%x nsid:%x r2:%x r3:%x "
+	    "mptr:%qx prp1:%qx prp2:%qx cdw:%x %x %x %x %x %x\n",
+	    cmd->opc, cmd->fuse, cmd->rsvd1, cmd->cid, cmd->nsid,
+	    cmd->rsvd2, cmd->rsvd3,
+	    (long long unsigned int)cmd->mptr,
+	    (long long unsigned int)cmd->prp1,
+	    (long long unsigned int)cmd->prp2,
+	    cmd->cdw10, cmd->cdw11, cmd->cdw12, cmd->cdw13, cmd->cdw14,
+	    cmd->cdw15);
+}
+
+void
+nvme_dump_completion(struct nvme_completion *cpl)
+{
+	printf("cdw0:%08x sqhd:%04x sqid:%04x "
+	    "cid:%04x p:%x sc:%02x sct:%x m:%x dnr:%x\n",
+	    cpl->cdw0, cpl->sqhd, cpl->sqid,
+	    cpl->cid, cpl->p, cpl->sf_sc, cpl->sf_sct, cpl->sf_m,
+	    cpl->sf_dnr);
+}
+
+void
+nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg, int error)
+{
+	struct nvme_tracker 	*tr;
+	struct nvme_qpair 	*qpair;
+	struct nvme_prp_list	*prp_list;
+	uint32_t		cur_nseg;
+
+	KASSERT(error == 0, ("nvme_payload_map error != 0\n"));
+
+	tr = (struct nvme_tracker *)arg;
+	qpair = tr->qpair;
+
+	/*
+	 * Note that we specified PAGE_SIZE for alignment and max
+	 *  segment size when creating the bus dma tags.  So here
+	 *  we can safely just transfer each segment to its
+	 *  associated PRP entry.
+	 */
+	tr->cmd.prp1 = seg[0].ds_addr;
+
+	if (nseg == 2) {
+		tr->cmd.prp2 = seg[1].ds_addr;
+	} else if (nseg > 2) {
+		KASSERT(tr->prp_list,
+		    ("prp_list needed but not attached to tracker\n"));
+		cur_nseg = 1;
+		prp_list = tr->prp_list;
+		tr->cmd.prp2 = (uint64_t)prp_list->bus_addr;
+		while (cur_nseg < nseg) {
+			prp_list->prp[cur_nseg-1] =
+			    (uint64_t)seg[cur_nseg].ds_addr;
+			cur_nseg++;
+		}
+	}
+
+	nvme_qpair_submit_cmd(qpair, tr);
+}
+
+struct nvme_tracker *
+nvme_allocate_tracker(struct nvme_controller *ctrlr, boolean_t is_admin,
+    nvme_cb_fn_t cb_fn, void *cb_arg, uint32_t payload_size, void *payload)
+{
+	struct nvme_tracker 	*tr;
+	struct nvme_qpair	*qpair;
+	uint32_t 		modulo, offset, num_prps;
+	boolean_t		alloc_prp_list = FALSE;
+
+	if (is_admin) {
+		qpair = &ctrlr->adminq;
+	} else {
+		if (ctrlr->per_cpu_io_queues)
+			qpair = &ctrlr->ioq[curcpu];
+		else
+			qpair = &ctrlr->ioq[0];
+	}
+
+	num_prps = payload_size / PAGE_SIZE;
+	modulo = payload_size % PAGE_SIZE;
+	offset = (uint32_t)((uintptr_t)payload % PAGE_SIZE);
+
+	if (modulo || offset)
+		num_prps += 1 + (modulo + offset - 1) / PAGE_SIZE;
+
+	if (num_prps > 2)
+		alloc_prp_list = TRUE;
+
+	tr = nvme_qpair_allocate_tracker(qpair, alloc_prp_list);
+
+	memset(&tr->cmd, 0, sizeof(tr->cmd));
+
+	tr->qpair = qpair;
+	tr->cb_fn = cb_fn;
+	tr->cb_arg = cb_arg;
+	tr->payload_size = payload_size;
+
+	return (tr);
+}
+
+static int
+nvme_attach(device_t dev)
+{
+	struct nvme_controller	*ctrlr = DEVICE2SOFTC(dev);
+	int			status;
+
+	status = nvme_ctrlr_construct(ctrlr, dev);
+
+	if (status != 0)
+		return (status);
+
+	/*
+	 * Reset controller twice to ensure we do a transition from cc.en==1
+	 *  to cc.en==0.  This is because we don't really know what status
+	 *  the controller was left in when boot handed off to OS.
+	 */
+	status = nvme_ctrlr_reset(ctrlr);
+	if (status != 0)
+		return (status);
+
+	status = nvme_ctrlr_reset(ctrlr);
+	if (status != 0)
+		return (status);
+
+	ctrlr->config_hook.ich_func = nvme_ctrlr_start;
+	ctrlr->config_hook.ich_arg = ctrlr;
+
+	config_intrhook_establish(&ctrlr->config_hook);
+
+	return (0);
+}
+
+static int
+nvme_detach (device_t dev)
+{
+	struct nvme_controller	*ctrlr = DEVICE2SOFTC(dev);
+	struct nvme_namespace	*ns;
+	int			i;
+
+	if (ctrlr->taskqueue) {
+		taskqueue_drain(ctrlr->taskqueue, &ctrlr->task);
+		taskqueue_free(ctrlr->taskqueue);
+	}
+
+	for (i = 0; i < NVME_MAX_NAMESPACES; i++) {
+		ns = &ctrlr->ns[i];
+		if (ns->cdev)
+			destroy_dev(ns->cdev);
+	}
+
+	if (ctrlr->cdev)
+		destroy_dev(ctrlr->cdev);
+
+	for (i = 0; i < ctrlr->num_io_queues; i++) {
+		nvme_io_qpair_destroy(&ctrlr->ioq[i]);
+	}
+
+	free(ctrlr->ioq, M_NVME);
+
+	nvme_admin_qpair_destroy(&ctrlr->adminq);
+
+	if (ctrlr->resource != NULL) {
+		bus_release_resource(dev, SYS_RES_MEMORY,
+		    ctrlr->resource_id, ctrlr->resource);
+	}
+
+#ifdef CHATHAM2
+	if (ctrlr->chatham_resource != NULL) {
+		bus_release_resource(dev, SYS_RES_MEMORY,
+		    ctrlr->chatham_resource_id, ctrlr->chatham_resource);
+	}
+#endif
+
+	if (ctrlr->tag)
+		bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag);
+
+	if (ctrlr->res)
+		bus_release_resource(ctrlr->dev, SYS_RES_IRQ,
+		    rman_get_rid(ctrlr->res), ctrlr->res);
+
+	if (ctrlr->msix_enabled)
+		pci_release_msi(dev);
+
+	return (0);
+}
+
+static void
+nvme_notify_consumer(struct nvme_consumer *consumer)
+{
+	device_t		*devlist;
+	struct nvme_controller	*ctrlr;
+	int			dev, ns, devcount;
+
+	if (devclass_get_devices(nvme_devclass, &devlist, &devcount))
+		return;
+
+	for (dev = 0; dev < devcount; dev++) {
+		ctrlr = DEVICE2SOFTC(devlist[dev]);
+		for (ns = 0; ns < ctrlr->cdata.nn; ns++)
+			(*consumer->cb_fn)(consumer->cb_arg, &ctrlr->ns[ns]);
+	}
+
+	free(devlist, M_TEMP);
+}
+
+struct nvme_consumer *
+nvme_register_consumer(nvme_consumer_cb_fn_t cb_fn, void *cb_arg)
+{
+	int i;
+
+	/*
+	 * TODO: add locking around consumer registration.  Not an issue
+	 *  right now since we only have one nvme consumer - nvd(4).
+	 */
+	for (i = 0; i < NVME_MAX_CONSUMERS; i++)
+		if (nvme_consumer[i].cb_fn == NULL) {
+			nvme_consumer[i].cb_fn = cb_fn;
+			nvme_consumer[i].cb_arg = cb_arg;
+
+			nvme_notify_consumer(&nvme_consumer[i]);
+			return (&nvme_consumer[i]);
+		}
+
+	printf("nvme(4): consumer not registered - no slots available\n");
+	return (NULL);
+}
+
+void
+nvme_unregister_consumer(struct nvme_consumer *consumer)
+{
+
+	consumer->cb_fn = NULL;
+	consumer->cb_arg = NULL;
+}
+
diff --git a/sys/dev/nvme/nvme.h b/sys/dev/nvme/nvme.h
new file mode 100644
index 0000000..c9a5686
--- /dev/null
+++ b/sys/dev/nvme/nvme.h
@@ -0,0 +1,738 @@
+/*-
+ * Copyright (C) 2012 Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef __NVME_H__
+#define __NVME_H__
+
+#ifdef _KERNEL
+#include <sys/types.h>
+#endif
+
+#define	NVME_IDENTIFY_CONTROLLER	_IOR('n', 0, struct nvme_controller_data)
+#define	NVME_IDENTIFY_NAMESPACE		_IOR('n', 1, struct nvme_namespace_data)
+#define	NVME_IO_TEST			_IOWR('n', 2, struct nvme_io_test)
+#define	NVME_BIO_TEST			_IOWR('n', 4, struct nvme_io_test)
+
+/*
+ * Use to mark a command to apply to all namespaces, or to retrieve global
+ *  log pages.
+ */
+#define NVME_GLOBAL_NAMESPACE_TAG	((uint32_t)0xFFFFFFFF)
+
+union cap_lo_register {
+	uint32_t	raw;
+	struct {
+		/** maximum queue entries supported */
+		uint32_t mqes		: 16;
+
+		/** contiguous queues required */
+		uint32_t cqr		: 1;
+
+		/** arbitration mechanism supported */
+		uint32_t ams		: 2;
+
+		uint32_t reserved1	: 5;
+
+		/** timeout */
+		uint32_t to		: 8;
+	} bits __packed;
+} __packed;
+
+union cap_hi_register {
+	uint32_t	raw;
+	struct {
+		/** doorbell stride */
+		uint32_t dstrd		: 4;
+
+		uint32_t reserved3	: 1;
+
+		/** command sets supported */
+		uint32_t css_nvm	: 1;
+
+		uint32_t css_reserved	: 3;
+		uint32_t reserved2	: 7;
+
+		/** memory page size minimum */
+		uint32_t mpsmin		: 4;
+
+		/** memory page size maximum */
+		uint32_t mpsmax		: 4;
+
+		uint32_t reserved1	: 8;
+	} bits __packed;
+} __packed;
+
+union cc_register {
+	uint32_t	raw;
+	struct {
+		/** enable */
+		uint32_t en		: 1;
+
+		uint32_t reserved1	: 3;
+
+		/** i/o command set selected */
+		uint32_t css		: 3;
+
+		/** memory page size */
+		uint32_t mps		: 4;
+
+		/** arbitration mechanism selected */
+		uint32_t ams		: 3;
+
+		/** shutdown notification */
+		uint32_t shn		: 2;
+
+		/** i/o submission queue entry size */
+		uint32_t iosqes		: 4;
+
+		/** i/o completion queue entry size */
+		uint32_t iocqes		: 4;
+
+		uint32_t reserved2	: 8;
+	} bits __packed;
+} __packed;
+
+enum shn_value {
+	NVME_SHN_NORMAL		= 0x1,
+	NVME_SHN_ABRUPT		= 0x2,
+};
+
+union csts_register {
+	uint32_t	raw;
+	struct {
+		/** ready */
+		uint32_t rdy		: 1;
+
+		/** controller fatal status */
+		uint32_t cfs		: 1;
+
+		/** shutdown status */
+		uint32_t shst		: 2;
+
+		uint32_t reserved1	: 28;
+	} bits __packed;
+} __packed;
+
+enum shst_value {
+	NVME_SHST_NORMAL	= 0x0,
+	NVME_SHST_OCCURRING	= 0x1,
+	NVME_SHST_COMPLETE	= 0x2,
+};
+
+union aqa_register {
+	uint32_t	raw;
+	struct {
+		/** admin submission queue size */
+		uint32_t asqs		: 12;
+
+		uint32_t reserved1	: 4;
+
+		/** admin completion queue size */
+		uint32_t acqs		: 12;
+
+		uint32_t reserved2	: 4;
+	} bits __packed;
+} __packed;
+
+struct nvme_registers
+{
+	/** controller capabilities */
+	union cap_lo_register	cap_lo;
+	union cap_hi_register	cap_hi;
+
+	uint32_t	vs;		/* version */
+	uint32_t	intms;		/* interrupt mask set */
+	uint32_t	intmc;		/* interrupt mask clear */
+
+	/** controller configuration */
+	union cc_register	cc;
+
+	uint32_t	reserved1;
+	uint32_t	csts;		/* controller status */
+	uint32_t	reserved2;
+
+	/** admin queue attributes */
+	union aqa_register	aqa;
+
+	uint64_t	asq;		/* admin submission queue base addr */
+	uint64_t	acq;		/* admin completion queue base addr */
+	uint32_t	reserved3[0x3f2];
+
+	struct {
+	    uint32_t	sq_tdbl;	/* submission queue tail doorbell */
+	    uint32_t	cq_hdbl;	/* completion queue head doorbell */
+	} doorbell[1] __packed;
+} __packed;
+
+struct nvme_command
+{
+	/* dword 0 */
+	uint16_t opc	:  8;	/* opcode */
+	uint16_t fuse	:  2;	/* fused operation */
+	uint16_t rsvd1	:  6;
+	uint16_t cid;		/* command identifier */
+
+	/* dword 1 */
+	uint32_t nsid;		/* namespace identifier */
+
+	/* dword 2-3 */
+	uint32_t rsvd2;
+	uint32_t rsvd3;
+
+	/* dword 4-5 */
+	uint64_t mptr;		/* metadata pointer */
+
+	/* dword 6-7 */
+	uint64_t prp1;		/* prp entry 1 */
+
+	/* dword 8-9 */
+	uint64_t prp2;		/* prp entry 2 */
+
+	/* dword 10-15 */
+	uint32_t cdw10;		/* command-specific */
+	uint32_t cdw11;		/* command-specific */
+	uint32_t cdw12;		/* command-specific */
+	uint32_t cdw13;		/* command-specific */
+	uint32_t cdw14;		/* command-specific */
+	uint32_t cdw15;		/* command-specific */
+} __packed;
+
+struct nvme_completion {
+
+	/* dword 0 */
+	uint32_t cdw0;		/* command-specific */
+
+	/* dword 1 */
+	uint32_t rsvd1;
+
+	/* dword 2 */
+	uint16_t sqhd;		/* submission queue head pointer */
+	uint16_t sqid;		/* submission queue identifier */
+
+	/* dword 3 */
+	uint16_t cid;		/* command identifier */
+	uint16_t p	:  1;	/* phase tag */
+	uint16_t sf_sc	:  8;	/* status field - status code */
+	uint16_t sf_sct	:  3;	/* status field - status code type */
+	uint16_t rsvd2	:  2;
+	uint16_t sf_m	:  1;	/* status field - more */
+	uint16_t sf_dnr	:  1;	/* status field - do not retry */
+} __packed;
+
+struct nvme_dsm_range {
+
+	uint32_t attributes;
+	uint32_t length;
+	uint64_t starting_lba;
+} __packed;
+
+/* status code types */
+enum nvme_status_code_type {
+	NVME_SCT_GENERIC		= 0x0,
+	NVME_SCT_COMMAND_SPECIFIC	= 0x1,
+	NVME_SCT_MEDIA_ERROR		= 0x2,
+	/* 0x3-0x6 - reserved */
+	NVME_SCT_VENDOR_SPECIFIC	= 0x7,
+};
+
+/* generic command status codes */
+enum nvme_generic_command_status_code {
+	NVME_SC_SUCCESS				= 0x00,
+	NVME_SC_INVALID_OPCODE			= 0x01,
+	NVME_SC_INVALID_FIELD			= 0x02,
+	NVME_SC_COMMAND_ID_CONFLICT		= 0x03,
+	NVME_SC_DATA_TRANSFER_ERROR		= 0x04,
+	NVME_SC_ABORTED_POWER_LOSS		= 0x05,
+	NVME_SC_INTERNAL_DEVICE_ERROR		= 0x06,
+	NVME_SC_ABORTED_BY_REQUEST		= 0x07,
+	NVME_SC_ABORTED_SQ_DELETION		= 0x08,
+	NVME_SC_ABORTED_FAILED_FUSED		= 0x09,
+	NVME_SC_ABORTED_MISSING_FUSED		= 0x0a,
+	NVME_SC_INVALID_NAMESPACE_OR_FORMAT	= 0x0b,
+	NVME_SC_COMMAND_SEQUENCE_ERROR		= 0x0c,
+
+	NVME_SC_LBA_OUT_OF_RANGE		= 0x80,
+	NVME_SC_CAPACITY_EXCEEDED		= 0x81,
+	NVME_SC_NAMESPACE_NOT_READY		= 0x82,
+};
+
+/* command specific status codes */
+enum nvme_command_specific_status_code {
+	NVME_SC_COMPLETION_QUEUE_INVALID	= 0x00,
+	NVME_SC_INVALID_QUEUE_IDENTIFIER	= 0x01,
+	NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED	= 0x02,
+	NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED	= 0x03,
+	/* 0x04 - reserved */
+	NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED = 0x05,
+	NVME_SC_INVALID_FIRMWARE_SLOT		= 0x06,
+	NVME_SC_INVALID_FIRMWARE_IMAGE		= 0x07,
+	NVME_SC_INVALID_INTERRUPT_VECTOR	= 0x08,
+	NVME_SC_INVALID_LOG_PAGE		= 0x09,
+	NVME_SC_INVALID_FORMAT			= 0x0a,
+	NVME_SC_FIRMWARE_REQUIRES_RESET		= 0x0b,
+
+	NVME_SC_CONFLICTING_ATTRIBUTES		= 0x80,
+	NVME_SC_INVALID_PROTECTION_INFO		= 0x81,
+	NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE	= 0x82,
+};
+
+/* media error status codes */
+enum nvme_media_error_status_code {
+	NVME_SC_WRITE_FAULTS			= 0x80,
+	NVME_SC_UNRECOVERED_READ_ERROR		= 0x81,
+	NVME_SC_GUARD_CHECK_ERROR		= 0x82,
+	NVME_SC_APPLICATION_TAG_CHECK_ERROR	= 0x83,
+	NVME_SC_REFERENCE_TAG_CHECK_ERROR	= 0x84,
+	NVME_SC_COMPARE_FAILURE			= 0x85,
+	NVME_SC_ACCESS_DENIED			= 0x86,
+};
+
+/* admin opcodes */
+enum nvme_admin_opcode {
+	NVME_OPC_DELETE_IO_SQ			= 0x00,
+	NVME_OPC_CREATE_IO_SQ			= 0x01,
+	NVME_OPC_GET_LOG_PAGE			= 0x02,
+	/* 0x03 - reserved */
+	NVME_OPC_DELETE_IO_CQ			= 0x04,
+	NVME_OPC_CREATE_IO_CQ			= 0x05,
+	NVME_OPC_IDENTIFY			= 0x06,
+	/* 0x07 - reserved */
+	NVME_OPC_ABORT				= 0x08,
+	NVME_OPC_SET_FEATURES			= 0x09,
+	NVME_OPC_GET_FEATURES			= 0x0a,
+	/* 0x0b - reserved */
+	NVME_OPC_ASYNC_EVENT_REQUEST		= 0x0c,
+	/* 0x0d-0x0f - reserved */
+	NVME_OPC_FIRMWARE_ACTIVATE		= 0x10,
+	NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD	= 0x11,
+
+	NVME_OPC_FORMAT_NVM			= 0x80,
+	NVME_OPC_SECURITY_SEND			= 0x81,
+	NVME_OPC_SECURITY_RECEIVE		= 0x82,
+};
+
+/* nvme nvm opcodes */
+enum nvme_nvm_opcode {
+	NVME_OPC_FLUSH				= 0x00,
+	NVME_OPC_WRITE				= 0x01,
+	NVME_OPC_READ				= 0x02,
+	/* 0x03 - reserved */
+	NVME_OPC_WRITE_UNCORRECTABLE		= 0x04,
+	NVME_OPC_COMPARE			= 0x05,
+	/* 0x06-0x07 - reserved */
+	NVME_OPC_DATASET_MANAGEMENT		= 0x09,
+};
+
+enum nvme_feature {
+	/* 0x00 - reserved */
+	NVME_FEAT_ARBITRATION			= 0x01,
+	NVME_FEAT_POWER_MANAGEMENT		= 0x02,
+	NVME_FEAT_LBA_RANGE_TYPE		= 0x03,
+	NVME_FEAT_TEMPERATURE_THRESHOLD		= 0x04,
+	NVME_FEAT_ERROR_RECOVERY		= 0x05,
+	NVME_FEAT_VOLATILE_WRITE_CACHE		= 0x06,
+	NVME_FEAT_NUMBER_OF_QUEUES		= 0x07,
+	NVME_FEAT_INTERRUPT_COALESCING		= 0x08,
+	NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION = 0x09,
+	NVME_FEAT_WRITE_ATOMICITY		= 0x0A,
+	NVME_FEAT_ASYNCHRONOUS_EVENT_CONFIGURATION = 0x0B,
+	/* 0x0C-0x7F - reserved */
+	NVME_FEAT_SOFTWARE_PROGRESS_MARKER	= 0x80,
+	/* 0x81-0xBF - command set specific (reserved) */
+	/* 0xC0-0xFF - vendor specific */
+};
+
+enum nvme_dsm_attribute {
+	NVME_DSM_ATTR_INTEGRAL_READ		= 0x1,
+	NVME_DSM_ATTR_INTEGRAL_WRITE		= 0x2,
+	NVME_DSM_ATTR_DEALLOCATE		= 0x4,
+};
+
+struct nvme_controller_data {
+
+	/* bytes 0-255: controller capabilities and features */
+
+	/** pci vendor id */
+	uint16_t		vid;
+
+	/** pci subsystem vendor id */
+	uint16_t		ssvid;
+
+	/** serial number */
+	int8_t			sn[20];
+
+	/** model number */
+	int8_t			mn[40];
+
+	/** firmware revision */
+	uint8_t			fr[8];
+
+	/** recommended arbitration burst */
+	uint8_t			rab;
+
+	/** ieee oui identifier */
+	uint8_t			ieee[3];
+
+	/** multi-interface capabilities */
+	uint8_t			mic;
+
+	/** maximum data transfer size */
+	uint8_t			mdts;
+
+	uint8_t			reserved1[178];
+
+	/* bytes 256-511: admin command set attributes */
+
+	/** optional admin command support */
+	struct {
+		/* supports security send/receive commands */
+		uint16_t	security  : 1;
+
+		/* supports format nvm command */
+		uint16_t	format    : 1;
+
+		/* supports firmware activate/download commands */
+		uint16_t	firmware  : 1;
+
+		uint16_t	oacs_rsvd : 13;
+	} __packed oacs;
+
+	/** abort command limit */
+	uint8_t			acl;
+
+	/** asynchronous event request limit */
+	uint8_t			aerl;
+
+	/** firmware updates */
+	struct {
+		/* first slot is read-only */
+		uint8_t		slot1_ro  : 1;
+
+		/* number of firmware slots */
+		uint8_t		num_slots : 3;
+
+		uint8_t		frmw_rsvd : 4;
+	} __packed frmw;
+
+	/** log page attributes */
+	struct {
+		/* per namespace smart/health log page */
+		uint8_t		ns_smart : 1;
+
+		uint8_t		lpa_rsvd : 7;
+	} __packed lpa;
+
+	/** error log page entries */
+	uint8_t			elpe;
+
+	/** number of power states supported */
+	uint8_t			npss;
+
+	/** admin vendor specific command configuration */
+	struct {
+		/* admin vendor specific commands use spec format */
+		uint8_t		spec_format : 1;
+
+		uint8_t		avscc_rsvd  : 7;
+	} __packed avscc;
+
+	uint8_t			reserved2[247];
+
+	/* bytes 512-703: nvm command set attributes */
+
+	/** submission queue entry size */
+	struct {
+		uint8_t		min : 4;
+		uint8_t		max : 4;
+	} __packed sqes;
+
+	/** completion queue entry size */
+	struct {
+		uint8_t		min : 4;
+		uint8_t		max : 4;
+	} __packed cqes;
+
+	uint8_t			reserved3[2];
+
+	/** number of namespaces */
+	uint32_t		nn;
+
+	/** optional nvm command support */
+	struct {
+		uint16_t	compare : 1;
+		uint16_t	write_unc : 1;
+		uint16_t	dsm: 1;
+		uint16_t	reserved: 13;
+	} __packed oncs;
+
+	/** fused operation support */
+	uint16_t		fuses;
+
+	/** format nvm attributes */
+	uint8_t			fna;
+
+	/** volatile write cache */
+	struct {
+		uint8_t		present : 1;
+		uint8_t		reserved : 7;
+	} __packed vwc;
+
+	/* TODO: flesh out remaining nvm command set attributes */
+	uint8_t			reserved4[178];
+
+	/* bytes 704-2047: i/o command set attributes */
+	uint8_t			reserved5[1344];
+
+	/* bytes 2048-3071: power state descriptors */
+	uint8_t			reserved6[1024];
+
+	/* bytes 3072-4095: vendor specific */
+	uint8_t			reserved7[1024];
+} __packed;
+
+struct nvme_namespace_data {
+
+	/** namespace size */
+	uint64_t		nsze;
+
+	/** namespace capacity */
+	uint64_t		ncap;
+
+	/** namespace utilization */
+	uint64_t		nuse;
+
+	/** namespace features */
+	struct {
+		/** thin provisioning */
+		uint8_t		thin_prov : 1;
+		uint8_t		reserved1 : 7;
+	} __packed nsfeat;
+
+	/** number of lba formats */
+	uint8_t			nlbaf;
+
+	/** formatted lba size */
+	struct {
+		uint8_t		format    : 4;
+		uint8_t		extended  : 1;
+		uint8_t		reserved2 : 3;
+	} __packed flbas;
+
+	/** metadata capabilities */
+	struct {
+		/* metadata can be transferred as part of data prp list */
+		uint8_t		extended  : 1;
+
+		/* metadata can be transferred with separate metadata pointer */
+		uint8_t		pointer   : 1;
+
+		uint8_t		reserved3 : 6;
+	} __packed mc;
+
+	/** end-to-end data protection capabilities */
+	struct {
+		/* protection information type 1 */
+		uint8_t		pit1     : 1;
+
+		/* protection information type 2 */
+		uint8_t		pit2     : 1;
+
+		/* protection information type 3 */
+		uint8_t		pit3     : 1;
+
+		/* first eight bytes of metadata */
+		uint8_t		md_start : 1;
+
+		/* last eight bytes of metadata */
+		uint8_t		md_end   : 1;
+	} __packed dpc;
+
+	/** end-to-end data protection type settings */
+	struct {
+		/* protection information type */
+		uint8_t		pit       : 3;
+
+		/* 1 == protection info transferred at start of metadata */
+		/* 0 == protection info transferred at end of metadata */
+		uint8_t		md_start  : 1;
+
+		uint8_t		reserved4 : 4;
+	} __packed dps;
+
+	uint8_t			reserved5[98];
+
+	/** lba format support */
+	struct {
+		/** metadata size */
+		uint32_t	ms	  : 16;
+
+		/** lba data size */
+		uint32_t	lbads	  : 8;
+
+		/** relative performance */
+		uint32_t	rp	  : 2;
+
+		uint32_t	reserved6 : 6;
+	} __packed lbaf[16];
+
+	uint8_t			reserved6[192];
+
+	uint8_t			vendor_specific[3712];
+};
+
+enum nvme_log_page {
+
+	/* 0x00 - reserved */
+	NVME_LOG_ERROR			= 0x01,
+	NVME_LOG_HEALTH_INFORMATION	= 0x02,
+	NVME_LOG_FIRMWARE_SLOT		= 0x03,
+	/* 0x04-0x7F - reserved */
+	/* 0x80-0xBF - I/O command set specific */
+	/* 0xC0-0xFF - vendor specific */
+};
+
+union nvme_critical_warning_state {
+
+	uint8_t		raw;
+
+	struct {
+		uint8_t	available_spare		: 1;
+		uint8_t	temperature		: 1;
+		uint8_t	device_reliability	: 1;
+		uint8_t	read_only		: 1;
+		uint8_t	volatile_memory_backup	: 1;
+		uint8_t	reserved		: 3;
+	} __packed bits;
+} __packed;
+
+struct nvme_health_information_page {
+
+	union nvme_critical_warning_state	critical_warning;
+
+	uint16_t		temperature;
+	uint8_t			available_spare;
+	uint8_t			available_spare_threshold;
+	uint8_t			percentage_used;
+
+	uint8_t			reserved[26];
+
+	/*
+	 * Note that the following are 128-bit values, but are
+	 *  defined as an array of 2 64-bit values.
+	 */
+	/* Data Units Read is always in 512-byte units. */
+	uint64_t		data_units_read[2];
+	/* Data Units Written is always in 512-byte units. */
+	uint64_t		data_units_written[2];
+	/* For NVM command set, this includes Compare commands. */
+	uint64_t		host_read_commands[2];
+	uint64_t		host_write_commands[2];
+	/* Controller Busy Time is reported in minutes. */
+	uint64_t		controller_busy_time[2];
+	uint64_t		power_cycles[2];
+	uint64_t		power_on_hours[2];
+	uint64_t		unsafe_shutdowns[2];
+	uint64_t		media_errors[2];
+	uint64_t		num_error_info_log_entries[2];
+
+	uint8_t			reserved2[320];
+} __packed;
+
+#define NVME_TEST_MAX_THREADS	128
+
+struct nvme_io_test {
+
+	enum nvme_nvm_opcode	opc;
+	uint32_t		size;
+	uint32_t		time;	/* in seconds */
+	uint32_t		num_threads;
+	uint32_t		flags;
+	uint32_t		io_completed[NVME_TEST_MAX_THREADS];
+};
+
+enum nvme_io_test_flags {
+
+	/*
+	 * Specifies whether dev_refthread/dev_relthread should be
+	 *  called during NVME_BIO_TEST.  Ignored for other test
+	 *  types.
+	 */
+	NVME_TEST_FLAG_REFTHREAD =	0x1,
+};
+
+#ifdef _KERNEL
+
+struct bio;
+
+/* TODO: reassess this QD variable - its a workaround for Chatham2 issue */
+#define NVME_QD			(200)
+
+struct nvme_namespace;
+struct nvme_consumer;
+
+typedef void (*nvme_cb_fn_t)(void *, const struct nvme_completion *);
+typedef void (*nvme_consumer_cb_fn_t)(void *, struct nvme_namespace *);
+
+enum nvme_namespace_flags {
+	NVME_NS_DEALLOCATE_SUPPORTED	= 0x1,
+	NVME_NS_FLUSH_SUPPORTED		= 0x2,
+};
+
+/* NVM I/O functions */
+void	nvme_ns_cmd_write(struct nvme_namespace *ns, void *payload,
+			  uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn,
+			  void *cb_arg);
+void	nvme_ns_cmd_read(struct nvme_namespace *ns, void *payload,
+			 uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn,
+			 void *cb_arg);
+void	nvme_ns_cmd_deallocate(struct nvme_namespace *ns, void *payload,
+			       uint8_t num_ranges, nvme_cb_fn_t cb_fn,
+			       void *cb_arg);
+void	nvme_ns_cmd_flush(struct nvme_namespace *ns, nvme_cb_fn_t cb_fn,
+			  void *cb_arg);
+
+/* Registration functions */
+struct nvme_consumer *	nvme_register_consumer(nvme_consumer_cb_fn_t cb_fn,
+					       void *cb_arg);
+void		nvme_unregister_consumer(struct nvme_consumer *consumer);
+
+/* Namespace helper functions */
+uint32_t	nvme_ns_get_max_io_xfer_size(struct nvme_namespace *ns);
+uint32_t	nvme_ns_get_sector_size(struct nvme_namespace *ns);
+uint64_t	nvme_ns_get_num_sectors(struct nvme_namespace *ns);
+uint64_t	nvme_ns_get_size(struct nvme_namespace *ns);
+uint32_t	nvme_ns_get_flags(struct nvme_namespace *ns);
+const char *	nvme_ns_get_serial_number(struct nvme_namespace *ns);
+const char *	nvme_ns_get_model_number(struct nvme_namespace *ns);
+
+int	nvme_ns_bio_process(struct nvme_namespace *ns, struct bio *bp,
+			    nvme_cb_fn_t cb_fn);
+
+#endif /* _KERNEL */
+
+#endif /* __NVME_H__ */
diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c
new file mode 100644
index 0000000..52317b4
--- /dev/null
+++ b/sys/dev/nvme/nvme_ctrlr.c
@@ -0,0 +1,787 @@
+/*-
+ * Copyright (C) 2012 Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/ioccom.h>
+#include <sys/smp.h>
+
+#include <dev/pci/pcireg.h>
+#include <dev/pci/pcivar.h>
+
+#include "nvme_private.h"
+
+static void
+nvme_ctrlr_cb(void *arg, const struct nvme_completion *status)
+{
+	struct nvme_completion	*cpl = arg;
+	struct mtx		*mtx;
+
+	/*
+	 * Copy status into the argument passed by the caller, so that
+	 *  the caller can check the status to determine if the
+	 *  the request passed or failed.
+	 */
+	memcpy(cpl, status, sizeof(*cpl));
+	mtx = mtx_pool_find(mtxpool_sleep, cpl);
+	mtx_lock(mtx);
+	wakeup(cpl);
+	mtx_unlock(mtx);
+}
+
+static int
+nvme_ctrlr_allocate_bar(struct nvme_controller *ctrlr)
+{
+
+	/* Chatham puts the NVMe MMRs behind BAR 2/3, not BAR 0/1. */
+	if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID)
+		ctrlr->resource_id = PCIR_BAR(2);
+	else
+		ctrlr->resource_id = PCIR_BAR(0);
+
+	ctrlr->resource = bus_alloc_resource(ctrlr->dev, SYS_RES_MEMORY,
+	    &ctrlr->resource_id, 0, ~0, 1, RF_ACTIVE);
+
+	if(ctrlr->resource == NULL) {
+		device_printf(ctrlr->dev, "unable to allocate pci resource\n");
+		return (ENOMEM);
+	}
+
+	ctrlr->bus_tag = rman_get_bustag(ctrlr->resource);
+	ctrlr->bus_handle = rman_get_bushandle(ctrlr->resource);
+	ctrlr->regs = (struct nvme_registers *)ctrlr->bus_handle;
+
+	return (0);
+}
+
+#ifdef CHATHAM2
+static int
+nvme_ctrlr_allocate_chatham_bar(struct nvme_controller *ctrlr)
+{
+
+	ctrlr->chatham_resource_id = PCIR_BAR(CHATHAM_CONTROL_BAR);
+	ctrlr->chatham_resource = bus_alloc_resource(ctrlr->dev,
+	    SYS_RES_MEMORY, &ctrlr->chatham_resource_id, 0, ~0, 1,
+	    RF_ACTIVE);
+
+	if(ctrlr->chatham_resource == NULL) {
+		device_printf(ctrlr->dev, "unable to alloc pci resource\n");
+		return (ENOMEM);
+	}
+
+	ctrlr->chatham_bus_tag = rman_get_bustag(ctrlr->chatham_resource);
+	ctrlr->chatham_bus_handle =
+	    rman_get_bushandle(ctrlr->chatham_resource);
+
+	return (0);
+}
+
+static void
+nvme_ctrlr_setup_chatham(struct nvme_controller *ctrlr)
+{
+	uint64_t reg1, reg2, reg3;
+	uint64_t temp1, temp2;
+	uint32_t temp3;
+	uint32_t use_flash_timings = 0;
+
+	DELAY(10000);
+
+	temp3 = chatham_read_4(ctrlr, 0x8080);
+
+	device_printf(ctrlr->dev, "Chatham version: 0x%x\n", temp3);
+
+	ctrlr->chatham_lbas = chatham_read_4(ctrlr, 0x8068) - 0x110;
+	ctrlr->chatham_size = ctrlr->chatham_lbas * 512;
+
+	device_printf(ctrlr->dev, "Chatham size: %lld\n",
+	    (long long)ctrlr->chatham_size);
+
+	reg1 = reg2 = reg3 = ctrlr->chatham_size - 1;
+
+	TUNABLE_INT_FETCH("hw.nvme.use_flash_timings", &use_flash_timings);
+	if (use_flash_timings) {
+		device_printf(ctrlr->dev, "Chatham: using flash timings\n");
+		temp1 = 0x00001b58000007d0LL;
+		temp2 = 0x000000cb00000131LL;
+	} else {
+		device_printf(ctrlr->dev, "Chatham: using DDR timings\n");
+		temp1 = temp2 = 0x0LL;
+	}
+
+	chatham_write_8(ctrlr, 0x8000, reg1);
+	chatham_write_8(ctrlr, 0x8008, reg2);
+	chatham_write_8(ctrlr, 0x8010, reg3);
+
+	chatham_write_8(ctrlr, 0x8020, temp1);
+	temp3 = chatham_read_4(ctrlr, 0x8020);
+
+	chatham_write_8(ctrlr, 0x8028, temp2);
+	temp3 = chatham_read_4(ctrlr, 0x8028);
+
+	chatham_write_8(ctrlr, 0x8030, temp1);
+	chatham_write_8(ctrlr, 0x8038, temp2);
+	chatham_write_8(ctrlr, 0x8040, temp1);
+	chatham_write_8(ctrlr, 0x8048, temp2);
+	chatham_write_8(ctrlr, 0x8050, temp1);
+	chatham_write_8(ctrlr, 0x8058, temp2);
+
+	DELAY(10000);
+}
+
+static void
+nvme_chatham_populate_cdata(struct nvme_controller *ctrlr)
+{
+	struct nvme_controller_data *cdata;
+
+	cdata = &ctrlr->cdata;
+
+	cdata->vid = 0x8086;
+	cdata->ssvid = 0x2011;
+
+	/*
+	 * Chatham2 puts garbage data in these fields when we
+	 *  invoke IDENTIFY_CONTROLLER, so we need to re-zero
+	 *  the fields before calling bcopy().
+	 */
+	memset(cdata->sn, 0, sizeof(cdata->sn));
+	memcpy(cdata->sn, "2012", strlen("2012"));
+	memset(cdata->mn, 0, sizeof(cdata->mn));
+	memcpy(cdata->mn, "CHATHAM2", strlen("CHATHAM2"));
+	memset(cdata->fr, 0, sizeof(cdata->fr));
+	memcpy(cdata->fr, "0", strlen("0"));
+	cdata->rab = 8;
+	cdata->aerl = 3;
+	cdata->lpa.ns_smart = 1;
+	cdata->sqes.min = 6;
+	cdata->sqes.max = 6;
+	cdata->sqes.min = 4;
+	cdata->sqes.max = 4;
+	cdata->nn = 1;
+
+	/* Chatham2 doesn't support DSM command */
+	cdata->oncs.dsm = 0;
+
+	cdata->vwc.present = 1;
+}
+#endif /* CHATHAM2 */
+
+static void
+nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr)
+{
+	struct nvme_qpair	*qpair;
+	uint32_t		num_entries;
+
+	qpair = &ctrlr->adminq;
+
+	num_entries = NVME_ADMIN_ENTRIES;
+	TUNABLE_INT_FETCH("hw.nvme.admin_entries", &num_entries);
+	/*
+	 * If admin_entries was overridden to an invalid value, revert it
+	 *  back to our default value.
+	 */
+	if (num_entries < NVME_MIN_ADMIN_ENTRIES ||
+	    num_entries > NVME_MAX_ADMIN_ENTRIES) {
+		printf("nvme: invalid hw.nvme.admin_entries=%d specified\n",
+		    num_entries);
+		num_entries = NVME_ADMIN_ENTRIES;
+	}
+
+	/*
+	 * The admin queue's max xfer size is treated differently than the
+	 *  max I/O xfer size.  16KB is sufficient here - maybe even less?
+	 */
+	nvme_qpair_construct(qpair, 0, 0, num_entries, 16*1024, ctrlr);
+}
+
+static int
+nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
+{
+	struct nvme_qpair	*qpair;
+	union cap_lo_register	cap_lo;
+	int			i, num_entries;
+
+	num_entries = NVME_IO_ENTRIES;
+	TUNABLE_INT_FETCH("hw.nvme.io_entries", &num_entries);
+
+	num_entries = max(num_entries, NVME_MIN_IO_ENTRIES);
+
+	/*
+	 * NVMe spec sets a hard limit of 64K max entries, but
+	 *  devices may specify a smaller limit, so we need to check
+	 *  the MQES field in the capabilities register.
+	 */
+	cap_lo.raw = nvme_mmio_read_4(ctrlr, cap_lo);
+	num_entries = min(num_entries, cap_lo.bits.mqes+1);
+
+	ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE;
+	TUNABLE_INT_FETCH("hw.nvme.max_xfer_size", &ctrlr->max_xfer_size);
+	/*
+	 * Check that tunable doesn't specify a size greater than what our
+	 *  driver supports, and is an even PAGE_SIZE multiple.
+	 */
+	if (ctrlr->max_xfer_size > NVME_MAX_XFER_SIZE ||
+	    ctrlr->max_xfer_size % PAGE_SIZE)
+		ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE;
+
+	ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair),
+	    M_NVME, M_ZERO | M_NOWAIT);
+
+	if (ctrlr->ioq == NULL)
+		return (ENOMEM);
+
+	for (i = 0; i < ctrlr->num_io_queues; i++) {
+		qpair = &ctrlr->ioq[i];
+
+		/*
+		 * Admin queue has ID=0. IO queues start at ID=1 -
+		 *  hence the 'i+1' here.
+		 *
+		 * For I/O queues, use the controller-wide max_xfer_size
+		 *  calculated in nvme_attach().
+		 */
+		nvme_qpair_construct(qpair,
+				     i+1, /* qpair ID */
+				     ctrlr->msix_enabled ? i+1 : 0, /* vector */
+				     num_entries,
+				     ctrlr->max_xfer_size,
+				     ctrlr);
+
+		if (ctrlr->per_cpu_io_queues)
+			bus_bind_intr(ctrlr->dev, qpair->res, i);
+	}
+
+	return (0);
+}
+
+static int
+nvme_ctrlr_wait_for_ready(struct nvme_controller *ctrlr)
+{
+	int ms_waited;
+	union cc_register cc;
+	union csts_register csts;
+
+	cc.raw = nvme_mmio_read_4(ctrlr, cc);
+	csts.raw = nvme_mmio_read_4(ctrlr, csts);
+
+	if (!cc.bits.en) {
+		device_printf(ctrlr->dev, "%s called with cc.en = 0\n",
+		    __func__);
+		return (ENXIO);
+	}
+
+	ms_waited = 0;
+
+	while (!csts.bits.rdy) {
+		DELAY(1000);
+		if (ms_waited++ > ctrlr->ready_timeout_in_ms) {
+			device_printf(ctrlr->dev, "controller did not become "
+			    "ready within %d ms\n", ctrlr->ready_timeout_in_ms);
+			return (ENXIO);
+		}
+		csts.raw = nvme_mmio_read_4(ctrlr, csts);
+	}
+
+	return (0);
+}
+
+static void
+nvme_ctrlr_disable(struct nvme_controller *ctrlr)
+{
+	union cc_register cc;
+	union csts_register csts;
+
+	cc.raw = nvme_mmio_read_4(ctrlr, cc);
+	csts.raw = nvme_mmio_read_4(ctrlr, csts);
+
+	if (cc.bits.en == 1 && csts.bits.rdy == 0)
+		nvme_ctrlr_wait_for_ready(ctrlr);
+
+	cc.bits.en = 0;
+	nvme_mmio_write_4(ctrlr, cc, cc.raw);
+	DELAY(5000);
+}
+
+static int
+nvme_ctrlr_enable(struct nvme_controller *ctrlr)
+{
+	union cc_register	cc;
+	union csts_register	csts;
+	union aqa_register	aqa;
+
+	cc.raw = nvme_mmio_read_4(ctrlr, cc);
+	csts.raw = nvme_mmio_read_4(ctrlr, csts);
+
+	if (cc.bits.en == 1) {
+		if (csts.bits.rdy == 1)
+			return (0);
+		else
+			return (nvme_ctrlr_wait_for_ready(ctrlr));
+	}
+
+	nvme_mmio_write_8(ctrlr, asq, ctrlr->adminq.cmd_bus_addr);
+	DELAY(5000);
+	nvme_mmio_write_8(ctrlr, acq, ctrlr->adminq.cpl_bus_addr);
+	DELAY(5000);
+
+	aqa.raw = 0;
+	/* acqs and asqs are 0-based. */
+	aqa.bits.acqs = ctrlr->adminq.num_entries-1;
+	aqa.bits.asqs = ctrlr->adminq.num_entries-1;
+	nvme_mmio_write_4(ctrlr, aqa, aqa.raw);
+	DELAY(5000);
+
+	cc.bits.en = 1;
+	cc.bits.css = 0;
+	cc.bits.ams = 0;
+	cc.bits.shn = 0;
+	cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */
+	cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */
+
+	/* This evaluates to 0, which is according to spec. */
+	cc.bits.mps = (PAGE_SIZE >> 13);
+
+	nvme_mmio_write_4(ctrlr, cc, cc.raw);
+	DELAY(5000);
+
+	return (nvme_ctrlr_wait_for_ready(ctrlr));
+}
+
+int
+nvme_ctrlr_reset(struct nvme_controller *ctrlr)
+{
+
+	nvme_ctrlr_disable(ctrlr);
+	return (nvme_ctrlr_enable(ctrlr));
+}
+
+static void
+nvme_async_event_cb(void *arg, const struct nvme_completion *status)
+{
+	struct nvme_controller *ctrlr = arg;
+
+	printf("Asynchronous event occurred.\n");
+
+	/* TODO: decode async event type based on status */
+	/* TODO: check status for any error bits */
+
+	/*
+	 * Repost an asynchronous event request so that it can be
+	 *  used again by the controller.
+	 */
+	nvme_ctrlr_cmd_asynchronous_event_request(ctrlr, nvme_async_event_cb,
+	    ctrlr);
+}
+
+static int
+nvme_ctrlr_identify(struct nvme_controller *ctrlr)
+{
+	struct mtx		*mtx;
+	struct nvme_completion	cpl;
+	int			status;
+
+	mtx = mtx_pool_find(mtxpool_sleep, &cpl);
+
+	mtx_lock(mtx);
+	nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata,
+	    nvme_ctrlr_cb, &cpl);
+	status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
+	mtx_unlock(mtx);
+	if ((status != 0) || cpl.sf_sc || cpl.sf_sct) {
+		printf("nvme_identify_controller failed!\n");
+		return (ENXIO);
+	}
+
+#ifdef CHATHAM2
+	if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID)
+		nvme_chatham_populate_cdata(ctrlr);
+#endif
+
+	return (0);
+}
+
+static int
+nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr)
+{
+	struct mtx		*mtx;
+	struct nvme_completion	cpl;
+	int			cq_allocated, sq_allocated, status;
+
+	mtx = mtx_pool_find(mtxpool_sleep, &cpl);
+
+	mtx_lock(mtx);
+	nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->num_io_queues,
+	    nvme_ctrlr_cb, &cpl);
+	status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
+	mtx_unlock(mtx);
+	if ((status != 0) || cpl.sf_sc || cpl.sf_sct) {
+		printf("nvme_set_num_queues failed!\n");
+		return (ENXIO);
+	}
+
+	/*
+	 * Data in cdw0 is 0-based.
+	 * Lower 16-bits indicate number of submission queues allocated.
+	 * Upper 16-bits indicate number of completion queues allocated.
+	 */
+	sq_allocated = (cpl.cdw0 & 0xFFFF) + 1;
+	cq_allocated = (cpl.cdw0 >> 16) + 1;
+
+	/*
+	 * Check that the controller was able to allocate the number of
+	 *  queues we requested.  If not, revert to one IO queue.
+	 */
+	if (sq_allocated < ctrlr->num_io_queues ||
+	    cq_allocated < ctrlr->num_io_queues) {
+		ctrlr->num_io_queues = 1;
+		ctrlr->per_cpu_io_queues = 0;
+
+		/* TODO: destroy extra queues that were created
+		 *  previously but now found to be not needed.
+		 */
+	}
+
+	return (0);
+}
+
+static int
+nvme_ctrlr_create_qpairs(struct nvme_controller *ctrlr)
+{
+	struct mtx		*mtx;
+	struct nvme_qpair	*qpair;
+	struct nvme_completion	cpl;
+	int			i, status;
+
+	mtx = mtx_pool_find(mtxpool_sleep, &cpl);
+
+	for (i = 0; i < ctrlr->num_io_queues; i++) {
+		qpair = &ctrlr->ioq[i];
+
+		mtx_lock(mtx);
+		nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair, qpair->vector,
+		    nvme_ctrlr_cb, &cpl);
+		status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
+		mtx_unlock(mtx);
+		if ((status != 0) || cpl.sf_sc || cpl.sf_sct) {
+			printf("nvme_create_io_cq failed!\n");
+			return (ENXIO);
+		}
+
+		mtx_lock(mtx);
+		nvme_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair,
+		    nvme_ctrlr_cb, &cpl);
+		status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
+		mtx_unlock(mtx);
+		if ((status != 0) || cpl.sf_sc || cpl.sf_sct) {
+			printf("nvme_create_io_sq failed!\n");
+			return (ENXIO);
+		}
+	}
+
+	return (0);
+}
+
+static int
+nvme_ctrlr_construct_namespaces(struct nvme_controller *ctrlr)
+{
+	struct nvme_namespace	*ns;
+	int			i, status;
+
+	for (i = 0; i < ctrlr->cdata.nn; i++) {
+		ns = &ctrlr->ns[i];
+		status = nvme_ns_construct(ns, i+1, ctrlr);
+		if (status != 0)
+			return (status);
+	}
+
+	return (0);
+}
+
+static void
+nvme_ctrlr_configure_aer(struct nvme_controller *ctrlr)
+{
+	union nvme_critical_warning_state	state;
+	uint8_t					num_async_events;
+
+	state.raw = 0xFF;
+	state.bits.reserved = 0;
+	nvme_ctrlr_cmd_set_asynchronous_event_config(ctrlr, state, NULL, NULL);
+
+	/* aerl is a zero-based value, so we need to add 1 here. */
+	num_async_events = min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl+1));
+
+	/*
+	 * Disable this code for now, since Chatham doesn't support
+	 *  AERs so I have no good way to test them.
+	 */
+#if 0
+	for (int i = 0; i < num_async_events; i++)
+		nvme_ctrlr_cmd_asynchronous_event_request(ctrlr,
+		    nvme_async_event_cb, ctrlr);
+#endif
+}
+
+static void
+nvme_ctrlr_configure_int_coalescing(struct nvme_controller *ctrlr)
+{
+
+	ctrlr->int_coal_time = 0;
+	TUNABLE_INT_FETCH("hw.nvme.int_coal_time",
+	    &ctrlr->int_coal_time);
+
+	ctrlr->int_coal_threshold = 0;
+	TUNABLE_INT_FETCH("hw.nvme.int_coal_threshold",
+	    &ctrlr->int_coal_threshold);
+
+	nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr, ctrlr->int_coal_time,
+	    ctrlr->int_coal_threshold, NULL, NULL);
+}
+
+void
+nvme_ctrlr_start(void *ctrlr_arg)
+{
+	struct nvme_controller *ctrlr = ctrlr_arg;
+
+	if (nvme_ctrlr_identify(ctrlr) != 0)
+		goto err;
+
+	if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0)
+		goto err;
+
+	if (nvme_ctrlr_create_qpairs(ctrlr) != 0)
+		goto err;
+
+	if (nvme_ctrlr_construct_namespaces(ctrlr) != 0)
+		goto err;
+
+	nvme_ctrlr_configure_aer(ctrlr);
+	nvme_ctrlr_configure_int_coalescing(ctrlr);
+
+	ctrlr->is_started = TRUE;
+
+err:
+
+	/*
+	 * Initialize sysctls, even if controller failed to start, to
+	 *  assist with debugging admin queue pair.
+	 */
+	nvme_sysctl_initialize_ctrlr(ctrlr);
+	config_intrhook_disestablish(&ctrlr->config_hook);
+}
+
+static void
+nvme_ctrlr_intx_task(void *arg, int pending)
+{
+	struct nvme_controller *ctrlr = arg;
+
+	nvme_qpair_process_completions(&ctrlr->adminq);
+
+	if (ctrlr->ioq[0].cpl)
+		nvme_qpair_process_completions(&ctrlr->ioq[0]);
+
+	nvme_mmio_write_4(ctrlr, intmc, 1);
+}
+
+static void
+nvme_ctrlr_intx_handler(void *arg)
+{
+	struct nvme_controller *ctrlr = arg;
+
+	nvme_mmio_write_4(ctrlr, intms, 1);
+	taskqueue_enqueue_fast(ctrlr->taskqueue, &ctrlr->task);
+}
+
+static int
+nvme_ctrlr_configure_intx(struct nvme_controller *ctrlr)
+{
+
+	ctrlr->num_io_queues = 1;
+	ctrlr->per_cpu_io_queues = 0;
+	ctrlr->rid = 0;
+	ctrlr->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ,
+	    &ctrlr->rid, RF_SHAREABLE | RF_ACTIVE);
+
+	if (ctrlr->res == NULL) {
+		device_printf(ctrlr->dev, "unable to allocate shared IRQ\n");
+		return (ENOMEM);
+	}
+
+	bus_setup_intr(ctrlr->dev, ctrlr->res,
+	    INTR_TYPE_MISC | INTR_MPSAFE, NULL, nvme_ctrlr_intx_handler,
+	    ctrlr, &ctrlr->tag);
+
+	if (ctrlr->tag == NULL) {
+		device_printf(ctrlr->dev,
+		    "unable to setup legacy interrupt handler\n");
+		return (ENOMEM);
+	}
+
+	TASK_INIT(&ctrlr->task, 0, nvme_ctrlr_intx_task, ctrlr);
+	ctrlr->taskqueue = taskqueue_create_fast("nvme_taskq", M_NOWAIT,
+	    taskqueue_thread_enqueue, &ctrlr->taskqueue);
+	taskqueue_start_threads(&ctrlr->taskqueue, 1, PI_NET,
+	    "%s intx taskq", device_get_nameunit(ctrlr->dev));
+
+	return (0);
+}
+
+static int
+nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
+    struct thread *td)
+{
+	struct nvme_controller	*ctrlr;
+	struct nvme_completion	cpl;
+	struct mtx		*mtx;
+
+	ctrlr = cdev->si_drv1;
+
+	switch (cmd) {
+	case NVME_IDENTIFY_CONTROLLER:
+#ifdef CHATHAM2
+		/*
+		 * Don't refresh data on Chatham, since Chatham returns
+		 *  garbage on IDENTIFY anyways.
+		 */
+		if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID) {
+			memcpy(arg, &ctrlr->cdata, sizeof(ctrlr->cdata));
+			break;
+		}
+#endif
+		/* Refresh data before returning to user. */
+		mtx = mtx_pool_find(mtxpool_sleep, &cpl);
+		mtx_lock(mtx);
+		nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata,
+		    nvme_ctrlr_cb, &cpl);
+		msleep(&cpl, mtx, PRIBIO, "nvme_ioctl", 0);
+		mtx_unlock(mtx);
+		if (cpl.sf_sc || cpl.sf_sct)
+			return (ENXIO);
+		memcpy(arg, &ctrlr->cdata, sizeof(ctrlr->cdata));
+		break;
+	default:
+		return (ENOTTY);
+	}
+
+	return (0);
+}
+
+static struct cdevsw nvme_ctrlr_cdevsw = {
+	.d_version =	D_VERSION,
+	.d_flags =	0,
+	.d_ioctl =	nvme_ctrlr_ioctl
+};
+
+int
+nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
+{
+	union cap_lo_register	cap_lo;
+	union cap_hi_register	cap_hi;
+	int			num_vectors, per_cpu_io_queues, status = 0;
+
+	ctrlr->dev = dev;
+	ctrlr->is_started = FALSE;
+
+	status = nvme_ctrlr_allocate_bar(ctrlr);
+
+	if (status != 0)
+		return (status);
+
+#ifdef CHATHAM2
+	if (pci_get_devid(dev) == CHATHAM_PCI_ID) {
+		status = nvme_ctrlr_allocate_chatham_bar(ctrlr);
+		if (status != 0)
+			return (status);
+		nvme_ctrlr_setup_chatham(ctrlr);
+	}
+#endif
+
+	/*
+	 * Software emulators may set the doorbell stride to something
+	 *  other than zero, but this driver is not set up to handle that.
+	 */
+	cap_hi.raw = nvme_mmio_read_4(ctrlr, cap_hi);
+	if (cap_hi.bits.dstrd != 0)
+		return (ENXIO);
+
+	/* Get ready timeout value from controller, in units of 500ms. */
+	cap_lo.raw = nvme_mmio_read_4(ctrlr, cap_lo);
+	ctrlr->ready_timeout_in_ms = cap_lo.bits.to * 500;
+
+	per_cpu_io_queues = 1;
+	TUNABLE_INT_FETCH("hw.nvme.per_cpu_io_queues", &per_cpu_io_queues);
+	ctrlr->per_cpu_io_queues = per_cpu_io_queues ? TRUE : FALSE;
+
+	if (ctrlr->per_cpu_io_queues)
+		ctrlr->num_io_queues = mp_ncpus;
+	else
+		ctrlr->num_io_queues = 1;
+
+	ctrlr->force_intx = 0;
+	TUNABLE_INT_FETCH("hw.nvme.force_intx", &ctrlr->force_intx);
+
+	ctrlr->msix_enabled = 1;
+
+	if (ctrlr->force_intx) {
+		ctrlr->msix_enabled = 0;
+		goto intx;
+	}
+
+	/* One vector per IO queue, plus one vector for admin queue. */
+	num_vectors = ctrlr->num_io_queues + 1;
+
+	if (pci_msix_count(dev) < num_vectors) {
+		ctrlr->msix_enabled = 0;
+		goto intx;
+	}
+
+	if (pci_alloc_msix(dev, &num_vectors) != 0)
+		ctrlr->msix_enabled = 0;
+
+intx:
+
+	if (!ctrlr->msix_enabled)
+		nvme_ctrlr_configure_intx(ctrlr);
+
+	nvme_ctrlr_construct_admin_qpair(ctrlr);
+
+	status = nvme_ctrlr_construct_io_qpairs(ctrlr);
+
+	if (status != 0)
+		return (status);
+
+	ctrlr->cdev = make_dev(&nvme_ctrlr_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
+	    "nvme%d", device_get_unit(dev));
+
+	if (ctrlr->cdev == NULL)
+		return (ENXIO);
+
+	ctrlr->cdev->si_drv1 = (void *)ctrlr;
+
+	return (0);
+}
diff --git a/sys/dev/nvme/nvme_ctrlr_cmd.c b/sys/dev/nvme/nvme_ctrlr_cmd.c
new file mode 100644
index 0000000..c145975
--- /dev/null
+++ b/sys/dev/nvme/nvme_ctrlr_cmd.c
@@ -0,0 +1,312 @@
+/*-
+ * Copyright (C) 2012 Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "nvme_private.h"
+
+void
+nvme_ctrlr_cmd_identify_controller(struct nvme_controller *ctrlr, void *payload,
+	nvme_cb_fn_t cb_fn, void *cb_arg)
+{
+	struct nvme_tracker *tr;
+	struct nvme_command *cmd;
+	int err;
+
+	tr = nvme_allocate_tracker(ctrlr, TRUE, cb_fn, cb_arg,
+	    sizeof(struct nvme_controller_data), payload);
+
+	cmd = &tr->cmd;
+	cmd->opc = NVME_OPC_IDENTIFY;
+
+	/*
+	 * TODO: create an identify command data structure, which
+	 *  includes this CNS bit in cdw10.
+	 */
+	cmd->cdw10 = 1;
+
+	err = bus_dmamap_load(tr->qpair->dma_tag, tr->dma_map, payload,
+	    tr->payload_size, nvme_payload_map, tr, 0);
+
+	KASSERT(err == 0, ("bus_dmamap_load returned non-zero!\n"));
+}
+
+void
+nvme_ctrlr_cmd_identify_namespace(struct nvme_controller *ctrlr, uint16_t nsid,
+	void *payload, nvme_cb_fn_t cb_fn, void *cb_arg)
+{
+	struct nvme_tracker *tr;
+	struct nvme_command *cmd;
+	int err;
+
+	tr = nvme_allocate_tracker(ctrlr, TRUE, cb_fn, cb_arg,
+	    sizeof(struct nvme_namespace_data), payload);
+
+	cmd = &tr->cmd;
+	cmd->opc = NVME_OPC_IDENTIFY;
+
+	/*
+	 * TODO: create an identify command data structure
+	 */
+	cmd->nsid = nsid;
+
+	err = bus_dmamap_load(tr->qpair->dma_tag, tr->dma_map, payload,
+	    tr->payload_size, nvme_payload_map, tr, 0);
+
+	KASSERT(err == 0, ("bus_dmamap_load returned non-zero!\n"));
+}
+
+void
+nvme_ctrlr_cmd_create_io_cq(struct nvme_controller *ctrlr,
+    struct nvme_qpair *io_que, uint16_t vector, nvme_cb_fn_t cb_fn,
+    void *cb_arg)
+{
+	struct nvme_tracker *tr;
+	struct nvme_command *cmd;
+
+	tr = nvme_allocate_tracker(ctrlr, TRUE, cb_fn, cb_arg, 0, NULL);
+
+	cmd = &tr->cmd;
+	cmd->opc = NVME_OPC_CREATE_IO_CQ;
+
+	/*
+	 * TODO: create a create io completion queue command data
+	 *  structure.
+	 */
+	cmd->cdw10 = ((io_que->num_entries-1) << 16) | io_que->id;
+	/* 0x3 = interrupts enabled | physically contiguous */
+	cmd->cdw11 = (vector << 16) | 0x3;
+	cmd->prp1 = io_que->cpl_bus_addr;
+
+	nvme_qpair_submit_cmd(tr->qpair, tr);
+}
+
+void
+nvme_ctrlr_cmd_create_io_sq(struct nvme_controller *ctrlr,
+    struct nvme_qpair *io_que, nvme_cb_fn_t cb_fn, void *cb_arg)
+{
+	struct nvme_tracker *tr;
+	struct nvme_command *cmd;
+
+	tr = nvme_allocate_tracker(ctrlr, TRUE, cb_fn, cb_arg, 0, NULL);
+
+	cmd = &tr->cmd;
+	cmd->opc = NVME_OPC_CREATE_IO_SQ;
+
+	/*
+	 * TODO: create a create io submission queue command data
+	 *  structure.
+	 */
+	cmd->cdw10 = ((io_que->num_entries-1) << 16) | io_que->id;
+	/* 0x1 = physically contiguous */
+	cmd->cdw11 = (io_que->id << 16) | 0x1;
+	cmd->prp1 = io_que->cmd_bus_addr;
+
+	nvme_qpair_submit_cmd(tr->qpair, tr);
+}
+
+void
+nvme_ctrlr_cmd_delete_io_cq(struct nvme_controller *ctrlr,
+    struct nvme_qpair *io_que, nvme_cb_fn_t cb_fn, void *cb_arg)
+{
+	struct nvme_tracker *tr;
+	struct nvme_command *cmd;
+
+	tr = nvme_allocate_tracker(ctrlr, TRUE, cb_fn, cb_arg, 0, NULL);
+
+	cmd = &tr->cmd;
+	cmd->opc = NVME_OPC_DELETE_IO_CQ;
+
+	/*
+	 * TODO: create a delete io completion queue command data
+	 *  structure.
+	 */
+	cmd->cdw10 = io_que->id;
+
+	nvme_qpair_submit_cmd(tr->qpair, tr);
+}
+
+void
+nvme_ctrlr_cmd_delete_io_sq(struct nvme_controller *ctrlr,
+    struct nvme_qpair *io_que, nvme_cb_fn_t cb_fn, void *cb_arg)
+{
+	struct nvme_tracker *tr;
+	struct nvme_command *cmd;
+
+	tr = nvme_allocate_tracker(ctrlr, TRUE, cb_fn, cb_arg, 0, NULL);
+
+	cmd = &tr->cmd;
+	cmd->opc = NVME_OPC_DELETE_IO_SQ;
+
+	/*
+	 * TODO: create a delete io submission queue command data
+	 *  structure.
+	 */
+	cmd->cdw10 = io_que->id;
+
+	nvme_qpair_submit_cmd(tr->qpair, tr);
+}
+
+void
+nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr, uint8_t feature,
+    uint32_t cdw11, void *payload, uint32_t payload_size,
+    nvme_cb_fn_t cb_fn, void *cb_arg)
+{
+	struct nvme_tracker *tr;
+	struct nvme_command *cmd;
+	int err;
+
+	tr = nvme_allocate_tracker(ctrlr, TRUE, cb_fn, cb_arg,
+	    payload_size, payload);
+
+	cmd = &tr->cmd;
+	cmd->opc = NVME_OPC_SET_FEATURES;
+	cmd->cdw10 = feature;
+	cmd->cdw11 = cdw11;
+
+	if (payload_size > 0) {
+		err = bus_dmamap_load(tr->qpair->dma_tag, tr->dma_map, payload,
+		    payload_size, nvme_payload_map, tr, 0);
+
+		KASSERT(err == 0, ("bus_dmamap_load returned non-zero!\n"));
+	} else
+		nvme_qpair_submit_cmd(tr->qpair, tr);
+}
+
+void
+nvme_ctrlr_cmd_get_feature(struct nvme_controller *ctrlr, uint8_t feature,
+    uint32_t cdw11, void *payload, uint32_t payload_size,
+    nvme_cb_fn_t cb_fn, void *cb_arg)
+{
+	struct nvme_tracker *tr;
+	struct nvme_command *cmd;
+	int err;
+
+	tr = nvme_allocate_tracker(ctrlr, TRUE, cb_fn, cb_arg,
+	    payload_size, payload);
+
+	cmd = &tr->cmd;
+	cmd->opc = NVME_OPC_GET_FEATURES;
+	cmd->cdw10 = feature;
+	cmd->cdw11 = cdw11;
+
+	if (payload_size > 0) {
+		err = bus_dmamap_load(tr->qpair->dma_tag, tr->dma_map, payload,
+		    payload_size, nvme_payload_map, tr, 0);
+
+		KASSERT(err == 0, ("bus_dmamap_load returned non-zero!\n"));
+	} else
+		nvme_qpair_submit_cmd(tr->qpair, tr);
+}
+
+void
+nvme_ctrlr_cmd_set_num_queues(struct nvme_controller *ctrlr,
+    uint32_t num_queues, nvme_cb_fn_t cb_fn, void *cb_arg)
+{
+	uint32_t cdw11;
+
+	cdw11 = ((num_queues - 1) << 16) || (num_queues - 1);
+	nvme_ctrlr_cmd_set_feature(ctrlr, NVME_FEAT_NUMBER_OF_QUEUES, cdw11,
+	    NULL, 0, cb_fn, cb_arg);
+}
+
+void
+nvme_ctrlr_cmd_set_asynchronous_event_config(struct nvme_controller *ctrlr,
+    union nvme_critical_warning_state state, nvme_cb_fn_t cb_fn,
+    void *cb_arg)
+{
+	uint32_t cdw11;
+
+	cdw11 = state.raw;
+	nvme_ctrlr_cmd_set_feature(ctrlr,
+	    NVME_FEAT_ASYNCHRONOUS_EVENT_CONFIGURATION, cdw11, NULL, 0, cb_fn,
+	    cb_arg);
+}
+
+void
+nvme_ctrlr_cmd_set_interrupt_coalescing(struct nvme_controller *ctrlr,
+    uint32_t microseconds, uint32_t threshold, nvme_cb_fn_t cb_fn, void *cb_arg)
+{
+	uint32_t cdw11;
+
+	if ((microseconds/100) >= 0x100) {
+		KASSERT(FALSE, ("intr coal time > 255*100 microseconds\n"));
+		printf("invalid coal time %d, disabling\n", microseconds);
+		microseconds = 0;
+		threshold = 0;
+	}
+
+	if (threshold >= 0x100) {
+		KASSERT(FALSE, ("intr threshold > 255\n"));
+		printf("invalid threshold %d, disabling\n", threshold);
+		threshold = 0;
+		microseconds = 0;
+	}
+
+	cdw11 = ((microseconds/100) << 8) | threshold;
+	nvme_ctrlr_cmd_set_feature(ctrlr, NVME_FEAT_INTERRUPT_COALESCING, cdw11,
+	    NULL, 0, cb_fn, cb_arg);
+}
+
+void
+nvme_ctrlr_cmd_asynchronous_event_request(struct nvme_controller *ctrlr,
+    nvme_cb_fn_t cb_fn, void *cb_arg)
+{
+	struct nvme_tracker *tr;
+	struct nvme_command *cmd;
+
+	tr = nvme_allocate_tracker(ctrlr, TRUE, cb_fn, cb_arg, 0, NULL);
+
+	cmd = &tr->cmd;
+	cmd->opc = NVME_OPC_ASYNC_EVENT_REQUEST;
+
+	nvme_qpair_submit_cmd(tr->qpair, tr);
+}
+
+void
+nvme_ctrlr_cmd_get_health_information_page(struct nvme_controller *ctrlr,
+    uint32_t nsid, struct nvme_health_information_page *payload,
+    nvme_cb_fn_t cb_fn, void *cb_arg)
+{
+	struct nvme_tracker *tr;
+	struct nvme_command *cmd;
+	int err;
+
+	tr = nvme_allocate_tracker(ctrlr, TRUE, cb_fn, cb_arg,
+	    sizeof(*payload), payload);
+
+	cmd = &tr->cmd;
+	cmd->opc = NVME_OPC_GET_LOG_PAGE;
+	cmd->nsid = nsid;
+	cmd->cdw10 = ((sizeof(*payload)/sizeof(uint32_t)) - 1) << 16;
+	cmd->cdw10 |= NVME_LOG_HEALTH_INFORMATION;
+
+	err = bus_dmamap_load(tr->qpair->dma_tag, tr->dma_map, payload,
+	    sizeof(*payload), nvme_payload_map, tr, 0);
+
+	KASSERT(err == 0, ("bus_dmamap_load returned non-zero!\n"));
+}
diff --git a/sys/dev/nvme/nvme_ns.c b/sys/dev/nvme/nvme_ns.c
new file mode 100644
index 0000000..4ee9a7e
--- /dev/null
+++ b/sys/dev/nvme/nvme_ns.c
@@ -0,0 +1,364 @@
+/*-
+ * Copyright (C) 2012 Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bio.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/disk.h>
+#include <sys/fcntl.h>
+#include <sys/ioccom.h>
+#include <sys/module.h>
+#include <sys/proc.h>
+
+#include <dev/pci/pcivar.h>
+
+#include "nvme_private.h"
+
+static void
+nvme_ns_cb(void *arg, const struct nvme_completion *status)
+{
+	struct nvme_completion	*cpl = arg;
+	struct mtx		*mtx;
+
+	/*
+	 * Copy status into the argument passed by the caller, so that
+	 *  the caller can check the status to determine if the
+	 *  the request passed or failed.
+	 */
+	memcpy(cpl, status, sizeof(*cpl));
+	mtx = mtx_pool_find(mtxpool_sleep, cpl);
+	mtx_lock(mtx);
+	wakeup(cpl);
+	mtx_unlock(mtx);
+}
+
+static int
+nvme_ns_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
+    struct thread *td)
+{
+	struct nvme_namespace	*ns;
+	struct nvme_controller	*ctrlr;
+	struct nvme_completion	cpl;
+	struct mtx		*mtx;
+
+	ns = cdev->si_drv1;
+	ctrlr = ns->ctrlr;
+
+	switch (cmd) {
+	case NVME_IDENTIFY_NAMESPACE:
+#ifdef CHATHAM2
+		/*
+		 * Don't refresh data on Chatham, since Chatham returns
+		 *  garbage on IDENTIFY anyways.
+		 */
+		if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID) {
+			memcpy(arg, &ns->data, sizeof(ns->data));
+			break;
+		}
+#endif
+		/* Refresh data before returning to user. */
+		mtx = mtx_pool_find(mtxpool_sleep, &cpl);
+		mtx_lock(mtx);
+		nvme_ctrlr_cmd_identify_namespace(ctrlr, ns->id, &ns->data,
+		    nvme_ns_cb, &cpl);
+		msleep(&cpl, mtx, PRIBIO, "nvme_ioctl", 0);
+		mtx_unlock(mtx);
+		if (cpl.sf_sc || cpl.sf_sct)
+			return (ENXIO);
+		memcpy(arg, &ns->data, sizeof(ns->data));
+		break;
+	case NVME_IO_TEST:
+	case NVME_BIO_TEST:
+		nvme_ns_test(ns, cmd, arg);
+		break;
+	case DIOCGMEDIASIZE:
+		*(off_t *)arg = (off_t)nvme_ns_get_size(ns);
+		break;
+	case DIOCGSECTORSIZE:
+		*(u_int *)arg = nvme_ns_get_sector_size(ns);
+		break;
+	default:
+		return (ENOTTY);
+	}
+
+	return (0);
+}
+
+static int
+nvme_ns_open(struct cdev *dev __unused, int flags, int fmt __unused,
+    struct thread *td)
+{
+	int error = 0;
+
+	if (flags & FWRITE)
+		error = securelevel_gt(td->td_ucred, 0);
+
+	return (error);
+}
+
+static int
+nvme_ns_close(struct cdev *dev __unused, int flags, int fmt __unused,
+    struct thread *td)
+{
+
+	return (0);
+}
+
+static void
+nvme_ns_strategy_done(void *arg, const struct nvme_completion *status)
+{
+	struct bio *bp = arg;
+
+	/*
+	 * TODO: add more extensive translation of NVMe status codes
+	 *  to different bio error codes (i.e. EIO, EINVAL, etc.)
+	 */
+	if (status->sf_sc || status->sf_sct) {
+		bp->bio_error = EIO;
+		bp->bio_flags |= BIO_ERROR;
+		bp->bio_resid = bp->bio_bcount;
+	} else
+		bp->bio_resid = 0;
+
+	biodone(bp);
+}
+
+static void
+nvme_ns_strategy(struct bio *bp)
+{
+	struct nvme_namespace	*ns;
+	int			err;
+
+	ns = bp->bio_dev->si_drv1;
+	err = nvme_ns_bio_process(ns, bp, nvme_ns_strategy_done);
+
+	if (err) {
+		bp->bio_error = EIO;
+		bp->bio_flags |= BIO_ERROR;
+		bp->bio_resid = bp->bio_bcount;
+		biodone(bp);
+	}
+
+}
+
+static struct cdevsw nvme_ns_cdevsw = {
+	.d_version =	D_VERSION,
+	.d_flags =	D_DISK,
+	.d_open =	nvme_ns_open,
+	.d_close =	nvme_ns_close,
+	.d_read =	nvme_ns_physio,
+	.d_write =	nvme_ns_physio,
+	.d_strategy =	nvme_ns_strategy,
+	.d_ioctl =	nvme_ns_ioctl
+};
+
+uint32_t
+nvme_ns_get_max_io_xfer_size(struct nvme_namespace *ns)
+{
+	return ns->ctrlr->max_xfer_size;
+}
+
+uint32_t
+nvme_ns_get_sector_size(struct nvme_namespace *ns)
+{
+	return (1 << ns->data.lbaf[0].lbads);
+}
+
+uint64_t
+nvme_ns_get_num_sectors(struct nvme_namespace *ns)
+{
+	return (ns->data.nsze);
+}
+
+uint64_t
+nvme_ns_get_size(struct nvme_namespace *ns)
+{
+	return (nvme_ns_get_num_sectors(ns) * nvme_ns_get_sector_size(ns));
+}
+
+uint32_t
+nvme_ns_get_flags(struct nvme_namespace *ns)
+{
+	return (ns->flags);
+}
+
+const char *
+nvme_ns_get_serial_number(struct nvme_namespace *ns)
+{
+	return ((const char *)ns->ctrlr->cdata.sn);
+}
+
+const char *
+nvme_ns_get_model_number(struct nvme_namespace *ns)
+{
+	return ((const char *)ns->ctrlr->cdata.mn);
+}
+
+static void
+nvme_ns_bio_done(void *arg, const struct nvme_completion *status)
+{
+	struct bio	*bp = arg;
+	nvme_cb_fn_t	bp_cb_fn;
+
+	bp_cb_fn = bp->bio_driver1;
+
+	if (bp->bio_driver2)
+		free(bp->bio_driver2, M_NVME);
+
+	bp_cb_fn(bp, status);
+}
+
+int
+nvme_ns_bio_process(struct nvme_namespace *ns, struct bio *bp,
+	nvme_cb_fn_t cb_fn)
+{
+	struct nvme_dsm_range *dsm_range;
+
+	bp->bio_driver1 = cb_fn;
+
+	switch (bp->bio_cmd) {
+	case BIO_READ:
+		nvme_ns_cmd_read(ns, bp->bio_data,
+		    bp->bio_offset/nvme_ns_get_sector_size(ns),
+		    bp->bio_bcount/nvme_ns_get_sector_size(ns),
+		    nvme_ns_bio_done, bp);
+		break;
+	case BIO_WRITE:
+		nvme_ns_cmd_write(ns, bp->bio_data,
+		    bp->bio_offset/nvme_ns_get_sector_size(ns),
+		    bp->bio_bcount/nvme_ns_get_sector_size(ns),
+		    nvme_ns_bio_done, bp);
+		break;
+	case BIO_FLUSH:
+		nvme_ns_cmd_flush(ns, nvme_ns_bio_done, bp);
+		break;
+	case BIO_DELETE:
+		/*
+		 * Note: Chatham2 doesn't support DSM, so this code
+		 *  can't be fully tested yet.
+		 */
+		dsm_range =
+		    malloc(sizeof(struct nvme_dsm_range), M_NVME,
+		    M_ZERO | M_NOWAIT);
+		dsm_range->length =
+		    bp->bio_bcount/nvme_ns_get_sector_size(ns);
+		dsm_range->starting_lba =
+		    bp->bio_offset/nvme_ns_get_sector_size(ns);
+		bp->bio_driver2 = dsm_range;
+		nvme_ns_cmd_deallocate(ns, dsm_range, 1, nvme_ns_bio_done, bp);
+		break;
+	default:
+		return (EIO);
+	}
+
+	return (0);
+}
+
+#ifdef CHATHAM2
+static void
+nvme_ns_populate_chatham_data(struct nvme_namespace *ns)
+{
+	struct nvme_controller		*ctrlr;
+	struct nvme_namespace_data	*nsdata;
+
+	ctrlr = ns->ctrlr;
+	nsdata = &ns->data;
+
+	nsdata->nsze = ctrlr->chatham_lbas;
+	nsdata->ncap = ctrlr->chatham_lbas;
+	nsdata->nuse = ctrlr->chatham_lbas;
+
+	/* Chatham2 doesn't support thin provisioning. */
+	nsdata->nsfeat.thin_prov = 0;
+
+	/* Set LBA size to 512 bytes. */
+	nsdata->lbaf[0].lbads = 9;
+}
+#endif /* CHATHAM2 */
+
+int
+nvme_ns_construct(struct nvme_namespace *ns, uint16_t id,
+    struct nvme_controller *ctrlr)
+{
+	struct nvme_completion	cpl;
+	struct mtx		*mtx;
+	int			status;
+
+	ns->ctrlr = ctrlr;
+	ns->id = id;
+
+#ifdef CHATHAM2
+	if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID)
+		nvme_ns_populate_chatham_data(ns);
+	else {
+#endif
+		mtx = mtx_pool_find(mtxpool_sleep, &cpl);
+
+		mtx_lock(mtx);
+		nvme_ctrlr_cmd_identify_namespace(ctrlr, id, &ns->data,
+		    nvme_ns_cb, &cpl);
+		status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
+		mtx_unlock(mtx);
+		if ((status != 0) || cpl.sf_sc || cpl.sf_sct) {
+			printf("nvme_identify_namespace failed!\n");
+			return (ENXIO);
+		}
+#ifdef CHATHAM2
+	}
+#endif
+
+	if (ctrlr->cdata.oncs.dsm && ns->data.nsfeat.thin_prov)
+		ns->flags |= NVME_NS_DEALLOCATE_SUPPORTED;
+
+	if (ctrlr->cdata.vwc.present)
+		ns->flags |= NVME_NS_FLUSH_SUPPORTED;
+
+/*
+ * MAKEDEV_ETERNAL was added in r210923, for cdevs that will never
+ *  be destroyed.  This avoids refcounting on the cdev object.
+ *  That should be OK case here, as long as we're not supporting PCIe
+ *  surprise removal nor namespace deletion.
+ */
+#ifdef MAKEDEV_ETERNAL_KLD
+	ns->cdev = make_dev_credf(MAKEDEV_ETERNAL_KLD, &nvme_ns_cdevsw, 0,
+	    NULL, UID_ROOT, GID_WHEEL, 0600, "nvme%dns%d",
+	    device_get_unit(ctrlr->dev), ns->id);
+#else
+	ns->cdev = make_dev_credf(0, &nvme_ns_cdevsw, 0,
+	    NULL, UID_ROOT, GID_WHEEL, 0600, "nvme%dns%d",
+	    device_get_unit(ctrlr->dev), ns->id);
+#endif
+
+	if (ns->cdev) {
+		ns->cdev->si_drv1 = ns;
+	}
+
+	return (0);
+}
diff --git a/sys/dev/nvme/nvme_ns_cmd.c b/sys/dev/nvme/nvme_ns_cmd.c
new file mode 100644
index 0000000..23d6f96
--- /dev/null
+++ b/sys/dev/nvme/nvme_ns_cmd.c
@@ -0,0 +1,120 @@
+/*-
+ * Copyright (C) 2012 Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "nvme_private.h"
+
+void
+nvme_ns_cmd_read(struct nvme_namespace *ns, void *payload, uint64_t lba,
+    uint32_t lba_count, nvme_cb_fn_t cb_fn, void *cb_arg)
+{
+	struct nvme_tracker	*tr;
+	struct nvme_command	*cmd;
+	int			err;
+
+	tr = nvme_allocate_tracker(ns->ctrlr, FALSE, cb_fn, cb_arg,
+	    lba_count*512, payload);
+
+	cmd = &tr->cmd;
+	cmd->opc = NVME_OPC_READ;
+	cmd->nsid = ns->id;
+
+	/* TODO: create a read command data structure */
+	*(uint64_t *)&cmd->cdw10 = lba;
+	cmd->cdw12 = lba_count-1;
+
+	err = bus_dmamap_load(tr->qpair->dma_tag, tr->dma_map, payload,
+	    tr->payload_size, nvme_payload_map, tr, 0);
+
+	KASSERT(err == 0, ("bus_dmamap_load returned non-zero!\n"));
+}
+
+void
+nvme_ns_cmd_write(struct nvme_namespace *ns, void *payload, uint64_t lba,
+    uint32_t lba_count, nvme_cb_fn_t cb_fn, void *cb_arg)
+{
+	struct nvme_tracker	*tr;
+	struct nvme_command	*cmd;
+	int 			err;
+
+	tr = nvme_allocate_tracker(ns->ctrlr, FALSE, cb_fn, cb_arg,
+	    lba_count*512, payload);
+
+	cmd = &tr->cmd;
+	cmd->opc = NVME_OPC_WRITE;
+	cmd->nsid = ns->id;
+
+	/* TODO: create a write command data structure */
+	*(uint64_t *)&cmd->cdw10 = lba;
+	cmd->cdw12 = lba_count-1;
+
+	err = bus_dmamap_load(tr->qpair->dma_tag, tr->dma_map, payload,
+	    tr->payload_size, nvme_payload_map, tr, 0);
+
+	KASSERT(err == 0, ("bus_dmamap_load returned non-zero!\n"));
+}
+
+void
+nvme_ns_cmd_deallocate(struct nvme_namespace *ns, void *payload,
+    uint8_t num_ranges, nvme_cb_fn_t cb_fn, void *cb_arg)
+{
+	struct nvme_tracker	*tr;
+	struct nvme_command	*cmd;
+	int 			err;
+
+	tr = nvme_allocate_tracker(ns->ctrlr, FALSE, cb_fn, cb_arg,
+	    num_ranges * sizeof(struct nvme_dsm_range), payload);
+
+	cmd = &tr->cmd;
+	cmd->opc = NVME_OPC_DATASET_MANAGEMENT;
+	cmd->nsid = ns->id;
+
+	/* TODO: create a delete command data structure */
+	cmd->cdw10 = num_ranges;
+	cmd->cdw11 = NVME_DSM_ATTR_DEALLOCATE;
+
+	err = bus_dmamap_load(tr->qpair->dma_tag, tr->dma_map, payload,
+	    tr->payload_size, nvme_payload_map, tr, 0);
+
+	KASSERT(err == 0, ("bus_dmamap_load returned non-zero!\n"));
+}
+
+void
+nvme_ns_cmd_flush(struct nvme_namespace *ns, nvme_cb_fn_t cb_fn, void *cb_arg)
+{
+	struct nvme_tracker	*tr;
+	struct nvme_command	*cmd;
+
+	tr = nvme_allocate_tracker(ns->ctrlr, FALSE, cb_fn, cb_arg, 0, NULL);
+
+	cmd = &tr->cmd;
+	cmd->opc = NVME_OPC_FLUSH;
+	cmd->nsid = ns->id;
+
+	nvme_qpair_submit_cmd(tr->qpair, tr);
+}
diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h
new file mode 100644
index 0000000..667238b
--- /dev/null
+++ b/sys/dev/nvme/nvme_private.h
@@ -0,0 +1,369 @@
+/*-
+ * Copyright (C) 2012 Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef __NVME_PRIVATE_H__
+#define __NVME_PRIVATE_H__
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/rman.h>
+#include <sys/systm.h>
+#include <sys/taskqueue.h>
+
+#include <machine/bus.h>
+
+#include "nvme.h"
+
+#define DEVICE2SOFTC(dev) ((struct nvme_controller *) device_get_softc(dev))
+
+MALLOC_DECLARE(M_NVME);
+
+#define CHATHAM2
+
+#ifdef CHATHAM2
+#define CHATHAM_PCI_ID		0x20118086
+#define CHATHAM_CONTROL_BAR	0
+#endif
+
+#define IDT_PCI_ID		0x80d0111d
+
+#define NVME_MAX_PRP_LIST_ENTRIES	(128)
+
+/*
+ * For commands requiring more than 2 PRP entries, one PRP will be
+ *  embedded in the command (prp1), and the rest of the PRP entries
+ *  will be in a list pointed to by the command (prp2).  This means
+ *  that real max number of PRP entries we support is 128+1, which
+ *  results in a max xfer size of 128*PAGE_SIZE.
+ */
+#define NVME_MAX_XFER_SIZE	NVME_MAX_PRP_LIST_ENTRIES * PAGE_SIZE
+
+#define NVME_ADMIN_ENTRIES	(128)
+/* min and max are defined in admin queue attributes section of spec */
+#define NVME_MIN_ADMIN_ENTRIES	(2)
+#define NVME_MAX_ADMIN_ENTRIES	(4096)
+
+#define NVME_IO_ENTRIES		(1024)
+/* min is a reasonable value picked for the nvme(4) driver */
+#define NVME_MIN_IO_ENTRIES	(128)
+/*
+ * NVME_MAX_IO_ENTRIES is not defined, since it is specified in CC.MQES
+ *  for each controller.
+ */
+
+#define NVME_INT_COAL_TIME	(0)	/* disabled */
+#define NVME_INT_COAL_THRESHOLD (0)	/* 0-based */
+
+#define NVME_MAX_NAMESPACES	(16)
+#define NVME_MAX_CONSUMERS	(2)
+#define NVME_MAX_ASYNC_EVENTS	(4)
+
+#define NVME_TIMEOUT_IN_SEC	(30)
+
+struct nvme_prp_list {
+	uint64_t			prp[NVME_MAX_PRP_LIST_ENTRIES];
+	SLIST_ENTRY(nvme_prp_list)	slist;
+	bus_addr_t			bus_addr;
+	bus_dmamap_t			dma_map;
+};
+
+struct nvme_tracker {
+
+	SLIST_ENTRY(nvme_tracker)	slist;
+	struct nvme_qpair		*qpair;
+	struct nvme_command		cmd;
+	struct callout			timer;
+	bus_dmamap_t			dma_map;
+	nvme_cb_fn_t			cb_fn;
+	void				*cb_arg;
+	uint32_t			payload_size;
+	struct nvme_prp_list		*prp_list;
+	uint16_t			cid;
+};
+
+struct nvme_qpair {
+
+	struct nvme_controller	*ctrlr;
+	uint32_t		id;
+	uint32_t		phase;
+
+	uint16_t		vector;
+	int			rid;
+	struct resource		*res;
+	void 			*tag;
+
+	uint32_t		max_xfer_size;
+	uint32_t		num_entries;
+	uint32_t		sq_tdbl_off;
+	uint32_t		cq_hdbl_off;
+
+	uint32_t		sq_head;
+	uint32_t		sq_tail;
+	uint32_t		cq_head;
+
+	int64_t			num_cmds;
+
+	struct mtx		lock;
+
+	struct nvme_command	*cmd;
+	struct nvme_completion	*cpl;
+
+	bus_dma_tag_t		dma_tag;
+
+	bus_dmamap_t		cmd_dma_map;
+	uint64_t		cmd_bus_addr;
+
+	bus_dmamap_t		cpl_dma_map;
+	uint64_t		cpl_bus_addr;
+
+	uint32_t		num_tr;
+	uint32_t		num_prp_list;
+
+	SLIST_HEAD(, nvme_tracker)	free_tr;
+
+	struct nvme_tracker	**act_tr;
+
+	SLIST_HEAD(, nvme_prp_list)	free_prp_list;
+};
+
+struct nvme_namespace {
+
+	struct nvme_controller		*ctrlr;
+	struct nvme_namespace_data	data;
+	uint16_t			id;
+	uint16_t			flags;
+	struct cdev			*cdev;
+};
+
+/*
+ * One of these per allocated PCI device.
+ */
+struct nvme_controller {
+
+	device_t		dev;
+
+	uint32_t		ready_timeout_in_ms;
+
+	bus_space_tag_t		bus_tag;
+	bus_space_handle_t	bus_handle;
+	int			resource_id;
+	struct resource		*resource;
+
+#ifdef CHATHAM2
+	bus_space_tag_t		chatham_bus_tag;
+	bus_space_handle_t	chatham_bus_handle;
+	int			chatham_resource_id;
+	struct resource		*chatham_resource;
+#endif
+
+	uint32_t		msix_enabled;
+	uint32_t		force_intx;
+
+	uint32_t		num_io_queues;
+	boolean_t		per_cpu_io_queues;
+
+	/* Fields for tracking progress during controller initialization. */
+	struct intr_config_hook	config_hook;
+	uint32_t		ns_identified;
+	uint32_t		queues_created;
+
+	/* For shared legacy interrupt. */
+	int			rid;
+	struct resource		*res;
+	void			*tag;
+	struct task		task;
+	struct taskqueue	*taskqueue;
+
+	bus_dma_tag_t		hw_desc_tag;
+	bus_dmamap_t		hw_desc_map;
+
+	/** maximum i/o size in bytes */
+	uint32_t		max_xfer_size;
+
+	/** interrupt coalescing time period (in microseconds) */
+	uint32_t		int_coal_time;
+
+	/** interrupt coalescing threshold */
+	uint32_t		int_coal_threshold;
+
+	struct nvme_qpair	adminq;
+	struct nvme_qpair	*ioq;
+
+	struct nvme_registers		*regs;
+
+	struct nvme_controller_data	cdata;
+	struct nvme_namespace		ns[NVME_MAX_NAMESPACES];
+
+	struct cdev			*cdev;
+
+	boolean_t			is_started;
+
+#ifdef CHATHAM2
+	uint64_t		chatham_size;
+	uint64_t		chatham_lbas;
+#endif
+};
+
+#define nvme_mmio_offsetof(reg)						       \
+	offsetof(struct nvme_registers, reg)
+
+#define nvme_mmio_read_4(sc, reg)					       \
+	bus_space_read_4((sc)->bus_tag, (sc)->bus_handle,		       \
+	    nvme_mmio_offsetof(reg))
+
+#define nvme_mmio_write_4(sc, reg, val)					       \
+	bus_space_write_4((sc)->bus_tag, (sc)->bus_handle,		       \
+	    nvme_mmio_offsetof(reg), val)
+
+#define nvme_mmio_write_8(sc, reg, val) \
+	do {								       \
+		bus_space_write_4((sc)->bus_tag, (sc)->bus_handle,	       \
+		    nvme_mmio_offsetof(reg), val & 0xFFFFFFFF); 	       \
+		bus_space_write_4((sc)->bus_tag, (sc)->bus_handle,	       \
+		    nvme_mmio_offsetof(reg)+4,				       \
+		    (val & 0xFFFFFFFF00000000UL) >> 32);		       \
+	} while (0);
+
+#ifdef CHATHAM2
+#define chatham_read_4(softc, reg) \
+	bus_space_read_4((softc)->chatham_bus_tag,			       \
+	    (softc)->chatham_bus_handle, reg)
+
+#define chatham_write_8(sc, reg, val)					       \
+	do {								       \
+		bus_space_write_4((sc)->chatham_bus_tag,		       \
+		    (sc)->chatham_bus_handle, reg, val & 0xffffffff);	       \
+		bus_space_write_4((sc)->chatham_bus_tag,		       \
+		    (sc)->chatham_bus_handle, reg+4,			       \
+		    (val & 0xFFFFFFFF00000000UL) >> 32);		       \
+	} while (0);
+
+#endif /* CHATHAM2 */
+
+#if __FreeBSD_version < 800054
+#define wmb()	__asm volatile("sfence" ::: "memory")
+#define mb()	__asm volatile("mfence" ::: "memory")
+#endif
+
+void	nvme_ns_test(struct nvme_namespace *ns, u_long cmd, caddr_t arg);
+
+void	nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr,
+				   uint8_t feature, uint32_t cdw11,
+				   void *payload, uint32_t payload_size,
+				   nvme_cb_fn_t cb_fn, void *cb_arg);
+void	nvme_ctrlr_cmd_get_feature(struct nvme_controller *ctrlr,
+				   uint8_t feature, uint32_t cdw11,
+				   void *payload, uint32_t payload_size,
+				   nvme_cb_fn_t cb_fn, void *cb_arg);
+void	nvme_ctrlr_cmd_identify_controller(struct nvme_controller *ctrlr,
+					   void *payload,
+					   nvme_cb_fn_t cb_fn, void *cb_arg);
+void	nvme_ctrlr_cmd_identify_namespace(struct nvme_controller *ctrlr,
+					  uint16_t nsid, void *payload,
+					  nvme_cb_fn_t cb_fn, void *cb_arg);
+void	nvme_ctrlr_cmd_set_interrupt_coalescing(struct nvme_controller *ctrlr,
+						uint32_t microseconds,
+						uint32_t threshold,
+						nvme_cb_fn_t cb_fn,
+						void *cb_arg);
+void	nvme_ctrlr_cmd_get_health_information_page(struct nvme_controller *ctrlr,
+						   uint32_t nsid,
+						   struct nvme_health_information_page *payload,
+						   nvme_cb_fn_t cb_fn,
+						   void *cb_arg);
+void	nvme_ctrlr_cmd_create_io_cq(struct nvme_controller *ctrlr,
+				    struct nvme_qpair *io_que, uint16_t vector,
+				    nvme_cb_fn_t cb_fn, void *cb_arg);
+void	nvme_ctrlr_cmd_create_io_sq(struct nvme_controller *ctrlr,
+				    struct nvme_qpair *io_que,
+				    nvme_cb_fn_t cb_fn, void *cb_arg);
+void	nvme_ctrlr_cmd_delete_io_cq(struct nvme_controller *ctrlr,
+				    struct nvme_qpair *io_que,
+				    nvme_cb_fn_t cb_fn, void *cb_arg);
+void	nvme_ctrlr_cmd_delete_io_sq(struct nvme_controller *ctrlr,
+				    struct nvme_qpair *io_que,
+				    nvme_cb_fn_t cb_fn, void *cb_arg);
+void	nvme_ctrlr_cmd_set_num_queues(struct nvme_controller *ctrlr,
+				      uint32_t num_queues, nvme_cb_fn_t cb_fn,
+				      void *cb_arg);
+void	nvme_ctrlr_cmd_set_asynchronous_event_config(struct nvme_controller *ctrlr,
+					   union nvme_critical_warning_state state,
+					   nvme_cb_fn_t cb_fn, void *cb_arg);
+void	nvme_ctrlr_cmd_asynchronous_event_request(struct nvme_controller *ctrlr,
+						  nvme_cb_fn_t cb_fn,
+						  void *cb_arg);
+
+struct nvme_tracker *	nvme_allocate_tracker(struct nvme_controller *ctrlr,
+					      boolean_t is_admin,
+					      nvme_cb_fn_t cb_fn, void *cb_arg,
+					      uint32_t payload_size,
+					      void *payload);
+void	nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg,
+			 int error);
+
+int	nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev);
+int	nvme_ctrlr_reset(struct nvme_controller *ctrlr);
+/* ctrlr defined as void * to allow use with config_intrhook. */
+void	nvme_ctrlr_start(void *ctrlr_arg);
+
+void	nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id,
+			     uint16_t vector, uint32_t num_entries,
+			     uint32_t max_xfer_size,
+			     struct nvme_controller *ctrlr);
+void	nvme_qpair_submit_cmd(struct nvme_qpair *qpair,
+			      struct nvme_tracker *tr);
+void	nvme_qpair_process_completions(struct nvme_qpair *qpair);
+struct nvme_tracker *	nvme_qpair_allocate_tracker(struct nvme_qpair *qpair,
+						    boolean_t alloc_prp_list);
+
+void	nvme_admin_qpair_destroy(struct nvme_qpair *qpair);
+
+void	nvme_io_qpair_destroy(struct nvme_qpair *qpair);
+
+int	nvme_ns_construct(struct nvme_namespace *ns, uint16_t id,
+			  struct nvme_controller *ctrlr);
+
+int	nvme_ns_physio(struct cdev *dev, struct uio *uio, int ioflag);
+
+void	nvme_sysctl_initialize_ctrlr(struct nvme_controller *ctrlr);
+
+void	nvme_dump_command(struct nvme_command *cmd);
+void	nvme_dump_completion(struct nvme_completion *cpl);
+
+static __inline void
+nvme_single_map(void *arg, bus_dma_segment_t *seg, int nseg, int error)
+{
+	uint64_t *bus_addr = (uint64_t *)arg;
+
+	*bus_addr = seg[0].ds_addr;
+}
+
+#endif /* __NVME_PRIVATE_H__ */
diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c
new file mode 100644
index 0000000..a49a702
--- /dev/null
+++ b/sys/dev/nvme/nvme_qpair.c
@@ -0,0 +1,422 @@
+/*-
+ * Copyright (C) 2012 Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+
+#include "nvme_private.h"
+
+static boolean_t
+nvme_completion_check_retry(const struct nvme_completion *cpl)
+{
+	/*
+	 * TODO: spec is not clear how commands that are aborted due
+	 *  to TLER will be marked.  So for now, it seems
+	 *  NAMESPACE_NOT_READY is the only case where we should
+	 *  look at the DNR bit.
+	 */
+	switch (cpl->sf_sct) {
+	case NVME_SCT_GENERIC:
+		switch (cpl->sf_sc) {
+		case NVME_SC_NAMESPACE_NOT_READY:
+			if (cpl->sf_dnr)
+				return (0);
+			else
+				return (1);
+		case NVME_SC_INVALID_OPCODE:
+		case NVME_SC_INVALID_FIELD:
+		case NVME_SC_COMMAND_ID_CONFLICT:
+		case NVME_SC_DATA_TRANSFER_ERROR:
+		case NVME_SC_ABORTED_POWER_LOSS:
+		case NVME_SC_INTERNAL_DEVICE_ERROR:
+		case NVME_SC_ABORTED_BY_REQUEST:
+		case NVME_SC_ABORTED_SQ_DELETION:
+		case NVME_SC_ABORTED_FAILED_FUSED:
+		case NVME_SC_ABORTED_MISSING_FUSED:
+		case NVME_SC_INVALID_NAMESPACE_OR_FORMAT:
+		case NVME_SC_COMMAND_SEQUENCE_ERROR:
+		case NVME_SC_LBA_OUT_OF_RANGE:
+		case NVME_SC_CAPACITY_EXCEEDED:
+		default:
+			return (0);
+		}
+	case NVME_SCT_COMMAND_SPECIFIC:
+	case NVME_SCT_MEDIA_ERROR:
+	case NVME_SCT_VENDOR_SPECIFIC:
+	default:
+		return (0);
+	}
+}
+
+struct nvme_tracker *
+nvme_qpair_allocate_tracker(struct nvme_qpair *qpair, boolean_t alloc_prp_list)
+{
+	struct nvme_tracker	*tr;
+	struct nvme_prp_list	*prp_list;
+
+	mtx_lock(&qpair->lock);
+
+	tr = SLIST_FIRST(&qpair->free_tr);
+	if (tr == NULL) {
+		/* TODO: fail if malloc returns NULL */
+		tr = malloc(sizeof(struct nvme_tracker), M_NVME,
+		    M_ZERO | M_NOWAIT);
+
+		bus_dmamap_create(qpair->dma_tag, 0, &tr->dma_map);
+		callout_init_mtx(&tr->timer, &qpair->lock, 0);
+		tr->cid = qpair->num_tr++;
+	} else
+		SLIST_REMOVE_HEAD(&qpair->free_tr, slist);
+
+	if (alloc_prp_list) {
+		prp_list = SLIST_FIRST(&qpair->free_prp_list);
+
+		if (prp_list == NULL) {
+			prp_list = malloc(sizeof(struct nvme_prp_list),
+			    M_NVME, M_ZERO | M_NOWAIT);
+
+			bus_dmamap_create(qpair->dma_tag, 0, &prp_list->dma_map);
+
+			bus_dmamap_load(qpair->dma_tag, prp_list->dma_map,
+			    prp_list->prp, sizeof(struct nvme_prp_list),
+			    nvme_single_map, &prp_list->bus_addr, 0);
+
+			qpair->num_prp_list++;
+		} else {
+			SLIST_REMOVE_HEAD(&qpair->free_prp_list, slist);
+		}
+
+		tr->prp_list = prp_list;
+	}
+
+	return (tr);
+}
+
+void
+nvme_qpair_process_completions(struct nvme_qpair *qpair)
+{
+	struct nvme_tracker	*tr;
+	struct nvme_completion	*cpl;
+	boolean_t		retry, error;
+
+	while (1) {
+		cpl = &qpair->cpl[qpair->cq_head];
+
+		if (cpl->p != qpair->phase)
+			break;
+
+		tr = qpair->act_tr[cpl->cid];
+		KASSERT(tr,
+		    ("completion queue has entries but no active trackers\n"));
+
+		error = cpl->sf_sc || cpl->sf_sct;
+		retry = error && nvme_completion_check_retry(cpl);
+
+		if (error) {
+			nvme_dump_completion(cpl);
+			nvme_dump_command(&tr->cmd);
+		}
+
+		qpair->act_tr[cpl->cid] = NULL;
+
+		KASSERT(cpl->cid == tr->cmd.cid,
+		    ("cpl cid does not match cmd cid\n"));
+
+		if (tr->cb_fn && !retry)
+			tr->cb_fn(tr->cb_arg, cpl);
+
+		qpair->sq_head = cpl->sqhd;
+
+		mtx_lock(&qpair->lock);
+		callout_stop(&tr->timer);
+
+		if (retry)
+			/* nvme_qpair_submit_cmd() will release the lock. */
+			nvme_qpair_submit_cmd(qpair, tr);
+		else {
+			if (tr->prp_list) {
+				SLIST_INSERT_HEAD(&qpair->free_prp_list,
+				    tr->prp_list, slist);
+				tr->prp_list = NULL;
+			}
+
+			if (tr->payload_size > 0)
+				bus_dmamap_unload(qpair->dma_tag, tr->dma_map);
+
+			SLIST_INSERT_HEAD(&qpair->free_tr, tr, slist);
+
+			mtx_unlock(&qpair->lock);
+		}
+
+		if (++qpair->cq_head == qpair->num_entries) {
+			qpair->cq_head = 0;
+			qpair->phase = !qpair->phase;
+		}
+
+		nvme_mmio_write_4(qpair->ctrlr, doorbell[qpair->id].cq_hdbl,
+		    qpair->cq_head);
+	}
+}
+
+static void
+nvme_qpair_msix_handler(void *arg)
+{
+	struct nvme_qpair *qpair = arg;
+
+	nvme_qpair_process_completions(qpair);
+}
+
+void
+nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id,
+    uint16_t vector, uint32_t num_entries, uint32_t max_xfer_size,
+    struct nvme_controller *ctrlr)
+{
+
+	qpair->id = id;
+	qpair->vector = vector;
+	qpair->num_entries = num_entries;
+	qpair->max_xfer_size = max_xfer_size;
+	qpair->ctrlr = ctrlr;
+
+	/*
+	 * First time through the completion queue, HW will set phase
+	 *  bit on completions to 1.  So set this to 1 here, indicating
+	 *  we're looking for a 1 to know which entries have completed.
+	 *  we'll toggle the bit each time when the completion queue
+	 *  rolls over.
+	 */
+	qpair->phase = 1;
+
+	if (ctrlr->msix_enabled) {
+
+		/*
+		 * MSI-X vector resource IDs start at 1, so we add one to
+		 *  the queue's vector to get the corresponding rid to use.
+		 */
+		qpair->rid = vector + 1;
+
+		qpair->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ,
+		    &qpair->rid, RF_ACTIVE);
+
+		bus_setup_intr(ctrlr->dev, qpair->res,
+		    INTR_TYPE_MISC | INTR_MPSAFE, NULL,
+		    nvme_qpair_msix_handler, qpair, &qpair->tag);
+	}
+
+	mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF);
+
+	bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
+	    sizeof(uint64_t), PAGE_SIZE, BUS_SPACE_MAXADDR,
+	    BUS_SPACE_MAXADDR, NULL, NULL, qpair->max_xfer_size,
+	    (qpair->max_xfer_size/PAGE_SIZE)+1, PAGE_SIZE, 0,
+	    NULL, NULL, &qpair->dma_tag);
+
+	qpair->num_cmds = 0;
+	qpair->num_tr = 0;
+	qpair->num_prp_list = 0;
+	qpair->sq_head = qpair->sq_tail = qpair->cq_head = 0;
+
+	/* TODO: error checking on contigmalloc, bus_dmamap_load calls */
+	qpair->cmd = contigmalloc(qpair->num_entries *
+	    sizeof(struct nvme_command), M_NVME, M_ZERO | M_NOWAIT,
+	    0, BUS_SPACE_MAXADDR, PAGE_SIZE, 0);
+	qpair->cpl = contigmalloc(qpair->num_entries *
+	    sizeof(struct nvme_completion), M_NVME, M_ZERO | M_NOWAIT,
+	    0, BUS_SPACE_MAXADDR, PAGE_SIZE, 0);
+
+	bus_dmamap_create(qpair->dma_tag, 0, &qpair->cmd_dma_map);
+	bus_dmamap_create(qpair->dma_tag, 0, &qpair->cpl_dma_map);
+
+	bus_dmamap_load(qpair->dma_tag, qpair->cmd_dma_map,
+	    qpair->cmd, qpair->num_entries * sizeof(struct nvme_command),
+	    nvme_single_map, &qpair->cmd_bus_addr, 0);
+	bus_dmamap_load(qpair->dma_tag, qpair->cpl_dma_map,
+	    qpair->cpl, qpair->num_entries * sizeof(struct nvme_completion),
+	    nvme_single_map, &qpair->cpl_bus_addr, 0);
+
+	qpair->sq_tdbl_off = nvme_mmio_offsetof(doorbell[id].sq_tdbl);
+	qpair->cq_hdbl_off = nvme_mmio_offsetof(doorbell[id].cq_hdbl);
+
+	SLIST_INIT(&qpair->free_tr);
+	SLIST_INIT(&qpair->free_prp_list);
+
+	qpair->act_tr = malloc(sizeof(struct nvme_tracker *) * qpair->num_entries,
+	    M_NVME, M_ZERO | M_NOWAIT);
+}
+
+static void
+nvme_qpair_destroy(struct nvme_qpair *qpair)
+{
+	struct nvme_tracker *tr;
+	struct nvme_prp_list *prp_list;
+
+	if (qpair->tag)
+		bus_teardown_intr(qpair->ctrlr->dev, qpair->res, qpair->tag);
+
+	if (qpair->res)
+		bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ,
+		    rman_get_rid(qpair->res), qpair->res);
+
+	if (qpair->dma_tag)
+		bus_dma_tag_destroy(qpair->dma_tag);
+
+	if (qpair->act_tr)
+		free(qpair->act_tr, M_NVME);
+
+	while (!SLIST_EMPTY(&qpair->free_tr)) {
+		tr = SLIST_FIRST(&qpair->free_tr);
+		SLIST_REMOVE_HEAD(&qpair->free_tr, slist);
+		bus_dmamap_destroy(qpair->dma_tag, tr->dma_map);
+		free(tr, M_NVME);
+	}
+
+	while (!SLIST_EMPTY(&qpair->free_prp_list)) {
+		prp_list = SLIST_FIRST(&qpair->free_prp_list);
+		SLIST_REMOVE_HEAD(&qpair->free_prp_list, slist);
+		bus_dmamap_destroy(qpair->dma_tag, prp_list->dma_map);
+		free(prp_list, M_NVME);
+	}
+}
+
+void
+nvme_admin_qpair_destroy(struct nvme_qpair *qpair)
+{
+
+	/*
+	 * For NVMe, you don't send delete queue commands for the admin
+	 *  queue, so we just need to unload and free the cmd and cpl memory.
+	 */
+	bus_dmamap_unload(qpair->dma_tag, qpair->cmd_dma_map);
+	bus_dmamap_destroy(qpair->dma_tag, qpair->cmd_dma_map);
+
+	contigfree(qpair->cmd,
+	    qpair->num_entries * sizeof(struct nvme_command), M_NVME);
+
+	bus_dmamap_unload(qpair->dma_tag, qpair->cpl_dma_map);
+	bus_dmamap_destroy(qpair->dma_tag, qpair->cpl_dma_map);
+	contigfree(qpair->cpl,
+	    qpair->num_entries * sizeof(struct nvme_completion), M_NVME);
+
+	nvme_qpair_destroy(qpair);
+}
+
+static void
+nvme_free_cmd_ring(void *arg, const struct nvme_completion *status)
+{
+	struct nvme_qpair *qpair;
+
+	qpair = (struct nvme_qpair *)arg;
+	bus_dmamap_unload(qpair->dma_tag, qpair->cmd_dma_map);
+	bus_dmamap_destroy(qpair->dma_tag, qpair->cmd_dma_map);
+	contigfree(qpair->cmd,
+	    qpair->num_entries * sizeof(struct nvme_command), M_NVME);
+	qpair->cmd = NULL;
+}
+
+static void
+nvme_free_cpl_ring(void *arg, const struct nvme_completion *status)
+{
+	struct nvme_qpair *qpair;
+
+	qpair = (struct nvme_qpair *)arg;
+	bus_dmamap_unload(qpair->dma_tag, qpair->cpl_dma_map);
+	bus_dmamap_destroy(qpair->dma_tag, qpair->cpl_dma_map);
+	contigfree(qpair->cpl,
+	    qpair->num_entries * sizeof(struct nvme_completion), M_NVME);
+	qpair->cpl = NULL;
+}
+
+void
+nvme_io_qpair_destroy(struct nvme_qpair *qpair)
+{
+	struct nvme_controller *ctrlr = qpair->ctrlr;
+
+	if (qpair->num_entries > 0) {
+
+		nvme_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_free_cmd_ring,
+		    qpair);
+		/* Spin until free_cmd_ring sets qpair->cmd to NULL. */
+		while (qpair->cmd)
+			DELAY(5);
+
+		nvme_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_free_cpl_ring,
+		    qpair);
+		/* Spin until free_cpl_ring sets qpair->cmd to NULL. */
+		while (qpair->cpl)
+			DELAY(5);
+
+		nvme_qpair_destroy(qpair);
+	}
+}
+
+static void
+nvme_timeout(void *arg)
+{
+	/*
+	 * TODO: Add explicit abort operation here, once nvme(4) supports
+	 *  abort commands.
+	 */
+}
+
+void
+nvme_qpair_submit_cmd(struct nvme_qpair *qpair, struct nvme_tracker *tr)
+{
+
+	tr->cmd.cid = tr->cid;
+	qpair->act_tr[tr->cid] = tr;
+
+	/*
+	 * TODO: rather than spin until entries free up, put this tracker
+	 *  on a queue, and submit from the interrupt handler when
+	 *  entries free up.
+	 */
+	if ((qpair->sq_tail+1) % qpair->num_entries == qpair->sq_head) {
+		do {
+			mtx_unlock(&qpair->lock);
+			DELAY(5);
+			mtx_lock(&qpair->lock);
+		} while ((qpair->sq_tail+1) % qpair->num_entries == qpair->sq_head);
+	}
+
+	callout_reset(&tr->timer, NVME_TIMEOUT_IN_SEC * hz, nvme_timeout, tr);
+
+	/* Copy the command from the tracker to the submission queue. */
+	memcpy(&qpair->cmd[qpair->sq_tail], &tr->cmd, sizeof(tr->cmd));
+
+	if (++qpair->sq_tail == qpair->num_entries)
+		qpair->sq_tail = 0;
+
+	wmb();
+	nvme_mmio_write_4(qpair->ctrlr, doorbell[qpair->id].sq_tdbl,
+	    qpair->sq_tail);
+
+	qpair->num_cmds++;
+
+	mtx_unlock(&qpair->lock);
+}
diff --git a/sys/dev/nvme/nvme_sysctl.c b/sys/dev/nvme/nvme_sysctl.c
new file mode 100644
index 0000000..b8fa209
--- /dev/null
+++ b/sys/dev/nvme/nvme_sysctl.c
@@ -0,0 +1,187 @@
+/*-
+ * Copyright (C) 2012 Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/sysctl.h>
+
+#include "nvme_private.h"
+
+static void
+nvme_dump_queue(struct nvme_qpair *qpair)
+{
+	struct nvme_completion *cpl;
+	struct nvme_command *cmd;
+	int i;
+
+	printf("id:%04Xh phase:%d\n", qpair->id, qpair->phase);
+
+	printf("Completion queue:\n");
+	for (i = 0; i < qpair->num_entries; i++) {
+		cpl = &qpair->cpl[i];
+		printf("%05d: ", i);
+		nvme_dump_completion(cpl);
+	}
+
+	printf("Submission queue:\n");
+	for (i = 0; i < qpair->num_entries; i++) {
+		cmd = &qpair->cmd[i];
+		printf("%05d: ", i);
+		nvme_dump_command(cmd);
+	}
+}
+
+
+static int
+nvme_sysctl_dump_debug(SYSCTL_HANDLER_ARGS)
+{
+	struct nvme_qpair 	*qpair = arg1;
+	uint32_t		val = 0;
+
+	int error = sysctl_handle_int(oidp, &val, 0, req);
+
+	if (error)
+		return (error);
+
+	if (val != 0)
+		nvme_dump_queue(qpair);
+
+	return (0);
+}
+
+static int
+nvme_sysctl_int_coal_time(SYSCTL_HANDLER_ARGS)
+{
+	struct nvme_controller *ctrlr = arg1;
+	uint32_t oldval = ctrlr->int_coal_time;
+	int error = sysctl_handle_int(oidp, &ctrlr->int_coal_time, 0,
+	    req);
+
+	if (error)
+		return (error);
+
+	if (oldval != ctrlr->int_coal_time)
+		nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr,
+		    ctrlr->int_coal_time, ctrlr->int_coal_threshold, NULL,
+		    NULL);
+
+	return (0);
+}
+
+static int
+nvme_sysctl_int_coal_threshold(SYSCTL_HANDLER_ARGS)
+{
+	struct nvme_controller *ctrlr = arg1;
+	uint32_t oldval = ctrlr->int_coal_threshold;
+	int error = sysctl_handle_int(oidp, &ctrlr->int_coal_threshold, 0,
+	    req);
+
+	if (error)
+		return (error);
+
+	if (oldval != ctrlr->int_coal_threshold)
+		nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr,
+		    ctrlr->int_coal_time, ctrlr->int_coal_threshold, NULL,
+		    NULL);
+
+	return (0);
+}
+
+static void
+nvme_sysctl_initialize_queue(struct nvme_qpair *qpair,
+    struct sysctl_ctx_list *ctrlr_ctx, struct sysctl_oid *que_tree)
+{
+	struct sysctl_oid_list	*que_list = SYSCTL_CHILDREN(que_tree);
+
+	SYSCTL_ADD_UINT(ctrlr_ctx, que_list, OID_AUTO, "num_entries",
+	    CTLFLAG_RD, &qpair->num_entries, 0,
+	    "Number of entries in hardware queue");
+	SYSCTL_ADD_UINT(ctrlr_ctx, que_list, OID_AUTO, "num_tr",
+	    CTLFLAG_RD, &qpair->num_tr, 0,
+	    "Number of trackers allocated");
+	SYSCTL_ADD_UINT(ctrlr_ctx, que_list, OID_AUTO, "num_prp_list",
+	    CTLFLAG_RD, &qpair->num_prp_list, 0,
+	    "Number of PRP lists allocated");
+	SYSCTL_ADD_UINT(ctrlr_ctx, que_list, OID_AUTO, "sq_head",
+	    CTLFLAG_RD, &qpair->sq_head, 0,
+	    "Current head of submission queue (as observed by driver)");
+	SYSCTL_ADD_UINT(ctrlr_ctx, que_list, OID_AUTO, "sq_tail",
+	    CTLFLAG_RD, &qpair->sq_tail, 0,
+	    "Current tail of submission queue (as observed by driver)");
+	SYSCTL_ADD_UINT(ctrlr_ctx, que_list, OID_AUTO, "cq_head",
+	    CTLFLAG_RD, &qpair->cq_head, 0,
+	    "Current head of completion queue (as observed by driver)");
+
+	SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_cmds",
+	    CTLFLAG_RD, &qpair->num_cmds, "Number of commands submitted");
+
+	SYSCTL_ADD_PROC(ctrlr_ctx, que_list, OID_AUTO,
+	    "dump_debug", CTLTYPE_UINT | CTLFLAG_RW, qpair, 0,
+	    nvme_sysctl_dump_debug, "IU", "Dump debug data");
+}
+
+void
+nvme_sysctl_initialize_ctrlr(struct nvme_controller *ctrlr)
+{
+	struct sysctl_ctx_list	*ctrlr_ctx;
+	struct sysctl_oid	*ctrlr_tree, *que_tree;
+	struct sysctl_oid_list	*ctrlr_list;
+#define QUEUE_NAME_LENGTH	16
+	char			queue_name[QUEUE_NAME_LENGTH];
+	int			i;
+
+	ctrlr_ctx = device_get_sysctl_ctx(ctrlr->dev);
+	ctrlr_tree = device_get_sysctl_tree(ctrlr->dev);
+	ctrlr_list = SYSCTL_CHILDREN(ctrlr_tree);
+
+	if (ctrlr->is_started) {
+		SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
+		    "int_coal_time", CTLTYPE_UINT | CTLFLAG_RW, ctrlr, 0,
+		    nvme_sysctl_int_coal_time, "IU",
+		    "Interrupt coalescing timeout (in microseconds)");
+
+		SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
+		    "int_coal_threshold", CTLTYPE_UINT | CTLFLAG_RW, ctrlr, 0,
+		    nvme_sysctl_int_coal_threshold, "IU",
+		    "Interrupt coalescing threshold");
+	}
+
+	que_tree = SYSCTL_ADD_NODE(ctrlr_ctx, ctrlr_list, OID_AUTO, "adminq",
+	    CTLFLAG_RD, NULL, "Admin Queue");
+
+	nvme_sysctl_initialize_queue(&ctrlr->adminq, ctrlr_ctx, que_tree);
+
+	for (i = 0; i < ctrlr->num_io_queues; i++) {
+		snprintf(queue_name, QUEUE_NAME_LENGTH, "ioq%d", i);
+		que_tree = SYSCTL_ADD_NODE(ctrlr_ctx, ctrlr_list, OID_AUTO,
+		    queue_name, CTLFLAG_RD, NULL, "IO Queue");
+		nvme_sysctl_initialize_queue(&ctrlr->ioq[i], ctrlr_ctx,
+		    que_tree);
+	}
+}
diff --git a/sys/dev/nvme/nvme_test.c b/sys/dev/nvme/nvme_test.c
new file mode 100644
index 0000000..4177227
--- /dev/null
+++ b/sys/dev/nvme/nvme_test.c
@@ -0,0 +1,305 @@
+/*-
+ * Copyright (C) 2012 Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bio.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/kthread.h>
+#include <sys/module.h>
+#include <sys/proc.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <sys/unistd.h>
+
+#include <geom/geom.h>
+
+#include "nvme_private.h"
+
+struct nvme_io_test_thread {
+
+	uint32_t		idx;
+	struct nvme_namespace	*ns;
+	enum nvme_nvm_opcode	opc;
+	struct timeval		start;
+	void			*buf;
+	uint32_t		size;
+	uint32_t		time;
+	uint32_t		io_completed;
+};
+
+struct nvme_io_test_internal {
+
+	struct nvme_namespace	*ns;
+	enum nvme_nvm_opcode	opc;
+	struct timeval		start;
+	uint32_t		time;
+	uint32_t		size;
+	uint32_t		td_active;
+	uint32_t		td_idx;
+	uint32_t		flags;
+	uint32_t		io_completed[NVME_TEST_MAX_THREADS];
+};
+
+static void
+nvme_ns_bio_test_cb(struct bio *bio)
+{
+	struct mtx *mtx;
+
+	mtx = mtx_pool_find(mtxpool_sleep, bio);
+	mtx_lock(mtx);
+	wakeup(bio);
+	mtx_unlock(mtx);
+}
+
+static void
+nvme_ns_bio_test(void *arg)
+{
+	struct nvme_io_test_internal	*io_test = arg;
+	struct cdevsw			*csw;
+	struct mtx			*mtx;
+	struct bio			*bio;
+	struct cdev			*dev;
+	void				*buf;
+	struct timeval			t;
+	uint64_t			offset;
+	uint32_t			idx, io_completed = 0;
+#if __FreeBSD_version >= 900017
+	int				ref;
+#endif
+
+	buf = malloc(io_test->size, M_NVME, M_NOWAIT);
+	idx = atomic_fetchadd_int(&io_test->td_idx, 1);
+	dev = io_test->ns->cdev;
+
+	offset = idx * 2048 * nvme_ns_get_sector_size(io_test->ns);
+
+	while (1) {
+
+		bio = g_alloc_bio();
+
+		memset(bio, 0, sizeof(*bio));
+		bio->bio_cmd = (io_test->opc == NVME_OPC_READ) ?
+		    BIO_READ : BIO_WRITE;
+		bio->bio_done = nvme_ns_bio_test_cb;
+		bio->bio_dev = dev;
+		bio->bio_offset = offset;
+		bio->bio_data = buf;
+		bio->bio_bcount = io_test->size;
+
+		if (io_test->flags & NVME_TEST_FLAG_REFTHREAD) {
+#if __FreeBSD_version >= 900017
+			csw = dev_refthread(dev, &ref);
+#else
+			csw = dev_refthread(dev);
+#endif
+		} else
+			csw = dev->si_devsw;
+
+		mtx = mtx_pool_find(mtxpool_sleep, bio);
+		mtx_lock(mtx);
+		(*csw->d_strategy)(bio);
+		msleep(bio, mtx, PRIBIO, "biotestwait", 0);
+		mtx_unlock(mtx);
+
+		if (io_test->flags & NVME_TEST_FLAG_REFTHREAD) {
+#if __FreeBSD_version >= 900017
+			dev_relthread(dev, ref);
+#else
+			dev_relthread(dev);
+#endif
+		}
+
+		if ((bio->bio_flags & BIO_ERROR) || (bio->bio_resid > 0))
+			break;
+
+		g_destroy_bio(bio);
+
+		io_completed++;
+
+		getmicrouptime(&t);
+		timevalsub(&t, &io_test->start);
+
+		if (t.tv_sec >= io_test->time)
+			break;
+
+		offset += io_test->size;
+		if ((offset + io_test->size) > nvme_ns_get_size(io_test->ns))
+			offset = 0;
+	}
+
+	io_test->io_completed[idx] = io_completed;
+	wakeup_one(io_test);
+
+	free(buf, M_NVME);
+
+	atomic_subtract_int(&io_test->td_active, 1);
+	mb();
+
+#if __FreeBSD_version >= 800000
+	kthread_exit();
+#else
+	kthread_exit(0);
+#endif
+}
+
+static void
+nvme_ns_io_test_cb(void *arg, const struct nvme_completion *status)
+{
+	struct nvme_io_test_thread	*tth = arg;
+	struct timeval			t;
+
+	tth->io_completed++;
+
+	if (status->sf_sc || status->sf_sct) {
+		printf("%s: error occurred\n", __func__);
+		wakeup_one(tth);
+		return;
+	}
+
+	getmicrouptime(&t);
+	timevalsub(&t, &tth->start);
+
+	if (t.tv_sec >= tth->time) {
+		wakeup_one(tth);
+		return;
+	}
+
+	switch (tth->opc) {
+	case NVME_OPC_WRITE:
+		nvme_ns_cmd_write(tth->ns, tth->buf, tth->idx * 2048,
+		    tth->size/nvme_ns_get_sector_size(tth->ns),
+		    nvme_ns_io_test_cb, tth);
+		break;
+	case NVME_OPC_READ:
+		nvme_ns_cmd_read(tth->ns, tth->buf, tth->idx * 2048,
+		    tth->size/nvme_ns_get_sector_size(tth->ns),
+		    nvme_ns_io_test_cb, tth);
+		break;
+	default:
+		break;
+	}
+}
+
+static void
+nvme_ns_io_test(void *arg)
+{
+	struct nvme_io_test_internal	*io_test = arg;
+	struct nvme_io_test_thread	*tth;
+	struct nvme_completion		cpl;
+	int				error;
+
+	tth = malloc(sizeof(*tth), M_NVME, M_NOWAIT | M_ZERO);
+	tth->ns = io_test->ns;
+	tth->opc = io_test->opc;
+	memcpy(&tth->start, &io_test->start, sizeof(tth->start));
+	tth->buf = malloc(io_test->size, M_NVME, M_NOWAIT);
+	tth->size = io_test->size;
+	tth->time = io_test->time;
+	tth->idx = atomic_fetchadd_int(&io_test->td_idx, 1);
+
+	memset(&cpl, 0, sizeof(cpl));
+
+	nvme_ns_io_test_cb(tth, &cpl);
+
+	error = tsleep(tth, 0, "test_wait", tth->time*hz*2);
+
+	if (error)
+		printf("%s: error = %d\n", __func__, error);
+
+	io_test->io_completed[tth->idx] = tth->io_completed;
+	wakeup_one(io_test);
+
+	free(tth->buf, M_NVME);
+	free(tth, M_NVME);
+
+	atomic_subtract_int(&io_test->td_active, 1);
+	mb();
+
+#if __FreeBSD_version >= 800004
+	kthread_exit();
+#else
+	kthread_exit(0);
+#endif
+}
+
+void
+nvme_ns_test(struct nvme_namespace *ns, u_long cmd, caddr_t arg)
+{
+	struct nvme_io_test		*io_test;
+	struct nvme_io_test_internal	*io_test_internal;
+	void				(*fn)(void *);
+	int				i;
+
+	io_test = (struct nvme_io_test *)arg;
+
+	if ((io_test->opc != NVME_OPC_READ) &&
+	    (io_test->opc != NVME_OPC_WRITE))
+		return;
+
+	if (io_test->size % nvme_ns_get_sector_size(ns))
+		return;
+
+	io_test_internal = malloc(sizeof(*io_test_internal), M_NVME,
+	    M_NOWAIT | M_ZERO);
+	io_test_internal->opc = io_test->opc;
+	io_test_internal->ns = ns;
+	io_test_internal->td_active = io_test->num_threads;
+	io_test_internal->time = io_test->time;
+	io_test_internal->size = io_test->size;
+	io_test_internal->flags = io_test->flags;
+
+	if (cmd == NVME_IO_TEST)
+		fn = nvme_ns_io_test;
+	else
+		fn = nvme_ns_bio_test;
+
+	getmicrouptime(&io_test_internal->start);
+
+	for (i = 0; i < io_test->num_threads; i++)
+#if __FreeBSD_version >= 800004
+		kthread_add(fn, io_test_internal,
+		    NULL, NULL, 0, 0, "nvme_io_test[%d]", i);
+#else
+		kthread_create(fn, io_test_internal,
+		    NULL, 0, 0, "nvme_io_test[%d]", i);
+#endif
+
+	tsleep(io_test_internal, 0, "nvme_test", io_test->time * 2 * hz);
+
+	while (io_test_internal->td_active > 0)
+		DELAY(10);
+
+	memcpy(io_test->io_completed, io_test_internal->io_completed,
+	    sizeof(io_test->io_completed));
+
+	free(io_test_internal, M_NVME);
+}
diff --git a/sys/dev/nvme/nvme_uio.c b/sys/dev/nvme/nvme_uio.c
new file mode 100644
index 0000000..ad5fd2f
--- /dev/null
+++ b/sys/dev/nvme/nvme_uio.c
@@ -0,0 +1,180 @@
+/*-
+ * Copyright (C) 2012 Intel Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/conf.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+
+#include "nvme_private.h"
+
+static void
+nvme_uio_done(void *arg, const struct nvme_completion *status)
+{
+	struct mtx *mtx;
+
+	/* TODO: update uio flags based on status */
+
+	mtx = mtx_pool_find(mtxpool_sleep, arg);
+	mtx_lock(mtx);
+	wakeup(arg);
+	mtx_unlock(mtx);
+}
+
+static struct nvme_tracker *
+nvme_allocate_tracker_uio(struct nvme_controller *ctrlr, struct uio *uio)
+{
+	struct nvme_tracker 	*tr;
+	struct nvme_qpair	*qpair;
+
+	if (ctrlr->per_cpu_io_queues)
+		qpair = &ctrlr->ioq[curcpu];
+	else
+		qpair = &ctrlr->ioq[0];
+
+	/*
+	 * For uio, always allocate a PRP list, rather than walking
+	 *  the iovecs.
+	 */
+	tr = nvme_qpair_allocate_tracker(qpair, TRUE /* alloc_prp_list */);
+
+	memset(&tr->cmd, 0, sizeof(tr->cmd));
+
+	tr->qpair = qpair;
+	tr->cb_fn = nvme_uio_done;
+	tr->cb_arg = uio;
+
+	return (tr);
+}
+
+static void
+nvme_payload_map_uio(void *arg, bus_dma_segment_t *seg, int nseg,
+    bus_size_t mapsize, int error)
+{
+	nvme_payload_map(arg, seg, nseg, error);
+}
+
+static void
+nvme_read_uio(struct nvme_namespace *ns, struct uio *uio)
+{
+	struct nvme_tracker	*tr;
+	struct nvme_command	*cmd;
+	int			err, i;
+	uint64_t		lba, iosize = 0;
+
+	tr = nvme_allocate_tracker_uio(ns->ctrlr, uio);
+
+	cmd = &tr->cmd;
+	cmd->opc = NVME_OPC_READ;
+	cmd->nsid = ns->id;
+	lba = uio->uio_offset / nvme_ns_get_sector_size(ns);
+
+	*(uint64_t *)&cmd->cdw10 = lba;
+
+	for (i = 0; i < uio->uio_iovcnt; i++) {
+		iosize += uio->uio_iov[i].iov_len;
+	}
+
+	cmd->cdw12 = (iosize / nvme_ns_get_sector_size(ns))-1;
+
+	err = bus_dmamap_load_uio(tr->qpair->dma_tag, tr->dma_map, uio,
+	    nvme_payload_map_uio, tr, 0);
+
+	KASSERT(err == 0, ("bus_dmamap_load_uio returned non-zero!\n"));
+}
+
+static void
+nvme_write_uio(struct nvme_namespace *ns, struct uio *uio)
+{
+	struct nvme_tracker	*tr;
+	struct nvme_command	*cmd;
+	int			err, i;
+	uint64_t		lba, iosize = 0;
+
+	tr = nvme_allocate_tracker_uio(ns->ctrlr, uio);
+
+	cmd = &tr->cmd;
+	cmd->opc = NVME_OPC_WRITE;
+	cmd->nsid = ns->id;
+	lba = uio->uio_offset / nvme_ns_get_sector_size(ns);
+
+	*(uint64_t *)&cmd->cdw10 = lba;
+
+	for (i = 0; i < uio->uio_iovcnt; i++) {
+		iosize += uio->uio_iov[i].iov_len;
+	}
+
+	cmd->cdw12 = (iosize / nvme_ns_get_sector_size(ns))-1;
+
+	err = bus_dmamap_load_uio(tr->qpair->dma_tag, tr->dma_map, uio,
+	    nvme_payload_map_uio, tr, 0);
+
+	KASSERT(err == 0, ("bus_dmamap_load_uio returned non-zero!\n"));
+}
+
+int
+nvme_ns_physio(struct cdev *dev, struct uio *uio, int ioflag)
+{
+	struct nvme_namespace	*ns;
+	struct mtx		*mtx;
+#if __FreeBSD_version > 900017
+	int			ref;
+#endif
+
+	PHOLD(curproc);
+
+	ns = dev->si_drv1;
+	mtx = mtx_pool_find(mtxpool_sleep, uio);
+
+#if __FreeBSD_version > 900017
+	dev_refthread(dev, &ref);
+#else
+	dev_refthread(dev);
+#endif
+
+	mtx_lock(mtx);
+	if (uio->uio_rw == UIO_READ)
+		nvme_read_uio(ns, uio);
+	else
+		nvme_write_uio(ns, uio);
+
+	msleep(uio, mtx, PRIBIO, "nvme_physio", 0);
+	mtx_unlock(mtx);
+
+#if __FreeBSD_version > 900017
+	dev_relthread(dev, ref);
+#else
+	dev_relthread(dev);
+#endif
+
+	uio->uio_resid = 0;
+
+	PRELE(curproc);
+	return (0);
+}
author	jimharris <jimharris@FreeBSD.org>	2012-09-17 19:23:01 +0000
committer	jimharris <jimharris@FreeBSD.org>	2012-09-17 19:23:01 +0000
commit	99662f533f035115d9b35d04f784c856a6bf7cc3 (patch)
tree	c8c46274e99382fa4ac1e233a8ae5469c7b912c3 /sys
parent	cde424c3369d2f37888aae925d9cfb3ce31e544b (diff)
download	FreeBSD-src-99662f533f035115d9b35d04f784c856a6bf7cc3.zip FreeBSD-src-99662f533f035115d9b35d04f784c856a6bf7cc3.tar.gz