199 files changed, 8063 insertions, 5176 deletions
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 1e64fc8..41dea8b 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -390,6 +390,8 @@ static struct md_page *pv_table;
  */
 pt_entry_t *CMAP1 = 0;
 caddr_t CADDR1 = 0;
+static vm_offset_t qframe = 0;
+static struct mtx qframe_mtx;
 
 static int pmap_flags = PMAP_PDE_SUPERPAGE;	/* flags for x86 pmaps */
 
@@ -1031,7 +1033,7 @@ pmap_init(void)
 	struct pmap_preinit_mapping *ppim;
 	vm_page_t mpte;
 	vm_size_t s;
-	int i, pv_npg;
+	int error, i, pv_npg;
 
 	/*
 	 * Initialize the vm page array entries for the kernel pmap's
@@ -1112,6 +1114,12 @@ pmap_init(void)
 		printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i,
 		    ppim->pa, ppim->va, ppim->sz, ppim->mode);
 	}
+
+	mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
+	error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
+	    (vmem_addr_t *)&qframe);
+	if (error != 0)
+		panic("qframe allocation failed");
 }
 
 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
@@ -7019,13 +7027,27 @@ pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
 vm_offset_t
 pmap_quick_enter_page(vm_page_t m)
 {
+	vm_paddr_t paddr;
 
-	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
+	paddr = VM_PAGE_TO_PHYS(m);
+	if (paddr < dmaplimit)
+		return (PHYS_TO_DMAP(paddr));
+	mtx_lock_spin(&qframe_mtx);
+	KASSERT(*vtopte(qframe) == 0, ("qframe busy"));
+	pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A |
+	    X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0));
+	return (qframe);
 }
 
 void
 pmap_quick_remove_page(vm_offset_t addr)
 {
+
+	if (addr != qframe)
+		return;
+	pte_store(vtopte(qframe), 0);
+	invlpg(qframe);
+	mtx_unlock_spin(&qframe_mtx);
 }
 
 #include "opt_ddb.h"
diff --git a/sys/arm/arm/cpufunc.c b/sys/arm/arm/cpufunc.c
index 0b589ed..fea0581 100644
--- a/sys/arm/arm/cpufunc.c
+++ b/sys/arm/arm/cpufunc.c
@@ -904,6 +904,7 @@ set_cpufuncs()
 	    cputype == CPU_ID_CORTEXA9R1 ||
 	    cputype == CPU_ID_CORTEXA9R2 ||
 	    cputype == CPU_ID_CORTEXA9R3 ||
+	    cputype == CPU_ID_CORTEXA9R4 ||
 	    cputype == CPU_ID_CORTEXA12R0 ||
 	    cputype == CPU_ID_CORTEXA15R0 ||
 	    cputype == CPU_ID_CORTEXA15R1 ||
diff --git a/sys/arm/arm/identcpu.c b/sys/arm/arm/identcpu.c
index 75bf08c..be1393b1 100644
--- a/sys/arm/arm/identcpu.c
+++ b/sys/arm/arm/identcpu.c
@@ -185,6 +185,8 @@ const struct cpuidtab cpuids[] = {
 	  generic_steppings },
 	{ CPU_ID_CORTEXA9R3,	CPU_CLASS_CORTEXA,	"Cortex A9-r3",
 	  generic_steppings },
+	{ CPU_ID_CORTEXA9R4,	CPU_CLASS_CORTEXA,	"Cortex A9-r4",
+	  generic_steppings },
 	{ CPU_ID_CORTEXA12R0,	CPU_CLASS_CORTEXA,	"Cortex A12-r0",
 	  generic_steppings },
 	{ CPU_ID_CORTEXA15R0,	CPU_CLASS_CORTEXA,	"Cortex A15-r0",
diff --git a/sys/arm/arm/pmap-v6-new.c b/sys/arm/arm/pmap-v6-new.c
index b18648f..864e05c 100644
--- a/sys/arm/arm/pmap-v6-new.c
+++ b/sys/arm/arm/pmap-v6-new.c
@@ -1166,10 +1166,9 @@ pmap_init_qpages(void)
 		pc = pcpu_find(i);
 		pc->pc_qmap_addr = kva_alloc(PAGE_SIZE);
 		if (pc->pc_qmap_addr == 0)
-			panic("pmap_init_qpages: unable to allocate KVA");
+			panic("%s: unable to allocate KVA", __func__);
 	}
 }
-
 SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_qpages, NULL);
 
 /*
@@ -5728,18 +5727,17 @@ pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
 vm_offset_t
 pmap_quick_enter_page(vm_page_t m)
 {
-	pt2_entry_t *pte;
-	vm_offset_t qmap_addr; 
+	pt2_entry_t *pte2p;
+	vm_offset_t qmap_addr;
 
 	critical_enter();
-
 	qmap_addr = PCPU_GET(qmap_addr);
-	pte = pt2map_entry(qmap_addr);
+	pte2p = pt2map_entry(qmap_addr);
 
-	KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy"));
+	KASSERT(pte2_load(pte2p) == 0, ("%s: PTE2 busy", __func__));
 
-	pte2_store(pte, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m),
-	    PTE2_AP_KRW, pmap_page_get_memattr(m)));
+	pte2_store(pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW,
+	    pmap_page_get_memattr(m)));
 	tlb_flush_local(qmap_addr);
 
 	return (qmap_addr);
@@ -5748,16 +5746,16 @@ pmap_quick_enter_page(vm_page_t m)
 void
 pmap_quick_remove_page(vm_offset_t addr)
 {
-	pt2_entry_t *pte;
+	pt2_entry_t *pte2p;
 	vm_offset_t qmap_addr;
 
 	qmap_addr = PCPU_GET(qmap_addr);
-	pte = pt2map_entry(qmap_addr);
+	pte2p = pt2map_entry(qmap_addr);
 
-	KASSERT(addr == qmap_addr, ("pmap_quick_remove_page: invalid address"));
-	KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use"));
+	KASSERT(addr == qmap_addr, ("%s: invalid address", __func__));
+	KASSERT(pte2_load(pte2p) != 0, ("%s: PTE2 not in use", __func__));
 
-	pte2_clear(pte);
+	pte2_clear(pte2p);
 	critical_exit();
 }
 
diff --git a/sys/arm/arm/stdatomic.c b/sys/arm/arm/stdatomic.c
index 211f26a..3c0b997 100644
--- a/sys/arm/arm/stdatomic.c
+++ b/sys/arm/arm/stdatomic.c
@@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/types.h>
 
 #include <machine/acle-compat.h>
+#include <machine/atomic.h>
 #include <machine/cpufunc.h>
 #include <machine/sysarch.h>
 
@@ -67,19 +68,12 @@ do_sync(void)
 
 	__asm volatile ("" : : : "memory");
 }
-#elif __ARM_ARCH >= 7
-static inline void
-do_sync(void)
-{
-
-	__asm volatile ("dmb" : : : "memory");
-}
 #elif __ARM_ARCH >= 6
 static inline void
 do_sync(void)
 {
 
-	__asm volatile ("mcr p15, 0, %0, c7, c10, 5" : : "r" (0) : "memory");
+	dmb();
 }
 #endif
 
diff --git a/sys/arm/broadcom/bcm2835/bcm2835_systimer.c b/sys/arm/broadcom/bcm2835/bcm2835_systimer.c
index 93bf676..731c7d0 100644
--- a/sys/arm/broadcom/bcm2835/bcm2835_systimer.c
+++ b/sys/arm/broadcom/bcm2835/bcm2835_systimer.c
@@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
 #define	BCM2835_NUM_TIMERS	4
 
 #define	DEFAULT_TIMER		3
+#define	DEFAULT_TIMER_NAME	"BCM2835-3"
 #define	DEFAULT_FREQUENCY	1000000
 #define	MIN_PERIOD		5LLU
 
@@ -101,7 +102,7 @@ static struct bcm_systimer_softc *bcm_systimer_sc = NULL;
 static unsigned bcm_systimer_tc_get_timecount(struct timecounter *);
 
 static struct timecounter bcm_systimer_tc = {
-	.tc_name           = "BCM2835 Timecounter",
+	.tc_name           = DEFAULT_TIMER_NAME,
 	.tc_get_timecount  = bcm_systimer_tc_get_timecount,
 	.tc_poll_pps       = NULL,
 	.tc_counter_mask   = ~0u,
@@ -238,8 +239,7 @@ bcm_systimer_attach(device_t dev)
 
 	sc->st[DEFAULT_TIMER].index = DEFAULT_TIMER;
 	sc->st[DEFAULT_TIMER].enabled = 0;
-	sc->st[DEFAULT_TIMER].et.et_name = malloc(64, M_DEVBUF, M_NOWAIT | M_ZERO);
-	sprintf(sc->st[DEFAULT_TIMER].et.et_name, "BCM2835 Event Timer %d", DEFAULT_TIMER);
+	sc->st[DEFAULT_TIMER].et.et_name = DEFAULT_TIMER_NAME;
 	sc->st[DEFAULT_TIMER].et.et_flags = ET_FLAGS_ONESHOT;
 	sc->st[DEFAULT_TIMER].et.et_quality = 1000;
 	sc->st[DEFAULT_TIMER].et.et_frequency = sc->sysclk_freq;
diff --git a/sys/arm/conf/BEAGLEBONE b/sys/arm/conf/BEAGLEBONE
index a0ca1b6..12b8290 100644
--- a/sys/arm/conf/BEAGLEBONE
+++ b/sys/arm/conf/BEAGLEBONE
@@ -26,7 +26,7 @@ ident		BEAGLEBONE
 include 	"std.armv6"
 include 	"../ti/am335x/std.am335x"
 
-makeoptions	MODULES_EXTRA="dtb/am335x"
+makeoptions	MODULES_EXTRA="dtb/am335x am335x_dmtpps"
 
 # DTrace support
 options 	KDTRACE_HOOKS		# Kernel DTrace hooks
@@ -77,6 +77,7 @@ device		ti_i2c
 device		am335x_pmic		# AM335x Power Management IC (TPC65217)
 
 device		am335x_rtc		# RTC support (power management only)
+#define 	am335x_dmtpps		# Pulse Per Second capture driver
 
 # Console and misc
 device		uart
diff --git a/sys/arm/include/armreg.h b/sys/arm/include/armreg.h
index 9358703..a300ddf 100644
--- a/sys/arm/include/armreg.h
+++ b/sys/arm/include/armreg.h
@@ -133,6 +133,7 @@
 #define CPU_ID_CORTEXA9R1	0x411fc090
 #define CPU_ID_CORTEXA9R2	0x412fc090
 #define CPU_ID_CORTEXA9R3	0x413fc090
+#define CPU_ID_CORTEXA9R4	0x414fc090
 #define CPU_ID_CORTEXA12R0	0x410fc0d0
 #define CPU_ID_CORTEXA15R0	0x410fc0f0
 #define CPU_ID_CORTEXA15R1	0x411fc0f0
diff --git a/sys/arm/ti/am335x/am335x_dmtpps.c b/sys/arm/ti/am335x/am335x_dmtpps.c
new file mode 100644
index 0000000..08b4104
--- /dev/null
+++ b/sys/arm/ti/am335x/am335x_dmtpps.c
@@ -0,0 +1,549 @@
+/*-
+ * Copyright (c) 2015 Ian lepore <ian@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * AM335x PPS driver using DMTimer capture.
+ *
+ * Note that this PPS driver does not use an interrupt.  Instead it uses the
+ * hardware's ability to latch the timer's count register in response to a
+ * signal on an IO pin.  Each of timers 4-7 have an associated pin, and this
+ * code allows any one of those to be used.
+ *
+ * The timecounter routines in kern_tc.c call the pps poll routine periodically
+ * to see if a new counter value has been latched.  When a new value has been
+ * latched, the only processing done in the poll routine is to capture the
+ * current set of timecounter timehands (done with pps_capture()) and the
+ * latched value from the timer.  The remaining work (done by pps_event() while
+ * holding a mutex) is scheduled to be done later in a non-interrupt context.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/malloc.h>
+#include <sys/rman.h>
+#include <sys/taskqueue.h>
+#include <sys/timepps.h>
+#include <sys/timetc.h>
+#include <machine/bus.h>
+
+#include <dev/ofw/openfirm.h>
+#include <dev/ofw/ofw_bus.h>
+#include <dev/ofw/ofw_bus_subr.h>
+
+#include <arm/ti/ti_prcm.h>
+#include <arm/ti/ti_hwmods.h>
+#include <arm/ti/ti_pinmux.h>
+#include <arm/ti/am335x/am335x_scm_padconf.h>
+
+#include "am335x_dmtreg.h"
+
+#define	PPS_CDEV_NAME	"dmtpps"
+
+struct dmtpps_softc {
+	device_t		dev;
+	int			mem_rid;
+	struct resource *	mem_res;
+	int			tmr_num;	/* N from hwmod str "timerN" */
+	char			tmr_name[12];	/* "DMTimerN" */
+	uint32_t		tclr;		/* Cached TCLR register. */
+	struct timecounter	tc;
+	int			pps_curmode;	/* Edge mode now set in hw. */
+	struct task 		pps_task;	/* For pps_event handling. */
+	struct cdev *		pps_cdev;
+	struct pps_state	pps_state;
+	struct mtx		pps_mtx;
+};
+
+static int dmtpps_tmr_num;	/* Set by probe() */
+
+/* List of compatible strings for FDT tree */
+static struct ofw_compat_data compat_data[] = {
+	{"ti,am335x-timer",     1},
+	{"ti,am335x-timer-1ms", 1},
+	{NULL,                  0},
+};
+
+/*
+ * A table relating pad names to the hardware timer number they can be mux'd to.
+ */
+struct padinfo {
+	char *	ballname;
+	int	tmr_num;
+};
+static struct padinfo dmtpps_padinfo[] = {
+	{"GPMC_ADVn_ALE",    4},
+	{"I2C0_SDA",         4},
+	{"MII1_TX_EN",       4},
+	{"XDMA_EVENT_INTR0", 4},
+	{"GPMC_BEn0_CLE",    5},
+	{"MDC",              5},
+	{"MMC0_DAT3",        5},
+	{"UART1_RTSn",       5},
+	{"GPMC_WEn",         6},
+	{"MDIO",             6},
+	{"MMC0_DAT2",        6},
+	{"UART1_CTSn",       6},
+	{"GPMC_OEn_REn",     7},
+	{"I2C0_SCL",         7},
+	{"UART0_CTSn",       7},
+	{"XDMA_EVENT_INTR1", 7},
+	{NULL, 0}
+};
+
+/*
+ * This is either brilliantly user-friendly, or utterly lame...
+ *
+ * The am335x chip is used on the popular Beaglebone boards.  Those boards have
+ * pins for all four capture-capable timers available on the P8 header. Allow
+ * users to configure the input pin by giving the name of the header pin.
+ */
+struct nicknames {
+	const char * nick;
+	const char * name;
+};
+static struct nicknames dmtpps_pin_nicks[] = {
+	{"P8-7",  "GPMC_ADVn_ALE"},
+	{"P8-9",  "GPMC_BEn0_CLE"},
+	{"P8-10", "GPMC_WEn"},
+	{"P8-8",  "GPMC_OEn_REn",},
+	{NULL, NULL}
+};
+
+#define	DMTIMER_READ4(sc, reg)		bus_read_4((sc)->mem_res, (reg))
+#define	DMTIMER_WRITE4(sc, reg, val)	bus_write_4((sc)->mem_res, (reg), (val))
+
+/*
+ * Translate a short friendly case-insensitive name to its canonical name.
+ */
+static const char *
+dmtpps_translate_nickname(const char *nick)
+{
+	struct nicknames *nn;
+
+	for (nn = dmtpps_pin_nicks; nn->nick != NULL; nn++)
+		if (strcasecmp(nick, nn->nick) == 0)
+			return nn->name;
+	return (nick);
+}
+
+/*
+ * See if our tunable is set to the name of the input pin.  If not, that's NOT
+ * an error, return 0.  If so, try to configure that pin as a timer capture
+ * input pin, and if that works, then we have our timer unit number and if it
+ * fails that IS an error, return -1.
+ */
+static int
+dmtpps_find_tmr_num_by_tunable()
+{
+	struct padinfo *pi;
+	char iname[20];
+	char muxmode[12];
+	const char * ballname;
+	int err;
+
+	if (!TUNABLE_STR_FETCH("hw.am335x_dmtpps.input", iname, sizeof(iname)))
+		return (0);
+	ballname = dmtpps_translate_nickname(iname);
+	for (pi = dmtpps_padinfo; pi->ballname != NULL; pi++) {
+		if (strcmp(ballname, pi->ballname) != 0)
+			continue;
+		snprintf(muxmode, sizeof(muxmode), "timer%d", pi->tmr_num);
+		err = ti_pinmux_padconf_set(pi->ballname, muxmode,
+		    PADCONF_INPUT);
+		if (err != 0) {
+			printf("am335x_dmtpps: unable to configure capture pin "
+			    "for %s to input mode\n", muxmode);
+			return (-1);
+		} else if (bootverbose) {
+			printf("am335x_dmtpps: configured pin %s as input "
+			    "for %s\n", iname, muxmode);
+		}
+		return (pi->tmr_num);
+	}
+
+	/* Invalid name in the tunable, that's an error. */
+	printf("am335x_dmtpps: unknown pin name '%s'\n", iname);
+	return (-1);
+}
+
+/*
+ * Ask the pinmux driver whether any pin has been configured as a TIMER4..TIMER7
+ * input pin.  If so, return the timer number, if not return 0.
+ */
+static int
+dmtpps_find_tmr_num_by_padconf()
+{
+	int err;
+	unsigned int padstate;
+	const char * padmux;
+	struct padinfo *pi;
+	char muxmode[12];
+
+	for (pi = dmtpps_padinfo; pi->ballname != NULL; pi++) {
+		err = ti_pinmux_padconf_get(pi->ballname, &padmux, &padstate);
+		snprintf(muxmode, sizeof(muxmode), "timer%d", pi->tmr_num);
+		if (err == 0 && (padstate & RXACTIVE) != 0 &&
+		    strcmp(muxmode, padmux) == 0)
+			return (pi->tmr_num);
+	}
+	/* Nothing found, not an error. */
+	return (0);
+}
+
+/*
+ * Figure out which hardware timer number to use based on input pin
+ * configuration.  This is done just once, the first time probe() runs.
+ */
+static int
+dmtpps_find_tmr_num()
+{
+	int tmr_num;
+
+	if ((tmr_num = dmtpps_find_tmr_num_by_tunable()) == 0)
+		tmr_num = dmtpps_find_tmr_num_by_padconf();
+
+	if (tmr_num <= 0) {
+		printf("am335x_dmtpps: PPS driver not enabled: unable to find "
+		    "or configure a capture input pin\n");
+		tmr_num = -1; /* Must return non-zero to prevent re-probing. */
+	}
+	return (tmr_num);
+}
+
+static void
+dmtpps_set_hw_capture(struct dmtpps_softc *sc, bool force_off)
+{
+	int newmode;
+
+	if (force_off)
+		newmode = 0;
+	else
+		newmode = sc->pps_state.ppsparam.mode & PPS_CAPTUREASSERT;
+
+	if (newmode == sc->pps_curmode)
+		return;
+	sc->pps_curmode = newmode;
+
+	if (newmode == PPS_CAPTUREASSERT)
+		sc->tclr |= DMT_TCLR_CAPTRAN_LOHI;
+	else
+		sc->tclr &= ~DMT_TCLR_CAPTRAN_MASK;
+	DMTIMER_WRITE4(sc, DMT_TCLR, sc->tclr);
+}
+
+static unsigned
+dmtpps_get_timecount(struct timecounter *tc)
+{
+	struct dmtpps_softc *sc;
+
+	sc = tc->tc_priv;
+
+	return (DMTIMER_READ4(sc, DMT_TCRR));
+}
+
+static void
+dmtpps_poll(struct timecounter *tc)
+{
+	struct dmtpps_softc *sc;
+
+	sc = tc->tc_priv;
+
+	/*
+	 * If a new value has been latched we've got a PPS event.  Capture the
+	 * timecounter data, then override the capcount field (pps_capture()
+	 * populates it from the current DMT_TCRR register) with the latched
+	 * value from the TCAR1 register.
+	 *
+	 * There is no locking here, by design.  pps_capture() writes into an
+	 * area of struct pps_state which is read only by pps_event().  The
+	 * synchronization of access to that area is temporal rather than
+	 * interlock based... we write in this routine and trigger the task that
+	 * will read the data, so no simultaneous access can occur.
+	 *
+	 * Note that we don't have the TCAR interrupt enabled, but the hardware
+	 * still provides the status bits in the "RAW" status register even when
+	 * they're masked from generating an irq.  However, when clearing the
+	 * TCAR status to re-arm the capture for the next second, we have to
+	 * write to the IRQ status register, not the RAW register.  Quirky.
+	 */
+	if (DMTIMER_READ4(sc, DMT_IRQSTATUS_RAW) & DMT_IRQ_TCAR) {
+		pps_capture(&sc->pps_state);
+		sc->pps_state.capcount = DMTIMER_READ4(sc, DMT_TCAR1);
+		DMTIMER_WRITE4(sc, DMT_IRQSTATUS, DMT_IRQ_TCAR);
+		taskqueue_enqueue_fast(taskqueue_fast, &sc->pps_task);
+	}
+}
+
+static void
+dmtpps_event(void *arg, int pending)
+{
+	struct dmtpps_softc *sc;
+
+	sc = arg;
+
+	/* This is the task function that gets enqueued by poll_pps.  Once the
+	 * time has been captured by the timecounter polling code which runs in
+	 * primary interrupt context, the remaining (more expensive) work to
+	 * process the event is done later in a threaded context.
+	 *
+	 * Here there is an interlock that protects the event data in struct
+	 * pps_state.  That data can be accessed at any time from userland via
+	 * ioctl() calls so we must ensure that there is no read access to
+	 * partially updated data while pps_event() does its work.
+	 */
+	mtx_lock(&sc->pps_mtx);
+	pps_event(&sc->pps_state, PPS_CAPTUREASSERT);
+	mtx_unlock(&sc->pps_mtx);
+}
+
+static int
+dmtpps_open(struct cdev *dev, int flags, int fmt, 
+    struct thread *td)
+{
+	struct dmtpps_softc *sc;
+
+	sc = dev->si_drv1;
+
+	/*
+	 * Begin polling for pps and enable capture in the hardware whenever the
+	 * device is open.  Doing this stuff again is harmless if this isn't the
+	 * first open.
+	 */
+	sc->tc.tc_poll_pps = dmtpps_poll;
+	dmtpps_set_hw_capture(sc, false);
+
+	return 0;
+}
+
+static	int
+dmtpps_close(struct cdev *dev, int flags, int fmt, 
+    struct thread *td)
+{
+	struct dmtpps_softc *sc;
+
+	sc = dev->si_drv1;
+
+	/*
+	 * Stop polling and disable capture on last close.  Use the force-off
+	 * flag to override the configured mode and turn off the hardware.
+	 */
+	sc->tc.tc_poll_pps = NULL;
+	dmtpps_set_hw_capture(sc, true);
+
+	return 0;
+}
+
+static int
+dmtpps_ioctl(struct cdev *dev, u_long cmd, caddr_t data, 
+    int flags, struct thread *td)
+{
+	struct dmtpps_softc *sc;
+	int err;
+
+	sc = dev->si_drv1;
+
+	/* Let the kernel do the heavy lifting for ioctl. */
+	mtx_lock(&sc->pps_mtx);
+	err = pps_ioctl(cmd, data, &sc->pps_state);
+	mtx_unlock(&sc->pps_mtx);
+	if (err != 0)
+		return (err);
+
+	/*
+	 * The capture mode could have changed, set the hardware to whatever
+	 * mode is now current.  Effectively a no-op if nothing changed.
+	 */
+	dmtpps_set_hw_capture(sc, false);
+
+	return (err);
+}
+
+static struct cdevsw dmtpps_cdevsw = {
+	.d_version =    D_VERSION,
+	.d_open =       dmtpps_open,
+	.d_close =      dmtpps_close,
+	.d_ioctl =      dmtpps_ioctl,
+	.d_name =       PPS_CDEV_NAME,
+};
+
+static int
+dmtpps_probe(device_t dev)
+{
+	char strbuf[64];
+	int tmr_num;
+
+	if (!ofw_bus_status_okay(dev))
+		return (ENXIO);
+
+	if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0)
+		return (ENXIO);
+
+	/*
+	 * If we haven't chosen which hardware timer to use yet, go do that now.
+	 * We need to know that to decide whether to return success for this
+	 * hardware timer instance or not.
+	 */
+	if (dmtpps_tmr_num == 0)
+		dmtpps_tmr_num = dmtpps_find_tmr_num();
+
+	/*
+	 * Figure out which hardware timer is being probed and see if it matches
+	 * the configured timer number determined earlier.
+	 */
+	tmr_num = ti_hwmods_get_unit(dev, "timer");
+	if (dmtpps_tmr_num != tmr_num)
+		return (ENXIO);
+
+	snprintf(strbuf, sizeof(strbuf), "AM335x PPS-Capture DMTimer%d",
+	    tmr_num);
+	device_set_desc_copy(dev, strbuf);
+
+	return(BUS_PROBE_DEFAULT);
+}
+
+static int
+dmtpps_attach(device_t dev)
+{
+	struct dmtpps_softc *sc;
+	clk_ident_t timer_id;
+	int err, sysclk_freq;
+
+	sc = device_get_softc(dev);
+	sc->dev = dev;
+
+	/* Get the base clock frequency. */
+	err = ti_prcm_clk_get_source_freq(SYS_CLK, &sysclk_freq);
+
+	/* Enable clocks and power on the device. */
+	if ((timer_id = ti_hwmods_get_clock(dev)) == INVALID_CLK_IDENT)
+		return (ENXIO);
+	if ((err = ti_prcm_clk_set_source(timer_id, SYSCLK_CLK)) != 0)
+		return (err);
+	if ((err = ti_prcm_clk_enable(timer_id)) != 0)
+		return (err);
+
+	/* Request the memory resources. */
+	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
+	    &sc->mem_rid, RF_ACTIVE);
+	if (sc->mem_res == NULL) {
+		return (ENXIO);
+	}
+
+	/* Figure out which hardware timer this is and set the name string. */
+	sc->tmr_num = ti_hwmods_get_unit(dev, "timer");
+	snprintf(sc->tmr_name, sizeof(sc->tmr_name), "DMTimer%d", sc->tmr_num);
+
+	/* Set up timecounter hardware, start it. */
+	DMTIMER_WRITE4(sc, DMT_TSICR, DMT_TSICR_RESET);
+	while (DMTIMER_READ4(sc, DMT_TIOCP_CFG) & DMT_TIOCP_RESET)
+		continue;
+
+	sc->tclr |= DMT_TCLR_START | DMT_TCLR_AUTOLOAD;
+	DMTIMER_WRITE4(sc, DMT_TLDR, 0);
+	DMTIMER_WRITE4(sc, DMT_TCRR, 0);
+	DMTIMER_WRITE4(sc, DMT_TCLR, sc->tclr);
+
+	/* Register the timecounter. */
+	sc->tc.tc_name           = sc->tmr_name;
+	sc->tc.tc_get_timecount  = dmtpps_get_timecount;
+	sc->tc.tc_counter_mask   = ~0u;
+	sc->tc.tc_frequency      = sysclk_freq;
+	sc->tc.tc_quality        = 1000;
+	sc->tc.tc_priv           = sc;
+
+	tc_init(&sc->tc);
+
+	/*
+	 * Indicate our PPS capabilities.  Have the kernel init its part of the
+	 * pps_state struct and add its capabilities.
+	 *
+	 * While the hardware has a mode to capture each edge, it's not clear we
+	 * can use it that way, because there's only a single interrupt/status
+	 * bit to say something was captured, but not which edge it was.  For
+	 * now, just say we can only capture assert events (the positive-going
+	 * edge of the pulse).
+	 */
+	mtx_init(&sc->pps_mtx, "dmtpps", NULL, MTX_DEF);
+	sc->pps_state.ppscap = PPS_CAPTUREASSERT;
+	sc->pps_state.driver_abi = PPS_ABI_VERSION;
+	sc->pps_state.driver_mtx = &sc->pps_mtx;
+	pps_init_abi(&sc->pps_state);
+
+	/*
+	 * Init the task that does deferred pps_event() processing after
+	 * the polling routine has captured a pps pulse time.
+	 */
+	TASK_INIT(&sc->pps_task, 0, dmtpps_event, sc);
+
+	/* Create the PPS cdev. */
+	sc->pps_cdev = make_dev(&dmtpps_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
+	    PPS_CDEV_NAME);
+	sc->pps_cdev->si_drv1 = sc;
+
+	if (bootverbose)
+		device_printf(sc->dev, "Using %s for PPS device /dev/%s\n",
+		    sc->tmr_name, PPS_CDEV_NAME);
+
+	return (0);
+}
+
+static int
+dmtpps_detach(device_t dev)
+{
+
+	/*
+	 * There is no way to remove a timecounter once it has been registered,
+	 * even if it's not in use, so we can never detach.  If we were
+	 * dynamically loaded as a module this will prevent unloading.
+	 */
+	return (EBUSY);
+}
+
+static device_method_t dmtpps_methods[] = {
+	DEVMETHOD(device_probe,		dmtpps_probe),
+	DEVMETHOD(device_attach,	dmtpps_attach),
+	DEVMETHOD(device_detach,	dmtpps_detach),
+	{ 0, 0 }
+};
+
+static driver_t dmtpps_driver = {
+	"am335x_dmtpps",
+	dmtpps_methods,
+	sizeof(struct dmtpps_softc),
+};
+
+static devclass_t dmtpps_devclass;
+
+DRIVER_MODULE(am335x_dmtpps, simplebus, dmtpps_driver, dmtpps_devclass, 0, 0);
+MODULE_DEPEND(am335x_dmtpps, am335x_prcm, 1, 1, 1);
+
diff --git a/sys/arm/ti/am335x/files.am335x b/sys/arm/ti/am335x/files.am335x
index 7293fd0..d0193e8 100644
--- a/sys/arm/ti/am335x/files.am335x
+++ b/sys/arm/ti/am335x/files.am335x
@@ -3,6 +3,7 @@
 arm/ti/aintc.c				standard
 
 arm/ti/am335x/am335x_dmtimer.c		standard
+arm/ti/am335x/am335x_dmtpps.c		optional	am335x_dmtpps
 arm/ti/am335x/am335x_gpio.c		optional	gpio
 arm/ti/am335x/am335x_lcd.c		optional	sc | vt
 arm/ti/am335x/am335x_lcd_syscons.c	optional	sc
diff --git a/sys/arm/versatile/sp804.c b/sys/arm/versatile/sp804.c
index a69c018..de05700 100644
--- a/sys/arm/versatile/sp804.c
+++ b/sys/arm/versatile/sp804.c
@@ -244,7 +244,7 @@ sp804_timer_attach(device_t dev)
 	 * Timer 1, timecounter
 	 */
 	sc->tc.tc_frequency = sc->sysclk_freq;
-	sc->tc.tc_name = "SP804 Time Counter";
+	sc->tc.tc_name = "SP804-1";
 	sc->tc.tc_get_timecount = sp804_timer_tc_get_timecount;
 	sc->tc.tc_poll_pps = NULL;
 	sc->tc.tc_counter_mask = ~0u;
@@ -263,9 +263,7 @@ sp804_timer_attach(device_t dev)
 	 * Timer 2, event timer
 	 */
 	sc->et_enabled = 0;
-	sc->et.et_name = malloc(64, M_DEVBUF, M_NOWAIT | M_ZERO);
-	sprintf(sc->et.et_name, "SP804 Event Timer %d",
-		device_get_unit(dev));
+	sc->et.et_name = "SP804-2";
 	sc->et.et_flags = ET_FLAGS_PERIODIC | ET_FLAGS_ONESHOT;
 	sc->et.et_quality = 1000;
 	sc->et.et_frequency = sc->sysclk_freq / DEFAULT_DIVISOR;
diff --git a/sys/arm64/arm64/bus_machdep.c b/sys/arm64/arm64/bus_machdep.c
index 25a675e..f6df4a1 100644
--- a/sys/arm64/arm64/bus_machdep.c
+++ b/sys/arm64/arm64/bus_machdep.c
@@ -49,6 +49,15 @@ void generic_bs_rm_4(void *, bus_space_handle_t, bus_size_t, uint32_t *,
 void generic_bs_rm_8(void *, bus_space_handle_t, bus_size_t, uint64_t *,
     bus_size_t);
 
+void generic_bs_rr_1(void *, bus_space_handle_t, bus_size_t, uint8_t *,
+    bus_size_t);
+void generic_bs_rr_2(void *, bus_space_handle_t, bus_size_t, uint16_t *,
+    bus_size_t);
+void generic_bs_rr_4(void *, bus_space_handle_t, bus_size_t, uint32_t *,
+    bus_size_t);
+void generic_bs_rr_8(void *, bus_space_handle_t, bus_size_t, uint64_t *,
+    bus_size_t);
+
 void generic_bs_w_1(void *, bus_space_handle_t, bus_size_t, uint8_t);
 void generic_bs_w_2(void *, bus_space_handle_t, bus_size_t, uint16_t);
 void generic_bs_w_4(void *, bus_space_handle_t, bus_size_t, uint32_t);
@@ -63,6 +72,15 @@ void generic_bs_wm_4(void *, bus_space_handle_t, bus_size_t, const uint32_t *,
 void generic_bs_wm_8(void *, bus_space_handle_t, bus_size_t, const uint64_t *,
     bus_size_t);
 
+void generic_bs_wr_1(void *, bus_space_handle_t, bus_size_t, const uint8_t *,
+    bus_size_t);
+void generic_bs_wr_2(void *, bus_space_handle_t, bus_size_t, const uint16_t *,
+    bus_size_t);
+void generic_bs_wr_4(void *, bus_space_handle_t, bus_size_t, const uint32_t *,
+    bus_size_t);
+void generic_bs_wr_8(void *, bus_space_handle_t, bus_size_t, const uint64_t *,
+    bus_size_t);
+
 static int
 generic_bs_map(void *t, bus_addr_t bpa, bus_size_t size, int flags,
     bus_space_handle_t *bshp)
@@ -126,6 +144,12 @@ struct bus_space memmap_bus = {
 	.bs_rm_4 = generic_bs_rm_4,
 	.bs_rm_8 = generic_bs_rm_8,
 
+	/* read region */
+	.bs_rr_1 = generic_bs_rr_1,
+	.bs_rr_2 = generic_bs_rr_2,
+	.bs_rr_4 = generic_bs_rr_4,
+	.bs_rr_8 = generic_bs_rr_8,
+
 	/* write single */
 	.bs_w_1 = generic_bs_w_1,
 	.bs_w_2 = generic_bs_w_2,
@@ -139,10 +163,10 @@ struct bus_space memmap_bus = {
 	.bs_wm_8 = generic_bs_wm_8,
 
 	/* write region */
-	.bs_wr_1 = NULL,
-	.bs_wr_2 = NULL,
-	.bs_wr_4 = NULL,
-	.bs_wr_8 = NULL,
+	.bs_wr_1 = generic_bs_wr_1,
+	.bs_wr_2 = generic_bs_wr_2,
+	.bs_wr_4 = generic_bs_wr_4,
+	.bs_wr_8 = generic_bs_wr_8,
 
 	/* set multiple */
 	.bs_sm_1 = NULL,
diff --git a/sys/arm64/arm64/bus_space_asm.S b/sys/arm64/arm64/bus_space_asm.S
index 20d4128..d919bd5 100644
--- a/sys/arm64/arm64/bus_space_asm.S
+++ b/sys/arm64/arm64/bus_space_asm.S
@@ -133,6 +133,90 @@ ENTRY(generic_bs_rm_8)
 2:	ret
 END(generic_bs_rm_8)
 
+ENTRY(generic_bs_rr_1)
+	/* Is there is anything to read. */
+	cbz	x4, 2f
+
+	/* Calculate the device address. */
+	add	x0, x1, x2
+	/*
+	 * x0 = The device address.
+	 * x3 = The kernel address.
+	 * x4 = Count
+	 */
+
+	/* Read the data. */
+1:	ldrb	w1, [x0], #1
+	strb	w1, [x3], #1
+	subs	x4, x4, #1
+	b.ne	1b
+
+2:	ret
+END(generic_bs_rr_1)
+
+ENTRY(generic_bs_rr_2)
+	/* Is there is anything to read. */
+	cbz	x4, 2f
+
+	/* Calculate the device address. */
+	add	x0, x1, x2
+	/*
+	 * x0 = The device address.
+	 * x3 = The kernel address.
+	 * x4 = Count
+	 */
+
+	/* Read the data. */
+1:	ldrh	w1, [x0], #2
+	strh	w1, [x3], #2
+	subs	x4, x4, #1
+	b.ne	1b
+
+2:	ret
+END(generic_bs_rr_2)
+
+ENTRY(generic_bs_rr_4)
+	/* Is there is anything to read. */
+	cbz	x4, 2f
+
+	/* Calculate the device address. */
+	add	x0, x1, x2
+	/*
+	 * x0 = The device address.
+	 * x3 = The kernel address.
+	 * x4 = Count
+	 */
+
+	/* Read the data. */
+1:	ldr	w1, [x0], #4
+	str	w1, [x3], #4
+	subs	x4, x4, #1
+	b.ne	1b
+
+2:	ret
+END(generic_bs_rr_4)
+
+ENTRY(generic_bs_rr_8)
+	/* Is there is anything to read. */
+	cbz	x4, 2f
+
+	/* Calculate the device address. */
+	add	x0, x1, x2
+	/*
+	 * x0 = The device address.
+	 * x3 = The kernel address.
+	 * x4 = Count
+	 */
+
+	/* Read the data. */
+1:	ldr	x1, [x0], #8
+	str	x1, [x3], #8
+	subs	x4, x4, #1
+	b.ne	1b
+
+2:	ret
+END(generic_bs_rr_8)
+
 
 ENTRY(generic_bs_w_1)
 	strb	w3, [x1, x2]
@@ -233,3 +317,83 @@ ENTRY(generic_bs_wm_8)
 
 2:	ret
 END(generic_bs_wm_8)
+
+ENTRY(generic_bs_wr_1)
+	/* Is there is anything to write. */
+	cbz	x4, 2f
+	
+	add	x0, x1, x2
+	/*
+	 * x0 = The device address.
+	 * x3 = The kernel address.
+	 * x4 = Count
+	 */
+
+	/* Write the data */
+1:	ldrb	w1, [x3], #1
+	strb	w1, [x0], #1
+	subs	x4, x4, #1
+	b.ne	1b
+
+2:	ret
+END(generic_bs_wr_1)
+
+ENTRY(generic_bs_wr_2)
+	/* Is there is anything to write. */
+	cbz	x4, 2f
+	
+	add	x0, x1, x2
+	/*
+	 * x0 = The device address.
+	 * x3 = The kernel address.
+	 * x4 = Count
+	 */
+
+	/* Write the data */
+1:	ldrh	w1, [x3], #2
+	strh	w1, [x0], #2
+	subs	x4, x4, #1
+	b.ne	1b
+
+2:	ret
+END(generic_bs_wr_2)
+
+ENTRY(generic_bs_wr_4)
+	/* Is there is anything to write. */
+	cbz	x4, 2f
+	
+	add	x0, x1, x2
+	/*
+	 * x0 = The device address.
+	 * x3 = The kernel address.
+	 * x4 = Count
+	 */
+
+	/* Write the data */
+1:	ldr	w1, [x3], #4
+	str	w1, [x0], #4
+	subs	x4, x4, #1
+	b.ne	1b
+
+2:	ret
+END(generic_bs_wr_4)
+
+ENTRY(generic_bs_wr_8)
+	/* Is there is anything to write. */
+	cbz	x4, 2f
+	
+	add	x0, x1, x2
+	/*
+	 * x0 = The device address.
+	 * x3 = The kernel address.
+	 * x4 = Count
+	 */
+
+	/* Write the data */
+1:	ldr	x1, [x3], #8
+	str	x1, [x0], #8
+	subs	x4, x4, #1
+	b.ne	1b
+
+2:	ret
+END(generic_bs_wr_8)
diff --git a/sys/arm64/arm64/exception.S b/sys/arm64/arm64/exception.S
index 4f457da..b05941f 100644
--- a/sys/arm64/arm64/exception.S
+++ b/sys/arm64/arm64/exception.S
@@ -104,7 +104,7 @@ __FBSDID("$FreeBSD$");
 
 	/* Read the current thread flags */
 1:	ldr	x1, [x18, #PC_CURTHREAD]	/* Load curthread */
-	ldr	x2, [x1, #TD_FLAGS]! /* TODO: No need for the ! but clang fails without it */
+	ldr	x2, [x1, #TD_FLAGS]
 
 	/* Check if we have either bits set */
 	mov	x3, #((TDF_ASTPENDING|TDF_NEEDRESCHED) >> 8)
diff --git a/sys/arm64/arm64/trap.c b/sys/arm64/arm64/trap.c
index 41e92a6..fa9aaa8 100644
--- a/sys/arm64/arm64/trap.c
+++ b/sys/arm64/arm64/trap.c
@@ -229,6 +229,21 @@ data_abort(struct trapframe *frame, uint64_t esr, int lower)
 		userret(td, frame);
 }
 
+static void
+print_registers(struct trapframe *frame)
+{
+	u_int reg;
+
+	for (reg = 0; reg < 31; reg++) {
+		printf(" %sx%d: %16lx\n", (reg < 10) ? " " : "", reg,
+		    frame->tf_x[reg]);
+	}
+	printf("  sp: %16lx\n", frame->tf_sp);
+	printf("  lr: %16lx\n", frame->tf_lr);
+	printf(" elr: %16lx\n", frame->tf_elr);
+	printf("spsr: %16lx\n", frame->tf_spsr);
+}
+
 void
 do_el1h_sync(struct trapframe *frame)
 {
@@ -265,6 +280,7 @@ do_el1h_sync(struct trapframe *frame)
 	switch(exception) {
 	case EXCP_FP_SIMD:
 	case EXCP_TRAP_FP:
+		print_registers(frame);
 		panic("VFP exception in the kernel");
 	case EXCP_DATA_ABORT:
 		data_abort(frame, esr, 0);
@@ -286,11 +302,30 @@ do_el1h_sync(struct trapframe *frame)
 #endif
 		break;
 	default:
+		print_registers(frame);
 		panic("Unknown kernel exception %x esr_el1 %lx\n", exception,
 		    esr);
 	}
 }
 
+/*
+ * We get EXCP_UNKNOWN from QEMU when executing zeroed memory. For now turn
+ * this into a SIGILL.
+ */
+static void
+el0_excp_unknown(struct trapframe *frame)
+{
+	struct thread *td;
+	uint64_t far;
+
+	td = curthread;
+	far = READ_SPECIALREG(far_el1);
+	printf("el0 EXCP_UNKNOWN exception\n");
+	print_registers(frame);
+	call_trapsignal(td, SIGILL, ILL_ILLTRP, (void *)far);
+	userret(td, frame);
+}
+
 void
 do_el0_sync(struct trapframe *frame)
 {
@@ -332,7 +367,11 @@ do_el0_sync(struct trapframe *frame)
 	case EXCP_DATA_ABORT:
 		data_abort(frame, esr, 1);
 		break;
+	case EXCP_UNKNOWN:
+		el0_excp_unknown(frame);
+		break;
 	default:
+		print_registers(frame);
 		panic("Unknown userland exception %x esr_el1 %lx\n", exception,
 		    esr);
 	}
diff --git a/sys/boot/kshim/bsd_kernel.h b/sys/boot/kshim/bsd_kernel.h
index 0e40fb0..aba8131 100644
--- a/sys/boot/kshim/bsd_kernel.h
+++ b/sys/boot/kshim/bsd_kernel.h
@@ -43,7 +43,8 @@
 #define	M_USBDEV 0
 #define	USB_PROC_MAX 3
 #define	USB_BUS_GIANT_PROC(bus) (usb_process + 2)
-#define	USB_BUS_NON_GIANT_PROC(bus) (usb_process + 2)
+#define	USB_BUS_NON_GIANT_BULK_PROC(bus) (usb_process + 2)
+#define	USB_BUS_NON_GIANT_ISOC_PROC(bus) (usb_process + 2)
 #define	USB_BUS_EXPLORE_PROC(bus) (usb_process + 0)
 #define	USB_BUS_CONTROL_XFER_PROC(bus) (usb_process + 1)
 #define	SYSCTL_DECL(...)
diff --git a/sys/boot/uboot/fdt/uboot_fdt.c b/sys/boot/uboot/fdt/uboot_fdt.c
index 86f46e9..6b646f6 100644
--- a/sys/boot/uboot/fdt/uboot_fdt.c
+++ b/sys/boot/uboot/fdt/uboot_fdt.c
@@ -69,10 +69,11 @@ fdt_platform_load_dtb(void)
 	}
 
 	/*
-	 * If the U-boot environment contains a variable giving the name of a
-	 * file, use it if we can load and validate it.
+	 * Try to get FDT filename first from loader env and then from u-boot env
 	 */
-	s = ub_env_get("fdtfile");
+	s = getenv("fdt_file");
+	if (s == NULL)
+		s = ub_env_get("fdtfile");
 	if (s == NULL)
 		s = ub_env_get("fdt_file");
 	if (s != NULL && *s != '\0') {
diff --git a/sys/cam/ctl/README.ctl.txt b/sys/cam/ctl/README.ctl.txt
index a6de201..d4dc938 100644
--- a/sys/cam/ctl/README.ctl.txt
+++ b/sys/cam/ctl/README.ctl.txt
@@ -366,16 +366,6 @@ This is a CTL frontend port that is also a CAM SIM.  The idea is that this
 frontend allows for using CTL without any target-capable hardware.  So any
 LUNs you create in CTL are visible via this port.
 
-
-ctl_frontend_internal.c
-ctl_frontend_internal.h:
------------------------
-
-This is a frontend port written for Copan to do some system-specific tasks
-that required sending commands into CTL from inside the kernel.  This isn't
-entirely relevant to FreeBSD in general, but can perhaps be repurposed or
-removed later.
-
 ctl_ha.h:
 --------
 
diff --git a/sys/cam/ctl/ctl.c b/sys/cam/ctl/ctl.c
index bdf5e6a..9141fc8 100644
--- a/sys/cam/ctl/ctl.c
+++ b/sys/cam/ctl/ctl.c
@@ -72,7 +72,6 @@ __FBSDID("$FreeBSD$");
 #include <cam/ctl/ctl_io.h>
 #include <cam/ctl/ctl.h>
 #include <cam/ctl/ctl_frontend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
 #include <cam/ctl/ctl_util.h>
 #include <cam/ctl/ctl_backend.h>
 #include <cam/ctl/ctl_ioctl.h>
@@ -383,18 +382,7 @@ static int ctl_init(void);
 void ctl_shutdown(void);
 static int ctl_open(struct cdev *dev, int flags, int fmt, struct thread *td);
 static int ctl_close(struct cdev *dev, int flags, int fmt, struct thread *td);
-static void ctl_ioctl_online(void *arg);
-static void ctl_ioctl_offline(void *arg);
-static int ctl_ioctl_lun_enable(void *arg, int lun_id);
-static int ctl_ioctl_lun_disable(void *arg, int lun_id);
-static int ctl_ioctl_do_datamove(struct ctl_scsiio *ctsio);
 static int ctl_serialize_other_sc_cmd(struct ctl_scsiio *ctsio);
-static int ctl_ioctl_submit_wait(union ctl_io *io);
-static void ctl_ioctl_datamove(union ctl_io *io);
-static void ctl_ioctl_done(union ctl_io *io);
-static void ctl_ioctl_hard_startstop_callback(void *arg,
-					      struct cfi_metatask *metatask);
-static void ctl_ioctl_bbrread_callback(void *arg,struct cfi_metatask *metatask);
 static int ctl_ioctl_fill_ooa(struct ctl_lun *lun, uint32_t *cur_fill_num,
 			      struct ctl_ooa *ooa_hdr,
 			      struct ctl_ooa_entry *kern_entries);
@@ -529,11 +517,6 @@ static moduledata_t ctl_moduledata = {
 DECLARE_MODULE(ctl, ctl_moduledata, SI_SUB_CONFIGURE, SI_ORDER_THIRD);
 MODULE_VERSION(ctl, 1);
 
-static struct ctl_frontend ioctl_frontend =
-{
-	.name = "ioctl",
-};
-
 #ifdef notyet
 static void
 ctl_isc_handler_finish_xfer(struct ctl_softc *ctl_softc,
@@ -1064,7 +1047,6 @@ ctl_init(void)
 {
 	struct ctl_softc *softc;
 	void *other_pool;
-	struct ctl_port *port;
 	int i, error, retval;
 	//int isc_retval;
 
@@ -1189,32 +1171,6 @@ ctl_init(void)
 		return (error);
 	}
 
-	/*
-	 * Initialize the ioctl front end.
-	 */
-	ctl_frontend_register(&ioctl_frontend);
-	port = &softc->ioctl_info.port;
-	port->frontend = &ioctl_frontend;
-	sprintf(softc->ioctl_info.port_name, "ioctl");
-	port->port_type = CTL_PORT_IOCTL;
-	port->num_requested_ctl_io = 100;
-	port->port_name = softc->ioctl_info.port_name;
-	port->port_online = ctl_ioctl_online;
-	port->port_offline = ctl_ioctl_offline;
-	port->onoff_arg = &softc->ioctl_info;
-	port->lun_enable = ctl_ioctl_lun_enable;
-	port->lun_disable = ctl_ioctl_lun_disable;
-	port->targ_lun_arg = &softc->ioctl_info;
-	port->fe_datamove = ctl_ioctl_datamove;
-	port->fe_done = ctl_ioctl_done;
-	port->max_targets = 15;
-	port->max_target_id = 15;
-
-	if (ctl_port_register(&softc->ioctl_info.port) != 0) {
-		printf("ctl: ioctl front end registration failed, will "
-		       "continue anyway\n");
-	}
-
 	SYSCTL_ADD_PROC(&softc->sysctl_ctx,SYSCTL_CHILDREN(softc->sysctl_tree),
 	    OID_AUTO, "ha_state", CTLTYPE_INT | CTLFLAG_RWTUN,
 	    softc, 0, ctl_ha_state_sysctl, "I", "HA state for this head");
@@ -1238,9 +1194,6 @@ ctl_shutdown(void)
 
 	softc = (struct ctl_softc *)control_softc;
 
-	if (ctl_port_deregister(&softc->ioctl_info.port) != 0)
-		printf("ctl: ioctl front end deregistration failed\n");
-
 	mtx_lock(&softc->ctl_lock);
 
 	/*
@@ -1253,8 +1206,6 @@ ctl_shutdown(void)
 
 	mtx_unlock(&softc->ctl_lock);
 
-	ctl_frontend_deregister(&ioctl_frontend);
-
 #if 0
 	ctl_shutdown_thread(softc->work_thread);
 	mtx_destroy(&softc->queue_lock);
@@ -1426,26 +1377,6 @@ ctl_port_list(struct ctl_port_entry *entries, int num_entries_alloced,
 	return (retval);
 }
 
-static void
-ctl_ioctl_online(void *arg)
-{
-	struct ctl_ioctl_info *ioctl_info;
-
-	ioctl_info = (struct ctl_ioctl_info *)arg;
-
-	ioctl_info->flags |= CTL_IOCTL_FLAG_ENABLED;
-}
-
-static void
-ctl_ioctl_offline(void *arg)
-{
-	struct ctl_ioctl_info *ioctl_info;
-
-	ioctl_info = (struct ctl_ioctl_info *)arg;
-
-	ioctl_info->flags &= ~CTL_IOCTL_FLAG_ENABLED;
-}
-
 /*
  * Remove an initiator by port number and initiator ID.
  * Returns 0 for success, -1 for failure.
@@ -1641,181 +1572,6 @@ ctl_create_iid(struct ctl_port *port, int iid, uint8_t *buf)
 	}
 }
 
-static int
-ctl_ioctl_lun_enable(void *arg, int lun_id)
-{
-	return (0);
-}
-
-static int
-ctl_ioctl_lun_disable(void *arg, int lun_id)
-{
-	return (0);
-}
-
-/*
- * Data movement routine for the CTL ioctl frontend port.
- */
-static int
-ctl_ioctl_do_datamove(struct ctl_scsiio *ctsio)
-{
-	struct ctl_sg_entry *ext_sglist, *kern_sglist;
-	struct ctl_sg_entry ext_entry, kern_entry;
-	int ext_sglen, ext_sg_entries, kern_sg_entries;
-	int ext_sg_start, ext_offset;
-	int len_to_copy, len_copied;
-	int kern_watermark, ext_watermark;
-	int ext_sglist_malloced;
-	int i, j;
-
-	ext_sglist_malloced = 0;
-	ext_sg_start = 0;
-	ext_offset = 0;
-
-	CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove\n"));
-
-	/*
-	 * If this flag is set, fake the data transfer.
-	 */
-	if (ctsio->io_hdr.flags & CTL_FLAG_NO_DATAMOVE) {
-		ctsio->ext_data_filled = ctsio->ext_data_len;
-		goto bailout;
-	}
-
-	/*
-	 * To simplify things here, if we have a single buffer, stick it in
-	 * a S/G entry and just make it a single entry S/G list.
-	 */
-	if (ctsio->io_hdr.flags & CTL_FLAG_EDPTR_SGLIST) {
-		int len_seen;
-
-		ext_sglen = ctsio->ext_sg_entries * sizeof(*ext_sglist);
-
-		ext_sglist = (struct ctl_sg_entry *)malloc(ext_sglen, M_CTL,
-							   M_WAITOK);
-		ext_sglist_malloced = 1;
-		if (copyin(ctsio->ext_data_ptr, ext_sglist,
-				   ext_sglen) != 0) {
-			ctl_set_internal_failure(ctsio,
-						 /*sks_valid*/ 0,
-						 /*retry_count*/ 0);
-			goto bailout;
-		}
-		ext_sg_entries = ctsio->ext_sg_entries;
-		len_seen = 0;
-		for (i = 0; i < ext_sg_entries; i++) {
-			if ((len_seen + ext_sglist[i].len) >=
-			     ctsio->ext_data_filled) {
-				ext_sg_start = i;
-				ext_offset = ctsio->ext_data_filled - len_seen;
-				break;
-			}
-			len_seen += ext_sglist[i].len;
-		}
-	} else {
-		ext_sglist = &ext_entry;
-		ext_sglist->addr = ctsio->ext_data_ptr;
-		ext_sglist->len = ctsio->ext_data_len;
-		ext_sg_entries = 1;
-		ext_sg_start = 0;
-		ext_offset = ctsio->ext_data_filled;
-	}
-
-	if (ctsio->kern_sg_entries > 0) {
-		kern_sglist = (struct ctl_sg_entry *)ctsio->kern_data_ptr;
-		kern_sg_entries = ctsio->kern_sg_entries;
-	} else {
-		kern_sglist = &kern_entry;
-		kern_sglist->addr = ctsio->kern_data_ptr;
-		kern_sglist->len = ctsio->kern_data_len;
-		kern_sg_entries = 1;
-	}
-
-
-	kern_watermark = 0;
-	ext_watermark = ext_offset;
-	len_copied = 0;
-	for (i = ext_sg_start, j = 0;
-	     i < ext_sg_entries && j < kern_sg_entries;) {
-		uint8_t *ext_ptr, *kern_ptr;
-
-		len_to_copy = MIN(ext_sglist[i].len - ext_watermark,
-				  kern_sglist[j].len - kern_watermark);
-
-		ext_ptr = (uint8_t *)ext_sglist[i].addr;
-		ext_ptr = ext_ptr + ext_watermark;
-		if (ctsio->io_hdr.flags & CTL_FLAG_BUS_ADDR) {
-			/*
-			 * XXX KDM fix this!
-			 */
-			panic("need to implement bus address support");
-#if 0
-			kern_ptr = bus_to_virt(kern_sglist[j].addr);
-#endif
-		} else
-			kern_ptr = (uint8_t *)kern_sglist[j].addr;
-		kern_ptr = kern_ptr + kern_watermark;
-
-		kern_watermark += len_to_copy;
-		ext_watermark += len_to_copy;
-
-		if ((ctsio->io_hdr.flags & CTL_FLAG_DATA_MASK) ==
-		     CTL_FLAG_DATA_IN) {
-			CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: copying %d "
-					 "bytes to user\n", len_to_copy));
-			CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: from %p "
-					 "to %p\n", kern_ptr, ext_ptr));
-			if (copyout(kern_ptr, ext_ptr, len_to_copy) != 0) {
-				ctl_set_internal_failure(ctsio,
-							 /*sks_valid*/ 0,
-							 /*retry_count*/ 0);
-				goto bailout;
-			}
-		} else {
-			CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: copying %d "
-					 "bytes from user\n", len_to_copy));
-			CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: from %p "
-					 "to %p\n", ext_ptr, kern_ptr));
-			if (copyin(ext_ptr, kern_ptr, len_to_copy)!= 0){
-				ctl_set_internal_failure(ctsio,
-							 /*sks_valid*/ 0,
-							 /*retry_count*/0);
-				goto bailout;
-			}
-		}
-
-		len_copied += len_to_copy;
-
-		if (ext_sglist[i].len == ext_watermark) {
-			i++;
-			ext_watermark = 0;
-		}
-
-		if (kern_sglist[j].len == kern_watermark) {
-			j++;
-			kern_watermark = 0;
-		}
-	}
-
-	ctsio->ext_data_filled += len_copied;
-
-	CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: ext_sg_entries: %d, "
-			 "kern_sg_entries: %d\n", ext_sg_entries,
-			 kern_sg_entries));
-	CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: ext_data_len = %d, "
-			 "kern_data_len = %d\n", ctsio->ext_data_len,
-			 ctsio->kern_data_len));
-
-
-	/* XXX KDM set residual?? */
-bailout:
-
-	if (ext_sglist_malloced != 0)
-		free(ext_sglist, M_CTL);
-
-	return (CTL_RETVAL_COMPLETE);
-}
-
 /*
  * Serialize a command that went down the "wrong" side, and so was sent to
  * this controller for execution.  The logic is a little different than the
@@ -1982,149 +1738,6 @@ ctl_serialize_other_sc_cmd(struct ctl_scsiio *ctsio)
 	return (retval);
 }
 
-static int
-ctl_ioctl_submit_wait(union ctl_io *io)
-{
-	struct ctl_fe_ioctl_params params;
-	ctl_fe_ioctl_state last_state;
-	int done, retval;
-
-	retval = 0;
-
-	bzero(&params, sizeof(params));
-
-	mtx_init(&params.ioctl_mtx, "ctliocmtx", NULL, MTX_DEF);
-	cv_init(&params.sem, "ctlioccv");
-	params.state = CTL_IOCTL_INPROG;
-	last_state = params.state;
-
-	io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr = &params;
-
-	CTL_DEBUG_PRINT(("ctl_ioctl_submit_wait\n"));
-
-	/* This shouldn't happen */
-	if ((retval = ctl_queue(io)) != CTL_RETVAL_COMPLETE)
-		return (retval);
-
-	done = 0;
-
-	do {
-		mtx_lock(&params.ioctl_mtx);
-		/*
-		 * Check the state here, and don't sleep if the state has
-		 * already changed (i.e. wakeup has already occured, but we
-		 * weren't waiting yet).
-		 */
-		if (params.state == last_state) {
-			/* XXX KDM cv_wait_sig instead? */
-			cv_wait(&params.sem, &params.ioctl_mtx);
-		}
-		last_state = params.state;
-
-		switch (params.state) {
-		case CTL_IOCTL_INPROG:
-			/* Why did we wake up? */
-			/* XXX KDM error here? */
-			mtx_unlock(&params.ioctl_mtx);
-			break;
-		case CTL_IOCTL_DATAMOVE:
-			CTL_DEBUG_PRINT(("got CTL_IOCTL_DATAMOVE\n"));
-
-			/*
-			 * change last_state back to INPROG to avoid
-			 * deadlock on subsequent data moves.
-			 */
-			params.state = last_state = CTL_IOCTL_INPROG;
-
-			mtx_unlock(&params.ioctl_mtx);
-			ctl_ioctl_do_datamove(&io->scsiio);
-			/*
-			 * Note that in some cases, most notably writes,
-			 * this will queue the I/O and call us back later.
-			 * In other cases, generally reads, this routine
-			 * will immediately call back and wake us up,
-			 * probably using our own context.
-			 */
-			io->scsiio.be_move_done(io);
-			break;
-		case CTL_IOCTL_DONE:
-			mtx_unlock(&params.ioctl_mtx);
-			CTL_DEBUG_PRINT(("got CTL_IOCTL_DONE\n"));
-			done = 1;
-			break;
-		default:
-			mtx_unlock(&params.ioctl_mtx);
-			/* XXX KDM error here? */
-			break;
-		}
-	} while (done == 0);
-
-	mtx_destroy(&params.ioctl_mtx);
-	cv_destroy(&params.sem);
-
-	return (CTL_RETVAL_COMPLETE);
-}
-
-static void
-ctl_ioctl_datamove(union ctl_io *io)
-{
-	struct ctl_fe_ioctl_params *params;
-
-	params = (struct ctl_fe_ioctl_params *)
-		io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr;
-
-	mtx_lock(&params->ioctl_mtx);
-	params->state = CTL_IOCTL_DATAMOVE;
-	cv_broadcast(&params->sem);
-	mtx_unlock(&params->ioctl_mtx);
-}
-
-static void
-ctl_ioctl_done(union ctl_io *io)
-{
-	struct ctl_fe_ioctl_params *params;
-
-	params = (struct ctl_fe_ioctl_params *)
-		io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr;
-
-	mtx_lock(&params->ioctl_mtx);
-	params->state = CTL_IOCTL_DONE;
-	cv_broadcast(&params->sem);
-	mtx_unlock(&params->ioctl_mtx);
-}
-
-static void
-ctl_ioctl_hard_startstop_callback(void *arg, struct cfi_metatask *metatask)
-{
-	struct ctl_fe_ioctl_startstop_info *sd_info;
-
-	sd_info = (struct ctl_fe_ioctl_startstop_info *)arg;
-
-	sd_info->hs_info.status = metatask->status;
-	sd_info->hs_info.total_luns = metatask->taskinfo.startstop.total_luns;
-	sd_info->hs_info.luns_complete =
-		metatask->taskinfo.startstop.luns_complete;
-	sd_info->hs_info.luns_failed = metatask->taskinfo.startstop.luns_failed;
-
-	cv_broadcast(&sd_info->sem);
-}
-
-static void
-ctl_ioctl_bbrread_callback(void *arg, struct cfi_metatask *metatask)
-{
-	struct ctl_fe_ioctl_bbrread_info *fe_bbr_info;
-
-	fe_bbr_info = (struct ctl_fe_ioctl_bbrread_info *)arg;
-
-	mtx_lock(fe_bbr_info->lock);
-	fe_bbr_info->bbr_info->status = metatask->status;
-	fe_bbr_info->bbr_info->bbr_status = metatask->taskinfo.bbrread.status;
-	fe_bbr_info->wakeup_done = 1;
-	mtx_unlock(fe_bbr_info->lock);
-
-	cv_broadcast(&fe_bbr_info->sem);
-}
-
 /*
  * Returns 0 for success, errno for failure.
  */
@@ -2367,57 +1980,9 @@ ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
 	retval = 0;
 
 	switch (cmd) {
-	case CTL_IO: {
-		union ctl_io *io;
-		void *pool_tmp;
-
-		/*
-		 * If we haven't been "enabled", don't allow any SCSI I/O
-		 * to this FETD.
-		 */
-		if ((softc->ioctl_info.flags & CTL_IOCTL_FLAG_ENABLED) == 0) {
-			retval = EPERM;
-			break;
-		}
-
-		io = ctl_alloc_io(softc->ioctl_info.port.ctl_pool_ref);
-
-		/*
-		 * Need to save the pool reference so it doesn't get
-		 * spammed by the user's ctl_io.
-		 */
-		pool_tmp = io->io_hdr.pool;
-		memcpy(io, (void *)addr, sizeof(*io));
-		io->io_hdr.pool = pool_tmp;
-
-		/*
-		 * No status yet, so make sure the status is set properly.
-		 */
-		io->io_hdr.status = CTL_STATUS_NONE;
-
-		/*
-		 * The user sets the initiator ID, target and LUN IDs.
-		 */
-		io->io_hdr.nexus.targ_port = softc->ioctl_info.port.targ_port;
-		io->io_hdr.flags |= CTL_FLAG_USER_REQ;
-		if ((io->io_hdr.io_type == CTL_IO_SCSI)
-		 && (io->scsiio.tag_type != CTL_TAG_UNTAGGED))
-			io->scsiio.tag_num = softc->ioctl_info.cur_tag_num++;
-
-		retval = ctl_ioctl_submit_wait(io);
-
-		if (retval != 0) {
-			ctl_free_io(io);
-			break;
-		}
-
-		memcpy((void *)addr, io, sizeof(*io));
-
-		/* return this to our pool */
-		ctl_free_io(io);
-
+	case CTL_IO:
+		retval = ctl_ioctl_io(dev, cmd, addr, flag, td);
 		break;
-	}
 	case CTL_ENABLE_PORT:
 	case CTL_DISABLE_PORT:
 	case CTL_SET_PORT_WWNS: {
@@ -2724,103 +2289,6 @@ ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
 
 		break;
 	}
-	case CTL_HARD_START:
-	case CTL_HARD_STOP: {
-		struct ctl_fe_ioctl_startstop_info ss_info;
-		struct cfi_metatask *metatask;
-		struct mtx hs_mtx;
-
-		mtx_init(&hs_mtx, "HS Mutex", NULL, MTX_DEF);
-
-		cv_init(&ss_info.sem, "hard start/stop cv" );
-
-		metatask = cfi_alloc_metatask(/*can_wait*/ 1);
-		if (metatask == NULL) {
-			retval = ENOMEM;
-			mtx_destroy(&hs_mtx);
-			break;
-		}
-
-		if (cmd == CTL_HARD_START)
-			metatask->tasktype = CFI_TASK_STARTUP;
-		else
-			metatask->tasktype = CFI_TASK_SHUTDOWN;
-
-		metatask->callback = ctl_ioctl_hard_startstop_callback;
-		metatask->callback_arg = &ss_info;
-
-		cfi_action(metatask);
-
-		/* Wait for the callback */
-		mtx_lock(&hs_mtx);
-		cv_wait_sig(&ss_info.sem, &hs_mtx);
-		mtx_unlock(&hs_mtx);
-
-		/*
-		 * All information has been copied from the metatask by the
-		 * time cv_broadcast() is called, so we free the metatask here.
-		 */
-		cfi_free_metatask(metatask);
-
-		memcpy((void *)addr, &ss_info.hs_info, sizeof(ss_info.hs_info));
-
-		mtx_destroy(&hs_mtx);
-		break;
-	}
-	case CTL_BBRREAD: {
-		struct ctl_bbrread_info *bbr_info;
-		struct ctl_fe_ioctl_bbrread_info fe_bbr_info;
-		struct mtx bbr_mtx;
-		struct cfi_metatask *metatask;
-
-		bbr_info = (struct ctl_bbrread_info *)addr;
-
-		bzero(&fe_bbr_info, sizeof(fe_bbr_info));
-
-		bzero(&bbr_mtx, sizeof(bbr_mtx));
-		mtx_init(&bbr_mtx, "BBR Mutex", NULL, MTX_DEF);
-
-		fe_bbr_info.bbr_info = bbr_info;
-		fe_bbr_info.lock = &bbr_mtx;
-
-		cv_init(&fe_bbr_info.sem, "BBR read cv");
-		metatask = cfi_alloc_metatask(/*can_wait*/ 1);
-
-		if (metatask == NULL) {
-			mtx_destroy(&bbr_mtx);
-			cv_destroy(&fe_bbr_info.sem);
-			retval = ENOMEM;
-			break;
-		}
-		metatask->tasktype = CFI_TASK_BBRREAD;
-		metatask->callback = ctl_ioctl_bbrread_callback;
-		metatask->callback_arg = &fe_bbr_info;
-		metatask->taskinfo.bbrread.lun_num = bbr_info->lun_num;
-		metatask->taskinfo.bbrread.lba = bbr_info->lba;
-		metatask->taskinfo.bbrread.len = bbr_info->len;
-
-		cfi_action(metatask);
-
-		mtx_lock(&bbr_mtx);
-		while (fe_bbr_info.wakeup_done == 0)
-			cv_wait_sig(&fe_bbr_info.sem, &bbr_mtx);
-		mtx_unlock(&bbr_mtx);
-
-		bbr_info->status = metatask->status;
-		bbr_info->bbr_status = metatask->taskinfo.bbrread.status;
-		bbr_info->scsi_status = metatask->taskinfo.bbrread.scsi_status;
-		memcpy(&bbr_info->sense_data,
-		       &metatask->taskinfo.bbrread.sense_data,
-		       MIN(sizeof(bbr_info->sense_data),
-			   sizeof(metatask->taskinfo.bbrread.sense_data)));
-
-		cfi_free_metatask(metatask);
-
-		mtx_destroy(&bbr_mtx);
-		cv_destroy(&fe_bbr_info.sem);
-
-		break;
-	}
 	case CTL_DELAY_IO: {
 		struct ctl_io_delay_info *delay_info;
 #ifdef CTL_IO_DELAY
diff --git a/sys/cam/ctl/ctl.h b/sys/cam/ctl/ctl.h
index b1d9118..2826742 100644
--- a/sys/cam/ctl/ctl.h
+++ b/sys/cam/ctl/ctl.h
@@ -194,6 +194,8 @@ void ctl_portDB_changed(int portnum);
 #ifdef notyet
 void ctl_init_isc_msg(void);
 #endif
+int ctl_ioctl_io(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
+		 struct thread *td);
 
 /*
  * KPI to manipulate LUN/port options
diff --git a/sys/cam/ctl/ctl_backend.c b/sys/cam/ctl/ctl_backend.c
index cabecb7..ae5034b 100644
--- a/sys/cam/ctl/ctl_backend.c
+++ b/sys/cam/ctl/ctl_backend.c
@@ -55,7 +55,6 @@ __FBSDID("$FreeBSD$");
 #include <cam/ctl/ctl.h>
 #include <cam/ctl/ctl_frontend.h>
 #include <cam/ctl/ctl_backend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
 #include <cam/ctl/ctl_ioctl.h>
 #include <cam/ctl/ctl_ha.h>
 #include <cam/ctl/ctl_private.h>
diff --git a/sys/cam/ctl/ctl_backend_block.c b/sys/cam/ctl/ctl_backend_block.c
index 5bb3121..65d0491 100644
--- a/sys/cam/ctl/ctl_backend_block.c
+++ b/sys/cam/ctl/ctl_backend_block.c
@@ -84,7 +84,6 @@ __FBSDID("$FreeBSD$");
 #include <cam/ctl/ctl_io.h>
 #include <cam/ctl/ctl.h>
 #include <cam/ctl/ctl_backend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
 #include <cam/ctl/ctl_ioctl.h>
 #include <cam/ctl/ctl_scsi_all.h>
 #include <cam/ctl/ctl_error.h>
@@ -170,7 +169,6 @@ struct ctl_be_block_lun {
 	uint64_t size_blocks;
 	uint64_t size_bytes;
 	uint32_t blocksize;
-	int blocksize_shift;
 	uint16_t pblockexp;
 	uint16_t pblockoff;
 	uint16_t ublockexp;
@@ -773,7 +771,7 @@ ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun,
 
 	DPRINTF("entered\n");
 
-	off = roff = ((off_t)lbalen->lba) << be_lun->blocksize_shift;
+	off = roff = ((off_t)lbalen->lba) * be_lun->blocksize;
 	vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
 	error = VOP_IOCTL(be_lun->vn, FIOSEEKHOLE, &off,
 	    0, curthread->td_ucred, curthread);
@@ -791,10 +789,9 @@ ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun,
 	}
 	VOP_UNLOCK(be_lun->vn, 0);
 
-	off >>= be_lun->blocksize_shift;
 	data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr;
 	scsi_u64to8b(lbalen->lba, data->descr[0].addr);
-	scsi_ulto4b(MIN(UINT32_MAX, off - lbalen->lba),
+	scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->blocksize - lbalen->lba),
 	    data->descr[0].length);
 	data->descr[0].status = status;
 
@@ -816,14 +813,14 @@ ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun, const char *attrname)
 	if (strcmp(attrname, "blocksused") == 0) {
 		error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
 		if (error == 0)
-			val = vattr.va_bytes >> be_lun->blocksize_shift;
+			val = vattr.va_bytes / be_lun->blocksize;
 	}
 	if (strcmp(attrname, "blocksavail") == 0 &&
 	    (be_lun->vn->v_iflag & VI_DOOMED) == 0) {
 		error = VFS_STATFS(be_lun->vn->v_mount, &statfs);
 		if (error == 0)
-			val = (statfs.f_bavail * statfs.f_bsize) >>
-			    be_lun->blocksize_shift;
+			val = statfs.f_bavail * statfs.f_bsize /
+			    be_lun->blocksize;
 	}
 	VOP_UNLOCK(be_lun->vn, 0);
 	return (val);
@@ -934,7 +931,7 @@ ctl_be_block_gls_zvol(struct ctl_be_block_lun *be_lun,
 
 	DPRINTF("entered\n");
 
-	off = roff = ((off_t)lbalen->lba) << be_lun->blocksize_shift;
+	off = roff = ((off_t)lbalen->lba) * be_lun->blocksize;
 	error = (*dev_data->csw->d_ioctl)(dev_data->cdev, FIOSEEKHOLE,
 	    (caddr_t)&off, FREAD, curthread);
 	if (error == 0 && off > roff)
@@ -950,10 +947,9 @@ ctl_be_block_gls_zvol(struct ctl_be_block_lun *be_lun,
 		}
 	}
 
-	off >>= be_lun->blocksize_shift;
 	data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr;
 	scsi_u64to8b(lbalen->lba, data->descr[0].addr);
-	scsi_ulto4b(MIN(UINT32_MAX, off - lbalen->lba),
+	scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->blocksize - lbalen->lba),
 	    data->descr[0].length);
 	data->descr[0].status = status;
 
@@ -1866,7 +1862,7 @@ ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
 	struct cdevsw		     *devsw;
 	char			     *value;
 	int			      error, atomic, maxio, unmap;
-	off_t			      ps, pss, po, pos, us, uss, uo, uos;
+	off_t			      ps, pss, po, pos, us, uss, uo, uos, tmp;
 
 	params = &be_lun->params;
 
@@ -1909,8 +1905,7 @@ ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
 		return (ENODEV);
 	}
 
-	error = devsw->d_ioctl(dev, DIOCGSECTORSIZE,
-			       (caddr_t)&be_lun->blocksize, FREAD,
+	error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, (caddr_t)&tmp, FREAD,
 			       curthread);
 	if (error) {
 		snprintf(req->error_str, sizeof(req->error_str),
@@ -1925,15 +1920,9 @@ ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
 	 * the user is asking for is an even multiple of the underlying 
 	 * device's blocksize.
 	 */
-	if ((params->blocksize_bytes != 0)
-	 && (params->blocksize_bytes > be_lun->blocksize)) {
-		uint32_t bs_multiple, tmp_blocksize;
-
-		bs_multiple = params->blocksize_bytes / be_lun->blocksize;
-
-		tmp_blocksize = bs_multiple * be_lun->blocksize;
-
-		if (tmp_blocksize == params->blocksize_bytes) {
+	if ((params->blocksize_bytes != 0) &&
+	    (params->blocksize_bytes >= tmp)) {
+		if (params->blocksize_bytes % tmp == 0) {
 			be_lun->blocksize = params->blocksize_bytes;
 		} else {
 			snprintf(req->error_str, sizeof(req->error_str),
@@ -1944,17 +1933,16 @@ ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
 			return (EINVAL);
 			
 		}
-	} else if ((params->blocksize_bytes != 0)
-		&& (params->blocksize_bytes != be_lun->blocksize)) {
+	} else if (params->blocksize_bytes != 0) {
 		snprintf(req->error_str, sizeof(req->error_str),
 			 "requested blocksize %u < backing device "
 			 "blocksize %u", params->blocksize_bytes,
 			 be_lun->blocksize);
 		return (EINVAL);
-	}
+	} else
+		be_lun->blocksize = tmp;
 
-	error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
-			       (caddr_t)&be_lun->size_bytes, FREAD,
+	error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, (caddr_t)&tmp, FREAD,
 			       curthread);
 	if (error) {
 		snprintf(req->error_str, sizeof(req->error_str),
@@ -1965,7 +1953,7 @@ ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
 	}
 
 	if (params->lun_size_bytes != 0) {
-		if (params->lun_size_bytes > be_lun->size_bytes) {
+		if (params->lun_size_bytes > tmp) {
 			snprintf(req->error_str, sizeof(req->error_str),
 				 "requested LUN size %ju > backing device "
 				 "size %ju",
@@ -1975,7 +1963,8 @@ ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
 		}
 
 		be_lun->size_bytes = params->lun_size_bytes;
-	}
+	} else
+		be_lun->size_bytes = tmp;
 
 	error = devsw->d_ioctl(dev, DIOCGSTRIPESIZE,
 			       (caddr_t)&ps, FREAD, curthread);
@@ -2160,14 +2149,8 @@ ctl_be_block_open(struct ctl_be_block_softc *softc,
 	}
 	VOP_UNLOCK(be_lun->vn, 0);
 
-	if (error != 0) {
+	if (error != 0)
 		ctl_be_block_close(be_lun);
-		return (error);
-	}
-
-	be_lun->blocksize_shift = fls(be_lun->blocksize) - 1;
-	be_lun->size_blocks = be_lun->size_bytes >> be_lun->blocksize_shift;
-
 	return (0);
 }
 
@@ -2224,10 +2207,14 @@ ctl_be_block_create(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
 			goto bailout_error;
 		}
 		be_lun->dev_path = strdup(value, M_CTLBLK);
-		be_lun->blocksize = 512;
-		be_lun->blocksize_shift = fls(be_lun->blocksize) - 1;
+		be_lun->size_bytes = params->lun_size_bytes;
+		if (params->blocksize_bytes != 0)
+			be_lun->blocksize = params->blocksize_bytes;
+		else
+			be_lun->blocksize = 512;
 
 		retval = ctl_be_block_open(softc, be_lun, req);
+		be_lun->size_blocks = be_lun->size_bytes / be_lun->blocksize;
 		if (retval != 0) {
 			retval = 0;
 			req->status = CTL_LUN_WARNING;
@@ -2652,10 +2639,9 @@ ctl_be_block_modify(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
 		error = ctl_be_block_modify_file(be_lun, req);
 	else
 		error = EINVAL;
+	be_lun->size_blocks = be_lun->size_bytes / be_lun->blocksize;
 
 	if (error == 0 && be_lun->size_bytes != oldsize) {
-		be_lun->size_blocks = be_lun->size_bytes >>
-		    be_lun->blocksize_shift;
 
 		/*
 		 * The maximum LBA is the size - 1.
diff --git a/sys/cam/ctl/ctl_backend_ramdisk.c b/sys/cam/ctl/ctl_backend_ramdisk.c
index ad90241..211738b 100644
--- a/sys/cam/ctl/ctl_backend_ramdisk.c
+++ b/sys/cam/ctl/ctl_backend_ramdisk.c
@@ -62,7 +62,6 @@ __FBSDID("$FreeBSD$");
 #include <cam/ctl/ctl.h>
 #include <cam/ctl/ctl_util.h>
 #include <cam/ctl/ctl_backend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
 #include <cam/ctl/ctl_debug.h>
 #include <cam/ctl/ctl_ioctl.h>
 #include <cam/ctl/ctl_error.h>
diff --git a/sys/cam/ctl/ctl_cmd_table.c b/sys/cam/ctl/ctl_cmd_table.c
index 08ff88a..9a7d70e 100644
--- a/sys/cam/ctl/ctl_cmd_table.c
+++ b/sys/cam/ctl/ctl_cmd_table.c
@@ -52,7 +52,6 @@
 #include <cam/ctl/ctl.h>
 #include <cam/ctl/ctl_frontend.h>
 #include <cam/ctl/ctl_backend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
 #include <cam/ctl/ctl_ioctl.h>
 #include <cam/ctl/ctl_ha.h>
 #include <cam/ctl/ctl_private.h>
diff --git a/sys/cam/ctl/ctl_error.c b/sys/cam/ctl/ctl_error.c
index d4d7f79..4b41331 100644
--- a/sys/cam/ctl/ctl_error.c
+++ b/sys/cam/ctl/ctl_error.c
@@ -57,7 +57,6 @@ __FBSDID("$FreeBSD$");
 #include <cam/ctl/ctl_io.h>
 #include <cam/ctl/ctl.h>
 #include <cam/ctl/ctl_frontend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
 #include <cam/ctl/ctl_backend.h>
 #include <cam/ctl/ctl_ioctl.h>
 #include <cam/ctl/ctl_error.h>
diff --git a/sys/cam/ctl/ctl_frontend.c b/sys/cam/ctl/ctl_frontend.c
index e22b9d4..34baf44 100644
--- a/sys/cam/ctl/ctl_frontend.c
+++ b/sys/cam/ctl/ctl_frontend.c
@@ -55,7 +55,6 @@ __FBSDID("$FreeBSD$");
 #include <cam/ctl/ctl_io.h>
 #include <cam/ctl/ctl.h>
 #include <cam/ctl/ctl_frontend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
 #include <cam/ctl/ctl_backend.h>
 /* XXX KDM move defines from ctl_ioctl.h to somewhere else */
 #include <cam/ctl/ctl_ioctl.h>
diff --git a/sys/cam/ctl/ctl_frontend_cam_sim.c b/sys/cam/ctl/ctl_frontend_cam_sim.c
index 3abc572..97b361a 100644
--- a/sys/cam/ctl/ctl_frontend_cam_sim.c
+++ b/sys/cam/ctl/ctl_frontend_cam_sim.c
@@ -64,7 +64,6 @@ __FBSDID("$FreeBSD$");
 #include <cam/ctl/ctl_io.h>
 #include <cam/ctl/ctl.h>
 #include <cam/ctl/ctl_frontend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
 #include <cam/ctl/ctl_debug.h>
 
 #define	io_ptr		spriv_ptr1
diff --git a/sys/cam/ctl/ctl_frontend_internal.c b/sys/cam/ctl/ctl_frontend_internal.c
deleted file mode 100644
index 4768292..0000000
--- a/sys/cam/ctl/ctl_frontend_internal.c
+++ /dev/null
@@ -1,1612 +0,0 @@
-/*-
- * Copyright (c) 2004, 2005 Silicon Graphics International Corp.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions, and the following disclaimer,
- *    without modification.
- * 2. Redistributions in binary form must reproduce at minimum a disclaimer
- *    substantially similar to the "NO WARRANTY" disclaimer below
- *    ("Disclaimer") and any redistribution must be conditioned upon
- *    including a substantially similar Disclaimer requirement for further
- *    binary redistribution.
- *
- * NO WARRANTY
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
- * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGES.
- *
- * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_frontend_internal.c#5 $
- */
-/*
- * CTL kernel internal frontend target driver.  This allows kernel-level
- * clients to send commands into CTL.
- *
- * This has elements of a FETD (e.g. it has to set tag numbers, initiator,
- * port, target, and LUN) and elements of an initiator (LUN discovery and
- * probing, error recovery, command initiation).  Even though this has some
- * initiator type elements, this is not intended to be a full fledged
- * initiator layer.  It is only intended to send a limited number of
- * commands to a well known target layer.
- *
- * To be able to fulfill the role of a full initiator layer, it would need
- * a whole lot more functionality.
- *
- * Author: Ken Merry <ken@FreeBSD.org>
- *
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/kernel.h>
-#include <sys/types.h>
-#include <sys/malloc.h>
-#include <sys/module.h>
-#include <sys/lock.h>
-#include <sys/mutex.h>
-#include <sys/condvar.h>
-#include <sys/queue.h>
-#include <sys/sbuf.h>
-#include <sys/sysctl.h>
-#include <vm/uma.h>
-#include <cam/scsi/scsi_all.h>
-#include <cam/scsi/scsi_da.h>
-#include <cam/ctl/ctl_io.h>
-#include <cam/ctl/ctl.h>
-#include <cam/ctl/ctl_frontend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
-#include <cam/ctl/ctl_backend.h>
-#include <cam/ctl/ctl_ioctl.h>
-#include <cam/ctl/ctl_util.h>
-#include <cam/ctl/ctl_ha.h>
-#include <cam/ctl/ctl_private.h>
-#include <cam/ctl/ctl_debug.h>
-#include <cam/ctl/ctl_scsi_all.h>
-#include <cam/ctl/ctl_error.h>
-
-/*
- * Task structure:
- *  - overall metatask, different potential metatask types (e.g. forced
- *    shutdown, gentle shutdown)
- *  - forced shutdown metatask:
- *     - states:  report luns, pending, done?
- *     - list of luns pending, with the relevant I/O for that lun attached.
- *       This would allow moving ahead on LUNs with no errors, and going
- *       into error recovery on LUNs with problems.  Per-LUN states might
- *       include inquiry, stop/offline, done.
- *
- * Use LUN enable for LUN list instead of getting it manually?  We'd still
- * need inquiry data for each LUN.
- *
- * How to handle processor LUN w.r.t. found/stopped counts?
- */
-#ifdef oldapi
-typedef enum {
-	CFI_TASK_NONE,
-	CFI_TASK_SHUTDOWN,
-	CFI_TASK_STARTUP
-} cfi_tasktype;
-
-struct cfi_task_startstop {
-	int total_luns;
-	int luns_complete;
-	int luns_failed;
-	cfi_cb_t callback;
-	void *callback_arg;
-	/* XXX KDM add more fields here */
-};
-
-union cfi_taskinfo {
-	struct cfi_task_startstop startstop;
-};
-
-struct cfi_metatask {
-	cfi_tasktype		tasktype;
-	cfi_mt_status		status;
-	union cfi_taskinfo	taskinfo;
-	void			*cfi_context;
-	STAILQ_ENTRY(cfi_metatask) links;
-};
-#endif
-
-typedef enum {
-	CFI_ERR_RETRY		= 0x000,
-	CFI_ERR_FAIL		= 0x001,
-	CFI_ERR_LUN_RESET	= 0x002,
-	CFI_ERR_MASK		= 0x0ff,
-	CFI_ERR_NO_DECREMENT	= 0x100
-} cfi_error_action;
-
-typedef enum {
-	CFI_ERR_SOFT,
-	CFI_ERR_HARD
-} cfi_error_policy;
-
-typedef enum {
-	CFI_LUN_INQUIRY,
-	CFI_LUN_READCAPACITY,
-	CFI_LUN_READCAPACITY_16,
-	CFI_LUN_READY
-} cfi_lun_state;
-
-struct cfi_lun {
-	int lun_id;
-	struct scsi_inquiry_data inq_data;
-	uint64_t num_blocks;
-	uint32_t blocksize;
-	int blocksize_powerof2;
-	uint32_t cur_tag_num;
-	cfi_lun_state state;
-	struct cfi_softc *softc;
-	STAILQ_HEAD(, cfi_lun_io) io_list;
-	STAILQ_ENTRY(cfi_lun) links;
-};
-
-struct cfi_lun_io {
-	struct cfi_lun *lun;
-	struct cfi_metatask *metatask;
-	cfi_error_policy policy;
-	void (*done_function)(union ctl_io *io);
-	union ctl_io *ctl_io;
-	struct cfi_lun_io *orig_lun_io;
-	STAILQ_ENTRY(cfi_lun_io) links;
-};
-
-typedef enum {
-	CFI_NONE	= 0x00,
-	CFI_ONLINE	= 0x01,
-} cfi_flags;
-
-struct cfi_softc {
-	struct ctl_port port;
-	char fe_name[40];
-	struct mtx lock;
-	cfi_flags flags;
-	STAILQ_HEAD(, cfi_lun) lun_list;
-	STAILQ_HEAD(, cfi_metatask) metatask_list;
-};
-
-MALLOC_DEFINE(M_CTL_CFI, "ctlcfi", "CTL CFI");
-
-static uma_zone_t cfi_lun_zone;
-static uma_zone_t cfi_metatask_zone;
-
-static struct cfi_softc fetd_internal_softc;
-
-int cfi_init(void);
-void cfi_shutdown(void) __unused;
-static void cfi_online(void *arg);
-static void cfi_offline(void *arg);
-static int cfi_lun_enable(void *arg, int lun_id);
-static int cfi_lun_disable(void *arg, int lun_id);
-static void cfi_datamove(union ctl_io *io);
-static cfi_error_action cfi_checkcond_parse(union ctl_io *io,
-					    struct cfi_lun_io *lun_io);
-static cfi_error_action cfi_error_parse(union ctl_io *io,
-					struct cfi_lun_io *lun_io);
-static void cfi_init_io(union ctl_io *io, struct cfi_lun *lun,
-			struct cfi_metatask *metatask, cfi_error_policy policy,
-			int retries, struct cfi_lun_io *orig_lun_io,
-			void (*done_function)(union ctl_io *io));
-static void cfi_done(union ctl_io *io);
-static void cfi_lun_probe_done(union ctl_io *io);
-static void cfi_lun_probe(struct cfi_lun *lun, int have_lock);
-static void cfi_metatask_done(struct cfi_softc *softc,
-			      struct cfi_metatask *metatask);
-static void cfi_metatask_bbr_errorparse(struct cfi_metatask *metatask,
-					union ctl_io *io);
-static void cfi_metatask_io_done(union ctl_io *io);
-static void cfi_err_recovery_done(union ctl_io *io);
-static void cfi_lun_io_done(union ctl_io *io);
-
-static struct ctl_frontend cfi_frontend =
-{
-	.name = "kernel",
-	.init = cfi_init,
-	.shutdown = cfi_shutdown,
-};
-CTL_FRONTEND_DECLARE(ctlcfi, cfi_frontend);
-
-int
-cfi_init(void)
-{
-	struct cfi_softc *softc;
-	struct ctl_port *port;
-	int retval;
-
-	softc = &fetd_internal_softc;
-
-	port = &softc->port;
-
-	retval = 0;
-
-	if (sizeof(struct cfi_lun_io) > CTL_PORT_PRIV_SIZE) {
-		printf("%s: size of struct cfi_lun_io %zd > "
-		       "CTL_PORT_PRIV_SIZE %d\n", __func__,
-		       sizeof(struct cfi_lun_io),
-		       CTL_PORT_PRIV_SIZE);
-	}
-	memset(softc, 0, sizeof(*softc));
-
-	mtx_init(&softc->lock, "CTL frontend mutex", NULL, MTX_DEF);
-	STAILQ_INIT(&softc->lun_list);
-	STAILQ_INIT(&softc->metatask_list);
-	sprintf(softc->fe_name, "kernel");
-	port->frontend = &cfi_frontend;
-	port->port_type = CTL_PORT_INTERNAL;
-	port->num_requested_ctl_io = 100;
-	port->port_name = softc->fe_name;
-	port->port_online = cfi_online;
-	port->port_offline = cfi_offline;
-	port->onoff_arg = softc;
-	port->lun_enable = cfi_lun_enable;
-	port->lun_disable = cfi_lun_disable;
-	port->targ_lun_arg = softc;
-	port->fe_datamove = cfi_datamove;
-	port->fe_done = cfi_done;
-	port->max_targets = 15;
-	port->max_target_id = 15;
-
-	if (ctl_port_register(port) != 0)
-	{
-		printf("%s: internal frontend registration failed\n", __func__);
-		return (0);
-	}
-
-	cfi_lun_zone = uma_zcreate("cfi_lun", sizeof(struct cfi_lun),
-	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
-	cfi_metatask_zone = uma_zcreate("cfi_metatask", sizeof(struct cfi_metatask),
-	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
-
-	return (0);
-}
-
-void
-cfi_shutdown(void)
-{
-	struct cfi_softc *softc;
-
-	softc = &fetd_internal_softc;
-
-	/*
-	 * XXX KDM need to clear out any I/O pending on each LUN.
-	 */
-	if (ctl_port_deregister(&softc->port) != 0)
-		printf("%s: ctl_frontend_deregister() failed\n", __func__);
-
-	uma_zdestroy(cfi_lun_zone);
-	uma_zdestroy(cfi_metatask_zone);
-}
-
-static void
-cfi_online(void *arg)
-{
-	struct cfi_softc *softc;
-	struct cfi_lun *lun;
-
-	softc = (struct cfi_softc *)arg;
-
-	softc->flags |= CFI_ONLINE;
-
-	/*
-	 * Go through and kick off the probe for each lun.  Should we check
-	 * the LUN flags here to determine whether or not to probe it?
-	 */
-	mtx_lock(&softc->lock);
-	STAILQ_FOREACH(lun, &softc->lun_list, links)
-		cfi_lun_probe(lun, /*have_lock*/ 1);
-	mtx_unlock(&softc->lock);
-}
-
-static void
-cfi_offline(void *arg)
-{
-	struct cfi_softc *softc;
-
-	softc = (struct cfi_softc *)arg;
-
-	softc->flags &= ~CFI_ONLINE;
-}
-
-static int
-cfi_lun_enable(void *arg, int lun_id)
-{
-	struct cfi_softc *softc;
-	struct cfi_lun *lun;
-	int found;
-
-	softc = (struct cfi_softc *)arg;
-
-	found = 0;
-	mtx_lock(&softc->lock);
-	STAILQ_FOREACH(lun, &softc->lun_list, links) {
-		if (lun->lun_id == lun_id) {
-			found = 1;
-			break;
-		}
-	}
-	mtx_unlock(&softc->lock);
-
-	/*
-	 * If we already have this target/LUN, there is no reason to add
-	 * it to our lists again.
-	 */
-	if (found != 0)
-		return (0);
-
-	lun = uma_zalloc(cfi_lun_zone, M_NOWAIT | M_ZERO);
-	if (lun == NULL) {
-		printf("%s: unable to allocate LUN structure\n", __func__);
-		return (1);
-	}
-
-	lun->lun_id = lun_id;
-	lun->cur_tag_num = 0;
-	lun->state = CFI_LUN_INQUIRY;
-	lun->softc = softc;
-	STAILQ_INIT(&lun->io_list);
-
-	mtx_lock(&softc->lock);
-	STAILQ_INSERT_TAIL(&softc->lun_list, lun, links);
-	mtx_unlock(&softc->lock);
-
-	cfi_lun_probe(lun, /*have_lock*/ 0);
-
-	return (0);
-}
-
-static int
-cfi_lun_disable(void *arg, int lun_id)
-{
-	struct cfi_softc *softc;
-	struct cfi_lun *lun;
-	int found;
-
-	softc = (struct cfi_softc *)arg;
-
-	found = 0;
-
-	/*
-	 * XXX KDM need to do an invalidate and then a free when any
-	 * pending I/O has completed.  Or do we?  CTL won't free a LUN
-	 * while any I/O is pending.  So we won't get this notification
-	 * unless any I/O we have pending on a LUN has completed.
-	 */
-	mtx_lock(&softc->lock);
-	STAILQ_FOREACH(lun, &softc->lun_list, links) {
-		if (lun->lun_id == lun_id) {
-			found = 1;
-			break;
-		}
-	}
-	if (found != 0)
-		STAILQ_REMOVE(&softc->lun_list, lun, cfi_lun, links);
-
-	mtx_unlock(&softc->lock);
-
-	if (found == 0) {
-		printf("%s: can't find lun %d\n", __func__, lun_id);
-		return (1);
-	}
-
-	uma_zfree(cfi_lun_zone, lun);
-
-	return (0);
-}
-
-static void
-cfi_datamove(union ctl_io *io)
-{
-	struct ctl_sg_entry *ext_sglist, *kern_sglist;
-	struct ctl_sg_entry ext_entry, kern_entry;
-	int ext_sglen, ext_sg_entries, kern_sg_entries;
-	int ext_sg_start, ext_offset;
-	int len_to_copy, len_copied;
-	int kern_watermark, ext_watermark;
-	int ext_sglist_malloced;
-	struct ctl_scsiio *ctsio;
-	int i, j;
-
-	ext_sglist_malloced = 0;
-	ext_sg_start = 0;
-	ext_offset = 0;
-	ext_sglist = NULL;
-
-	CTL_DEBUG_PRINT(("%s\n", __func__));
-
-	ctsio = &io->scsiio;
-
-	/*
-	 * If this is the case, we're probably doing a BBR read and don't
-	 * actually need to transfer the data.  This will effectively
-	 * bit-bucket the data.
-	 */
-	if (ctsio->ext_data_ptr == NULL)
-		goto bailout;
-
-	/*
-	 * To simplify things here, if we have a single buffer, stick it in
-	 * a S/G entry and just make it a single entry S/G list.
-	 */
-	if (ctsio->io_hdr.flags & CTL_FLAG_EDPTR_SGLIST) {
-		int len_seen;
-
-		ext_sglen = ctsio->ext_sg_entries * sizeof(*ext_sglist);
-
-		ext_sglist = (struct ctl_sg_entry *)malloc(ext_sglen, M_CTL_CFI,
-							   M_WAITOK);
-		ext_sglist_malloced = 1;
-		if (memcpy(ext_sglist, ctsio->ext_data_ptr, ext_sglen) != 0) {
-			ctl_set_internal_failure(ctsio,
-						 /*sks_valid*/ 0,
-						 /*retry_count*/ 0);
-			goto bailout;
-		}
-		ext_sg_entries = ctsio->ext_sg_entries;
-		len_seen = 0;
-		for (i = 0; i < ext_sg_entries; i++) {
-			if ((len_seen + ext_sglist[i].len) >=
-			     ctsio->ext_data_filled) {
-				ext_sg_start = i;
-				ext_offset = ctsio->ext_data_filled - len_seen;
-				break;
-			}
-			len_seen += ext_sglist[i].len;
-		}
-	} else {
-		ext_sglist = &ext_entry;
-		ext_sglist->addr = ctsio->ext_data_ptr;
-		ext_sglist->len = ctsio->ext_data_len;
-		ext_sg_entries = 1;
-		ext_sg_start = 0;
-		ext_offset = ctsio->ext_data_filled;
-	}
-
-	if (ctsio->kern_sg_entries > 0) {
-		kern_sglist = (struct ctl_sg_entry *)ctsio->kern_data_ptr;
-		kern_sg_entries = ctsio->kern_sg_entries;
-	} else {
-		kern_sglist = &kern_entry;
-		kern_sglist->addr = ctsio->kern_data_ptr;
-		kern_sglist->len = ctsio->kern_data_len;
-		kern_sg_entries = 1;
-	}
-
-
-	kern_watermark = 0;
-	ext_watermark = ext_offset;
-	len_copied = 0;
-	for (i = ext_sg_start, j = 0;
-	     i < ext_sg_entries && j < kern_sg_entries;) {
-		uint8_t *ext_ptr, *kern_ptr;
-
-		len_to_copy = MIN(ext_sglist[i].len - ext_watermark,
-				  kern_sglist[j].len - kern_watermark);
-
-		ext_ptr = (uint8_t *)ext_sglist[i].addr;
-		ext_ptr = ext_ptr + ext_watermark;
-		if (io->io_hdr.flags & CTL_FLAG_BUS_ADDR) {
-			/*
-			 * XXX KDM fix this!
-			 */
-			panic("need to implement bus address support");
-#if 0
-			kern_ptr = bus_to_virt(kern_sglist[j].addr);
-#endif
-		} else
-			kern_ptr = (uint8_t *)kern_sglist[j].addr;
-		kern_ptr = kern_ptr + kern_watermark;
-
-		kern_watermark += len_to_copy;
-		ext_watermark += len_to_copy;
-		
-		if ((ctsio->io_hdr.flags & CTL_FLAG_DATA_MASK) ==
-		     CTL_FLAG_DATA_IN) {
-			CTL_DEBUG_PRINT(("%s: copying %d bytes to user\n",
-					 __func__, len_to_copy));
-			CTL_DEBUG_PRINT(("%s: from %p to %p\n", __func__,
-					 kern_ptr, ext_ptr));
-			memcpy(ext_ptr, kern_ptr, len_to_copy);
-		} else {
-			CTL_DEBUG_PRINT(("%s: copying %d bytes from user\n",
-					 __func__, len_to_copy));
-			CTL_DEBUG_PRINT(("%s: from %p to %p\n", __func__,
-					 ext_ptr, kern_ptr));
-			memcpy(kern_ptr, ext_ptr, len_to_copy);
-		}
-
-		len_copied += len_to_copy;
-
-		if (ext_sglist[i].len == ext_watermark) {
-			i++;
-			ext_watermark = 0;
-		}
-
-		if (kern_sglist[j].len == kern_watermark) {
-			j++;
-			kern_watermark = 0;
-		}
-	}
-
-	ctsio->ext_data_filled += len_copied;
-
-	CTL_DEBUG_PRINT(("%s: ext_sg_entries: %d, kern_sg_entries: %d\n",
-			 __func__, ext_sg_entries, kern_sg_entries));
-	CTL_DEBUG_PRINT(("%s: ext_data_len = %d, kern_data_len = %d\n",
-			 __func__, ctsio->ext_data_len, ctsio->kern_data_len));
-	
-
-	/* XXX KDM set residual?? */
-bailout:
-
-	if (ext_sglist_malloced != 0)
-		free(ext_sglist, M_CTL_CFI);
-
-	io->scsiio.be_move_done(io);
-
-	return;
-}
-
-/*
- * For any sort of check condition, busy, etc., we just retry.  We do not
- * decrement the retry count for unit attention type errors.  These are
- * normal, and we want to save the retry count for "real" errors.  Otherwise,
- * we could end up with situations where a command will succeed in some
- * situations and fail in others, depending on whether a unit attention is
- * pending.  Also, some of our error recovery actions, most notably the
- * LUN reset action, will cause a unit attention.
- *
- * We can add more detail here later if necessary.
- */
-static cfi_error_action
-cfi_checkcond_parse(union ctl_io *io, struct cfi_lun_io *lun_io)
-{
-	cfi_error_action error_action;
-	int error_code, sense_key, asc, ascq;
-
-	/*
-	 * Default to retrying the command.
-	 */
-	error_action = CFI_ERR_RETRY;
-
-	scsi_extract_sense_len(&io->scsiio.sense_data,
-			       io->scsiio.sense_len,
-			       &error_code,
-			       &sense_key,
-			       &asc,
-			       &ascq,
-			       /*show_errors*/ 1);
-
-	switch (error_code) {
-	case SSD_DEFERRED_ERROR:
-	case SSD_DESC_DEFERRED_ERROR:
-		error_action |= CFI_ERR_NO_DECREMENT;
-		break;
-	case SSD_CURRENT_ERROR:
-	case SSD_DESC_CURRENT_ERROR:
-	default: {
-		switch (sense_key) {
-		case SSD_KEY_UNIT_ATTENTION:
-			error_action |= CFI_ERR_NO_DECREMENT;
-			break;
-		case SSD_KEY_HARDWARE_ERROR:
-			/*
-			 * This is our generic "something bad happened"
-			 * error code.  It often isn't recoverable.
-			 */
-			if ((asc == 0x44) && (ascq == 0x00))
-				error_action = CFI_ERR_FAIL;
-			break;
-		case SSD_KEY_NOT_READY:
-			/*
-			 * If the LUN is powered down, there likely isn't
-			 * much point in retrying right now.
-			 */
-			if ((asc == 0x04) && (ascq == 0x02))
-				error_action = CFI_ERR_FAIL;
-			/*
-			 * If the LUN is offline, there probably isn't much
-			 * point in retrying, either.
-			 */
-			if ((asc == 0x04) && (ascq == 0x03))
-				error_action = CFI_ERR_FAIL;
-			break;
-		}
-	}
-	}
-
-	return (error_action);
-}
-
-static cfi_error_action
-cfi_error_parse(union ctl_io *io, struct cfi_lun_io *lun_io)
-{
-	cfi_error_action error_action;
-
-	error_action = CFI_ERR_RETRY;
-
-	switch (io->io_hdr.io_type) {
-	case CTL_IO_SCSI:
-		switch (io->io_hdr.status & CTL_STATUS_MASK) {
-		case CTL_SCSI_ERROR:
-			switch (io->scsiio.scsi_status) {
-			case SCSI_STATUS_RESERV_CONFLICT:
-				/*
-				 * For a reservation conflict, we'll usually
-				 * want the hard error recovery policy, so
-				 * we'll reset the LUN.
-				 */
-				if (lun_io->policy == CFI_ERR_HARD)
-					error_action =
-						CFI_ERR_LUN_RESET;
-				else
-					error_action = 
-						CFI_ERR_RETRY;
-				break;
-			case SCSI_STATUS_CHECK_COND:
-			default:
-				error_action = cfi_checkcond_parse(io, lun_io);
-				break;
-			}
-			break;
-		default:
-			error_action = CFI_ERR_RETRY;
-			break;
-		}
-		break;
-	case CTL_IO_TASK:
-		/*
-		 * In theory task management commands shouldn't fail...
-		 */
-		error_action = CFI_ERR_RETRY;
-		break;
-	default:
-		printf("%s: invalid ctl_io type %d\n", __func__,
-		       io->io_hdr.io_type);
-		panic("%s: invalid ctl_io type %d\n", __func__,
-		      io->io_hdr.io_type);
-		break;
-	}
-
-	return (error_action);
-}
-
-static void
-cfi_init_io(union ctl_io *io, struct cfi_lun *lun,
-	    struct cfi_metatask *metatask, cfi_error_policy policy, int retries,
-	    struct cfi_lun_io *orig_lun_io,
-	    void (*done_function)(union ctl_io *io))
-{
-	struct cfi_lun_io *lun_io;
-
-	io->io_hdr.nexus.initid.id = 7;
-	io->io_hdr.nexus.targ_port = lun->softc->port.targ_port;
-	io->io_hdr.nexus.targ_target.id = 0;
-	io->io_hdr.nexus.targ_lun = lun->lun_id;
-	io->io_hdr.retries = retries;
-	lun_io = (struct cfi_lun_io *)io->io_hdr.port_priv;
-	io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr = lun_io;
-	lun_io->lun = lun;
-	lun_io->metatask = metatask;
-	lun_io->ctl_io = io;
-	lun_io->policy = policy;
-	lun_io->orig_lun_io = orig_lun_io;
-	lun_io->done_function = done_function;
-	/*
-	 * We only set the tag number for SCSI I/Os.  For task management
-	 * commands, the tag number is only really needed for aborts, so
-	 * the caller can set it if necessary.
-	 */
-	switch (io->io_hdr.io_type) {
-	case CTL_IO_SCSI:
-		io->scsiio.tag_num = lun->cur_tag_num++;
-		break;
-	case CTL_IO_TASK:
-	default:
-		break;
-	}
-}
-
-static void
-cfi_done(union ctl_io *io)
-{
-	struct cfi_lun_io *lun_io;
-	struct cfi_softc *softc;
-	struct cfi_lun *lun;
-
-	lun_io = (struct cfi_lun_io *)
-		io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr;
-
-	lun = lun_io->lun;
-	softc = lun->softc;
-
-	/*
-	 * Very minimal retry logic.  We basically retry if we got an error
-	 * back, and the retry count is greater than 0.  If we ever want
-	 * more sophisticated initiator type behavior, the CAM error
-	 * recovery code in ../common might be helpful.
-	 */
-	if (((io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)
-	 && (io->io_hdr.retries > 0)) {
-		ctl_io_status old_status;
-		cfi_error_action error_action;
-
-		error_action = cfi_error_parse(io, lun_io);
-
-		switch (error_action & CFI_ERR_MASK) {
-		case CFI_ERR_FAIL:
-			goto done;
-			break; /* NOTREACHED */
-		case CFI_ERR_LUN_RESET: {
-			union ctl_io *new_io;
-			struct cfi_lun_io *new_lun_io;
-
-			new_io = ctl_alloc_io(softc->port.ctl_pool_ref);
-			ctl_zero_io(new_io);
-
-			new_io->io_hdr.io_type = CTL_IO_TASK;
-			new_io->taskio.task_action = CTL_TASK_LUN_RESET;
-
-			cfi_init_io(new_io,
-				    /*lun*/ lun_io->lun,
-				    /*metatask*/ NULL,
-				    /*policy*/ CFI_ERR_SOFT,
-				    /*retries*/ 0,
-				    /*orig_lun_io*/lun_io,
-				    /*done_function*/ cfi_err_recovery_done);
-			
-
-			new_lun_io = (struct cfi_lun_io *)
-				new_io->io_hdr.port_priv;
-
-			mtx_lock(&lun->softc->lock);
-			STAILQ_INSERT_TAIL(&lun->io_list, new_lun_io, links);
-			mtx_unlock(&lun->softc->lock);
-
-			io = new_io;
-			break;
-		}
-		case CFI_ERR_RETRY:
-		default:
-			if ((error_action & CFI_ERR_NO_DECREMENT) == 0)
-				io->io_hdr.retries--;
-			break;
-		}
-
-		old_status = io->io_hdr.status;
-		io->io_hdr.status = CTL_STATUS_NONE;
-#if 0
-		io->io_hdr.flags &= ~CTL_FLAG_ALREADY_DONE;
-#endif
-		io->io_hdr.flags &= ~CTL_FLAG_ABORT;
-		io->io_hdr.flags &= ~CTL_FLAG_SENT_2OTHER_SC;
-
-		if (ctl_queue(io) != CTL_RETVAL_COMPLETE) {
-			printf("%s: error returned from ctl_queue()!\n",
-			       __func__);
-			io->io_hdr.status = old_status;
-		} else
-			return;
-	}
-done:
-	lun_io->done_function(io);
-}
-
-static void
-cfi_lun_probe_done(union ctl_io *io)
-{
-	struct cfi_lun *lun;
-	struct cfi_lun_io *lun_io;
-
-	lun_io = (struct cfi_lun_io *)
-		io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr;
-	lun = lun_io->lun;
-
-	switch (lun->state) {
-	case CFI_LUN_INQUIRY: {
-		if ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS) {
-			/* print out something here?? */
-			printf("%s: LUN %d probe failed because inquiry "
-			       "failed\n", __func__, lun->lun_id);
-			ctl_io_error_print(io, NULL);
-		} else {
-
-			if (SID_TYPE(&lun->inq_data) != T_DIRECT) {
-				char path_str[40];
-
-				lun->state = CFI_LUN_READY;
-				ctl_scsi_path_string(io, path_str,
-						     sizeof(path_str));
-				printf("%s", path_str);
-				scsi_print_inquiry(&lun->inq_data);
-			} else {
-				lun->state = CFI_LUN_READCAPACITY;
-				cfi_lun_probe(lun, /*have_lock*/ 0);
-			}
-		}
-		mtx_lock(&lun->softc->lock);
-		STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links);
-		mtx_unlock(&lun->softc->lock);
-		ctl_free_io(io);
-		break;
-	}
-	case CFI_LUN_READCAPACITY:
-	case CFI_LUN_READCAPACITY_16: {
-		uint64_t maxlba;
-		uint32_t blocksize;
-
-		maxlba = 0;
-		blocksize = 0;
-
-		if ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS) {
-			printf("%s: LUN %d probe failed because READ CAPACITY "
-			       "failed\n", __func__, lun->lun_id);
-			ctl_io_error_print(io, NULL);
-		} else {
-
-			if (lun->state == CFI_LUN_READCAPACITY) {
-				struct scsi_read_capacity_data *rdcap;
-
-				rdcap = (struct scsi_read_capacity_data *)
-					io->scsiio.ext_data_ptr;
-
-				maxlba = scsi_4btoul(rdcap->addr);
-				blocksize = scsi_4btoul(rdcap->length);
-				if (blocksize == 0) {
-					printf("%s: LUN %d has invalid "
-					       "blocksize 0, probe aborted\n",
-					       __func__, lun->lun_id);
-				} else if (maxlba == 0xffffffff) {
-					lun->state = CFI_LUN_READCAPACITY_16;
-					cfi_lun_probe(lun, /*have_lock*/ 0);
-				} else
-					lun->state = CFI_LUN_READY;
-			} else {
-				struct scsi_read_capacity_data_long *rdcap_long;
-
-				rdcap_long = (struct
-					scsi_read_capacity_data_long *)
-					io->scsiio.ext_data_ptr;
-				maxlba = scsi_8btou64(rdcap_long->addr);
-				blocksize = scsi_4btoul(rdcap_long->length);
-
-				if (blocksize == 0) {
-					printf("%s: LUN %d has invalid "
-					       "blocksize 0, probe aborted\n",
-					       __func__, lun->lun_id);
-				} else
-					lun->state = CFI_LUN_READY;
-			}
-		}
-
-		if (lun->state == CFI_LUN_READY) {
-			char path_str[40];
-
-			lun->num_blocks = maxlba + 1;
-			lun->blocksize = blocksize;
-
-			/*
-			 * If this is true, the blocksize is a power of 2.
-			 * We already checked for 0 above.
-			 */
-			if (((blocksize - 1) & blocksize) == 0) {
-				int i;
-
-				for (i = 0; i < 32; i++) {
-					if ((blocksize & (1 << i)) != 0) {
-						lun->blocksize_powerof2 = i;
-						break;
-					}
-				}
-			}
-			ctl_scsi_path_string(io, path_str,sizeof(path_str));
-			printf("%s", path_str);
-			scsi_print_inquiry(&lun->inq_data);
-			printf("%s %ju blocks, blocksize %d\n", path_str,
-			       (uintmax_t)maxlba + 1, blocksize);
-		}
-		mtx_lock(&lun->softc->lock);
-		STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links);
-		mtx_unlock(&lun->softc->lock);
-		free(io->scsiio.ext_data_ptr, M_CTL_CFI);
-		ctl_free_io(io);
-		break;
-	}
-	case CFI_LUN_READY:
-	default:
-		mtx_lock(&lun->softc->lock);
-		/* How did we get here?? */
-		STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links);
-		mtx_unlock(&lun->softc->lock);
-		ctl_free_io(io);
-		break;
-	}
-}
-
-static void
-cfi_lun_probe(struct cfi_lun *lun, int have_lock)
-{
-
-	if (have_lock == 0)
-		mtx_lock(&lun->softc->lock);
-	if ((lun->softc->flags & CFI_ONLINE) == 0) {
-		if (have_lock == 0)
-			mtx_unlock(&lun->softc->lock);
-		return;
-	}
-	if (have_lock == 0)
-		mtx_unlock(&lun->softc->lock);
-
-	switch (lun->state) {
-	case CFI_LUN_INQUIRY: {
-		struct cfi_lun_io *lun_io;
-		union ctl_io *io;
-
-		io = ctl_alloc_io(lun->softc->port.ctl_pool_ref);
-		ctl_scsi_inquiry(io,
-				 /*data_ptr*/(uint8_t *)&lun->inq_data,
-				 /*data_len*/ sizeof(lun->inq_data),
-				 /*byte2*/ 0,
-				 /*page_code*/ 0,
-				 /*tag_type*/ CTL_TAG_SIMPLE,
-				 /*control*/ 0);
-
-		cfi_init_io(io,
-			    /*lun*/ lun,
-			    /*metatask*/ NULL,
-			    /*policy*/ CFI_ERR_SOFT,
-			    /*retries*/ 5,
-			    /*orig_lun_io*/ NULL,
-			    /*done_function*/
-			    cfi_lun_probe_done);
-
-		lun_io = (struct cfi_lun_io *)io->io_hdr.port_priv;
-
-		if (have_lock == 0)
-			mtx_lock(&lun->softc->lock);
-		STAILQ_INSERT_TAIL(&lun->io_list, lun_io, links);
-		if (have_lock == 0)
-			mtx_unlock(&lun->softc->lock);
-
-		if (ctl_queue(io) != CTL_RETVAL_COMPLETE) {
-			printf("%s: error returned from ctl_queue()!\n",
-			       __func__);
-			STAILQ_REMOVE(&lun->io_list, lun_io,
-				      cfi_lun_io, links);
-			ctl_free_io(io);
-		}
-		break;
-	}
-	case CFI_LUN_READCAPACITY:
-	case CFI_LUN_READCAPACITY_16: {
-		struct cfi_lun_io *lun_io;
-		uint8_t *dataptr;
-		union ctl_io *io;
-
-		io = ctl_alloc_io(lun->softc->port.ctl_pool_ref);
-
-		dataptr = malloc(sizeof(struct scsi_read_capacity_data_long),
-				 M_CTL_CFI, M_NOWAIT);
-		if (dataptr == NULL) {
-			printf("%s: unable to allocate SCSI read capacity "
-			       "buffer for lun %d\n", __func__, lun->lun_id);
-			return;
-		}
-		if (lun->state == CFI_LUN_READCAPACITY) {
-			ctl_scsi_read_capacity(io,
-				/*data_ptr*/ dataptr,
-				/*data_len*/
-				sizeof(struct scsi_read_capacity_data_long),
-				/*addr*/ 0,
-				/*reladr*/ 0,
-				/*pmi*/ 0,
-				/*tag_type*/ CTL_TAG_SIMPLE,
-				/*control*/ 0);
-		} else {
-			ctl_scsi_read_capacity_16(io,
-				/*data_ptr*/ dataptr,
-				/*data_len*/
-				sizeof(struct scsi_read_capacity_data_long),
-				/*addr*/ 0,
-				/*reladr*/ 0,
-				/*pmi*/ 0,
-				/*tag_type*/ CTL_TAG_SIMPLE,
-				/*control*/ 0);
-		}
-		cfi_init_io(io,
-			    /*lun*/ lun,
-			    /*metatask*/ NULL,
-			    /*policy*/ CFI_ERR_SOFT,
-			    /*retries*/ 7,
-			    /*orig_lun_io*/ NULL,
-			    /*done_function*/ cfi_lun_probe_done);
-
-		lun_io = (struct cfi_lun_io *)io->io_hdr.port_priv;
-
-		if (have_lock == 0)
-			mtx_lock(&lun->softc->lock);
-		STAILQ_INSERT_TAIL(&lun->io_list, lun_io, links);
-		if (have_lock == 0)
-			mtx_unlock(&lun->softc->lock);
-
-		if (ctl_queue(io) != CTL_RETVAL_COMPLETE) {
-			printf("%s: error returned from ctl_queue()!\n",
-			       __func__);
-			STAILQ_REMOVE(&lun->io_list, lun_io,
-				      cfi_lun_io, links);
-			free(dataptr, M_CTL_CFI);
-			ctl_free_io(io);
-		}
-		break;
-	}
-	case CFI_LUN_READY:
-	default:
-		/* Why were we called? */
-		break;
-	}
-}
-
-static void
-cfi_metatask_done(struct cfi_softc *softc, struct cfi_metatask *metatask)
-{
-	mtx_lock(&softc->lock);
-	STAILQ_REMOVE(&softc->metatask_list, metatask, cfi_metatask, links);
-	mtx_unlock(&softc->lock);
-
-	/*
-	 * Return status to the caller.  Caller allocated storage, and is
-	 * responsible for calling cfi_free_metatask to release it once
-	 * they've seen the status.
-	 */
-	metatask->callback(metatask->callback_arg, metatask);
-}
-
-static void
-cfi_metatask_bbr_errorparse(struct cfi_metatask *metatask, union ctl_io *io)
-{
-	int error_code, sense_key, asc, ascq;
-
-	if (metatask->tasktype != CFI_TASK_BBRREAD)
-		return;
-
-	if ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS) {
-		metatask->status = CFI_MT_SUCCESS;
-		metatask->taskinfo.bbrread.status = CFI_BBR_SUCCESS;
-		return;
-	}
-
-	if ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_SCSI_ERROR) {
-		metatask->status = CFI_MT_ERROR;
-		metatask->taskinfo.bbrread.status = CFI_BBR_ERROR;
-		return;
-	}
-
-	metatask->taskinfo.bbrread.scsi_status = io->scsiio.scsi_status;
-	memcpy(&metatask->taskinfo.bbrread.sense_data, &io->scsiio.sense_data,
-	       MIN(sizeof(metatask->taskinfo.bbrread.sense_data),
-		   sizeof(io->scsiio.sense_data)));
-
-	if (io->scsiio.scsi_status == SCSI_STATUS_RESERV_CONFLICT) {
-		metatask->status = CFI_MT_ERROR;
-		metatask->taskinfo.bbrread.status = CFI_BBR_RESERV_CONFLICT;
-		return;
-	}
-
-	if (io->scsiio.scsi_status != SCSI_STATUS_CHECK_COND) {
-		metatask->status = CFI_MT_ERROR;
-		metatask->taskinfo.bbrread.status = CFI_BBR_SCSI_ERROR;
-		return;
-	}
-
-	scsi_extract_sense_len(&io->scsiio.sense_data,
-			       io->scsiio.sense_len,
-			       &error_code,
-			       &sense_key,
-			       &asc,
-			       &ascq,
-			       /*show_errors*/ 1);
-
-	switch (error_code) {
-	case SSD_DEFERRED_ERROR:
-	case SSD_DESC_DEFERRED_ERROR:
-		metatask->status = CFI_MT_ERROR;
-		metatask->taskinfo.bbrread.status = CFI_BBR_SCSI_ERROR;
-		break;
-	case SSD_CURRENT_ERROR:
-	case SSD_DESC_CURRENT_ERROR:
-	default: {
-		struct scsi_sense_data *sense;
-
-		sense = &io->scsiio.sense_data;
-
-		if ((asc == 0x04) && (ascq == 0x02)) {
-			metatask->status = CFI_MT_ERROR;
-			metatask->taskinfo.bbrread.status = CFI_BBR_LUN_STOPPED;
-		} else if ((asc == 0x04) && (ascq == 0x03)) {
-			metatask->status = CFI_MT_ERROR;
-			metatask->taskinfo.bbrread.status =
-				CFI_BBR_LUN_OFFLINE_CTL;
-		} else if ((asc == 0x44) && (ascq == 0x00)) {
-#ifdef NEEDTOPORT
-			if (sense->sense_key_spec[0] & SSD_SCS_VALID) {
-				uint16_t retry_count;
-
-				retry_count = sense->sense_key_spec[1] << 8 |
-					      sense->sense_key_spec[2];
-				if (((retry_count & 0xf000) == CSC_RAIDCORE)
-				 && ((retry_count & 0x0f00) == CSC_SHELF_SW)
-				 && ((retry_count & 0xff) ==
-				      RC_STS_DEVICE_OFFLINE)) {
-					metatask->status = CFI_MT_ERROR;
-					metatask->taskinfo.bbrread.status =
-						CFI_BBR_LUN_OFFLINE_RC;
-				} else {
-					metatask->status = CFI_MT_ERROR;
-					metatask->taskinfo.bbrread.status =
-						CFI_BBR_SCSI_ERROR;
-				}
-			} else {
-#endif /* NEEDTOPORT */
-				metatask->status = CFI_MT_ERROR;
-				metatask->taskinfo.bbrread.status =
-					CFI_BBR_SCSI_ERROR;
-#ifdef NEEDTOPORT
-			}
-#endif
-		} else {
-			metatask->status = CFI_MT_ERROR;
-			metatask->taskinfo.bbrread.status = CFI_BBR_SCSI_ERROR;
-		}
-		break;
-	}
-	}
-}
-
-static void
-cfi_metatask_io_done(union ctl_io *io)
-{
-	struct cfi_lun_io *lun_io;
-	struct cfi_metatask *metatask;
-	struct cfi_softc *softc;
-	struct cfi_lun *lun;
-
-	lun_io = (struct cfi_lun_io *)
-		io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr;
-
-	lun = lun_io->lun;
-	softc = lun->softc;
-
-	metatask = lun_io->metatask;
-
-	switch (metatask->tasktype) {
-	case CFI_TASK_STARTUP:
-	case CFI_TASK_SHUTDOWN: {
-		int failed, done, is_start;
-
-		failed = 0;
-		done = 0;
-		if (metatask->tasktype == CFI_TASK_STARTUP)
-			is_start = 1;
-		else
-			is_start = 0;
-
-		mtx_lock(&softc->lock);
-		if ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)
-			metatask->taskinfo.startstop.luns_complete++;
-		else {
-			metatask->taskinfo.startstop.luns_failed++;
-			failed = 1;
-		}
-		if ((metatask->taskinfo.startstop.luns_complete +
-		     metatask->taskinfo.startstop.luns_failed) >=
-		     metatask->taskinfo.startstop.total_luns)
-			done = 1;
-
-		mtx_unlock(&softc->lock);
-
-		if (failed != 0) {
-			printf("%s: LUN %d %s request failed\n", __func__,
-			       lun_io->lun->lun_id, (is_start == 1) ? "start" :
-			       "stop");
-			ctl_io_error_print(io, &lun_io->lun->inq_data);
-		}
-		if (done != 0) {
-			if (metatask->taskinfo.startstop.luns_failed > 0)
-				metatask->status = CFI_MT_ERROR;
-			else
-				metatask->status = CFI_MT_SUCCESS;
-			cfi_metatask_done(softc, metatask);
-		}
-		mtx_lock(&softc->lock);
-		STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links);
-		mtx_unlock(&softc->lock);
-
-		ctl_free_io(io);
-		break;
-	}
-	case CFI_TASK_BBRREAD: {
-		/*
-		 * Translate the SCSI error into an enumeration.
-		 */
-		cfi_metatask_bbr_errorparse(metatask, io);
-
-		mtx_lock(&softc->lock);
-		STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links);
-		mtx_unlock(&softc->lock);
-
-		ctl_free_io(io);
-
-		cfi_metatask_done(softc, metatask);
-		break;
-	}
-	default:
-		/*
-		 * This shouldn't happen.
-		 */
-		mtx_lock(&softc->lock);
-		STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links);
-		mtx_unlock(&softc->lock);
-
-		ctl_free_io(io);
-		break;
-	}
-}
-
-static void
-cfi_err_recovery_done(union ctl_io *io)
-{
-	struct cfi_lun_io *lun_io, *orig_lun_io;
-	struct cfi_lun *lun;
-	union ctl_io *orig_io;
-
-	lun_io = (struct cfi_lun_io *)io->io_hdr.port_priv;
-	orig_lun_io = lun_io->orig_lun_io;
-	orig_io = orig_lun_io->ctl_io;
-	lun = lun_io->lun;
-
-	if (io->io_hdr.status != CTL_SUCCESS) {
-		printf("%s: error recovery action failed.  Original "
-		       "error:\n", __func__);
-
-		ctl_io_error_print(orig_lun_io->ctl_io, &lun->inq_data);
-
-		printf("%s: error from error recovery action:\n", __func__);
-
-		ctl_io_error_print(io, &lun->inq_data);
-
-		printf("%s: trying original command again...\n", __func__);
-	}
-
-	mtx_lock(&lun->softc->lock);
-	STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links);
-	mtx_unlock(&lun->softc->lock);
-	ctl_free_io(io);
-
-	orig_io->io_hdr.retries--;
-	orig_io->io_hdr.status = CTL_STATUS_NONE;
-
-	if (ctl_queue(orig_io) != CTL_RETVAL_COMPLETE) {
-		printf("%s: error returned from ctl_queue()!\n", __func__);
-		STAILQ_REMOVE(&lun->io_list, orig_lun_io,
-			      cfi_lun_io, links);
-		ctl_free_io(orig_io);
-	}
-}
-
-static void
-cfi_lun_io_done(union ctl_io *io)
-{
-	struct cfi_lun *lun;
-	struct cfi_lun_io *lun_io;
-
-	lun_io = (struct cfi_lun_io *)
-		io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr;
-	lun = lun_io->lun;
-
-	if (lun_io->metatask == NULL) {
-		printf("%s: I/O has no metatask pointer, discarding\n",
-		       __func__);
-		STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links);
-		ctl_free_io(io);
-		return;
-	}
-	cfi_metatask_io_done(io);
-}
-
-void
-cfi_action(struct cfi_metatask *metatask)
-{
-	struct cfi_softc *softc;
-
-	softc = &fetd_internal_softc;
-
-	mtx_lock(&softc->lock);
-
-	STAILQ_INSERT_TAIL(&softc->metatask_list, metatask, links);
-
-	if ((softc->flags & CFI_ONLINE) == 0) {
-		mtx_unlock(&softc->lock);
-		metatask->status = CFI_MT_PORT_OFFLINE;
-		cfi_metatask_done(softc, metatask);
-		return;
-	} else
-		mtx_unlock(&softc->lock);
-
-	switch (metatask->tasktype) {
-	case CFI_TASK_STARTUP:
-	case CFI_TASK_SHUTDOWN: {
-		union ctl_io *io;
-		int da_luns, ios_allocated, do_start;
-		struct cfi_lun *lun;
-		STAILQ_HEAD(, ctl_io_hdr) tmp_io_list;
-
-		da_luns = 0;
-		ios_allocated = 0;
-		STAILQ_INIT(&tmp_io_list);
-
-		if (metatask->tasktype == CFI_TASK_STARTUP)
-			do_start = 1;
-		else
-			do_start = 0;
-
-		mtx_lock(&softc->lock);
-		STAILQ_FOREACH(lun, &softc->lun_list, links) {
-			if (lun->state != CFI_LUN_READY)
-				continue;
-
-			if (SID_TYPE(&lun->inq_data) != T_DIRECT)
-				continue;
-			da_luns++;
-			io = ctl_alloc_io_nowait(softc->port.ctl_pool_ref);
-			if (io != NULL) {
-				ios_allocated++;
-				STAILQ_INSERT_TAIL(&tmp_io_list, &io->io_hdr,
-						   links);
-			}
-		}
-
-		if (ios_allocated < da_luns) {
-			printf("%s: error allocating ctl_io for %s\n",
-			       __func__, (do_start == 1) ? "startup" :
-			       "shutdown");
-			da_luns = ios_allocated;
-		}
-
-		metatask->taskinfo.startstop.total_luns = da_luns;
-
-		STAILQ_FOREACH(lun, &softc->lun_list, links) {
-			struct cfi_lun_io *lun_io;
-
-			if (lun->state != CFI_LUN_READY)
-				continue;
-
-			if (SID_TYPE(&lun->inq_data) != T_DIRECT)
-				continue;
-
-			io = (union ctl_io *)STAILQ_FIRST(&tmp_io_list);
-			if (io == NULL)
-				break;
-
-			STAILQ_REMOVE(&tmp_io_list, &io->io_hdr, ctl_io_hdr,
-				      links);
-
-			ctl_scsi_start_stop(io,
-					    /*start*/ do_start,
-					    /*load_eject*/ 0,
-					    /*immediate*/ 0,
-					    /*power_conditions*/
-					    SSS_PC_START_VALID,
-					    /*onoffline*/ 1,
-					    /*ctl_tag_type*/ CTL_TAG_ORDERED,
-					    /*control*/ 0);
-
-			cfi_init_io(io,
-				    /*lun*/ lun,
-				    /*metatask*/ metatask,
-				    /*policy*/ CFI_ERR_HARD,
-				    /*retries*/ 3,
-				    /*orig_lun_io*/ NULL,
-				    /*done_function*/ cfi_lun_io_done);
-
-			lun_io = (struct cfi_lun_io *) io->io_hdr.port_priv;
-
-			STAILQ_INSERT_TAIL(&lun->io_list, lun_io, links);
-
-			if (ctl_queue(io) != CTL_RETVAL_COMPLETE) {
-				printf("%s: error returned from ctl_queue()!\n",
-				       __func__);
-				STAILQ_REMOVE(&lun->io_list, lun_io,
-					      cfi_lun_io, links);
-				ctl_free_io(io);
-				metatask->taskinfo.startstop.total_luns--;
-			}
-		}
-
-		if (STAILQ_FIRST(&tmp_io_list) != NULL) {
-			printf("%s: error: tmp_io_list != NULL\n", __func__);
-			for (io = (union ctl_io *)STAILQ_FIRST(&tmp_io_list);
-			     io != NULL;
-			     io = (union ctl_io *)STAILQ_FIRST(&tmp_io_list)) {
-				STAILQ_REMOVE(&tmp_io_list, &io->io_hdr,
-					      ctl_io_hdr, links);
-				ctl_free_io(io);
-			}
-		}
-		mtx_unlock(&softc->lock);
-
-		break;
-	}
-	case CFI_TASK_BBRREAD: {
-		union ctl_io *io;
-		struct cfi_lun *lun;
-		struct cfi_lun_io *lun_io;
-		cfi_bbrread_status status;
-		int req_lun_num;
-		uint32_t num_blocks;
-
-		status = CFI_BBR_SUCCESS;
-
-		req_lun_num = metatask->taskinfo.bbrread.lun_num;
-
-		mtx_lock(&softc->lock);
-		STAILQ_FOREACH(lun, &softc->lun_list, links) {
-			if (lun->lun_id != req_lun_num)
-				continue;
-			if (lun->state != CFI_LUN_READY) {
-				status = CFI_BBR_LUN_UNCONFIG;
-				break;
-			} else
-				break;
-		}
-
-		if (lun == NULL)
-			status = CFI_BBR_NO_LUN;
-
-		if (status != CFI_BBR_SUCCESS) {
-			metatask->status = CFI_MT_ERROR;
-			metatask->taskinfo.bbrread.status = status;
-			mtx_unlock(&softc->lock);
-			cfi_metatask_done(softc, metatask);
-			break;
-		}
-
-		/*
-		 * Convert the number of bytes given into blocks and check
-		 * that the number of bytes is a multiple of the blocksize.
-		 * CTL will verify that the LBA is okay.
-		 */
-		if (lun->blocksize_powerof2 != 0) {
-			if ((metatask->taskinfo.bbrread.len &
-			    (lun->blocksize - 1)) != 0) {
-				metatask->status = CFI_MT_ERROR;
-				metatask->taskinfo.bbrread.status =
-					CFI_BBR_BAD_LEN;
-				cfi_metatask_done(softc, metatask);
-				break;
-			}
-
-			num_blocks = metatask->taskinfo.bbrread.len >>
-				lun->blocksize_powerof2;
-		} else {
-			/*
-			 * XXX KDM this could result in floating point
-			 * division, which isn't supported in the kernel on
-			 * x86 at least.
-			 */
-			if ((metatask->taskinfo.bbrread.len %
-			     lun->blocksize) != 0) {
-				metatask->status = CFI_MT_ERROR;
-				metatask->taskinfo.bbrread.status =
-					CFI_BBR_BAD_LEN;
-				cfi_metatask_done(softc, metatask);
-				break;
-			}
-
-			/*
-			 * XXX KDM this could result in floating point
-			 * division in some cases.
-			 */
-			num_blocks = metatask->taskinfo.bbrread.len /
-				lun->blocksize;
-
-		}
-
-		io = ctl_alloc_io_nowait(softc->port.ctl_pool_ref);
-		if (io == NULL) {
-			metatask->status = CFI_MT_ERROR;
-			metatask->taskinfo.bbrread.status = CFI_BBR_NO_MEM;
-			mtx_unlock(&softc->lock);
-			cfi_metatask_done(softc, metatask);
-			break;
-		}
-
-		/*
-		 * XXX KDM need to do a read capacity to get the blocksize
-		 * for this device.
-		 */
-		ctl_scsi_read_write(io,
-				    /*data_ptr*/ NULL,
-				    /*data_len*/ metatask->taskinfo.bbrread.len,
-				    /*read_op*/ 1,
-				    /*byte2*/ 0,
-				    /*minimum_cdb_size*/ 0,
-				    /*lba*/ metatask->taskinfo.bbrread.lba,
-				    /*num_blocks*/ num_blocks,
-				    /*tag_type*/ CTL_TAG_SIMPLE,
-				    /*control*/ 0);
-
-		cfi_init_io(io,
-			    /*lun*/ lun,
-			    /*metatask*/ metatask,
-			    /*policy*/ CFI_ERR_SOFT,
-			    /*retries*/ 3,
-			    /*orig_lun_io*/ NULL,
-			    /*done_function*/ cfi_lun_io_done);
-
-		lun_io = (struct cfi_lun_io *)io->io_hdr.port_priv;
-
-		STAILQ_INSERT_TAIL(&lun->io_list, lun_io, links);
-
-		if (ctl_queue(io) != CTL_RETVAL_COMPLETE) {
-			printf("%s: error returned from ctl_queue()!\n",
-			       __func__);
-			STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links);
-			ctl_free_io(io);
-			metatask->status = CFI_MT_ERROR;
-			metatask->taskinfo.bbrread.status = CFI_BBR_ERROR;
-			mtx_unlock(&softc->lock);
-			cfi_metatask_done(softc, metatask);
-			break;
-		}
-
-		mtx_unlock(&softc->lock);
-		break;
-	}
-	default:
-		panic("invalid metatask type %d", metatask->tasktype);
-		break; /* NOTREACHED */
-	}
-}
-
-struct cfi_metatask *
-cfi_alloc_metatask(int can_wait)
-{
-	struct cfi_metatask *metatask;
-	struct cfi_softc *softc;
-
-	softc = &fetd_internal_softc;
-
-	metatask = uma_zalloc(cfi_metatask_zone,
-	    (can_wait ? M_WAITOK : M_NOWAIT) | M_ZERO);
-	if (metatask == NULL)
-		return (NULL);
-
-	metatask->status = CFI_MT_NONE;
-
-	return (metatask);
-}
-
-void
-cfi_free_metatask(struct cfi_metatask *metatask)
-{
-
-	uma_zfree(cfi_metatask_zone, metatask);
-}
-
-/*
- * vim: ts=8
- */
diff --git a/sys/cam/ctl/ctl_frontend_internal.h b/sys/cam/ctl/ctl_frontend_internal.h
deleted file mode 100644
index cb00dc6..0000000
--- a/sys/cam/ctl/ctl_frontend_internal.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/*-
- * Copyright (c) 2004 Silicon Graphics International Corp.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions, and the following disclaimer,
- *    without modification.
- * 2. Redistributions in binary form must reproduce at minimum a disclaimer
- *    substantially similar to the "NO WARRANTY" disclaimer below
- *    ("Disclaimer") and any redistribution must be conditioned upon
- *    including a substantially similar Disclaimer requirement for further
- *    binary redistribution.
- *
- * NO WARRANTY
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
- * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGES.
- *
- * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_frontend_internal.h#1 $
- * $FreeBSD$
- */
-/*
- * CTL kernel internal frontend target driver.  This allows kernel-level
- * clients to send commands into CTL.
- *
- * Author: Ken Merry <ken@FreeBSD.org>
- */
-
-#ifndef	_CTL_FRONTEND_INTERNAL_H_
-#define	_CTL_FRONTEND_INTERNAL_H_
-
-/*
- * These are general metatask error codes.  If the error code is CFI_MT_ERROR, 
- * check any metatask-specific status codes for more detail on the problem.
- */
-typedef enum {
-	CFI_MT_NONE,
-	CFI_MT_PORT_OFFLINE,
-	CFI_MT_ERROR,
-	CFI_MT_SUCCESS
-} cfi_mt_status;
-
-typedef enum {
-	CFI_TASK_NONE,
-	CFI_TASK_SHUTDOWN,
-	CFI_TASK_STARTUP,
-	CFI_TASK_BBRREAD
-} cfi_tasktype;
-
-struct cfi_task_startstop {
-	int total_luns;
-	int luns_complete;
-	int luns_failed;
-};
-
-/*
- * Error code description:
- * CFI_BBR_SUCCESS          - the read was successful
- * CFI_BBR_LUN_UNCONFIG     - CFI probe for this lun hasn't completed
- * CFI_BBR_NO_LUN           - this lun doesn't exist, as far as CFI knows
- * CFI_BBR_NO_MEM           - memory allocation error
- * CFI_BBR_BAD_LEN          - data length isn't a multiple of the blocksize
- * CFI_BBR_RESERV_CONFLICT  - another initiator has this lun reserved, so
- *                            we can't issue I/O at all.
- * CFI_BBR_LUN_STOPPED      - the lun is powered off.
- * CFI_BBR_LUN_OFFLINE_CTL  - the lun is offline from a CTL standpoint
- * CFI_BBR_LUN_OFFLINE_RC   - the lun is offline from a RAIDCore standpoint.
- *                            This is bad, because it basically means we've
- *                            had a double failure on the LUN.
- * CFI_BBR_SCSI_ERROR       - generic SCSI error, see status byte and sense
- *                            data for more resolution if you want it.
- * CFI_BBR_ERROR            - the catch-all error code.
- */
-typedef enum {
-	CFI_BBR_SUCCESS,
-	CFI_BBR_LUN_UNCONFIG,
-	CFI_BBR_NO_LUN,
-	CFI_BBR_NO_MEM,
-	CFI_BBR_BAD_LEN,
-	CFI_BBR_RESERV_CONFLICT,
-	CFI_BBR_LUN_STOPPED,
-	CFI_BBR_LUN_OFFLINE_CTL,
-	CFI_BBR_LUN_OFFLINE_RC,
-	CFI_BBR_SCSI_ERROR,
-	CFI_BBR_ERROR,
-} cfi_bbrread_status;
-
-struct cfi_task_bbrread {
-	int			lun_num;      /* lun number */
-	uint64_t		lba;          /* logical block address */
-	int			len;          /* length in bytes */
-	cfi_bbrread_status	status;       /* BBR status */
-	uint8_t			scsi_status;  /* SCSI status */
-	struct scsi_sense_data	sense_data;   /* SCSI sense data */
-};
-
-union cfi_taskinfo {
-	struct cfi_task_startstop startstop;
-	struct cfi_task_bbrread bbrread;
-};
-
-struct cfi_metatask;
-
-typedef void (*cfi_cb_t)(void *arg, struct cfi_metatask *metatask);
-
-struct cfi_metatask {
-	cfi_tasktype		tasktype;	/* passed to CFI */
-	cfi_mt_status		status;		/* returned from CFI */
-	union cfi_taskinfo	taskinfo;	/* returned from CFI */
-	struct ctl_mem_element	*element;	/* used by CFI, don't touch*/
-	cfi_cb_t		callback;	/* passed to CFI */
-	void			*callback_arg;	/* passed to CFI */
-	STAILQ_ENTRY(cfi_metatask) links;	/* used by CFI, don't touch*/
-};
-
-#ifdef _KERNEL
-
-MALLOC_DECLARE(M_CTL_CFI);
-
-/*
- * This is the API for sending meta commands (commands that are sent to more
- * than one LUN) to the internal frontend:
- *  - Allocate a metatask using cfi_alloc_metatask().  can_wait == 0 means
- *    that you're calling from an interrupt context.  can_wait == 1 means
- *    that you're calling from a thread context and don't mind waiting to
- *    allocate memory.
- *  - Setup the task type, callback and callback argument.
- *  - Call cfi_action().
- *  - When the callback comes, note the status and any per-command status
- *    (see the taskinfo union) and then free the metatask with
- *    cfi_free_metatask().
- */
-struct cfi_metatask *cfi_alloc_metatask(int can_wait);
-void cfi_free_metatask(struct cfi_metatask *metatask);
-void cfi_action(struct cfi_metatask *metatask);
-
-#endif /* _KERNEL */
-
-#endif	/* _CTL_FRONTEND_INTERNAL_H_ */
-
-/*
- * vim: ts=8
- */
diff --git a/sys/cam/ctl/ctl_frontend_ioctl.c b/sys/cam/ctl/ctl_frontend_ioctl.c
new file mode 100644
index 0000000..7d57314
--- /dev/null
+++ b/sys/cam/ctl/ctl_frontend_ioctl.c
@@ -0,0 +1,470 @@
+/*-
+ * Copyright (c) 2003-2009 Silicon Graphics International Corp.
+ * Copyright (c) 2012 The FreeBSD Foundation
+ * Copyright (c) 2015 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer,
+ *    without modification, immediately at the beginning of the file.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/types.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/condvar.h>
+#include <sys/malloc.h>
+#include <sys/conf.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+
+#include <cam/cam.h>
+#include <cam/scsi/scsi_all.h>
+#include <cam/scsi/scsi_da.h>
+#include <cam/ctl/ctl_io.h>
+#include <cam/ctl/ctl.h>
+#include <cam/ctl/ctl_frontend.h>
+#include <cam/ctl/ctl_util.h>
+#include <cam/ctl/ctl_backend.h>
+#include <cam/ctl/ctl_ioctl.h>
+#include <cam/ctl/ctl_ha.h>
+#include <cam/ctl/ctl_private.h>
+#include <cam/ctl/ctl_debug.h>
+#include <cam/ctl/ctl_error.h>
+
+struct cfi_softc {
+	uint32_t		cur_tag_num;
+	struct ctl_port		port;
+};
+
+static struct cfi_softc cfi_softc;
+
+static int cfi_init(void);
+static void cfi_shutdown(void);
+static void cfi_online(void *arg);
+static void cfi_offline(void *arg);
+static int cfi_lun_enable(void *arg, int lun_id);
+static int cfi_lun_disable(void *arg, int lun_id);
+static void cfi_datamove(union ctl_io *io);
+static void cfi_done(union ctl_io *io);
+
+static struct ctl_frontend cfi_frontend =
+{
+	.name = "ioctl",
+	.init = cfi_init,
+	.shutdown = cfi_shutdown,
+};
+CTL_FRONTEND_DECLARE(ctlioctl, cfi_frontend);
+
+static int
+cfi_init(void)
+{
+	struct cfi_softc *isoftc = &cfi_softc;
+	struct ctl_port *port;
+
+	memset(isoftc, 0, sizeof(*isoftc));
+
+	port = &isoftc->port;
+	port->frontend = &cfi_frontend;
+	port->port_type = CTL_PORT_IOCTL;
+	port->num_requested_ctl_io = 100;
+	port->port_name = "ioctl";
+	port->port_online = cfi_online;
+	port->port_offline = cfi_offline;
+	port->onoff_arg = &isoftc;
+	port->lun_enable = cfi_lun_enable;
+	port->lun_disable = cfi_lun_disable;
+	port->targ_lun_arg = &isoftc;
+	port->fe_datamove = cfi_datamove;
+	port->fe_done = cfi_done;
+	port->max_targets = 1;
+	port->max_target_id = 0;
+	port->max_initiators = 1;
+
+	if (ctl_port_register(port) != 0) {
+		printf("%s: ioctl port registration failed\n", __func__);
+		return (0);
+	}
+	ctl_port_online(port);
+	return (0);
+}
+
+void
+cfi_shutdown(void)
+{
+	struct cfi_softc *isoftc = &cfi_softc;
+	struct ctl_port *port;
+
+	port = &isoftc->port;
+	ctl_port_offline(port);
+	if (ctl_port_deregister(&isoftc->port) != 0)
+		printf("%s: ctl_frontend_deregister() failed\n", __func__);
+}
+
+static void
+cfi_online(void *arg)
+{
+}
+
+static void
+cfi_offline(void *arg)
+{
+}
+
+static int
+cfi_lun_enable(void *arg, int lun_id)
+{
+
+	return (0);
+}
+
+static int
+cfi_lun_disable(void *arg, int lun_id)
+{
+
+	return (0);
+}
+
+/*
+ * Data movement routine for the CTL ioctl frontend port.
+ */
+static int
+ctl_ioctl_do_datamove(struct ctl_scsiio *ctsio)
+{
+	struct ctl_sg_entry *ext_sglist, *kern_sglist;
+	struct ctl_sg_entry ext_entry, kern_entry;
+	int ext_sglen, ext_sg_entries, kern_sg_entries;
+	int ext_sg_start, ext_offset;
+	int len_to_copy, len_copied;
+	int kern_watermark, ext_watermark;
+	int ext_sglist_malloced;
+	int i, j;
+
+	ext_sglist_malloced = 0;
+	ext_sg_start = 0;
+	ext_offset = 0;
+
+	CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove\n"));
+
+	/*
+	 * If this flag is set, fake the data transfer.
+	 */
+	if (ctsio->io_hdr.flags & CTL_FLAG_NO_DATAMOVE) {
+		ctsio->ext_data_filled = ctsio->ext_data_len;
+		goto bailout;
+	}
+
+	/*
+	 * To simplify things here, if we have a single buffer, stick it in
+	 * a S/G entry and just make it a single entry S/G list.
+	 */
+	if (ctsio->io_hdr.flags & CTL_FLAG_EDPTR_SGLIST) {
+		int len_seen;
+
+		ext_sglen = ctsio->ext_sg_entries * sizeof(*ext_sglist);
+
+		ext_sglist = (struct ctl_sg_entry *)malloc(ext_sglen, M_CTL,
+							   M_WAITOK);
+		ext_sglist_malloced = 1;
+		if (copyin(ctsio->ext_data_ptr, ext_sglist,
+				   ext_sglen) != 0) {
+			ctl_set_internal_failure(ctsio,
+						 /*sks_valid*/ 0,
+						 /*retry_count*/ 0);
+			goto bailout;
+		}
+		ext_sg_entries = ctsio->ext_sg_entries;
+		len_seen = 0;
+		for (i = 0; i < ext_sg_entries; i++) {
+			if ((len_seen + ext_sglist[i].len) >=
+			     ctsio->ext_data_filled) {
+				ext_sg_start = i;
+				ext_offset = ctsio->ext_data_filled - len_seen;
+				break;
+			}
+			len_seen += ext_sglist[i].len;
+		}
+	} else {
+		ext_sglist = &ext_entry;
+		ext_sglist->addr = ctsio->ext_data_ptr;
+		ext_sglist->len = ctsio->ext_data_len;
+		ext_sg_entries = 1;
+		ext_sg_start = 0;
+		ext_offset = ctsio->ext_data_filled;
+	}
+
+	if (ctsio->kern_sg_entries > 0) {
+		kern_sglist = (struct ctl_sg_entry *)ctsio->kern_data_ptr;
+		kern_sg_entries = ctsio->kern_sg_entries;
+	} else {
+		kern_sglist = &kern_entry;
+		kern_sglist->addr = ctsio->kern_data_ptr;
+		kern_sglist->len = ctsio->kern_data_len;
+		kern_sg_entries = 1;
+	}
+
+
+	kern_watermark = 0;
+	ext_watermark = ext_offset;
+	len_copied = 0;
+	for (i = ext_sg_start, j = 0;
+	     i < ext_sg_entries && j < kern_sg_entries;) {
+		uint8_t *ext_ptr, *kern_ptr;
+
+		len_to_copy = MIN(ext_sglist[i].len - ext_watermark,
+				  kern_sglist[j].len - kern_watermark);
+
+		ext_ptr = (uint8_t *)ext_sglist[i].addr;
+		ext_ptr = ext_ptr + ext_watermark;
+		if (ctsio->io_hdr.flags & CTL_FLAG_BUS_ADDR) {
+			/*
+			 * XXX KDM fix this!
+			 */
+			panic("need to implement bus address support");
+#if 0
+			kern_ptr = bus_to_virt(kern_sglist[j].addr);
+#endif
+		} else
+			kern_ptr = (uint8_t *)kern_sglist[j].addr;
+		kern_ptr = kern_ptr + kern_watermark;
+
+		kern_watermark += len_to_copy;
+		ext_watermark += len_to_copy;
+
+		if ((ctsio->io_hdr.flags & CTL_FLAG_DATA_MASK) ==
+		     CTL_FLAG_DATA_IN) {
+			CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: copying %d "
+					 "bytes to user\n", len_to_copy));
+			CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: from %p "
+					 "to %p\n", kern_ptr, ext_ptr));
+			if (copyout(kern_ptr, ext_ptr, len_to_copy) != 0) {
+				ctl_set_internal_failure(ctsio,
+							 /*sks_valid*/ 0,
+							 /*retry_count*/ 0);
+				goto bailout;
+			}
+		} else {
+			CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: copying %d "
+					 "bytes from user\n", len_to_copy));
+			CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: from %p "
+					 "to %p\n", ext_ptr, kern_ptr));
+			if (copyin(ext_ptr, kern_ptr, len_to_copy)!= 0){
+				ctl_set_internal_failure(ctsio,
+							 /*sks_valid*/ 0,
+							 /*retry_count*/0);
+				goto bailout;
+			}
+		}
+
+		len_copied += len_to_copy;
+
+		if (ext_sglist[i].len == ext_watermark) {
+			i++;
+			ext_watermark = 0;
+		}
+
+		if (kern_sglist[j].len == kern_watermark) {
+			j++;
+			kern_watermark = 0;
+		}
+	}
+
+	ctsio->ext_data_filled += len_copied;
+
+	CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: ext_sg_entries: %d, "
+			 "kern_sg_entries: %d\n", ext_sg_entries,
+			 kern_sg_entries));
+	CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: ext_data_len = %d, "
+			 "kern_data_len = %d\n", ctsio->ext_data_len,
+			 ctsio->kern_data_len));
+
+
+	/* XXX KDM set residual?? */
+bailout:
+
+	if (ext_sglist_malloced != 0)
+		free(ext_sglist, M_CTL);
+
+	return (CTL_RETVAL_COMPLETE);
+}
+
+static void
+cfi_datamove(union ctl_io *io)
+{
+	struct ctl_fe_ioctl_params *params;
+
+	params = (struct ctl_fe_ioctl_params *)
+		io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr;
+
+	mtx_lock(&params->ioctl_mtx);
+	params->state = CTL_IOCTL_DATAMOVE;
+	cv_broadcast(&params->sem);
+	mtx_unlock(&params->ioctl_mtx);
+}
+
+static void
+cfi_done(union ctl_io *io)
+{
+	struct ctl_fe_ioctl_params *params;
+
+	params = (struct ctl_fe_ioctl_params *)
+		io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr;
+
+	mtx_lock(&params->ioctl_mtx);
+	params->state = CTL_IOCTL_DONE;
+	cv_broadcast(&params->sem);
+	mtx_unlock(&params->ioctl_mtx);
+}
+
+static int
+cfi_submit_wait(union ctl_io *io)
+{
+	struct ctl_fe_ioctl_params params;
+	ctl_fe_ioctl_state last_state;
+	int done, retval;
+
+	retval = 0;
+
+	bzero(&params, sizeof(params));
+
+	mtx_init(&params.ioctl_mtx, "ctliocmtx", NULL, MTX_DEF);
+	cv_init(&params.sem, "ctlioccv");
+	params.state = CTL_IOCTL_INPROG;
+	last_state = params.state;
+
+	io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr = &params;
+
+	CTL_DEBUG_PRINT(("cfi_submit_wait\n"));
+
+	/* This shouldn't happen */
+	if ((retval = ctl_queue(io)) != CTL_RETVAL_COMPLETE)
+		return (retval);
+
+	done = 0;
+
+	do {
+		mtx_lock(&params.ioctl_mtx);
+		/*
+		 * Check the state here, and don't sleep if the state has
+		 * already changed (i.e. wakeup has already occured, but we
+		 * weren't waiting yet).
+		 */
+		if (params.state == last_state) {
+			/* XXX KDM cv_wait_sig instead? */
+			cv_wait(&params.sem, &params.ioctl_mtx);
+		}
+		last_state = params.state;
+
+		switch (params.state) {
+		case CTL_IOCTL_INPROG:
+			/* Why did we wake up? */
+			/* XXX KDM error here? */
+			mtx_unlock(&params.ioctl_mtx);
+			break;
+		case CTL_IOCTL_DATAMOVE:
+			CTL_DEBUG_PRINT(("got CTL_IOCTL_DATAMOVE\n"));
+
+			/*
+			 * change last_state back to INPROG to avoid
+			 * deadlock on subsequent data moves.
+			 */
+			params.state = last_state = CTL_IOCTL_INPROG;
+
+			mtx_unlock(&params.ioctl_mtx);
+			ctl_ioctl_do_datamove(&io->scsiio);
+			/*
+			 * Note that in some cases, most notably writes,
+			 * this will queue the I/O and call us back later.
+			 * In other cases, generally reads, this routine
+			 * will immediately call back and wake us up,
+			 * probably using our own context.
+			 */
+			io->scsiio.be_move_done(io);
+			break;
+		case CTL_IOCTL_DONE:
+			mtx_unlock(&params.ioctl_mtx);
+			CTL_DEBUG_PRINT(("got CTL_IOCTL_DONE\n"));
+			done = 1;
+			break;
+		default:
+			mtx_unlock(&params.ioctl_mtx);
+			/* XXX KDM error here? */
+			break;
+		}
+	} while (done == 0);
+
+	mtx_destroy(&params.ioctl_mtx);
+	cv_destroy(&params.sem);
+
+	return (CTL_RETVAL_COMPLETE);
+}
+
+int
+ctl_ioctl_io(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
+    struct thread *td)
+{
+	union ctl_io *io;
+	void *pool_tmp;
+	int retval = 0;
+
+	/*
+	 * If we haven't been "enabled", don't allow any SCSI I/O
+	 * to this FETD.
+	 */
+	if ((cfi_softc.port.status & CTL_PORT_STATUS_ONLINE) == 0)
+		return (EPERM);
+
+	io = ctl_alloc_io(cfi_softc.port.ctl_pool_ref);
+
+	/*
+	 * Need to save the pool reference so it doesn't get
+	 * spammed by the user's ctl_io.
+	 */
+	pool_tmp = io->io_hdr.pool;
+	memcpy(io, (void *)addr, sizeof(*io));
+	io->io_hdr.pool = pool_tmp;
+
+	/*
+	 * No status yet, so make sure the status is set properly.
+	 */
+	io->io_hdr.status = CTL_STATUS_NONE;
+
+	/*
+	 * The user sets the initiator ID, target and LUN IDs.
+	 */
+	io->io_hdr.nexus.targ_port = cfi_softc.port.targ_port;
+	io->io_hdr.flags |= CTL_FLAG_USER_REQ;
+	if ((io->io_hdr.io_type == CTL_IO_SCSI) &&
+	    (io->scsiio.tag_type != CTL_TAG_UNTAGGED))
+		io->scsiio.tag_num = cfi_softc.cur_tag_num++;
+
+	retval = cfi_submit_wait(io);
+	if (retval == 0)
+		memcpy((void *)addr, io, sizeof(*io));
+	ctl_free_io(io);
+	return (retval);
+}
diff --git a/sys/cam/ctl/ctl_frontend_iscsi.c b/sys/cam/ctl/ctl_frontend_iscsi.c
index 652c961..7f8f8a8 100644
--- a/sys/cam/ctl/ctl_frontend_iscsi.c
+++ b/sys/cam/ctl/ctl_frontend_iscsi.c
@@ -61,7 +61,6 @@ __FBSDID("$FreeBSD$");
 #include <cam/ctl/ctl_backend.h>
 #include <cam/ctl/ctl_error.h>
 #include <cam/ctl/ctl_frontend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
 #include <cam/ctl/ctl_debug.h>
 #include <cam/ctl/ctl_ha.h>
 #include <cam/ctl/ctl_ioctl.h>
diff --git a/sys/cam/ctl/ctl_ioctl.h b/sys/cam/ctl/ctl_ioctl.h
index c7a3c29..f62bbe1 100644
--- a/sys/cam/ctl/ctl_ioctl.h
+++ b/sys/cam/ctl/ctl_ioctl.h
@@ -92,23 +92,6 @@ struct ctl_ooa_info {
 	ctl_ooa_status status;	/* Returned from CTL */
 };
 
-struct ctl_hard_startstop_info {
-	cfi_mt_status status;
-	int total_luns;
-	int luns_complete;
-	int luns_failed;
-};
-
-struct ctl_bbrread_info {
-	int			lun_num;	/* Passed in to CTL */
-	uint64_t		lba;		/* Passed in to CTL */
-	int			len;		/* Passed in to CTL */
-	cfi_mt_status		status;		/* Returned from CTL */
-	cfi_bbrread_status	bbr_status;	/* Returned from CTL */
-	uint8_t			scsi_status;	/* Returned from CTL */
-	struct scsi_sense_data	sense_data;	/* Returned from CTL */
-};
-
 typedef enum {
 	CTL_DELAY_TYPE_NONE,
 	CTL_DELAY_TYPE_CONT,
@@ -828,10 +811,6 @@ struct ctl_lun_map {
 #define	CTL_DISABLE_PORT	_IOW(CTL_MINOR, 0x05, struct ctl_port_entry)
 #define	CTL_DUMP_OOA		_IO(CTL_MINOR, 0x06)
 #define	CTL_CHECK_OOA		_IOWR(CTL_MINOR, 0x07, struct ctl_ooa_info)
-#define	CTL_HARD_STOP		_IOR(CTL_MINOR, 0x08, \
-				     struct ctl_hard_startstop_info)
-#define	CTL_HARD_START		_IOR(CTL_MINOR, 0x09, \
-				     struct ctl_hard_startstop_info)
 #define	CTL_DELAY_IO		_IOWR(CTL_MINOR, 0x10, struct ctl_io_delay_info)
 #define	CTL_REALSYNC_GET	_IOR(CTL_MINOR, 0x11, int)
 #define	CTL_REALSYNC_SET	_IOW(CTL_MINOR, 0x12, int)
@@ -839,7 +818,6 @@ struct ctl_lun_map {
 #define	CTL_GETSYNC		_IOWR(CTL_MINOR, 0x14, struct ctl_sync_info)
 #define	CTL_GETSTATS		_IOWR(CTL_MINOR, 0x15, struct ctl_stats)
 #define	CTL_ERROR_INJECT	_IOWR(CTL_MINOR, 0x16, struct ctl_error_desc)
-#define	CTL_BBRREAD		_IOWR(CTL_MINOR, 0x17, struct ctl_bbrread_info)
 #define	CTL_GET_OOA		_IOWR(CTL_MINOR, 0x18, struct ctl_ooa)
 #define	CTL_DUMP_STRUCTS	_IO(CTL_MINOR, 0x19)
 #define	CTL_GET_PORT_LIST	_IOWR(CTL_MINOR, 0x20, struct ctl_port_list)
diff --git a/sys/cam/ctl/ctl_private.h b/sys/cam/ctl/ctl_private.h
index a038552..6f7379a 100644
--- a/sys/cam/ctl/ctl_private.h
+++ b/sys/cam/ctl/ctl_private.h
@@ -47,18 +47,6 @@
 #define	CTL_PROCESSOR_PRODUCT	"CTLPROCESSOR    "
 #define	CTL_UNKNOWN_PRODUCT	"CTLDEVICE       "
 
-struct ctl_fe_ioctl_startstop_info {
-	struct cv			sem;
-	struct ctl_hard_startstop_info	hs_info;
-};
-
-struct ctl_fe_ioctl_bbrread_info {
-	struct cv			sem;
-	struct ctl_bbrread_info		*bbr_info;
-	int				wakeup_done;
-	struct mtx			*lock;
-};
-
 typedef enum {
 	CTL_IOCTL_INPROG,
 	CTL_IOCTL_DATAMOVE,
@@ -81,18 +69,6 @@ struct ctl_io_pool {
 };
 
 typedef enum {
-	CTL_IOCTL_FLAG_NONE	= 0x00,
-	CTL_IOCTL_FLAG_ENABLED	= 0x01
-} ctl_ioctl_flags;
-
-struct ctl_ioctl_info {
-	ctl_ioctl_flags		flags;
-	uint32_t		cur_tag_num;
-	struct ctl_port		port;
-	char			port_name[24];
-};
-
-typedef enum {
 	CTL_SER_BLOCK,
 	CTL_SER_BLOCKOPT,
 	CTL_SER_EXTENT,
@@ -472,7 +448,6 @@ struct ctl_softc {
 	int inquiry_pq_no_lun;
 	struct sysctl_ctx_list sysctl_ctx;
 	struct sysctl_oid *sysctl_tree;
-	struct ctl_ioctl_info ioctl_info;
 	void *othersc_pool;
 	struct proc *ctl_proc;
 	int targ_online;
diff --git a/sys/cam/ctl/ctl_tpc.c b/sys/cam/ctl/ctl_tpc.c
index 662ee3d..b1b674f 100644
--- a/sys/cam/ctl/ctl_tpc.c
+++ b/sys/cam/ctl/ctl_tpc.c
@@ -47,7 +47,6 @@ __FBSDID("$FreeBSD$");
 #include <cam/ctl/ctl_io.h>
 #include <cam/ctl/ctl.h>
 #include <cam/ctl/ctl_frontend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
 #include <cam/ctl/ctl_util.h>
 #include <cam/ctl/ctl_backend.h>
 #include <cam/ctl/ctl_ioctl.h>
diff --git a/sys/cam/ctl/ctl_tpc_local.c b/sys/cam/ctl/ctl_tpc_local.c
index d0319ee..fb1f2ac 100644
--- a/sys/cam/ctl/ctl_tpc_local.c
+++ b/sys/cam/ctl/ctl_tpc_local.c
@@ -47,7 +47,6 @@ __FBSDID("$FreeBSD$");
 #include <cam/ctl/ctl_io.h>
 #include <cam/ctl/ctl.h>
 #include <cam/ctl/ctl_frontend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
 #include <cam/ctl/ctl_util.h>
 #include <cam/ctl/ctl_backend.h>
 #include <cam/ctl/ctl_ioctl.h>
diff --git a/sys/cddl/compat/opensolaris/sys/nvpair.h b/sys/cddl/compat/opensolaris/sys/nvpair.h
index c90ab70..33b62cb 100644
--- a/sys/cddl/compat/opensolaris/sys/nvpair.h
+++ b/sys/cddl/compat/opensolaris/sys/nvpair.h
@@ -42,29 +42,19 @@
  */
 #define nvlist_add_binary illumos_nvlist_add_binary
 #define nvlist_add_bool illumos_nvlist_add_bool
+#define nvlist_add_bool_array illumos_nvlist_add_bool_array
 #define nvlist_add_descriptor illumos_nvlist_add_descriptor
+#define nvlist_add_descriptor_array illumos_nvlist_add_descriptor_array
 #define nvlist_add_null illumos_nvlist_add_null
 #define nvlist_add_number illumos_nvlist_add_number
+#define nvlist_add_number_array illumos_nvlist_add_number_array
 #define nvlist_add_nvlist illumos_nvlist_add_nvlist
+#define nvlist_add_nvlist_array illumos_nvlist_add_nvlist_array
 #define nvlist_add_nvpair illumos_nvlist_add_nvpair
 #define nvlist_add_string illumos_nvlist_add_string
+#define nvlist_add_string_array illumos_nvlist_add_string_array
 #define nvlist_add_stringf illumos_nvlist_add_stringf
 #define nvlist_add_stringv illumos_nvlist_add_stringv
-#define nvlist_addf_binary illumos_nvlist_addf_binary
-#define nvlist_addf_bool illumos_nvlist_addf_bool
-#define nvlist_addf_descriptor illumos_nvlist_addf_descriptor
-#define nvlist_addf_null illumos_nvlist_addf_null
-#define nvlist_addf_number illumos_nvlist_addf_number
-#define nvlist_addf_nvlist illumos_nvlist_addf_nvlist
-#define nvlist_addf_string illumos_nvlist_addf_string
-#define nvlist_addv_binary illumos_nvlist_addv_binary
-#define nvlist_addv_bool illumos_nvlist_addv_bool
-#define nvlist_addv_descriptor illumos_nvlist_addv_descriptor
-#define nvlist_addv_null illumos_nvlist_addv_null
-#define nvlist_addv_number illumos_nvlist_addv_number
-#define nvlist_addv_nvlist illumos_nvlist_addv_nvlist
-#define nvlist_addv_string illumos_nvlist_addv_string
-#define nvlist_check_header illumos_nvlist_check_header
 #define nvlist_clone illumos_nvlist_clone
 #define nvlist_create illumos_nvlist_create
 #define nvlist_descriptors illumos_nvlist_descriptors
@@ -75,92 +65,61 @@
 #define nvlist_exists illumos_nvlist_exists
 #define nvlist_exists_binary illumos_nvlist_exists_binary
 #define nvlist_exists_bool illumos_nvlist_exists_bool
+#define nvlist_exists_bool_array illumos_nvlist_exists_bool_array
 #define nvlist_exists_descriptor illumos_nvlist_exists_descriptor
+#define nvlist_exists_descriptor_array illumos_nvlist_exists_descriptor_array
 #define nvlist_exists_null illumos_nvlist_exists_null
 #define nvlist_exists_number illumos_nvlist_exists_number
+#define nvlist_exists_number_array illumos_nvlist_exists_number_array
 #define nvlist_exists_nvlist illumos_nvlist_exists_nvlist
+#define nvlist_exists_nvlist_array illumos_nvlist_exists_nvlist_array
 #define nvlist_exists_string illumos_nvlist_exists_string
+#define nvlist_exists_string_array illumos_nvlist_exists_string_array
 #define nvlist_exists_type illumos_nvlist_exists_type
-#define nvlist_existsf illumos_nvlist_existsf
-#define nvlist_existsf_binary illumos_nvlist_existsf_binary
-#define nvlist_existsf_bool illumos_nvlist_existsf_bool
-#define nvlist_existsf_descriptor illumos_nvlist_existsf_descriptor
-#define nvlist_existsf_null illumos_nvlist_existsf_null
-#define nvlist_existsf_number illumos_nvlist_existsf_number
-#define nvlist_existsf_nvlist illumos_nvlist_existsf_nvlist
-#define nvlist_existsf_string illumos_nvlist_existsf_string
-#define nvlist_existsf_type illumos_nvlist_existsf_type
-#define nvlist_existsv illumos_nvlist_existsv
-#define nvlist_existsv_binary illumos_nvlist_existsv_binary
-#define nvlist_existsv_bool illumos_nvlist_existsv_bool
-#define nvlist_existsv_descriptor illumos_nvlist_existsv_descriptor
-#define nvlist_existsv_null illumos_nvlist_existsv_null
-#define nvlist_existsv_number illumos_nvlist_existsv_number
-#define nvlist_existsv_nvlist illumos_nvlist_existsv_nvlist
-#define nvlist_existsv_string illumos_nvlist_existsv_string
-#define nvlist_existsv_type illumos_nvlist_existsv_type
 #define nvlist_fdump illumos_nvlist_fdump
 #define nvlist_first_nvpair illumos_nvlist_first_nvpair
+#define nvlist_flags illumos_nvlist_flags
 #define nvlist_free illumos_nvlist_free
 #define nvlist_free_binary illumos_nvlist_free_binary
+#define nvlist_free_binary_array illumos_nvlist_free_binary_array
 #define nvlist_free_bool illumos_nvlist_free_bool
+#define nvlist_free_bool_array illumos_nvlist_free_bool_array
 #define nvlist_free_descriptor illumos_nvlist_free_descriptor
+#define nvlist_free_descriptor_array illumos_nvlist_free_descriptor_array
 #define nvlist_free_null illumos_nvlist_free_null
 #define nvlist_free_number illumos_nvlist_free_number
+#define nvlist_free_number_array illumos_nvlist_free_number_array
 #define nvlist_free_nvlist illumos_nvlist_free_nvlist
+#define nvlist_free_nvlist_array illumos_nvlist_free_nvlist_array
 #define nvlist_free_nvpair illumos_nvlist_free_nvpair
 #define nvlist_free_string illumos_nvlist_free_string
+#define nvlist_free_string_array illumos_nvlist_free_string_array
 #define nvlist_free_type illumos_nvlist_free_type
-#define nvlist_freef illumos_nvlist_freef
-#define nvlist_freef_binary illumos_nvlist_freef_binary
-#define nvlist_freef_bool illumos_nvlist_freef_bool
-#define nvlist_freef_descriptor illumos_nvlist_freef_descriptor
-#define nvlist_freef_null illumos_nvlist_freef_null
-#define nvlist_freef_number illumos_nvlist_freef_number
-#define nvlist_freef_nvlist illumos_nvlist_freef_nvlist
-#define nvlist_freef_string illumos_nvlist_freef_string
-#define nvlist_freef_type illumos_nvlist_freef_type
-#define nvlist_freev illumos_nvlist_freev
-#define nvlist_freev_binary illumos_nvlist_freev_binary
-#define nvlist_freev_bool illumos_nvlist_freev_bool
-#define nvlist_freev_descriptor illumos_nvlist_freev_descriptor
-#define nvlist_freev_null illumos_nvlist_freev_null
-#define nvlist_freev_number illumos_nvlist_freev_number
-#define nvlist_freev_nvlist illumos_nvlist_freev_nvlist
-#define nvlist_freev_string illumos_nvlist_freev_string
-#define nvlist_freev_type illumos_nvlist_freev_type
+#define nvlist_get_array_next illumos_nvlist_get_array_next
 #define nvlist_get_binary illumos_nvlist_get_binary
 #define nvlist_get_bool illumos_nvlist_get_bool
+#define nvlist_get_bool_array illumos_nvlist_get_bool_array
 #define nvlist_get_descriptor illumos_nvlist_get_descriptor
+#define nvlist_get_descriptor_array illumos_nvlist_get_descriptor_array
 #define nvlist_get_number illumos_nvlist_get_number
+#define nvlist_get_number_array illumos_nvlist_get_number_array
 #define nvlist_get_nvlist illumos_nvlist_get_nvlist
 #define nvlist_get_nvpair illumos_nvlist_get_nvpair
+#define nvlist_get_nvpair_parent illumos_nvlist_get_nvpair_parent
+#define nvlist_get_pararr illumos_nvlist_get_pararr
+#define nvlist_get_parent illumos_nvlist_get_parent
 #define nvlist_get_string illumos_nvlist_get_string
-#define nvlist_getf_binary illumos_nvlist_getf_binary
-#define nvlist_getf_bool illumos_nvlist_getf_bool
-#define nvlist_getf_descriptor illumos_nvlist_getf_descriptor
-#define nvlist_getf_number illumos_nvlist_getf_number
-#define nvlist_getf_nvlist illumos_nvlist_getf_nvlist
-#define nvlist_getf_string illumos_nvlist_getf_string
-#define nvlist_getv_binary illumos_nvlist_getv_binary
-#define nvlist_getv_bool illumos_nvlist_getv_bool
-#define nvlist_getv_descriptor illumos_nvlist_getv_descriptor
-#define nvlist_getv_number illumos_nvlist_getv_number
-#define nvlist_getv_nvlist illumos_nvlist_getv_nvlist
-#define nvlist_getv_string illumos_nvlist_getv_string
+#define nvlist_in_array illumos_nvlist_in_array
 #define nvlist_move_binary illumos_nvlist_move_binary
+#define nvlist_move_bool_array illumos_nvlist_move_bool_array
 #define nvlist_move_descriptor illumos_nvlist_move_descriptor
+#define nvlist_move_descriptor_array illumos_nvlist_move_descriptor_array
+#define nvlist_move_number_array illumos_nvlist_move_number_array
 #define nvlist_move_nvlist illumos_nvlist_move_nvlist
+#define nvlist_move_nvlist_array illumos_nvlist_move_nvlist_array
 #define nvlist_move_nvpair illumos_nvlist_move_nvpair
 #define nvlist_move_string illumos_nvlist_move_string
-#define nvlist_movef_binary illumos_nvlist_movef_binary
-#define nvlist_movef_descriptor illumos_nvlist_movef_descriptor
-#define nvlist_movef_nvlist illumos_nvlist_movef_nvlist
-#define nvlist_movef_string illumos_nvlist_movef_string
-#define nvlist_movev_binary illumos_nvlist_movev_binary
-#define nvlist_movev_descriptor illumos_nvlist_movev_descriptor
-#define nvlist_movev_nvlist illumos_nvlist_movev_nvlist
-#define nvlist_movev_string illumos_nvlist_movev_string
+#define nvlist_move_string_array illumos_nvlist_move_string_array
 #define nvlist_ndescriptors illumos_nvlist_ndescriptors
 #define nvlist_next illumos_nvlist_next
 #define nvlist_next_nvpair illumos_nvlist_next_nvpair
@@ -168,93 +127,101 @@
 #define nvlist_prev_nvpair illumos_nvlist_prev_nvpair
 #define nvlist_recv illumos_nvlist_recv
 #define nvlist_remove_nvpair illumos_nvlist_remove_nvpair
-#define nvlist_report_missing illumos_nvlist_report_missing
 #define nvlist_send illumos_nvlist_send
+#define nvlist_set_array_next illumos_nvlist_set_array_next
 #define nvlist_set_error illumos_nvlist_set_error
+#define nvlist_set_flags illumos_nvlist_set_flags
+#define nvlist_set_parent illumos_nvlist_set_parent
 #define nvlist_size illumos_nvlist_size
 #define nvlist_take_binary illumos_nvlist_take_binary
 #define nvlist_take_bool illumos_nvlist_take_bool
+#define nvlist_take_bool_array illumos_nvlist_take_bool_array
 #define nvlist_take_descriptor illumos_nvlist_take_descriptor
+#define nvlist_take_descriptor_array illumos_nvlist_take_descriptor_array
 #define nvlist_take_number illumos_nvlist_take_number
+#define nvlist_take_number_array illumos_nvlist_take_number_array
 #define nvlist_take_nvlist illumos_nvlist_take_nvlist
+#define nvlist_take_nvlist_array illumos_nvlist_take_nvlist_array
 #define nvlist_take_nvpair illumos_nvlist_take_nvpair
 #define nvlist_take_string illumos_nvlist_take_string
-#define nvlist_takef_binary illumos_nvlist_takef_binary
-#define nvlist_takef_bool illumos_nvlist_takef_bool
-#define nvlist_takef_descriptor illumos_nvlist_takef_descriptor
-#define nvlist_takef_number illumos_nvlist_takef_number
-#define nvlist_takef_nvlist illumos_nvlist_takef_nvlist
-#define nvlist_takef_string illumos_nvlist_takef_string
-#define nvlist_takev_binary illumos_nvlist_takev_binary
-#define nvlist_takev_bool illumos_nvlist_takev_bool
-#define nvlist_takev_descriptor illumos_nvlist_takev_descriptor
-#define nvlist_takev_number illumos_nvlist_takev_number
-#define nvlist_takev_nvlist illumos_nvlist_takev_nvlist
-#define nvlist_takev_string illumos_nvlist_takev_string
+#define nvlist_take_string_array illumos_nvlist_take_string_array
 #define nvlist_unpack illumos_nvlist_unpack
+#define nvlist_unpack_header illumos_nvlist_unpack_header
 #define nvlist_xfer illumos_nvlist_xfer
-#define nvlist_xpack illumos_nvlist_xpack
-#define nvlist_xunpack illumos_nvlist_xunpack
-#define nvpair_allocv illumos_nvpair_allocv
 #define nvpair_assert illumos_nvpair_assert
 #define nvpair_clone illumos_nvpair_clone
 #define nvpair_create_binary illumos_nvpair_create_binary
 #define nvpair_create_bool illumos_nvpair_create_bool
+#define nvpair_create_bool_array illumos_nvpair_create_bool_array
 #define nvpair_create_descriptor illumos_nvpair_create_descriptor
+#define nvpair_create_descriptor_array illumos_nvpair_create_descriptor_array
 #define nvpair_create_null illumos_nvpair_create_null
 #define nvpair_create_number illumos_nvpair_create_number
+#define nvpair_create_number_array illumos_nvpair_create_number_array
 #define nvpair_create_nvlist illumos_nvpair_create_nvlist
+#define nvpair_create_nvlist_array illumos_nvpair_create_nvlist_array
 #define nvpair_create_string illumos_nvpair_create_string
+#define nvpair_create_string_array illumos_nvpair_create_string_array
 #define nvpair_create_stringf illumos_nvpair_create_stringf
 #define nvpair_create_stringv illumos_nvpair_create_stringv
-#define nvpair_createf_binary illumos_nvpair_createf_binary
-#define nvpair_createf_bool illumos_nvpair_createf_bool
-#define nvpair_createf_descriptor illumos_nvpair_createf_descriptor
-#define nvpair_createf_null illumos_nvpair_createf_null
-#define nvpair_createf_number illumos_nvpair_createf_number
-#define nvpair_createf_nvlist illumos_nvpair_createf_nvlist
-#define nvpair_createf_string illumos_nvpair_createf_string
-#define nvpair_createv_binary illumos_nvpair_createv_binary
-#define nvpair_createv_bool illumos_nvpair_createv_bool
-#define nvpair_createv_descriptor illumos_nvpair_createv_descriptor
-#define nvpair_createv_null illumos_nvpair_createv_null
-#define nvpair_createv_number illumos_nvpair_createv_number
-#define nvpair_createv_nvlist illumos_nvpair_createv_nvlist
-#define nvpair_createv_string illumos_nvpair_createv_string
 #define nvpair_free illumos_nvpair_free
 #define nvpair_free_structure illumos_nvpair_free_structure
 #define nvpair_get_binary illumos_nvpair_get_binary
 #define nvpair_get_bool illumos_nvpair_get_bool
+#define nvpair_get_bool_array illumos_nvpair_get_bool_array
 #define nvpair_get_descriptor illumos_nvpair_get_descriptor
+#define nvpair_get_descriptor_array illumos_nvpair_get_descriptor_array
 #define nvpair_get_number illumos_nvpair_get_number
+#define nvpair_get_number_array illumos_nvpair_get_number_array
 #define nvpair_get_nvlist illumos_nvpair_get_nvlist
 #define nvpair_get_string illumos_nvpair_get_string
 #define nvpair_header_size illumos_nvpair_header_size
+#define nvpair_init_datasize illumos_nvpair_init_datasize
 #define nvpair_insert illumos_nvpair_insert
 #define nvpair_move_binary illumos_nvpair_move_binary
+#define nvpair_move_bool_array illumos_nvpair_move_bool_array
 #define nvpair_move_descriptor illumos_nvpair_move_descriptor
+#define nvpair_move_descriptor_array illumos_nvpair_move_descriptor_array
+#define nvpair_move_number_array illumos_nvpair_move_number_array
 #define nvpair_move_nvlist illumos_nvpair_move_nvlist
+#define nvpair_move_nvlist_array illumos_nvpair_move_nvlist_array
 #define nvpair_move_string illumos_nvpair_move_string
-#define nvpair_movef_binary illumos_nvpair_movef_binary
-#define nvpair_movef_descriptor illumos_nvpair_movef_descriptor
-#define nvpair_movef_nvlist illumos_nvpair_movef_nvlist
-#define nvpair_movef_string illumos_nvpair_movef_string
-#define nvpair_movev_binary illumos_nvpair_movev_binary
-#define nvpair_movev_descriptor illumos_nvpair_movev_descriptor
-#define nvpair_movev_nvlist illumos_nvpair_movev_nvlist
-#define nvpair_movev_string illumos_nvpair_movev_string
+#define nvpair_move_string_array illumos_nvpair_move_string_array
 #define nvpair_name illumos_nvpair_name
 #define nvpair_next illumos_nvpair_next
 #define nvpair_nvlist illumos_nvpair_nvlist
-#define nvpair_pack illumos_nvpair_pack
+#define nvpair_pack_binary illumos_nvpair_pack_binary
+#define nvpair_pack_bool illumos_nvpair_pack_bool
+#define nvpair_pack_bool_array illumos_nvpair_pack_bool_array
 #define nvpair_pack_descriptor illumos_nvpair_pack_descriptor
+#define nvpair_pack_descriptor_array illumos_nvpair_pack_descriptor_array
+#define nvpair_pack_header illumos_nvpair_pack_header
+#define nvpair_pack_null illumos_nvpair_pack_null
+#define nvpair_pack_number illumos_nvpair_pack_number
+#define nvpair_pack_number_array illumos_nvpair_pack_number_array
+#define nvpair_pack_nvlist_array_next illumos_nvpair_pack_nvlist_array_next
+#define nvpair_pack_nvlist_up illumos_nvpair_pack_nvlist_up
+#define nvpair_pack_string illumos_nvpair_pack_string
+#define nvpair_pack_string_array illumos_nvpair_pack_string_array
 #define nvpair_prev illumos_nvpair_prev
 #define nvpair_remove illumos_nvpair_remove
 #define nvpair_size illumos_nvpair_size
 #define nvpair_type illumos_nvpair_type
 #define nvpair_type_string illumos_nvpair_type_string
 #define nvpair_unpack illumos_nvpair_unpack
+#define nvpair_unpack_binary illumos_nvpair_unpack_binary
+#define nvpair_unpack_bool illumos_nvpair_unpack_bool
+#define nvpair_unpack_bool_array illumos_nvpair_unpack_bool_array
 #define nvpair_unpack_descriptor illumos_nvpair_unpack_descriptor
+#define nvpair_unpack_descriptor_array illumos_nvpair_unpack_descriptor_array
+#define nvpair_unpack_header illumos_nvpair_unpack_header
+#define nvpair_unpack_null illumos_nvpair_unpack_null
+#define nvpair_unpack_number illumos_nvpair_unpack_number
+#define nvpair_unpack_number_array illumos_nvpair_unpack_number_array
+#define nvpair_unpack_nvlist illumos_nvpair_unpack_nvlist
+#define nvpair_unpack_nvlist_array illumos_nvpair_unpack_nvlist_array
+#define nvpair_unpack_string illumos_nvpair_unpack_string
+#define nvpair_unpack_string_array illumos_nvpair_unpack_string_array
 
 #endif /* _KERNEL */
 
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
index 52a355d..d59fbf0 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
@@ -129,15 +129,15 @@ zfeature_depends_on(spa_feature_t fid, spa_feature_t check) {
 
 static void
 zfeature_register(spa_feature_t fid, const char *guid, const char *name,
-    const char *desc, boolean_t readonly, boolean_t mos,
-    boolean_t activate_on_enable, const spa_feature_t *deps)
+    const char *desc, zfeature_flags_t flags, const spa_feature_t *deps)
 {
 	zfeature_info_t *feature = &spa_feature_table[fid];
 	static spa_feature_t nodeps[] = { SPA_FEATURE_NONE };
 
 	ASSERT(name != NULL);
 	ASSERT(desc != NULL);
-	ASSERT(!readonly || !mos);
+	ASSERT((flags & ZFEATURE_FLAG_READONLY_COMPAT) == 0 ||
+	    (flags & ZFEATURE_FLAG_MOS) == 0);
 	ASSERT3U(fid, <, SPA_FEATURES);
 	ASSERT(zfeature_is_valid_guid(guid));
 
@@ -148,9 +148,7 @@ zfeature_register(spa_feature_t fid, const char *guid, const char *name,
 	feature->fi_guid = guid;
 	feature->fi_uname = name;
 	feature->fi_desc = desc;
-	feature->fi_can_readonly = readonly;
-	feature->fi_mos = mos;
-	feature->fi_activate_on_enable = activate_on_enable;
+	feature->fi_flags = flags;
 	feature->fi_depends = deps;
 }
 
@@ -159,45 +157,46 @@ zpool_feature_init(void)
 {
 	zfeature_register(SPA_FEATURE_ASYNC_DESTROY,
 	    "com.delphix:async_destroy", "async_destroy",
-	    "Destroy filesystems asynchronously.", B_TRUE, B_FALSE,
-	    B_FALSE, NULL);
+	    "Destroy filesystems asynchronously.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
 
 	zfeature_register(SPA_FEATURE_EMPTY_BPOBJ,
 	    "com.delphix:empty_bpobj", "empty_bpobj",
-	    "Snapshots use less space.", B_TRUE, B_FALSE,
-	    B_FALSE, NULL);
+	    "Snapshots use less space.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
 
 	zfeature_register(SPA_FEATURE_LZ4_COMPRESS,
 	    "org.illumos:lz4_compress", "lz4_compress",
-	    "LZ4 compression algorithm support.", B_FALSE, B_FALSE,
-	    B_TRUE, NULL);
+	    "LZ4 compression algorithm support.",
+	    ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, NULL);
 
 	zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
 	    "com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump",
-	    "Crash dumps to multiple vdev pools.", B_FALSE, B_FALSE,
-	    B_FALSE, NULL);
+	    "Crash dumps to multiple vdev pools.",
+	    0, NULL);
 
 	zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM,
 	    "com.delphix:spacemap_histogram", "spacemap_histogram",
-	    "Spacemaps maintain space histograms.", B_TRUE, B_FALSE,
-	    B_FALSE, NULL);
+	    "Spacemaps maintain space histograms.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
 
 	zfeature_register(SPA_FEATURE_ENABLED_TXG,
 	    "com.delphix:enabled_txg", "enabled_txg",
-	    "Record txg at which a feature is enabled", B_TRUE, B_FALSE,
-	    B_FALSE, NULL);
+	    "Record txg at which a feature is enabled",
+	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
 
 	static spa_feature_t hole_birth_deps[] = { SPA_FEATURE_ENABLED_TXG,
 	    SPA_FEATURE_NONE };
 	zfeature_register(SPA_FEATURE_HOLE_BIRTH,
 	    "com.delphix:hole_birth", "hole_birth",
 	    "Retain hole birth txg for more precise zfs send",
-	    B_FALSE, B_TRUE, B_TRUE, hole_birth_deps);
+	    ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
+	    hole_birth_deps);
 
 	zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET,
 	    "com.delphix:extensible_dataset", "extensible_dataset",
 	    "Enhanced dataset functionality, used by other features.",
-	    B_FALSE, B_FALSE, B_FALSE, NULL);
+	    0, NULL);
 
 	static const spa_feature_t bookmarks_deps[] = {
 		SPA_FEATURE_EXTENSIBLE_DATASET,
@@ -206,7 +205,7 @@ zpool_feature_init(void)
 	zfeature_register(SPA_FEATURE_BOOKMARKS,
 	    "com.delphix:bookmarks", "bookmarks",
 	    "\"zfs bookmark\" command",
-	    B_TRUE, B_FALSE, B_FALSE, bookmarks_deps);
+	    ZFEATURE_FLAG_READONLY_COMPAT, bookmarks_deps);
 
 	static const spa_feature_t filesystem_limits_deps[] = {
 	    SPA_FEATURE_EXTENSIBLE_DATASET,
@@ -214,13 +213,14 @@ zpool_feature_init(void)
 	};
 	zfeature_register(SPA_FEATURE_FS_SS_LIMIT,
 	    "com.joyent:filesystem_limits", "filesystem_limits",
-	    "Filesystem and snapshot limits.", B_TRUE, B_FALSE, B_FALSE,
-	    filesystem_limits_deps);
+	    "Filesystem and snapshot limits.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, filesystem_limits_deps);
 
 	zfeature_register(SPA_FEATURE_EMBEDDED_DATA,
 	    "com.delphix:embedded_data", "embedded_data",
 	    "Blocks which compress very well use even less space.",
-	    B_FALSE, B_TRUE, B_TRUE, NULL);
+	    ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
+	    NULL);
 
 	static const spa_feature_t large_blocks_deps[] = {
 		SPA_FEATURE_EXTENSIBLE_DATASET,
@@ -228,6 +228,6 @@ zpool_feature_init(void)
 	};
 	zfeature_register(SPA_FEATURE_LARGE_BLOCKS,
 	    "org.open-zfs:large_blocks", "large_blocks",
-	    "Support for blocks larger than 128KB.", B_FALSE, B_FALSE, B_FALSE,
-	    large_blocks_deps);
+	    "Support for blocks larger than 128KB.",
+	    ZFEATURE_FLAG_PER_DATASET, large_blocks_deps);
 }
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
index 4ffe435..0e88a9a 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
@@ -56,15 +56,23 @@ typedef enum spa_feature {
 
 #define	SPA_FEATURE_DISABLED	(-1ULL)
 
+typedef enum zfeature_flags {
+	/* Can open pool readonly even if this feature is not supported. */
+	ZFEATURE_FLAG_READONLY_COMPAT =		(1 << 0),
+	/* Is this feature necessary to read the MOS? */
+	ZFEATURE_FLAG_MOS =			(1 << 1),
+	/* Activate this feature at the same time it is enabled. */
+	ZFEATURE_FLAG_ACTIVATE_ON_ENABLE =	(1 << 2),
+	/* Each dataset has a field set if it has ever used this feature. */
+	ZFEATURE_FLAG_PER_DATASET =		(1 << 3)
+} zfeature_flags_t;
+
 typedef struct zfeature_info {
 	spa_feature_t fi_feature;
 	const char *fi_uname;	/* User-facing feature name */
 	const char *fi_guid;	/* On-disk feature identifier */
 	const char *fi_desc;	/* Feature description */
-	boolean_t fi_can_readonly; /* Can open pool readonly w/o support? */
-	boolean_t fi_mos;	/* Is the feature necessary to read the MOS? */
-	/* Activate this feature at the same time it is enabled */
-	boolean_t fi_activate_on_enable;
+	zfeature_flags_t fi_flags;
 	/* array of dependencies, terminated by SPA_FEATURE_NONE */
 	const spa_feature_t *fi_depends;
 } zfeature_info_t;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
index 4c7e225..77c7b1d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
+++ b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
@@ -22,7 +22,9 @@
 #
 # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
 # Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
-# Copyright (c) 2013 by Delphix. All rights reserved.
+# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
+# Copyright (c) 2012 Joyent, Inc.  All rights reserved.
+# Copyright (c) 2011, 2014 by Delphix. All rights reserved.
 # Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 #
 #
@@ -36,6 +38,7 @@ ZFS_COMMON_OBJS +=		\
 	blkptr.o		\
 	bpobj.o			\
 	bptree.o		\
+	bqueue.o		\
 	dbuf.o			\
 	ddt.o			\
 	ddt_zap.o		\
@@ -65,6 +68,7 @@ ZFS_COMMON_OBJS +=		\
 	lz4.o			\
 	lzjb.o			\
 	metaslab.o		\
+	multilist.o		\
 	range_tree.o		\
 	refcount.o		\
 	rrwlock.o		\
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index 6de36f2..07fcb51 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -21,9 +21,9 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
- * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  */
 
 /*
@@ -82,9 +82,9 @@
  * types of locks: 1) the hash table lock array, and 2) the
  * arc list locks.
  *
- * Buffers do not have their own mutexs, rather they rely on the
- * hash table mutexs for the bulk of their protection (i.e. most
- * fields in the arc_buf_hdr_t are protected by these mutexs).
+ * Buffers do not have their own mutexes, rather they rely on the
+ * hash table mutexes for the bulk of their protection (i.e. most
+ * fields in the arc_buf_hdr_t are protected by these mutexes).
  *
  * buf_hash_find() returns the appropriate mutex (held) when it
  * locates the requested buffer in the hash table.  It returns
@@ -129,6 +129,7 @@
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/dsl_pool.h>
+#include <sys/multilist.h>
 #ifdef _KERNEL
 #include <sys/dnlc.h>
 #endif
@@ -149,21 +150,39 @@ int arc_procfd;
 #endif
 #endif /* illumos */
 
-static kmutex_t		arc_reclaim_thr_lock;
-static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
-static uint8_t		arc_thread_exit;
+static kmutex_t		arc_reclaim_lock;
+static kcondvar_t	arc_reclaim_thread_cv;
+static boolean_t	arc_reclaim_thread_exit;
+static kcondvar_t	arc_reclaim_waiters_cv;
+
+static kmutex_t		arc_user_evicts_lock;
+static kcondvar_t	arc_user_evicts_cv;
+static boolean_t	arc_user_evicts_thread_exit;
 
 uint_t arc_reduce_dnlc_percent = 3;
 
 /*
- * The number of iterations through arc_evict_*() before we
- * drop & reacquire the lock.
+ * The number of headers to evict in arc_evict_state_impl() before
+ * dropping the sublist lock and evicting from another sublist. A lower
+ * value means we're more likely to evict the "correct" header (i.e. the
+ * oldest header in the arc state), but comes with higher overhead
+ * (i.e. more invocations of arc_evict_state_impl()).
  */
-int arc_evict_iterations = 100;
+int zfs_arc_evict_batch_limit = 10;
+
+/*
+ * The number of sublists used for each of the arc state lists. If this
+ * is not set to a suitable value by the user, it will be configured to
+ * the number of CPUs on the system in arc_init().
+ */
+int zfs_arc_num_sublists_per_state = 0;
 
 /* number of seconds before growing cache again */
 static int		arc_grow_retry = 60;
 
+/* shift of arc_c for calculating overflow limit in arc_get_data_buf */
+int		zfs_arc_overflow_shift = 8;
+
 /* shift of arc_c for calculating both min and max arc_p */
 static int		arc_p_min_shift = 4;
 
@@ -201,6 +220,9 @@ extern int zfs_prefetch_disable;
  */
 static boolean_t arc_warm;
 
+/*
+ * These tunables are for performance analysis.
+ */
 uint64_t zfs_arc_max;
 uint64_t zfs_arc_min;
 uint64_t zfs_arc_meta_limit = 0;
@@ -312,31 +334,22 @@ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit,
  * second level ARC benefit from these fast lookups.
  */
 
-#define	ARCS_LOCK_PAD		CACHE_LINE_SIZE
-struct arcs_lock {
-	kmutex_t	arcs_lock;
-#ifdef _KERNEL
-	unsigned char	pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
-#endif
-};
-
-/*
- * must be power of two for mask use to work
- *
- */
-#define ARC_BUFC_NUMDATALISTS		16
-#define ARC_BUFC_NUMMETADATALISTS	16
-#define ARC_BUFC_NUMLISTS	(ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
-
 typedef struct arc_state {
-	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
-	uint64_t arcs_size;	/* total amount of data in this state */
-	list_t	arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
-	struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
+	/*
+	 * list of evictable buffers
+	 */
+	multilist_t arcs_list[ARC_BUFC_NUMTYPES];
+	/*
+	 * total amount of evictable data in this state
+	 */
+	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];
+	/*
+	 * total amount of data in this state; this includes: evictable,
+	 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
+	 */
+	refcount_t arcs_size;
 } arc_state_t;
 
-#define ARCS_LOCK(s, i)	(&((s)->arcs_locks[(i)].arcs_lock))
-
 /* The 6 states: */
 static arc_state_t ARC_anon;
 static arc_state_t ARC_mru;
@@ -362,8 +375,6 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_mfu_ghost_hits;
 	kstat_named_t arcstat_allocated;
 	kstat_named_t arcstat_deleted;
-	kstat_named_t arcstat_stolen;
-	kstat_named_t arcstat_recycle_miss;
 	/*
 	 * Number of buffers that could not be evicted because the hash lock
 	 * was held by another thread.  The lock may not necessarily be held
@@ -377,9 +388,15 @@ typedef struct arc_stats {
 	 * not from the spa we're trying to evict from.
 	 */
 	kstat_named_t arcstat_evict_skip;
+	/*
+	 * Number of times arc_evict_state() was unable to evict enough
+	 * buffers to reach it's target amount.
+	 */
+	kstat_named_t arcstat_evict_not_enough;
 	kstat_named_t arcstat_evict_l2_cached;
 	kstat_named_t arcstat_evict_l2_eligible;
 	kstat_named_t arcstat_evict_l2_ineligible;
+	kstat_named_t arcstat_evict_l2_skip;
 	kstat_named_t arcstat_hash_elements;
 	kstat_named_t arcstat_hash_elements_max;
 	kstat_named_t arcstat_hash_collisions;
@@ -530,7 +547,7 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_l2_writes_sent;
 	kstat_named_t arcstat_l2_writes_done;
 	kstat_named_t arcstat_l2_writes_error;
-	kstat_named_t arcstat_l2_writes_hdr_miss;
+	kstat_named_t arcstat_l2_writes_lock_retry;
 	kstat_named_t arcstat_l2_evict_lock_retry;
 	kstat_named_t arcstat_l2_evict_reading;
 	kstat_named_t arcstat_l2_evict_l1cached;
@@ -584,13 +601,13 @@ static arc_stats_t arc_stats = {
 	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
 	{ "allocated",			KSTAT_DATA_UINT64 },
 	{ "deleted",			KSTAT_DATA_UINT64 },
-	{ "stolen",			KSTAT_DATA_UINT64 },
-	{ "recycle_miss",		KSTAT_DATA_UINT64 },
 	{ "mutex_miss",			KSTAT_DATA_UINT64 },
 	{ "evict_skip",			KSTAT_DATA_UINT64 },
+	{ "evict_not_enough",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
+	{ "evict_l2_skip",		KSTAT_DATA_UINT64 },
 	{ "hash_elements",		KSTAT_DATA_UINT64 },
 	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
 	{ "hash_collisions",		KSTAT_DATA_UINT64 },
@@ -629,7 +646,7 @@ static arc_stats_t arc_stats = {
 	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
-	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
+	{ "l2_writes_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
 	{ "l2_evict_l1cached",		KSTAT_DATA_UINT64 },
@@ -806,7 +823,7 @@ typedef struct l1arc_buf_hdr {
 
 	/* protected by arc state mutex */
 	arc_state_t		*b_state;
-	list_node_t		b_arc_node;
+	multilist_node_t	b_arc_node;
 
 	/* updated atomically */
 	clock_t			b_arc_access;
@@ -877,7 +894,6 @@ sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS)
 #endif
 
 static arc_buf_t *arc_eviction_list;
-static kmutex_t arc_eviction_mtx;
 static arc_buf_hdr_t arc_eviction_hdr;
 
 #define	GHOST_STATE(state)	\
@@ -1011,21 +1027,21 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
     &l2arc_norw, 0, "no reads during writes");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
-    &ARC_anon.arcs_size, 0, "size of anonymous state");
+    &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
     &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
     &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
-    &ARC_mru.arcs_size, 0, "size of mru state");
+    &ARC_mru.arcs_size.rc_count, 0, "size of mru state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
     &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
     &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
-    &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
+    &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
     &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
     "size of metadata in mru ghost state");
@@ -1034,14 +1050,14 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
     "size of data in mru ghost state");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
-    &ARC_mfu.arcs_size, 0, "size of mfu state");
+    &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
     &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
     &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
-    &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
+    &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
     &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
     "size of metadata in mfu ghost state");
@@ -1050,7 +1066,7 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
     "size of data in mfu ghost state");
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
-    &ARC_l2c_only.arcs_size, 0, "size of mru state");
+    &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state");
 
 /*
  * L2ARC Internals
@@ -1106,8 +1122,7 @@ static uint8_t l2arc_thread_exit;
 
 static void arc_get_data_buf(arc_buf_t *);
 static void arc_access(arc_buf_hdr_t *, kmutex_t *);
-static int arc_evict_needed(arc_buf_contents_t);
-static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t);
+static boolean_t arc_is_overflowing();
 static void arc_buf_watch(arc_buf_t *);
 
 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
@@ -1288,6 +1303,7 @@ hdr_full_cons(void *vbuf, void *unused, int kmflag)
 	cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
 	refcount_create(&hdr->b_l1hdr.b_refcnt);
 	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+	multilist_link_init(&hdr->b_l1hdr.b_arc_node);
 	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 
 	return (0);
@@ -1332,6 +1348,7 @@ hdr_full_dest(void *vbuf, void *unused)
 	cv_destroy(&hdr->b_l1hdr.b_cv);
 	refcount_destroy(&hdr->b_l1hdr.b_refcnt);
 	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
+	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 }
 
@@ -1368,7 +1385,7 @@ hdr_recl(void *unused)
 	 * which is after we do arc_fini().
 	 */
 	if (!arc_dead)
-		cv_signal(&arc_reclaim_thr_cv);
+		cv_signal(&arc_reclaim_thread_cv);
 }
 
 static void
@@ -1447,18 +1464,31 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
 		 * l2c_only even though it's about to change.
 		 */
 		nhdr->b_l1hdr.b_state = arc_l2c_only;
+
+		/* Verify previous threads set to NULL before freeing */
+		ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL);
 	} else {
 		ASSERT(hdr->b_l1hdr.b_buf == NULL);
 		ASSERT0(hdr->b_l1hdr.b_datacnt);
-		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+
 		/*
-		 * We might be removing the L1hdr of a buffer which was just
-		 * written out to L2ARC. If such a buffer is compressed then we
-		 * need to free its b_tmp_cdata before destroying the header.
+		 * If we've reached here, We must have been called from
+		 * arc_evict_hdr(), as such we should have already been
+		 * removed from any ghost list we were previously on
+		 * (which protects us from racing with arc_evict_state),
+		 * thus no locking is needed during this check.
 		 */
-		if (hdr->b_l1hdr.b_tmp_cdata != NULL &&
-		    HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
-			l2arc_release_cdata_buf(hdr);
+		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+
+		/*
+		 * A buffer must not be moved into the arc_l2c_only
+		 * state if it's not finished being written out to the
+		 * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field
+		 * might try to be accessed, even though it was removed.
+		 */
+		VERIFY(!HDR_L2_WRITING(hdr));
+		VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+
 		nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
 	}
 	/*
@@ -1681,23 +1711,6 @@ arc_buf_freeze(arc_buf_t *buf)
 }
 
 static void
-get_buf_info(arc_buf_hdr_t *hdr, arc_state_t *state, list_t **list, kmutex_t **lock)
-{
-	uint64_t buf_hashid = buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
-
-	if (arc_buf_type(hdr) == ARC_BUFC_METADATA)
-		buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
-	else {
-		buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
-		buf_hashid += ARC_BUFC_NUMMETADATALISTS;
-	}
-
-	*list = &state->arcs_lists[buf_hashid];
-	*lock = ARCS_LOCK(state, buf_hashid);
-}
-
-
-static void
 add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
 {
 	ASSERT(HDR_HAS_L1HDR(hdr));
@@ -1708,16 +1721,13 @@ add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
 	    (state != arc_anon)) {
 		/* We don't use the L2-only state list. */
 		if (state != arc_l2c_only) {
+			arc_buf_contents_t type = arc_buf_type(hdr);
 			uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt;
-			uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
-			list_t *list;
-			kmutex_t *lock;
-
-			get_buf_info(hdr, state, &list, &lock);
-			ASSERT(!MUTEX_HELD(lock));
-			mutex_enter(lock);
-			ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
-			list_remove(list, hdr);
+			multilist_t *list = &state->arcs_list[type];
+			uint64_t *size = &state->arcs_lsize[type];
+
+			multilist_remove(list, hdr);
+
 			if (GHOST_STATE(state)) {
 				ASSERT0(hdr->b_l1hdr.b_datacnt);
 				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
@@ -1726,7 +1736,6 @@ add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
 			ASSERT(delta > 0);
 			ASSERT3U(*size, >=, delta);
 			atomic_add_64(size, -delta);
-			mutex_exit(lock);
 		}
 		/* remove the prefetch flag if we get a reference */
 		hdr->b_flags &= ~ARC_FLAG_PREFETCH;
@@ -1749,25 +1758,21 @@ remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
 	 */
 	if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
 	    (state != arc_anon)) {
-		uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
-		list_t *list;
-		kmutex_t *lock;
+		arc_buf_contents_t type = arc_buf_type(hdr);
+		multilist_t *list = &state->arcs_list[type];
+		uint64_t *size = &state->arcs_lsize[type];
+
+		multilist_insert(list, hdr);
 
-		get_buf_info(hdr, state, &list, &lock);
-		ASSERT(!MUTEX_HELD(lock));
-		mutex_enter(lock);
-		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
-		list_insert_head(list, hdr);
 		ASSERT(hdr->b_l1hdr.b_datacnt > 0);
 		atomic_add_64(size, hdr->b_size *
 		    hdr->b_l1hdr.b_datacnt);
-		mutex_exit(lock);
 	}
 	return (cnt);
 }
 
 /*
- * Move the supplied buffer to the indicated state.  The mutex
+ * Move the supplied buffer to the indicated state. The hash lock
  * for the buffer must be held by the caller.
  */
 static void
@@ -1779,8 +1784,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
 	uint32_t datacnt;
 	uint64_t from_delta, to_delta;
 	arc_buf_contents_t buftype = arc_buf_type(hdr);
-	list_t *list;
-	kmutex_t *lock;
 
 	/*
 	 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
@@ -1813,17 +1816,10 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
 	 */
 	if (refcnt == 0) {
 		if (old_state != arc_anon && old_state != arc_l2c_only) {
-			int use_mutex;
 			uint64_t *size = &old_state->arcs_lsize[buftype];
 
-			get_buf_info(hdr, old_state, &list, &lock);
-			use_mutex = !MUTEX_HELD(lock);
-			if (use_mutex)
-				mutex_enter(lock);
-
 			ASSERT(HDR_HAS_L1HDR(hdr));
-			ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
-			list_remove(list, hdr);
+			multilist_remove(&old_state->arcs_list[buftype], hdr);
 
 			/*
 			 * If prefetching out of the ghost cache,
@@ -1836,12 +1832,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
 			}
 			ASSERT3U(*size, >=, from_delta);
 			atomic_add_64(size, -from_delta);
-
-			if (use_mutex)
-				mutex_exit(lock);
 		}
 		if (new_state != arc_anon && new_state != arc_l2c_only) {
-			int use_mutex;
 			uint64_t *size = &new_state->arcs_lsize[buftype];
 
 			/*
@@ -1851,23 +1843,15 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
 			 * beforehand.
 			 */
 			ASSERT(HDR_HAS_L1HDR(hdr));
-			get_buf_info(hdr, new_state, &list, &lock);
-			use_mutex = !MUTEX_HELD(lock);
-			if (use_mutex)
-				mutex_enter(lock);
-
-			list_insert_head(list, hdr);
+			multilist_insert(&new_state->arcs_list[buftype], hdr);
 
 			/* ghost elements have a ghost size */
 			if (GHOST_STATE(new_state)) {
-				ASSERT(datacnt == 0);
+				ASSERT0(datacnt);
 				ASSERT(hdr->b_l1hdr.b_buf == NULL);
 				to_delta = hdr->b_size;
 			}
 			atomic_add_64(size, to_delta);
-
-			if (use_mutex)
-				mutex_exit(lock);
 		}
 	}
 
@@ -1876,12 +1860,73 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
 		buf_hash_remove(hdr);
 
 	/* adjust state sizes (ignore arc_l2c_only) */
-	if (to_delta && new_state != arc_l2c_only)
-		atomic_add_64(&new_state->arcs_size, to_delta);
+
+	if (to_delta && new_state != arc_l2c_only) {
+		ASSERT(HDR_HAS_L1HDR(hdr));
+		if (GHOST_STATE(new_state)) {
+			ASSERT0(datacnt);
+
+			/*
+			 * We moving a header to a ghost state, we first
+			 * remove all arc buffers. Thus, we'll have a
+			 * datacnt of zero, and no arc buffer to use for
+			 * the reference. As a result, we use the arc
+			 * header pointer for the reference.
+			 */
+			(void) refcount_add_many(&new_state->arcs_size,
+			    hdr->b_size, hdr);
+		} else {
+			ASSERT3U(datacnt, !=, 0);
+
+			/*
+			 * Each individual buffer holds a unique reference,
+			 * thus we must remove each of these references one
+			 * at a time.
+			 */
+			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+			    buf = buf->b_next) {
+				(void) refcount_add_many(&new_state->arcs_size,
+				    hdr->b_size, buf);
+			}
+		}
+	}
+
 	if (from_delta && old_state != arc_l2c_only) {
-		ASSERT3U(old_state->arcs_size, >=, from_delta);
-		atomic_add_64(&old_state->arcs_size, -from_delta);
+		ASSERT(HDR_HAS_L1HDR(hdr));
+		if (GHOST_STATE(old_state)) {
+			/*
+			 * When moving a header off of a ghost state,
+			 * there's the possibility for datacnt to be
+			 * non-zero. This is because we first add the
+			 * arc buffer to the header prior to changing
+			 * the header's state. Since we used the header
+			 * for the reference when putting the header on
+			 * the ghost state, we must balance that and use
+			 * the header when removing off the ghost state
+			 * (even though datacnt is non zero).
+			 */
+
+			IMPLY(datacnt == 0, new_state == arc_anon ||
+			    new_state == arc_l2c_only);
+
+			(void) refcount_remove_many(&old_state->arcs_size,
+			    hdr->b_size, hdr);
+		} else {
+			ASSERT3P(datacnt, !=, 0);
+
+			/*
+			 * Each individual buffer holds a unique reference,
+			 * thus we must remove each of these references one
+			 * at a time.
+			 */
+			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+			    buf = buf->b_next) {
+				(void) refcount_remove_many(
+				    &old_state->arcs_size, hdr->b_size, buf);
+			}
+		}
 	}
+
 	if (HDR_HAS_L1HDR(hdr))
 		hdr->b_l1hdr.b_state = new_state;
 
@@ -1889,10 +1934,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
 	 * L2 headers should never be on the L2 state list since they don't
 	 * have L1 headers allocated.
 	 */
-#ifdef illumos
-	ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
-	    list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
-#endif
+	ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
+	    multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
 }
 
 void
@@ -1985,6 +2028,7 @@ arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type)
 	hdr->b_l1hdr.b_state = arc_anon;
 	hdr->b_l1hdr.b_arc_access = 0;
 	hdr->b_l1hdr.b_datacnt = 1;
+	hdr->b_l1hdr.b_tmp_cdata = NULL;
 
 	arc_get_data_buf(buf);
 	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
@@ -2120,7 +2164,7 @@ arc_buf_free_on_write(void *data, size_t size,
 {
 	l2arc_data_free_t *df;
 
-	df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
+	df = kmem_alloc(sizeof (*df), KM_SLEEP);
 	df->l2df_data = data;
 	df->l2df_size = size;
 	df->l2df_func = free_func;
@@ -2146,10 +2190,6 @@ arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
 	}
 }
 
-/*
- * Free up buf->b_data and if 'remove' is set, then pull the
- * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
- */
 static void
 arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
 {
@@ -2164,19 +2204,53 @@ arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
 	if (!HDR_HAS_L1HDR(hdr))
 		return;
 
-	if (hdr->b_l1hdr.b_tmp_cdata == NULL)
+	/*
+	 * The header isn't being written to the l2arc device, thus it
+	 * shouldn't have a b_tmp_cdata to free.
+	 */
+	if (!HDR_L2_WRITING(hdr)) {
+		ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+		return;
+	}
+
+	/*
+	 * The header does not have compression enabled. This can be due
+	 * to the buffer not being compressible, or because we're
+	 * freeing the buffer before the second phase of
+	 * l2arc_write_buffer() has started (which does the compression
+	 * step). In either case, b_tmp_cdata does not point to a
+	 * separately compressed buffer, so there's nothing to free (it
+	 * points to the same buffer as the arc_buf_t's b_data field).
+	 */
+	if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) {
+		hdr->b_l1hdr.b_tmp_cdata = NULL;
+		return;
+	}
+
+	/*
+	 * There's nothing to free since the buffer was all zero's and
+	 * compressed to a zero length buffer.
+	 */
+	if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_EMPTY) {
+		ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
 		return;
+	}
 
-	ASSERT(HDR_L2_WRITING(hdr));
-	arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size,
-	    zio_data_buf_free);
+	ASSERT(L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr)));
+
+	arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata,
+	    hdr->b_size, zio_data_buf_free);
 
 	ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
 	hdr->b_l1hdr.b_tmp_cdata = NULL;
 }
 
+/*
+ * Free up buf->b_data and if 'remove' is set, then pull the
+ * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
+ */
 static void
-arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
+arc_buf_destroy(arc_buf_t *buf, boolean_t remove)
 {
 	arc_buf_t **bufp;
 
@@ -2191,17 +2265,17 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
 		arc_buf_unwatch(buf);
 #endif
 
-		if (!recycle) {
-			if (type == ARC_BUFC_METADATA) {
-				arc_buf_data_free(buf, zio_buf_free);
-				arc_space_return(size, ARC_SPACE_META);
-			} else {
-				ASSERT(type == ARC_BUFC_DATA);
-				arc_buf_data_free(buf, zio_data_buf_free);
-				arc_space_return(size, ARC_SPACE_DATA);
-			}
+		if (type == ARC_BUFC_METADATA) {
+			arc_buf_data_free(buf, zio_buf_free);
+			arc_space_return(size, ARC_SPACE_META);
+		} else {
+			ASSERT(type == ARC_BUFC_DATA);
+			arc_buf_data_free(buf, zio_data_buf_free);
+			arc_space_return(size, ARC_SPACE_DATA);
 		}
-		if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
+
+		/* protected by hash lock, if in the hash table */
+		if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
 			uint64_t *cnt = &state->arcs_lsize[type];
 
 			ASSERT(refcount_is_zero(
@@ -2211,8 +2285,8 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
 			ASSERT3U(*cnt, >=, size);
 			atomic_add_64(cnt, -size);
 		}
-		ASSERT3U(state->arcs_size, >=, size);
-		atomic_add_64(&state->arcs_size, -size);
+
+		(void) refcount_remove_many(&state->arcs_size, size, buf);
 		buf->b_data = NULL;
 
 		/*
@@ -2339,6 +2413,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 
 	if (!BUF_EMPTY(hdr))
 		buf_discard_identity(hdr);
+
 	if (hdr->b_freeze_cksum != NULL) {
 		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
 		hdr->b_freeze_cksum = NULL;
@@ -2349,20 +2424,19 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 			arc_buf_t *buf = hdr->b_l1hdr.b_buf;
 
 			if (buf->b_efunc != NULL) {
-				mutex_enter(&arc_eviction_mtx);
+				mutex_enter(&arc_user_evicts_lock);
 				mutex_enter(&buf->b_evict_lock);
 				ASSERT(buf->b_hdr != NULL);
-				arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
-				    FALSE);
+				arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE);
 				hdr->b_l1hdr.b_buf = buf->b_next;
 				buf->b_hdr = &arc_eviction_hdr;
 				buf->b_next = arc_eviction_list;
 				arc_eviction_list = buf;
 				mutex_exit(&buf->b_evict_lock);
-				mutex_exit(&arc_eviction_mtx);
+				cv_signal(&arc_user_evicts_cv);
+				mutex_exit(&arc_user_evicts_lock);
 			} else {
-				arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
-				    TRUE);
+				arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE);
 			}
 		}
 #ifdef ZFS_DEBUG
@@ -2375,7 +2449,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 
 	ASSERT3P(hdr->b_hash_next, ==, NULL);
 	if (HDR_HAS_L1HDR(hdr)) {
-		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 		kmem_cache_free(hdr_full_cache, hdr);
 	} else {
@@ -2401,7 +2475,7 @@ arc_buf_free(arc_buf_t *buf, void *tag)
 
 		(void) remove_reference(hdr, hash_lock, tag);
 		if (hdr->b_l1hdr.b_datacnt > 1) {
-			arc_buf_destroy(buf, FALSE, TRUE);
+			arc_buf_destroy(buf, TRUE);
 		} else {
 			ASSERT(buf == hdr->b_l1hdr.b_buf);
 			ASSERT(buf->b_efunc == NULL);
@@ -2415,16 +2489,16 @@ arc_buf_free(arc_buf_t *buf, void *tag)
 		 * this buffer unless the write completes before we finish
 		 * decrementing the reference count.
 		 */
-		mutex_enter(&arc_eviction_mtx);
+		mutex_enter(&arc_user_evicts_lock);
 		(void) remove_reference(hdr, NULL, tag);
 		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
-		mutex_exit(&arc_eviction_mtx);
+		mutex_exit(&arc_user_evicts_lock);
 		if (destroy_hdr)
 			arc_hdr_destroy(hdr);
 	} else {
 		if (remove_reference(hdr, NULL, tag) > 0)
-			arc_buf_destroy(buf, FALSE, TRUE);
+			arc_buf_destroy(buf, TRUE);
 		else
 			arc_hdr_destroy(hdr);
 	}
@@ -2453,7 +2527,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag)
 	(void) remove_reference(hdr, hash_lock, tag);
 	if (hdr->b_l1hdr.b_datacnt > 1) {
 		if (no_callback)
-			arc_buf_destroy(buf, FALSE, TRUE);
+			arc_buf_destroy(buf, TRUE);
 	} else if (no_callback) {
 		ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL);
 		ASSERT(buf->b_efunc == NULL);
@@ -2514,499 +2588,678 @@ arc_buf_eviction_needed(arc_buf_t *buf)
 }
 
 /*
- * Evict buffers from list until we've removed the specified number of
- * bytes.  Move the removed buffers to the appropriate evict state.
- * If the recycle flag is set, then attempt to "recycle" a buffer:
- * - look for a buffer to evict that is `bytes' long.
- * - return the data block from this buffer rather than freeing it.
- * This flag is used by callers that are trying to make space for a
- * new buffer in a full arc cache.
+ * Evict the arc_buf_hdr that is provided as a parameter. The resultant
+ * state of the header is dependent on it's state prior to entering this
+ * function. The following transitions are possible:
  *
- * This function makes a "best effort".  It skips over any buffers
- * it can't get a hash_lock on, and so may not catch all candidates.
- * It may also return without evicting as much space as requested.
+ *    - arc_mru -> arc_mru_ghost
+ *    - arc_mfu -> arc_mfu_ghost
+ *    - arc_mru_ghost -> arc_l2c_only
+ *    - arc_mru_ghost -> deleted
+ *    - arc_mfu_ghost -> arc_l2c_only
+ *    - arc_mfu_ghost -> deleted
  */
-static void *
-arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
-    arc_buf_contents_t type)
+static int64_t
+arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 {
-	arc_state_t *evicted_state;
-	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
-	int64_t bytes_remaining;
-	arc_buf_hdr_t *hdr, *hdr_prev = NULL;
-	list_t *evicted_list, *list, *evicted_list_start, *list_start;
-	kmutex_t *lock, *evicted_lock;
-	kmutex_t *hash_lock;
-	boolean_t have_lock;
-	void *stolen = NULL;
-	arc_buf_hdr_t marker = { 0 };
-	int count = 0;
-	static int evict_metadata_offset, evict_data_offset;
-	int i, idx, offset, list_count, lists;
+	arc_state_t *evicted_state, *state;
+	int64_t bytes_evicted = 0;
 
-	ASSERT(state == arc_mru || state == arc_mfu);
+	ASSERT(MUTEX_HELD(hash_lock));
+	ASSERT(HDR_HAS_L1HDR(hdr));
 
-	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+	state = hdr->b_l1hdr.b_state;
+	if (GHOST_STATE(state)) {
+		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+		ASSERT(hdr->b_l1hdr.b_buf == NULL);
 
-	/*
-	 * Decide which "type" (data vs metadata) to recycle from.
-	 *
-	 * If we are over the metadata limit, recycle from metadata.
-	 * If we are under the metadata minimum, recycle from data.
-	 * Otherwise, recycle from whichever type has the oldest (least
-	 * recently accessed) header.  This is not yet implemented.
-	 */
-	if (recycle) {
-		arc_buf_contents_t realtype;
-		if (state->arcs_lsize[ARC_BUFC_DATA] == 0) {
-			realtype = ARC_BUFC_METADATA;
-		} else if (state->arcs_lsize[ARC_BUFC_METADATA] == 0) {
-			realtype = ARC_BUFC_DATA;
-		} else if (arc_meta_used >= arc_meta_limit) {
-			realtype = ARC_BUFC_METADATA;
-		} else if (arc_meta_used <= arc_meta_min) {
-			realtype = ARC_BUFC_DATA;
-#ifdef illumos
-		} else if (HDR_HAS_L1HDR(data_hdr) &&
-		    HDR_HAS_L1HDR(metadata_hdr) &&
-		    data_hdr->b_l1hdr.b_arc_access <
-		    metadata_hdr->b_l1hdr.b_arc_access) {
-			realtype = ARC_BUFC_DATA;
-		} else {
-			realtype = ARC_BUFC_METADATA;
-#else
-		} else {
-			/* TODO */
-			realtype = type;
-#endif
+		/*
+		 * l2arc_write_buffers() relies on a header's L1 portion
+		 * (i.e. it's b_tmp_cdata field) during it's write phase.
+		 * Thus, we cannot push a header onto the arc_l2c_only
+		 * state (removing it's L1 piece) until the header is
+		 * done being written to the l2arc.
+		 */
+		if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
+			ARCSTAT_BUMP(arcstat_evict_l2_skip);
+			return (bytes_evicted);
 		}
-		if (realtype != type) {
+
+		ARCSTAT_BUMP(arcstat_deleted);
+		bytes_evicted += hdr->b_size;
+
+		DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
+
+		if (HDR_HAS_L2HDR(hdr)) {
+			/*
+			 * This buffer is cached on the 2nd Level ARC;
+			 * don't destroy the header.
+			 */
+			arc_change_state(arc_l2c_only, hdr, hash_lock);
 			/*
-			 * If we want to evict from a different list,
-			 * we can not recycle, because DATA vs METADATA
-			 * buffers are segregated into different kmem
-			 * caches (and vmem arenas).
+			 * dropping from L1+L2 cached to L2-only,
+			 * realloc to remove the L1 header.
 			 */
-			type = realtype;
-			recycle = B_FALSE;
+			hdr = arc_hdr_realloc(hdr, hdr_full_cache,
+			    hdr_l2only_cache);
+		} else {
+			arc_change_state(arc_anon, hdr, hash_lock);
+			arc_hdr_destroy(hdr);
 		}
+		return (bytes_evicted);
 	}
 
-	if (type == ARC_BUFC_METADATA) {
-		offset = 0;
-		list_count = ARC_BUFC_NUMMETADATALISTS;
-		list_start = &state->arcs_lists[0];
-		evicted_list_start = &evicted_state->arcs_lists[0];
-		idx = evict_metadata_offset;
-	} else {
-		offset = ARC_BUFC_NUMMETADATALISTS;
-		list_start = &state->arcs_lists[offset];
-		evicted_list_start = &evicted_state->arcs_lists[offset];
-		list_count = ARC_BUFC_NUMDATALISTS;
-		idx = evict_data_offset;
-	}
-	bytes_remaining = evicted_state->arcs_lsize[type];
-	lists = 0;
+	ASSERT(state == arc_mru || state == arc_mfu);
+	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
 
-evict_start:
-	list = &list_start[idx];
-	evicted_list = &evicted_list_start[idx];
-	lock = ARCS_LOCK(state, (offset + idx));
-	evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
+	/* prefetch buffers have a minimum lifespan */
+	if (HDR_IO_IN_PROGRESS(hdr) ||
+	    ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
+	    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
+	    arc_min_prefetch_lifespan)) {
+		ARCSTAT_BUMP(arcstat_evict_skip);
+		return (bytes_evicted);
+	}
 
-	/*
-	 * The ghost list lock must be acquired first in order to prevent
-	 * a 3 party deadlock:
-	 *
-	 *  - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by
-	 *    l2ad_mtx in arc_hdr_realloc
-	 *  - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx
-	 *  - arc_evict acquires arc_*_ghost->arcs_mtx, followed by
-	 *    arc_*_ghost->arcs_mtx and forms a deadlock cycle.
-	 *
-	 * This situation is avoided by acquiring the ghost list lock first.
-	 */
-	mutex_enter(evicted_lock);
-	mutex_enter(lock);
-
-	for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
-		hdr_prev = list_prev(list, hdr);
-		if (HDR_HAS_L1HDR(hdr)) {
-			bytes_remaining -=
-			    (hdr->b_size * hdr->b_l1hdr.b_datacnt);
+	ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
+	ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
+	while (hdr->b_l1hdr.b_buf) {
+		arc_buf_t *buf = hdr->b_l1hdr.b_buf;
+		if (!mutex_tryenter(&buf->b_evict_lock)) {
+			ARCSTAT_BUMP(arcstat_mutex_miss);
+			break;
 		}
-		/* prefetch buffers have a minimum lifespan */
-		if (HDR_IO_IN_PROGRESS(hdr) ||
-		    (spa && hdr->b_spa != spa) ||
-		    ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
-		    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
-		    arc_min_prefetch_lifespan)) {
-			skipped++;
-			continue;
+		if (buf->b_data != NULL)
+			bytes_evicted += hdr->b_size;
+		if (buf->b_efunc != NULL) {
+			mutex_enter(&arc_user_evicts_lock);
+			arc_buf_destroy(buf, FALSE);
+			hdr->b_l1hdr.b_buf = buf->b_next;
+			buf->b_hdr = &arc_eviction_hdr;
+			buf->b_next = arc_eviction_list;
+			arc_eviction_list = buf;
+			cv_signal(&arc_user_evicts_cv);
+			mutex_exit(&arc_user_evicts_lock);
+			mutex_exit(&buf->b_evict_lock);
+		} else {
+			mutex_exit(&buf->b_evict_lock);
+			arc_buf_destroy(buf, TRUE);
 		}
-		/* "lookahead" for better eviction candidate */
-		if (recycle && hdr->b_size != bytes &&
-		    hdr_prev && hdr_prev->b_size == bytes)
-			continue;
+	}
 
-		/* ignore markers */
-		if (hdr->b_spa == 0)
-			continue;
+	if (HDR_HAS_L2HDR(hdr)) {
+		ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size);
+	} else {
+		if (l2arc_write_eligible(hdr->b_spa, hdr))
+			ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size);
+		else
+			ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size);
+	}
+
+	if (hdr->b_l1hdr.b_datacnt == 0) {
+		arc_change_state(evicted_state, hdr, hash_lock);
+		ASSERT(HDR_IN_HASH_TABLE(hdr));
+		hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
+		hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
+		DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
+	}
+
+	return (bytes_evicted);
+}
+
+static uint64_t
+arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
+    uint64_t spa, int64_t bytes)
+{
+	multilist_sublist_t *mls;
+	uint64_t bytes_evicted = 0;
+	arc_buf_hdr_t *hdr;
+	kmutex_t *hash_lock;
+	int evict_count = 0;
+
+	ASSERT3P(marker, !=, NULL);
+	IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
+
+	mls = multilist_sublist_lock(ml, idx);
+
+	for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
+	    hdr = multilist_sublist_prev(mls, marker)) {
+		if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
+		    (evict_count >= zfs_arc_evict_batch_limit))
+			break;
 
 		/*
-		 * It may take a long time to evict all the bufs requested.
-		 * To avoid blocking all arc activity, periodically drop
-		 * the arcs_mtx and give other threads a chance to run
-		 * before reacquiring the lock.
-		 *
-		 * If we are looking for a buffer to recycle, we are in
-		 * the hot code path, so don't sleep.
+		 * To keep our iteration location, move the marker
+		 * forward. Since we're not holding hdr's hash lock, we
+		 * must be very careful and not remove 'hdr' from the
+		 * sublist. Otherwise, other consumers might mistake the
+		 * 'hdr' as not being on a sublist when they call the
+		 * multilist_link_active() function (they all rely on
+		 * the hash lock protecting concurrent insertions and
+		 * removals). multilist_sublist_move_forward() was
+		 * specifically implemented to ensure this is the case
+		 * (only 'marker' will be removed and re-inserted).
 		 */
-		if (!recycle && count++ > arc_evict_iterations) {
-			list_insert_after(list, hdr, &marker);
-			mutex_exit(lock);
-			mutex_exit(evicted_lock);
-			kpreempt(KPREEMPT_SYNC);
-			mutex_enter(evicted_lock);
-			mutex_enter(lock);
-			hdr_prev = list_prev(list, &marker);
-			list_remove(list, &marker);
-			count = 0;
+		multilist_sublist_move_forward(mls, marker);
+
+		/*
+		 * The only case where the b_spa field should ever be
+		 * zero, is the marker headers inserted by
+		 * arc_evict_state(). It's possible for multiple threads
+		 * to be calling arc_evict_state() concurrently (e.g.
+		 * dsl_pool_close() and zio_inject_fault()), so we must
+		 * skip any markers we see from these other threads.
+		 */
+		if (hdr->b_spa == 0)
+			continue;
+
+		/* we're only interested in evicting buffers of a certain spa */
+		if (spa != 0 && hdr->b_spa != spa) {
+			ARCSTAT_BUMP(arcstat_evict_skip);
 			continue;
 		}
 
 		hash_lock = HDR_LOCK(hdr);
-		have_lock = MUTEX_HELD(hash_lock);
-		if (have_lock || mutex_tryenter(hash_lock)) {
-			ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
-			ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
-			while (hdr->b_l1hdr.b_buf) {
-				arc_buf_t *buf = hdr->b_l1hdr.b_buf;
-				if (!mutex_tryenter(&buf->b_evict_lock)) {
-					missed += 1;
-					break;
-				}
-				if (buf->b_data != NULL) {
-					bytes_evicted += hdr->b_size;
-					if (recycle &&
-					    arc_buf_type(hdr) == type &&
-					    hdr->b_size == bytes &&
-					    !HDR_L2_WRITING(hdr)) {
-						stolen = buf->b_data;
-						recycle = FALSE;
-					}
-				}
-				if (buf->b_efunc != NULL) {
-					mutex_enter(&arc_eviction_mtx);
-					arc_buf_destroy(buf,
-					    buf->b_data == stolen, FALSE);
-					hdr->b_l1hdr.b_buf = buf->b_next;
-					buf->b_hdr = &arc_eviction_hdr;
-					buf->b_next = arc_eviction_list;
-					arc_eviction_list = buf;
-					mutex_exit(&arc_eviction_mtx);
-					mutex_exit(&buf->b_evict_lock);
-				} else {
-					mutex_exit(&buf->b_evict_lock);
-					arc_buf_destroy(buf,
-					    buf->b_data == stolen, TRUE);
-				}
-			}
 
-			if (HDR_HAS_L2HDR(hdr)) {
-				ARCSTAT_INCR(arcstat_evict_l2_cached,
-				    hdr->b_size);
-			} else {
-				if (l2arc_write_eligible(hdr->b_spa, hdr)) {
-					ARCSTAT_INCR(arcstat_evict_l2_eligible,
-					    hdr->b_size);
-				} else {
-					ARCSTAT_INCR(
-					    arcstat_evict_l2_ineligible,
-					    hdr->b_size);
-				}
-			}
+		/*
+		 * We aren't calling this function from any code path
+		 * that would already be holding a hash lock, so we're
+		 * asserting on this assumption to be defensive in case
+		 * this ever changes. Without this check, it would be
+		 * possible to incorrectly increment arcstat_mutex_miss
+		 * below (e.g. if the code changed such that we called
+		 * this function with a hash lock held).
+		 */
+		ASSERT(!MUTEX_HELD(hash_lock));
 
-			if (hdr->b_l1hdr.b_datacnt == 0) {
-				arc_change_state(evicted_state, hdr, hash_lock);
-				ASSERT(HDR_IN_HASH_TABLE(hdr));
-				hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
-				hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
-				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
-			}
-			if (!have_lock)
-				mutex_exit(hash_lock);
-			if (bytes >= 0 && bytes_evicted >= bytes)
-				break;
-			if (bytes_remaining > 0) {
-				mutex_exit(evicted_lock);
-				mutex_exit(lock);
-				idx  = ((idx + 1) & (list_count - 1));
-				lists++;
-				goto evict_start;
-			}
-		} else {
-			missed += 1;
-		}
-	}
+		if (mutex_tryenter(hash_lock)) {
+			uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
+			mutex_exit(hash_lock);
 
-	mutex_exit(lock);
-	mutex_exit(evicted_lock);
+			bytes_evicted += evicted;
 
-	idx  = ((idx + 1) & (list_count - 1));
-	lists++;
+			/*
+			 * If evicted is zero, arc_evict_hdr() must have
+			 * decided to skip this header, don't increment
+			 * evict_count in this case.
+			 */
+			if (evicted != 0)
+				evict_count++;
 
-	if (bytes_evicted < bytes) {
-		if (lists < list_count)
-			goto evict_start;
-		else
-			dprintf("only evicted %lld bytes from %x",
-			    (longlong_t)bytes_evicted, state);
+			/*
+			 * If arc_size isn't overflowing, signal any
+			 * threads that might happen to be waiting.
+			 *
+			 * For each header evicted, we wake up a single
+			 * thread. If we used cv_broadcast, we could
+			 * wake up "too many" threads causing arc_size
+			 * to significantly overflow arc_c; since
+			 * arc_get_data_buf() doesn't check for overflow
+			 * when it's woken up (it doesn't because it's
+			 * possible for the ARC to be overflowing while
+			 * full of un-evictable buffers, and the
+			 * function should proceed in this case).
+			 *
+			 * If threads are left sleeping, due to not
+			 * using cv_broadcast, they will be woken up
+			 * just before arc_reclaim_thread() sleeps.
+			 */
+			mutex_enter(&arc_reclaim_lock);
+			if (!arc_is_overflowing())
+				cv_signal(&arc_reclaim_waiters_cv);
+			mutex_exit(&arc_reclaim_lock);
+		} else {
+			ARCSTAT_BUMP(arcstat_mutex_miss);
+		}
 	}
-	if (type == ARC_BUFC_METADATA)
-		evict_metadata_offset = idx;
-	else
-		evict_data_offset = idx;
 
-	if (skipped)
-		ARCSTAT_INCR(arcstat_evict_skip, skipped);
-
-	if (missed)
-		ARCSTAT_INCR(arcstat_mutex_miss, missed);
-
-	/*
-	 * Note: we have just evicted some data into the ghost state,
-	 * potentially putting the ghost size over the desired size.  Rather
-	 * that evicting from the ghost list in this hot code path, leave
-	 * this chore to the arc_reclaim_thread().
-	 */
+	multilist_sublist_unlock(mls);
 
-	if (stolen)
-		ARCSTAT_BUMP(arcstat_stolen);
-	return (stolen);
+	return (bytes_evicted);
 }
 
 /*
- * Remove buffers from list until we've removed the specified number of
- * bytes.  Destroy the buffers that are removed.
+ * Evict buffers from the given arc state, until we've removed the
+ * specified number of bytes. Move the removed buffers to the
+ * appropriate evict state.
+ *
+ * This function makes a "best effort". It skips over any buffers
+ * it can't get a hash_lock on, and so, may not catch all candidates.
+ * It may also return without evicting as much space as requested.
+ *
+ * If bytes is specified using the special value ARC_EVICT_ALL, this
+ * will evict all available (i.e. unlocked and evictable) buffers from
+ * the given arc state; which is used by arc_flush().
  */
-static void
-arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
+static uint64_t
+arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
+    arc_buf_contents_t type)
 {
-	arc_buf_hdr_t *hdr, *hdr_prev;
-	arc_buf_hdr_t marker = { 0 };
-	list_t *list, *list_start;
-	kmutex_t *hash_lock, *lock;
-	uint64_t bytes_deleted = 0;
-	uint64_t bufs_skipped = 0;
-	int count = 0;
-	static int evict_offset;
-	int list_count, idx = evict_offset;
-	int offset, lists = 0;
-
-	ASSERT(GHOST_STATE(state));
+	uint64_t total_evicted = 0;
+	multilist_t *ml = &state->arcs_list[type];
+	int num_sublists;
+	arc_buf_hdr_t **markers;
+
+	IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
+
+	num_sublists = multilist_get_num_sublists(ml);
 
 	/*
-	 * data lists come after metadata lists
+	 * If we've tried to evict from each sublist, made some
+	 * progress, but still have not hit the target number of bytes
+	 * to evict, we want to keep trying. The markers allow us to
+	 * pick up where we left off for each individual sublist, rather
+	 * than starting from the tail each time.
 	 */
-	list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
-	list_count = ARC_BUFC_NUMDATALISTS;
-	offset = ARC_BUFC_NUMMETADATALISTS;
-
-evict_start:
-	list = &list_start[idx];
-	lock = ARCS_LOCK(state, idx + offset);
-
-	mutex_enter(lock);
-	for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
-		hdr_prev = list_prev(list, hdr);
-		if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES)
-			panic("invalid hdr=%p", (void *)hdr);
-		if (spa && hdr->b_spa != spa)
-			continue;
+	markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
+	for (int i = 0; i < num_sublists; i++) {
+		markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
 
-		/* ignore markers */
-		if (hdr->b_spa == 0)
-			continue;
+		/*
+		 * A b_spa of 0 is used to indicate that this header is
+		 * a marker. This fact is used in arc_adjust_type() and
+		 * arc_evict_state_impl().
+		 */
+		markers[i]->b_spa = 0;
 
-		hash_lock = HDR_LOCK(hdr);
-		/* caller may be trying to modify this buffer, skip it */
-		if (MUTEX_HELD(hash_lock))
-			continue;
+		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+		multilist_sublist_insert_tail(mls, markers[i]);
+		multilist_sublist_unlock(mls);
+	}
 
+	/*
+	 * While we haven't hit our target number of bytes to evict, or
+	 * we're evicting all available buffers.
+	 */
+	while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
 		/*
-		 * It may take a long time to evict all the bufs requested.
-		 * To avoid blocking all arc activity, periodically drop
-		 * the arcs_mtx and give other threads a chance to run
-		 * before reacquiring the lock.
+		 * Start eviction using a randomly selected sublist,
+		 * this is to try and evenly balance eviction across all
+		 * sublists. Always starting at the same sublist
+		 * (e.g. index 0) would cause evictions to favor certain
+		 * sublists over others.
 		 */
-		if (count++ > arc_evict_iterations) {
-			list_insert_after(list, hdr, &marker);
-			mutex_exit(lock);
-			kpreempt(KPREEMPT_SYNC);
-			mutex_enter(lock);
-			hdr_prev = list_prev(list, &marker);
-			list_remove(list, &marker);
-			count = 0;
-			continue;
-		}
-		if (mutex_tryenter(hash_lock)) {
-			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-			ASSERT(!HDR_HAS_L1HDR(hdr) ||
-			    hdr->b_l1hdr.b_buf == NULL);
-			ARCSTAT_BUMP(arcstat_deleted);
-			bytes_deleted += hdr->b_size;
+		int sublist_idx = multilist_get_random_index(ml);
+		uint64_t scan_evicted = 0;
 
-			if (HDR_HAS_L2HDR(hdr)) {
-				/*
-				 * This buffer is cached on the 2nd Level ARC;
-				 * don't destroy the header.
-				 */
-				arc_change_state(arc_l2c_only, hdr, hash_lock);
-				/*
-				 * dropping from L1+L2 cached to L2-only,
-				 * realloc to remove the L1 header.
-				 */
-				hdr = arc_hdr_realloc(hdr, hdr_full_cache,
-				    hdr_l2only_cache);
-				mutex_exit(hash_lock);
-			} else {
-				arc_change_state(arc_anon, hdr, hash_lock);
-				mutex_exit(hash_lock);
-				arc_hdr_destroy(hdr);
-			}
+		for (int i = 0; i < num_sublists; i++) {
+			uint64_t bytes_remaining;
+			uint64_t bytes_evicted;
 
-			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
-			if (bytes >= 0 && bytes_deleted >= bytes)
+			if (bytes == ARC_EVICT_ALL)
+				bytes_remaining = ARC_EVICT_ALL;
+			else if (total_evicted < bytes)
+				bytes_remaining = bytes - total_evicted;
+			else
 				break;
-		} else if (bytes < 0) {
+
+			bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
+			    markers[sublist_idx], spa, bytes_remaining);
+
+			scan_evicted += bytes_evicted;
+			total_evicted += bytes_evicted;
+
+			/* we've reached the end, wrap to the beginning */
+			if (++sublist_idx >= num_sublists)
+				sublist_idx = 0;
+		}
+
+		/*
+		 * If we didn't evict anything during this scan, we have
+		 * no reason to believe we'll evict more during another
+		 * scan, so break the loop.
+		 */
+		if (scan_evicted == 0) {
+			/* This isn't possible, let's make that obvious */
+			ASSERT3S(bytes, !=, 0);
+
 			/*
-			 * Insert a list marker and then wait for the
-			 * hash lock to become available. Once its
-			 * available, restart from where we left off.
+			 * When bytes is ARC_EVICT_ALL, the only way to
+			 * break the loop is when scan_evicted is zero.
+			 * In that case, we actually have evicted enough,
+			 * so we don't want to increment the kstat.
 			 */
-			list_insert_after(list, hdr, &marker);
-			mutex_exit(lock);
-			mutex_enter(hash_lock);
-			mutex_exit(hash_lock);
-			mutex_enter(lock);
-			hdr_prev = list_prev(list, &marker);
-			list_remove(list, &marker);
-		} else {
-			bufs_skipped += 1;
+			if (bytes != ARC_EVICT_ALL) {
+				ASSERT3S(total_evicted, <, bytes);
+				ARCSTAT_BUMP(arcstat_evict_not_enough);
+			}
+
+			break;
 		}
+	}
 
+	for (int i = 0; i < num_sublists; i++) {
+		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+		multilist_sublist_remove(mls, markers[i]);
+		multilist_sublist_unlock(mls);
+
+		kmem_cache_free(hdr_full_cache, markers[i]);
 	}
-	mutex_exit(lock);
-	idx  = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
-	lists++;
+	kmem_free(markers, sizeof (*markers) * num_sublists);
 
-	if (lists < list_count)
-		goto evict_start;
+	return (total_evicted);
+}
 
-	evict_offset = idx;
-	if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
-	    (bytes < 0 || bytes_deleted < bytes)) {
-		list_start = &state->arcs_lists[0];
-		list_count = ARC_BUFC_NUMMETADATALISTS;
-		offset = lists = 0;
-		goto evict_start;
+/*
+ * Flush all "evictable" data of the given type from the arc state
+ * specified. This will not evict any "active" buffers (i.e. referenced).
+ *
+ * When 'retry' is set to FALSE, the function will make a single pass
+ * over the state and evict any buffers that it can. Since it doesn't
+ * continually retry the eviction, it might end up leaving some buffers
+ * in the ARC due to lock misses.
+ *
+ * When 'retry' is set to TRUE, the function will continually retry the
+ * eviction until *all* evictable buffers have been removed from the
+ * state. As a result, if concurrent insertions into the state are
+ * allowed (e.g. if the ARC isn't shutting down), this function might
+ * wind up in an infinite loop, continually trying to evict buffers.
+ */
+static uint64_t
+arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
+    boolean_t retry)
+{
+	uint64_t evicted = 0;
+
+	while (state->arcs_lsize[type] != 0) {
+		evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
+
+		if (!retry)
+			break;
 	}
 
-	if (bufs_skipped) {
-		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
-		ASSERT(bytes >= 0);
+	return (evicted);
+}
+
+/*
+ * Evict the specified number of bytes from the state specified,
+ * restricting eviction to the spa and type given. This function
+ * prevents us from trying to evict more from a state's list than
+ * is "evictable", and to skip evicting altogether when passed a
+ * negative value for "bytes". In contrast, arc_evict_state() will
+ * evict everything it can, when passed a negative value for "bytes".
+ */
+static uint64_t
+arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
+    arc_buf_contents_t type)
+{
+	int64_t delta;
+
+	if (bytes > 0 && state->arcs_lsize[type] > 0) {
+		delta = MIN(state->arcs_lsize[type], bytes);
+		return (arc_evict_state(state, spa, delta, type));
 	}
 
-	if (bytes_deleted < bytes)
-		dprintf("only deleted %lld bytes from %p",
-		    (longlong_t)bytes_deleted, state);
+	return (0);
 }
 
-static void
+/*
+ * Evict metadata buffers from the cache, such that arc_meta_used is
+ * capped by the arc_meta_limit tunable.
+ */
+static uint64_t
+arc_adjust_meta(void)
+{
+	uint64_t total_evicted = 0;
+	int64_t target;
+
+	/*
+	 * If we're over the meta limit, we want to evict enough
+	 * metadata to get back under the meta limit. We don't want to
+	 * evict so much that we drop the MRU below arc_p, though. If
+	 * we're over the meta limit more than we're over arc_p, we
+	 * evict some from the MRU here, and some from the MFU below.
+	 */
+	target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
+	    (int64_t)(refcount_count(&arc_anon->arcs_size) +
+	    refcount_count(&arc_mru->arcs_size) - arc_p));
+
+	total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
+
+	/*
+	 * Similar to the above, we want to evict enough bytes to get us
+	 * below the meta limit, but not so much as to drop us below the
+	 * space alloted to the MFU (which is defined as arc_c - arc_p).
+	 */
+	target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
+	    (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));
+
+	total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
+
+	return (total_evicted);
+}
+
+/*
+ * Return the type of the oldest buffer in the given arc state
+ *
+ * This function will select a random sublist of type ARC_BUFC_DATA and
+ * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
+ * is compared, and the type which contains the "older" buffer will be
+ * returned.
+ */
+static arc_buf_contents_t
+arc_adjust_type(arc_state_t *state)
+{
+	multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA];
+	multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA];
+	int data_idx = multilist_get_random_index(data_ml);
+	int meta_idx = multilist_get_random_index(meta_ml);
+	multilist_sublist_t *data_mls;
+	multilist_sublist_t *meta_mls;
+	arc_buf_contents_t type;
+	arc_buf_hdr_t *data_hdr;
+	arc_buf_hdr_t *meta_hdr;
+
+	/*
+	 * We keep the sublist lock until we're finished, to prevent
+	 * the headers from being destroyed via arc_evict_state().
+	 */
+	data_mls = multilist_sublist_lock(data_ml, data_idx);
+	meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
+
+	/*
+	 * These two loops are to ensure we skip any markers that
+	 * might be at the tail of the lists due to arc_evict_state().
+	 */
+
+	for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
+	    data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
+		if (data_hdr->b_spa != 0)
+			break;
+	}
+
+	for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
+	    meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
+		if (meta_hdr->b_spa != 0)
+			break;
+	}
+
+	if (data_hdr == NULL && meta_hdr == NULL) {
+		type = ARC_BUFC_DATA;
+	} else if (data_hdr == NULL) {
+		ASSERT3P(meta_hdr, !=, NULL);
+		type = ARC_BUFC_METADATA;
+	} else if (meta_hdr == NULL) {
+		ASSERT3P(data_hdr, !=, NULL);
+		type = ARC_BUFC_DATA;
+	} else {
+		ASSERT3P(data_hdr, !=, NULL);
+		ASSERT3P(meta_hdr, !=, NULL);
+
+		/* The headers can't be on the sublist without an L1 header */
+		ASSERT(HDR_HAS_L1HDR(data_hdr));
+		ASSERT(HDR_HAS_L1HDR(meta_hdr));
+
+		if (data_hdr->b_l1hdr.b_arc_access <
+		    meta_hdr->b_l1hdr.b_arc_access) {
+			type = ARC_BUFC_DATA;
+		} else {
+			type = ARC_BUFC_METADATA;
+		}
+	}
+
+	multilist_sublist_unlock(meta_mls);
+	multilist_sublist_unlock(data_mls);
+
+	return (type);
+}
+
+/*
+ * Evict buffers from the cache, such that arc_size is capped by arc_c.
+ */
+static uint64_t
 arc_adjust(void)
 {
-	int64_t adjustment, delta;
+	uint64_t total_evicted = 0;
+	uint64_t bytes;
+	int64_t target;
+
+	/*
+	 * If we're over arc_meta_limit, we want to correct that before
+	 * potentially evicting data buffers below.
+	 */
+	total_evicted += arc_adjust_meta();
 
 	/*
 	 * Adjust MRU size
+	 *
+	 * If we're over the target cache size, we want to evict enough
+	 * from the list to get back to our target size. We don't want
+	 * to evict too much from the MRU, such that it drops below
+	 * arc_p. So, if we're over our target cache size more than
+	 * the MRU is over arc_p, we'll evict enough to get back to
+	 * arc_p here, and then evict more from the MFU below.
 	 */
+	target = MIN((int64_t)(arc_size - arc_c),
+	    (int64_t)(refcount_count(&arc_anon->arcs_size) +
+	    refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p));
 
-	adjustment = MIN((int64_t)(arc_size - arc_c),
-	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
-	    arc_p));
+	/*
+	 * If we're below arc_meta_min, always prefer to evict data.
+	 * Otherwise, try to satisfy the requested number of bytes to
+	 * evict from the type which contains older buffers; in an
+	 * effort to keep newer buffers in the cache regardless of their
+	 * type. If we cannot satisfy the number of bytes from this
+	 * type, spill over into the next type.
+	 */
+	if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
+	    arc_meta_used > arc_meta_min) {
+		bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
+		total_evicted += bytes;
 
-	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
-		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
-		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
-		adjustment -= delta;
-	}
+		/*
+		 * If we couldn't evict our target number of bytes from
+		 * metadata, we try to get the rest from data.
+		 */
+		target -= bytes;
+
+		total_evicted +=
+		    arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
+	} else {
+		bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
+		total_evicted += bytes;
+
+		/*
+		 * If we couldn't evict our target number of bytes from
+		 * data, we try to get the rest from metadata.
+		 */
+		target -= bytes;
 
-	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
-		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
-		(void) arc_evict(arc_mru, 0, delta, FALSE,
-		    ARC_BUFC_METADATA);
+		total_evicted +=
+		    arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
 	}
 
 	/*
 	 * Adjust MFU size
+	 *
+	 * Now that we've tried to evict enough from the MRU to get its
+	 * size back to arc_p, if we're still above the target cache
+	 * size, we evict the rest from the MFU.
 	 */
+	target = arc_size - arc_c;
 
-	adjustment = arc_size - arc_c;
+	if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
+	    arc_meta_used > arc_meta_min) {
+		bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
+		total_evicted += bytes;
 
-	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
-		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
-		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
-		adjustment -= delta;
-	}
+		/*
+		 * If we couldn't evict our target number of bytes from
+		 * metadata, we try to get the rest from data.
+		 */
+		target -= bytes;
+
+		total_evicted +=
+		    arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
+	} else {
+		bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
+		total_evicted += bytes;
+
+		/*
+		 * If we couldn't evict our target number of bytes from
+		 * data, we try to get the rest from data.
+		 */
+		target -= bytes;
 
-	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
-		int64_t delta = MIN(adjustment,
-		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
-		(void) arc_evict(arc_mfu, 0, delta, FALSE,
-		    ARC_BUFC_METADATA);
+		total_evicted +=
+		    arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
 	}
 
 	/*
 	 * Adjust ghost lists
+	 *
+	 * In addition to the above, the ARC also defines target values
+	 * for the ghost lists. The sum of the mru list and mru ghost
+	 * list should never exceed the target size of the cache, and
+	 * the sum of the mru list, mfu list, mru ghost list, and mfu
+	 * ghost list should never exceed twice the target size of the
+	 * cache. The following logic enforces these limits on the ghost
+	 * caches, and evicts from them as needed.
 	 */
+	target = refcount_count(&arc_mru->arcs_size) +
+	    refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
 
-	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
+	bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
+	total_evicted += bytes;
 
-	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
-		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
-		arc_evict_ghost(arc_mru_ghost, 0, delta);
-	}
+	target -= bytes;
 
-	adjustment =
-	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
+	total_evicted +=
+	    arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
 
-	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
-		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
-		arc_evict_ghost(arc_mfu_ghost, 0, delta);
-	}
+	/*
+	 * We assume the sum of the mru list and mfu list is less than
+	 * or equal to arc_c (we enforced this above), which means we
+	 * can use the simpler of the two equations below:
+	 *
+	 *	mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
+	 *		    mru ghost + mfu ghost <= arc_c
+	 */
+	target = refcount_count(&arc_mru_ghost->arcs_size) +
+	    refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
+
+	bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
+	total_evicted += bytes;
+
+	target -= bytes;
+
+	total_evicted +=
+	    arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
+
+	return (total_evicted);
 }
 
 static void
 arc_do_user_evicts(void)
 {
-	static arc_buf_t *tmp_arc_eviction_list;
-
-	/*
-	 * Move list over to avoid LOR
-	 */
-restart:
-	mutex_enter(&arc_eviction_mtx);
-	tmp_arc_eviction_list = arc_eviction_list;
-	arc_eviction_list = NULL;
-	mutex_exit(&arc_eviction_mtx);
-
-	while (tmp_arc_eviction_list != NULL) {
-		arc_buf_t *buf = tmp_arc_eviction_list;
-		tmp_arc_eviction_list = buf->b_next;
+	mutex_enter(&arc_user_evicts_lock);
+	while (arc_eviction_list != NULL) {
+		arc_buf_t *buf = arc_eviction_list;
+		arc_eviction_list = buf->b_next;
 		mutex_enter(&buf->b_evict_lock);
 		buf->b_hdr = NULL;
 		mutex_exit(&buf->b_evict_lock);
+		mutex_exit(&arc_user_evicts_lock);
 
 		if (buf->b_efunc != NULL)
 			VERIFY0(buf->b_efunc(buf->b_private));
@@ -3014,58 +3267,45 @@ restart:
 		buf->b_efunc = NULL;
 		buf->b_private = NULL;
 		kmem_cache_free(buf_cache, buf);
+		mutex_enter(&arc_user_evicts_lock);
 	}
-
-	if (arc_eviction_list != NULL)
-		goto restart;
+	mutex_exit(&arc_user_evicts_lock);
 }
 
-/*
- * Flush all *evictable* data from the cache for the given spa.
- * NOTE: this will not touch "active" (i.e. referenced) data.
- */
 void
-arc_flush(spa_t *spa)
+arc_flush(spa_t *spa, boolean_t retry)
 {
 	uint64_t guid = 0;
 
+	/*
+	 * If retry is TRUE, a spa must not be specified since we have
+	 * no good way to determine if all of a spa's buffers have been
+	 * evicted from an arc state.
+	 */
+	ASSERT(!retry || spa == 0);
+
 	if (spa != NULL)
 		guid = spa_load_guid(spa);
 
-	while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
-		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
-		if (spa != NULL)
-			break;
-	}
-	while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
-		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
-		if (spa != NULL)
-			break;
-	}
-	while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
-		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
-		if (spa != NULL)
-			break;
-	}
-	while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
-		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
-		if (spa != NULL)
-			break;
-	}
+	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
+	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
+
+	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
+	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
 
-	arc_evict_ghost(arc_mru_ghost, guid, -1);
-	arc_evict_ghost(arc_mfu_ghost, guid, -1);
+	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
+	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
+
+	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
+	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
 
-	mutex_enter(&arc_reclaim_thr_lock);
 	arc_do_user_evicts();
-	mutex_exit(&arc_reclaim_thr_lock);
 	ASSERT(spa || arc_eviction_list == NULL);
 }
 
 void
 arc_shrink(int64_t to_free)
 {
-
 	if (arc_c > arc_c_min) {
 		DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
 			arc_c_min, uint64_t, arc_p, uint64_t, to_free);
@@ -3090,7 +3330,7 @@ arc_shrink(int64_t to_free)
 	if (arc_size > arc_c) {
 		DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,
 			uint64_t, arc_c);
-		arc_adjust();
+		(void) arc_adjust();
 	}
 }
 
@@ -3329,17 +3569,37 @@ arc_kmem_reap_now(void)
 	DTRACE_PROBE(arc__kmem_reap_end);
 }
 
+/*
+ * Threads can block in arc_get_data_buf() waiting for this thread to evict
+ * enough data and signal them to proceed. When this happens, the threads in
+ * arc_get_data_buf() are sleeping while holding the hash lock for their
+ * particular arc header. Thus, we must be careful to never sleep on a
+ * hash lock in this thread. This is to prevent the following deadlock:
+ *
+ *  - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L",
+ *    waiting for the reclaim thread to signal it.
+ *
+ *  - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
+ *    fails, and goes to sleep forever.
+ *
+ * This possible deadlock is avoided by always acquiring a hash lock
+ * using mutex_tryenter() from arc_reclaim_thread().
+ */
 static void
 arc_reclaim_thread(void *dummy __unused)
 {
 	clock_t			growtime = 0;
 	callb_cpr_t		cpr;
 
-	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
+	CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
 
-	mutex_enter(&arc_reclaim_thr_lock);
-	while (arc_thread_exit == 0) {
+	mutex_enter(&arc_reclaim_lock);
+	while (!arc_reclaim_thread_exit) {
 		int64_t free_memory = arc_available_memory();
+		uint64_t evicted = 0;
+
+		mutex_exit(&arc_reclaim_lock);
+
 		if (free_memory < 0) {
 
 			arc_no_grow = B_TRUE;
@@ -3373,17 +3633,60 @@ arc_reclaim_thread(void *dummy __unused)
 			arc_no_grow = B_FALSE;
 		}
 
-		arc_adjust();
+		evicted = arc_adjust();
 
-		if (arc_eviction_list != NULL)
-			arc_do_user_evicts();
+		mutex_enter(&arc_reclaim_lock);
 
+		/*
+		 * If evicted is zero, we couldn't evict anything via
+		 * arc_adjust(). This could be due to hash lock
+		 * collisions, but more likely due to the majority of
+		 * arc buffers being unevictable. Therefore, even if
+		 * arc_size is above arc_c, another pass is unlikely to
+		 * be helpful and could potentially cause us to enter an
+		 * infinite loop.
+		 */
+		if (arc_size <= arc_c || evicted == 0) {
 #ifdef _KERNEL
-		if (needfree) {
 			needfree = 0;
-			wakeup(&needfree);
-		}
 #endif
+			/*
+			 * We're either no longer overflowing, or we
+			 * can't evict anything more, so we should wake
+			 * up any threads before we go to sleep.
+			 */
+			cv_broadcast(&arc_reclaim_waiters_cv);
+
+			/*
+			 * Block until signaled, or after one second (we
+			 * might need to perform arc_kmem_reap_now()
+			 * even if we aren't being signalled)
+			 */
+			CALLB_CPR_SAFE_BEGIN(&cpr);
+			(void) cv_timedwait(&arc_reclaim_thread_cv,
+			    &arc_reclaim_lock, hz);
+			CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
+		}
+	}
+
+	arc_reclaim_thread_exit = FALSE;
+	cv_broadcast(&arc_reclaim_thread_cv);
+	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_lock */
+	thread_exit();
+}
+
+static void
+arc_user_evicts_thread(void *dummy __unused)
+{
+	callb_cpr_t cpr;
+
+	CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG);
+
+	mutex_enter(&arc_user_evicts_lock);
+	while (!arc_user_evicts_thread_exit) {
+		mutex_exit(&arc_user_evicts_lock);
+
+		arc_do_user_evicts();
 
 		/*
 		 * This is necessary in order for the mdb ::arc dcmd to
@@ -3399,16 +3702,21 @@ arc_reclaim_thread(void *dummy __unused)
 		if (arc_ksp != NULL)
 			arc_ksp->ks_update(arc_ksp, KSTAT_READ);
 
-		/* block until needed, or one second, whichever is shorter */
+		mutex_enter(&arc_user_evicts_lock);
+
+		/*
+		 * Block until signaled, or after one second (we need to
+		 * call the arc's kstat update function regularly).
+		 */
 		CALLB_CPR_SAFE_BEGIN(&cpr);
-		(void) cv_timedwait(&arc_reclaim_thr_cv,
-		    &arc_reclaim_thr_lock, hz);
-		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
+		(void) cv_timedwait(&arc_user_evicts_cv,
+		    &arc_user_evicts_lock, hz);
+		CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock);
 	}
 
-	arc_thread_exit = 0;
-	cv_broadcast(&arc_reclaim_thr_cv);
-	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
+	arc_user_evicts_thread_exit = FALSE;
+	cv_broadcast(&arc_user_evicts_cv);
+	CALLB_CPR_EXIT(&cpr);		/* drops arc_user_evicts_lock */
 	thread_exit();
 }
 
@@ -3422,6 +3730,8 @@ arc_adapt(int bytes, arc_state_t *state)
 {
 	int mult;
 	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
+	int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size);
+	int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size);
 
 	if (state == arc_l2c_only)
 		return;
@@ -3436,16 +3746,14 @@ arc_adapt(int bytes, arc_state_t *state)
 	 *	  target size of the MRU list.
 	 */
 	if (state == arc_mru_ghost) {
-		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
-		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
+		mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
 		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
 
 		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
 	} else if (state == arc_mfu_ghost) {
 		uint64_t delta;
 
-		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
-		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
+		mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
 		mult = MIN(mult, 10);
 
 		delta = MIN(bytes * mult, arc_p);
@@ -3454,7 +3762,7 @@ arc_adapt(int bytes, arc_state_t *state)
 	ASSERT((int64_t)arc_p >= 0);
 
 	if (arc_reclaim_needed()) {
-		cv_signal(&arc_reclaim_thr_cv);
+		cv_signal(&arc_reclaim_thread_cv);
 		return;
 	}
 
@@ -3482,43 +3790,25 @@ arc_adapt(int bytes, arc_state_t *state)
 }
 
 /*
- * Check if the cache has reached its limits and eviction is required
- * prior to insert.
+ * Check if arc_size has grown past our upper threshold, determined by
+ * zfs_arc_overflow_shift.
  */
-static int
-arc_evict_needed(arc_buf_contents_t type)
+static boolean_t
+arc_is_overflowing(void)
 {
-	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
-		return (1);
+	/* Always allow at least one block of overflow */
+	uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
+	    arc_c >> zfs_arc_overflow_shift);
 
-	if (arc_reclaim_needed())
-		return (1);
-
-	return (arc_size > arc_c);
+	return (arc_size >= arc_c + overflow);
 }
 
 /*
- * The buffer, supplied as the first argument, needs a data block.
- * So, if we are at cache max, determine which cache should be victimized.
- * We have the following cases:
- *
- * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
- * In this situation if we're out of space, but the resident size of the MFU is
- * under the limit, victimize the MFU cache to satisfy this insertion request.
- *
- * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
- * Here, we've used up all of the available space for the MRU, so we need to
- * evict from our own cache instead.  Evict from the set of resident MRU
- * entries.
- *
- * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
- * c minus p represents the MFU space in the cache, since p is the size of the
- * cache that is dedicated to the MRU.  In this situation there's still space on
- * the MFU side, so the MRU side needs to be victimized.
- *
- * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
- * MFU's resident set is consuming more space than it has been allotted.  In
- * this situation, we must victimize our own cache, the MFU, for this insertion.
+ * The buffer, supplied as the first argument, needs a data block. If we
+ * are hitting the hard limit for the cache size, we must sleep, waiting
+ * for the eviction thread to catch up. If we're past the target size
+ * but below the hard limit, we'll only signal the reclaim thread and
+ * continue on.
  */
 static void
 arc_get_data_buf(arc_buf_t *buf)
@@ -3530,62 +3820,70 @@ arc_get_data_buf(arc_buf_t *buf)
 	arc_adapt(size, state);
 
 	/*
-	 * We have not yet reached cache maximum size,
-	 * just allocate a new buffer.
+	 * If arc_size is currently overflowing, and has grown past our
+	 * upper limit, we must be adding data faster than the evict
+	 * thread can evict. Thus, to ensure we don't compound the
+	 * problem by adding more data and forcing arc_size to grow even
+	 * further past it's target size, we halt and wait for the
+	 * eviction thread to catch up.
+	 *
+	 * It's also possible that the reclaim thread is unable to evict
+	 * enough buffers to get arc_size below the overflow limit (e.g.
+	 * due to buffers being un-evictable, or hash lock collisions).
+	 * In this case, we want to proceed regardless if we're
+	 * overflowing; thus we don't use a while loop here.
 	 */
-	if (!arc_evict_needed(type)) {
-		if (type == ARC_BUFC_METADATA) {
-			buf->b_data = zio_buf_alloc(size);
-			arc_space_consume(size, ARC_SPACE_META);
-		} else {
-			ASSERT(type == ARC_BUFC_DATA);
-			buf->b_data = zio_data_buf_alloc(size);
-			arc_space_consume(size, ARC_SPACE_DATA);
+	if (arc_is_overflowing()) {
+		mutex_enter(&arc_reclaim_lock);
+
+		/*
+		 * Now that we've acquired the lock, we may no longer be
+		 * over the overflow limit, lets check.
+		 *
+		 * We're ignoring the case of spurious wake ups. If that
+		 * were to happen, it'd let this thread consume an ARC
+		 * buffer before it should have (i.e. before we're under
+		 * the overflow limit and were signalled by the reclaim
+		 * thread). As long as that is a rare occurrence, it
+		 * shouldn't cause any harm.
+		 */
+		if (arc_is_overflowing()) {
+			cv_signal(&arc_reclaim_thread_cv);
+			cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
 		}
-		goto out;
+
+		mutex_exit(&arc_reclaim_lock);
 	}
 
-	/*
-	 * If we are prefetching from the mfu ghost list, this buffer
-	 * will end up on the mru list; so steal space from there.
-	 */
-	if (state == arc_mfu_ghost)
-		state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu;
-	else if (state == arc_mru_ghost)
-		state = arc_mru;
-
-	if (state == arc_mru || state == arc_anon) {
-		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
-		state = (arc_mfu->arcs_lsize[type] >= size &&
-		    arc_p > mru_used) ? arc_mfu : arc_mru;
+	if (type == ARC_BUFC_METADATA) {
+		buf->b_data = zio_buf_alloc(size);
+		arc_space_consume(size, ARC_SPACE_META);
 	} else {
-		/* MFU cases */
-		uint64_t mfu_space = arc_c - arc_p;
-		state =  (arc_mru->arcs_lsize[type] >= size &&
-		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
+		ASSERT(type == ARC_BUFC_DATA);
+		buf->b_data = zio_data_buf_alloc(size);
+		arc_space_consume(size, ARC_SPACE_DATA);
 	}
-	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
-		if (type == ARC_BUFC_METADATA) {
-			buf->b_data = zio_buf_alloc(size);
-			arc_space_consume(size, ARC_SPACE_META);
-		} else {
-			ASSERT(type == ARC_BUFC_DATA);
-			buf->b_data = zio_data_buf_alloc(size);
-			arc_space_consume(size, ARC_SPACE_DATA);
-		}
-		ARCSTAT_BUMP(arcstat_recycle_miss);
-	}
-	ASSERT(buf->b_data != NULL);
-out:
+
 	/*
 	 * Update the state size.  Note that ghost states have a
 	 * "ghost size" and so don't need to be updated.
 	 */
 	if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) {
 		arc_buf_hdr_t *hdr = buf->b_hdr;
+		arc_state_t *state = hdr->b_l1hdr.b_state;
+
+		(void) refcount_add_many(&state->arcs_size, size, buf);
 
-		atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size);
-		if (list_link_active(&hdr->b_l1hdr.b_arc_node)) {
+		/*
+		 * If this is reached via arc_read, the link is
+		 * protected by the hash lock. If reached via
+		 * arc_buf_alloc, the header should not be accessed by
+		 * any other thread. And, if reached via arc_read_done,
+		 * the hash lock will protect it if it's found in the
+		 * hash table; otherwise no other thread should be
+		 * trying to [add|remove]_reference it.
+		 */
+		if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 			atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type],
 			    size);
@@ -3595,7 +3893,8 @@ out:
 		 * data, and we have outgrown arc_p, update arc_p
 		 */
 		if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
-		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
+		    (refcount_count(&arc_anon->arcs_size) +
+		    refcount_count(&arc_mru->arcs_size) > arc_p))
 			arc_p = MIN(arc_c, arc_p + size);
 	}
 	ARCSTAT_BUMP(arcstat_allocated);
@@ -3638,7 +3937,8 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 		 */
 		if (HDR_PREFETCH(hdr)) {
 			if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
-				ASSERT(list_link_active(
+				/* link protected by hash lock */
+				ASSERT(multilist_link_active(
 				    &hdr->b_l1hdr.b_arc_node));
 			} else {
 				hdr->b_flags &= ~ARC_FLAG_PREFETCH;
@@ -3698,7 +3998,8 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 		 */
 		if ((HDR_PREFETCH(hdr)) != 0) {
 			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
-			ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
+			/* link protected by hash_lock */
+			ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		}
 		ARCSTAT_BUMP(arcstat_mfu_hits);
 		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
@@ -3903,7 +4204,7 @@ arc_read_done(zio_t *zio)
 }
 
 /*
- * "Read" the block block at the specified DVA (in bp) via the
+ * "Read" the block at the specified DVA (in bp) via the
  * cache.  If the block is found in the cache, invoke the provided
  * callback immediately and return.  Note that the `zio' parameter
  * in the callback will be NULL in this case, since no IO was
@@ -4070,7 +4371,7 @@ top:
 			ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
 			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
-			ASSERT(hdr->b_l1hdr.b_buf == NULL);
+			ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 
 			/* if this is a prefetch, we don't have a reference */
 			if (*arc_flags & ARC_FLAG_PREFETCH)
@@ -4297,8 +4598,6 @@ arc_clear_callback(arc_buf_t *buf)
 	kmutex_t *hash_lock;
 	arc_evict_func_t *efunc = buf->b_efunc;
 	void *private = buf->b_private;
-	list_t *list, *evicted_list;
-	kmutex_t *lock, *evicted_lock;
 
 	mutex_enter(&buf->b_evict_lock);
 	hdr = buf->b_hdr;
@@ -4334,7 +4633,7 @@ arc_clear_callback(arc_buf_t *buf)
 
 	if (hdr->b_l1hdr.b_datacnt > 1) {
 		mutex_exit(&buf->b_evict_lock);
-		arc_buf_destroy(buf, FALSE, TRUE);
+		arc_buf_destroy(buf, TRUE);
 	} else {
 		ASSERT(buf == hdr->b_l1hdr.b_buf);
 		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
@@ -4364,6 +4663,9 @@ arc_release(arc_buf_t *buf, void *tag)
 	 */
 
 	mutex_enter(&buf->b_evict_lock);
+
+	ASSERT(HDR_HAS_L1HDR(hdr));
+
 	/*
 	 * We don't grab the hash lock prior to this check, because if
 	 * the buffer's header is in the arc_anon state, it won't be
@@ -4449,8 +4751,10 @@ arc_release(arc_buf_t *buf, void *tag)
 		buf->b_next = NULL;
 
 		ASSERT3P(state, !=, arc_l2c_only);
-		ASSERT3U(state->arcs_size, >=, hdr->b_size);
-		atomic_add_64(&state->arcs_size, -hdr->b_size);
+
+		(void) refcount_remove_many(
+		    &state->arcs_size, hdr->b_size, buf);
+
 		if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
 			ASSERT3P(state, !=, arc_l2c_only);
 			uint64_t *size = &state->arcs_lsize[type];
@@ -4487,17 +4791,18 @@ arc_release(arc_buf_t *buf, void *tag)
 		nhdr->b_l1hdr.b_datacnt = 1;
 		nhdr->b_l1hdr.b_state = arc_anon;
 		nhdr->b_l1hdr.b_arc_access = 0;
+		nhdr->b_l1hdr.b_tmp_cdata = NULL;
 		nhdr->b_freeze_cksum = NULL;
 
 		(void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
 		buf->b_hdr = nhdr;
 		mutex_exit(&buf->b_evict_lock);
-		atomic_add_64(&arc_anon->arcs_size, blksz);
+		(void) refcount_add_many(&arc_anon->arcs_size, blksz, buf);
 	} else {
 		mutex_exit(&buf->b_evict_lock);
 		ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
-		/* protected by hash lock */
-		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+		/* protected by hash lock, or hdr is on arc_anon */
+		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		arc_change_state(arc_anon, hdr, hash_lock);
 		hdr->b_l1hdr.b_arc_access = 0;
@@ -4759,7 +5064,8 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
 	 * network delays from blocking transactions that are ready to be
 	 * assigned to a txg.
 	 */
-	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
+	anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) -
+	    arc_loaned_bytes), 0);
 
 	/*
 	 * Writes will, almost always, require additional memory allocations
@@ -4796,7 +5102,7 @@ static void
 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
     kstat_named_t *evict_data, kstat_named_t *evict_metadata)
 {
-	size->value.ui64 = state->arcs_size;
+	size->value.ui64 = refcount_count(&state->arcs_size);
 	evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
 	evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
 }
@@ -4834,6 +5140,41 @@ arc_kstat_update(kstat_t *ksp, int rw)
 	return (0);
 }
 
+/*
+ * This function *must* return indices evenly distributed between all
+ * sublists of the multilist. This is needed due to how the ARC eviction
+ * code is laid out; arc_evict_state() assumes ARC buffers are evenly
+ * distributed between all sublists and uses this assumption when
+ * deciding which sublist to evict from and how much to evict from it.
+ */
+unsigned int
+arc_state_multilist_index_func(multilist_t *ml, void *obj)
+{
+	arc_buf_hdr_t *hdr = obj;
+
+	/*
+	 * We rely on b_dva to generate evenly distributed index
+	 * numbers using buf_hash below. So, as an added precaution,
+	 * let's make sure we never add empty buffers to the arc lists.
+	 */
+	ASSERT(!BUF_EMPTY(hdr));
+
+	/*
+	 * The assumption here, is the hash value for a given
+	 * arc_buf_hdr_t will remain constant throughout it's lifetime
+	 * (i.e. it's b_spa, b_dva, and b_birth fields don't change).
+	 * Thus, we don't need to store the header's sublist index
+	 * on insertion, as this index can be recalculated on removal.
+	 *
+	 * Also, the low order bits of the hash value are thought to be
+	 * distributed evenly. Otherwise, in the case that the multilist
+	 * has a power of two number of sublists, each sublists' usage
+	 * would not be evenly distributed.
+	 */
+	return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
+	    multilist_get_num_sublists(ml));
+}
+
 #ifdef _KERNEL
 static eventhandler_tag arc_event_lowmem = NULL;
 
@@ -4841,11 +5182,11 @@ static void
 arc_lowmem(void *arg __unused, int howto __unused)
 {
 
-	mutex_enter(&arc_reclaim_thr_lock);
+	mutex_enter(&arc_reclaim_lock);
 	/* XXX: Memory deficit should be passed as argument. */
 	needfree = btoc(arc_c >> arc_shrink_shift);
 	DTRACE_PROBE(arc__needfree);
-	cv_signal(&arc_reclaim_thr_cv);
+	cv_signal(&arc_reclaim_thread_cv);
 
 	/*
 	 * It is unsafe to block here in arbitrary threads, because we can come
@@ -4853,8 +5194,8 @@ arc_lowmem(void *arg __unused, int howto __unused)
 	 * with ARC reclaim thread.
 	 */
 	if (curproc == pageproc)
-		msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
-	mutex_exit(&arc_reclaim_thr_lock);
+		(void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
+	mutex_exit(&arc_reclaim_lock);
 }
 #endif
 
@@ -4863,8 +5204,12 @@ arc_init(void)
 {
 	int i, prefetch_tunable_set = 0;
 
-	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
+
+	mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL);
 
 	/* Convert seconds to clock ticks */
 	arc_min_prefetch_lifespan = 1 * hz;
@@ -4936,6 +5281,9 @@ arc_init(void)
 	if (zfs_arc_p_min_shift > 0)
 		arc_p_min_shift = zfs_arc_p_min_shift;
 
+	if (zfs_arc_num_sublists_per_state < 1)
+		zfs_arc_num_sublists_per_state = MAX(max_ncpus, 1);
+
 	/* if kmem_flags are set, lets try to use less memory */
 	if (kmem_debugging())
 		arc_c = arc_c / 2;
@@ -4953,45 +5301,59 @@ arc_init(void)
 	arc_l2c_only = &ARC_l2c_only;
 	arc_size = 0;
 
-	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
-		mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
-		    NULL, MUTEX_DEFAULT, NULL);
-		mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
-		    NULL, MUTEX_DEFAULT, NULL);
-		mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
-		    NULL, MUTEX_DEFAULT, NULL);
-		mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
-		    NULL, MUTEX_DEFAULT, NULL);
-		mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
-		    NULL, MUTEX_DEFAULT, NULL);
-		mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
-		    NULL, MUTEX_DEFAULT, NULL);
-
-		list_create(&arc_mru->arcs_lists[i],
-		    sizeof (arc_buf_hdr_t),
-		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
-		list_create(&arc_mru_ghost->arcs_lists[i],
-		    sizeof (arc_buf_hdr_t),
-		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
-		list_create(&arc_mfu->arcs_lists[i],
-		    sizeof (arc_buf_hdr_t),
-		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
-		list_create(&arc_mfu_ghost->arcs_lists[i],
-		    sizeof (arc_buf_hdr_t),
-		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
-		list_create(&arc_mfu_ghost->arcs_lists[i],
-		    sizeof (arc_buf_hdr_t),
-		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
-		list_create(&arc_l2c_only->arcs_lists[i],
-		    sizeof (arc_buf_hdr_t),
-		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
-	}
+	multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+
+	refcount_create(&arc_anon->arcs_size);
+	refcount_create(&arc_mru->arcs_size);
+	refcount_create(&arc_mru_ghost->arcs_size);
+	refcount_create(&arc_mfu->arcs_size);
+	refcount_create(&arc_mfu_ghost->arcs_size);
+	refcount_create(&arc_l2c_only->arcs_size);
 
 	buf_init();
 
-	arc_thread_exit = 0;
+	arc_reclaim_thread_exit = FALSE;
+	arc_user_evicts_thread_exit = FALSE;
 	arc_eviction_list = NULL;
-	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
 	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
 
 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
@@ -5011,6 +5373,9 @@ arc_init(void)
 	    EVENTHANDLER_PRI_FIRST);
 #endif
 
+	(void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0,
+	    TS_RUN, minclsyspri);
+
 	arc_dead = FALSE;
 	arc_warm = B_FALSE;
 
@@ -5069,16 +5434,32 @@ arc_init(void)
 void
 arc_fini(void)
 {
-	int i;
+	mutex_enter(&arc_reclaim_lock);
+	arc_reclaim_thread_exit = TRUE;
+	/*
+	 * The reclaim thread will set arc_reclaim_thread_exit back to
+	 * FALSE when it is finished exiting; we're waiting for that.
+	 */
+	while (arc_reclaim_thread_exit) {
+		cv_signal(&arc_reclaim_thread_cv);
+		cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
+	}
+	mutex_exit(&arc_reclaim_lock);
 
-	mutex_enter(&arc_reclaim_thr_lock);
-	arc_thread_exit = 1;
-	cv_signal(&arc_reclaim_thr_cv);
-	while (arc_thread_exit != 0)
-		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
-	mutex_exit(&arc_reclaim_thr_lock);
+	mutex_enter(&arc_user_evicts_lock);
+	arc_user_evicts_thread_exit = TRUE;
+	/*
+	 * The user evicts thread will set arc_user_evicts_thread_exit
+	 * to FALSE when it is finished exiting; we're waiting for that.
+	 */
+	while (arc_user_evicts_thread_exit) {
+		cv_signal(&arc_user_evicts_cv);
+		cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock);
+	}
+	mutex_exit(&arc_user_evicts_lock);
 
-	arc_flush(NULL);
+	/* Use TRUE to ensure *all* buffers are evicted */
+	arc_flush(NULL, TRUE);
 
 	arc_dead = TRUE;
 
@@ -5087,24 +5468,28 @@ arc_fini(void)
 		arc_ksp = NULL;
 	}
 
-	mutex_destroy(&arc_eviction_mtx);
-	mutex_destroy(&arc_reclaim_thr_lock);
-	cv_destroy(&arc_reclaim_thr_cv);
+	mutex_destroy(&arc_reclaim_lock);
+	cv_destroy(&arc_reclaim_thread_cv);
+	cv_destroy(&arc_reclaim_waiters_cv);
 
-	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
-		list_destroy(&arc_mru->arcs_lists[i]);
-		list_destroy(&arc_mru_ghost->arcs_lists[i]);
-		list_destroy(&arc_mfu->arcs_lists[i]);
-		list_destroy(&arc_mfu_ghost->arcs_lists[i]);
-		list_destroy(&arc_l2c_only->arcs_lists[i]);
+	mutex_destroy(&arc_user_evicts_lock);
+	cv_destroy(&arc_user_evicts_cv);
 
-		mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
-		mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
-		mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
-		mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
-		mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
-		mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
-	}
+	refcount_destroy(&arc_anon->arcs_size);
+	refcount_destroy(&arc_mru->arcs_size);
+	refcount_destroy(&arc_mru_ghost->arcs_size);
+	refcount_destroy(&arc_mfu->arcs_size);
+	refcount_destroy(&arc_mfu_ghost->arcs_size);
+	refcount_destroy(&arc_l2c_only->arcs_size);
+
+	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
+	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
+	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
+	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
+	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
+	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
+	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
+	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
 
 	buf_fini();
 
@@ -5450,39 +5835,68 @@ l2arc_write_done(zio_t *zio)
 	if (zio->io_error != 0)
 		ARCSTAT_BUMP(arcstat_l2_writes_error);
 
-	mutex_enter(&dev->l2ad_mtx);
-
 	/*
 	 * All writes completed, or an error was hit.
 	 */
+top:
+	mutex_enter(&dev->l2ad_mtx);
 	for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
 		hdr_prev = list_prev(buflist, hdr);
 
 		hash_lock = HDR_LOCK(hdr);
+
+		/*
+		 * We cannot use mutex_enter or else we can deadlock
+		 * with l2arc_write_buffers (due to swapping the order
+		 * the hash lock and l2ad_mtx are taken).
+		 */
 		if (!mutex_tryenter(hash_lock)) {
 			/*
-			 * This buffer misses out.  It may be in a stage
-			 * of eviction.  Its ARC_FLAG_L2_WRITING flag will be
-			 * left set, denying reads to this buffer.
+			 * Missed the hash lock. We must retry so we
+			 * don't leave the ARC_FLAG_L2_WRITING bit set.
 			 */
-			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
-			continue;
+			ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
+
+			/*
+			 * We don't want to rescan the headers we've
+			 * already marked as having been written out, so
+			 * we reinsert the head node so we can pick up
+			 * where we left off.
+			 */
+			list_remove(buflist, head);
+			list_insert_after(buflist, hdr, head);
+
+			mutex_exit(&dev->l2ad_mtx);
+
+			/*
+			 * We wait for the hash lock to become available
+			 * to try and prevent busy waiting, and increase
+			 * the chance we'll be able to acquire the lock
+			 * the next time around.
+			 */
+			mutex_enter(hash_lock);
+			mutex_exit(hash_lock);
+			goto top;
 		}
 
 		/*
-		 * It's possible that this buffer got evicted from the L1 cache
-		 * before we grabbed the vdev + hash locks, in which case
-		 * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated.
-		 * Only free the buffer if we still have an L1 hdr.
+		 * We could not have been moved into the arc_l2c_only
+		 * state while in-flight due to our ARC_FLAG_L2_WRITING
+		 * bit being set. Let's just ensure that's being enforced.
 		 */
-		if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL &&
-		    HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
-			l2arc_release_cdata_buf(hdr);
+		ASSERT(HDR_HAS_L1HDR(hdr));
+
+		/*
+		 * We may have allocated a buffer for L2ARC compression,
+		 * we must release it to avoid leaking this data.
+		 */
+		l2arc_release_cdata_buf(hdr);
 
 		if (zio->io_error != 0) {
 			/*
 			 * Error - drop L2ARC entry.
 			 */
+			list_remove(buflist, hdr);
 			trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev,
 			    hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0);
 			hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
@@ -5496,7 +5910,8 @@ l2arc_write_done(zio_t *zio)
 		}
 
 		/*
-		 * Allow ARC to begin reads to this L2ARC entry.
+		 * Allow ARC to begin reads and ghost list evictions to
+		 * this L2ARC entry.
 		 */
 		hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
 
@@ -5604,36 +6019,37 @@ l2arc_read_done(zio_t *zio)
  * the data lists.  This function returns a locked list, and also returns
  * the lock pointer.
  */
-static list_t *
-l2arc_list_locked(int list_num, kmutex_t **lock)
+static multilist_sublist_t *
+l2arc_sublist_lock(int list_num)
 {
-	list_t *list = NULL;
-	int idx;
-
-	ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);
-
-	if (list_num < ARC_BUFC_NUMMETADATALISTS) {
-		idx = list_num;
-		list = &arc_mfu->arcs_lists[idx];
-		*lock = ARCS_LOCK(arc_mfu, idx);
-	} else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) {
-		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
-		list = &arc_mru->arcs_lists[idx];
-		*lock = ARCS_LOCK(arc_mru, idx);
-	} else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 +
-		ARC_BUFC_NUMDATALISTS)) {
-		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
-		list = &arc_mfu->arcs_lists[idx];
-		*lock = ARCS_LOCK(arc_mfu, idx);
-	} else {
-		idx = list_num - ARC_BUFC_NUMLISTS;
-		list = &arc_mru->arcs_lists[idx];
-		*lock = ARCS_LOCK(arc_mru, idx);
+	multilist_t *ml = NULL;
+	unsigned int idx;
+
+	ASSERT(list_num >= 0 && list_num <= 3);
+
+	switch (list_num) {
+	case 0:
+		ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
+		break;
+	case 1:
+		ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
+		break;
+	case 2:
+		ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
+		break;
+	case 3:
+		ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
+		break;
 	}
 
-	ASSERT(!(MUTEX_HELD(*lock)));
-	mutex_enter(*lock);
-	return (list);
+	/*
+	 * Return a randomly-selected sublist. This is acceptable
+	 * because the caller feeds only a little bit of data for each
+	 * call (8MB). Subsequent calls will result in different
+	 * sublists being selected.
+	 */
+	idx = multilist_get_random_index(ml);
+	return (multilist_sublist_lock(ml, idx));
 }
 
 /*
@@ -5678,6 +6094,12 @@ top:
 		hdr_prev = list_prev(buflist, hdr);
 
 		hash_lock = HDR_LOCK(hdr);
+
+		/*
+		 * We cannot use mutex_enter or else we can deadlock
+		 * with l2arc_write_buffers (due to swapping the order
+		 * the hash lock and l2ad_mtx are taken).
+		 */
 		if (!mutex_tryenter(hash_lock)) {
 			/*
 			 * Missed the hash lock.  Retry.
@@ -5733,6 +6155,10 @@ top:
 				hdr->b_flags |= ARC_FLAG_L2_EVICTED;
 			}
 
+			/* Ensure this header has finished being written */
+			ASSERT(!HDR_L2_WRITING(hdr));
+			ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+
 			arc_hdr_l2hdr_destroy(hdr);
 		}
 		mutex_exit(hash_lock);
@@ -5756,11 +6182,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
     boolean_t *headroom_boost)
 {
 	arc_buf_hdr_t *hdr, *hdr_prev, *head;
-	list_t *list;
 	uint64_t write_asize, write_psize, write_sz, headroom,
 	    buf_compress_minsz;
 	void *buf_data;
-	kmutex_t *list_lock;
 	boolean_t full;
 	l2arc_write_callback_t *cb;
 	zio_t *pio, *wzio;
@@ -5790,11 +6214,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
 	/*
 	 * Copy buffers for L2ARC writing.
 	 */
-	mutex_enter(&dev->l2ad_mtx);
-	for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
+	for (try = 0; try <= 3; try++) {
+		multilist_sublist_t *mls = l2arc_sublist_lock(try);
 		uint64_t passed_sz = 0;
 
-		list = l2arc_list_locked(try, &list_lock);
 		ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
 
 		/*
@@ -5804,13 +6227,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
 		 * head of the ARC lists rather than the tail.
 		 */
 		if (arc_warm == B_FALSE)
-			hdr = list_head(list);
+			hdr = multilist_sublist_head(mls);
 		else
-			hdr = list_tail(list);
+			hdr = multilist_sublist_tail(mls);
 		if (hdr == NULL)
 			ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
 
-		headroom = target_sz * l2arc_headroom * 2 / ARC_BUFC_NUMLISTS;
+		headroom = target_sz * l2arc_headroom;
 		if (do_headroom_boost)
 			headroom = (headroom * l2arc_headroom_boost) / 100;
 
@@ -5819,9 +6242,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
 			uint64_t buf_sz;
 
 			if (arc_warm == B_FALSE)
-				hdr_prev = list_next(list, hdr);
+				hdr_prev = multilist_sublist_next(mls, hdr);
 			else
-				hdr_prev = list_prev(list, hdr);
+				hdr_prev = multilist_sublist_prev(mls, hdr);
 			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size);
 
 			hash_lock = HDR_LOCK(hdr);
@@ -5861,7 +6284,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
 				 * l2arc_write_done() can find where the
 				 * write buffers begin without searching.
 				 */
+				mutex_enter(&dev->l2ad_mtx);
 				list_insert_head(&dev->l2ad_buflist, head);
+				mutex_exit(&dev->l2ad_mtx);
 
 				cb = kmem_alloc(
 				    sizeof (l2arc_write_callback_t), KM_SLEEP);
@@ -5915,7 +6340,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
 			buf_sz = hdr->b_size;
 			hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
 
+			mutex_enter(&dev->l2ad_mtx);
 			list_insert_head(&dev->l2ad_buflist, hdr);
+			mutex_exit(&dev->l2ad_mtx);
 
 			/*
 			 * Compute and store the buffer cksum before
@@ -5929,7 +6356,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
 			write_sz += buf_sz;
 		}
 
-		mutex_exit(list_lock);
+		multilist_sublist_unlock(mls);
 
 		if (full == B_TRUE)
 			break;
@@ -5938,12 +6365,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
 	/* No buffers selected for writing? */
 	if (pio == NULL) {
 		ASSERT0(write_sz);
-		mutex_exit(&dev->l2ad_mtx);
 		ASSERT(!HDR_HAS_L1HDR(head));
 		kmem_cache_free(hdr_l2only_cache, head);
 		return (0);
 	}
 
+	mutex_enter(&dev->l2ad_mtx);
+
 	/*
 	 * Now start writing the buffers. We're starting at the write head
 	 * and work backwards, retracing the course of the buffer selector
@@ -5954,6 +6382,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
 		uint64_t buf_sz;
 
 		/*
+		 * We rely on the L1 portion of the header below, so
+		 * it's invalid for this header to have been evicted out
+		 * of the ghost cache, prior to being written out. The
+		 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
+		 */
+		ASSERT(HDR_HAS_L1HDR(hdr));
+
+		/*
 		 * We shouldn't need to lock the buffer here, since we flagged
 		 * it as ARC_FLAG_L2_WRITING in the previous step, but we must
 		 * take care to only access its L2 cache parameters. In
@@ -5981,14 +6417,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
 		buf_sz = hdr->b_l2hdr.b_asize;
 
 		/*
-		 * If the data has not been compressed, then clear b_tmp_cdata
-		 * to make sure that it points only to a temporary compression
-		 * buffer.
-		 */
-		if (!L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr)))
-			hdr->b_l1hdr.b_tmp_cdata = NULL;
-
-		/*
 		 * We need to do this regardless if buf_sz is zero or
 		 * not, otherwise, when this l2hdr is evicted we'll
 		 * remove a reference that was never added.
@@ -6081,6 +6509,12 @@ l2arc_compress_buf(arc_buf_hdr_t *hdr)
 	csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata,
 	    cdata, l2hdr->b_asize);
 
+	rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
+	if (rounded > csize) {
+		bzero((char *)cdata + csize, rounded - csize);
+		csize = rounded;
+	}
+
 	if (csize == 0) {
 		/* zero block, indicate that there's nothing to write */
 		zio_data_buf_free(cdata, len);
@@ -6089,19 +6523,11 @@ l2arc_compress_buf(arc_buf_hdr_t *hdr)
 		hdr->b_l1hdr.b_tmp_cdata = NULL;
 		ARCSTAT_BUMP(arcstat_l2_compress_zeros);
 		return (B_TRUE);
-	}
-
-	rounded = P2ROUNDUP(csize,
-	    (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift);
-	if (rounded < len) {
+	} else if (csize > 0 && csize < len) {
 		/*
 		 * Compression succeeded, we'll keep the cdata around for
 		 * writing and release it afterwards.
 		 */
-		if (rounded > csize) {
-			bzero((char *)cdata + csize, rounded - csize);
-			csize = rounded;
-		}
 		HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4);
 		l2hdr->b_asize = csize;
 		hdr->b_l1hdr.b_tmp_cdata = cdata;
@@ -6189,8 +6615,26 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
 static void
 l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
 {
+	enum zio_compress comp = HDR_GET_COMPRESS(hdr);
+
 	ASSERT(HDR_HAS_L1HDR(hdr));
-	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) {
+	ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp));
+
+	if (comp == ZIO_COMPRESS_OFF) {
+		/*
+		 * In this case, b_tmp_cdata points to the same buffer
+		 * as the arc_buf_t's b_data field. We don't want to
+		 * free it, since the arc_buf_t will handle that.
+		 */
+		hdr->b_l1hdr.b_tmp_cdata = NULL;
+	} else if (comp == ZIO_COMPRESS_EMPTY) {
+		/*
+		 * In this case, b_tmp_cdata was compressed to an empty
+		 * buffer, thus there's nothing to free and b_tmp_cdata
+		 * should have been set to NULL in l2arc_write_buffers().
+		 */
+		ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+	} else {
 		/*
 		 * If the data was compressed, then we've allocated a
 		 * temporary buffer for it, so now we need to release it.
@@ -6199,9 +6643,8 @@ l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
 		zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata,
 		    hdr->b_size);
 		hdr->b_l1hdr.b_tmp_cdata = NULL;
-	} else {
-		ASSERT(hdr->b_l1hdr.b_tmp_cdata == NULL);
 	}
+
 }
 
 /*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
index 5f7d76f..b2b9887 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
@@ -154,7 +154,7 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	int err;
 	struct bptree_args *ba = arg;
 
-	if (BP_IS_HOLE(bp))
+	if (bp == NULL || BP_IS_HOLE(bp))
 		return (0);
 
 	err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c
new file mode 100644
index 0000000..1ddc697
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c
@@ -0,0 +1,111 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
+#include	<sys/bqueue.h>
+#include	<sys/zfs_context.h>
+
+static inline bqueue_node_t *
+obj2node(bqueue_t *q, void *data)
+{
+	return ((bqueue_node_t *)((char *)data + q->bq_node_offset));
+}
+
+/*
+ * Initialize a blocking queue  The maximum capacity of the queue is set to
+ * size.  Types that want to be stored in a bqueue must contain a bqueue_node_t,
+ * and offset should give its offset from the start of the struct.  Return 0 on
+ * success, or -1 on failure.
+ */
+int
+bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset)
+{
+	list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t),
+	    node_offset + offsetof(bqueue_node_t, bqn_node));
+	cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL);
+	q->bq_node_offset = node_offset;
+	q->bq_size = 0;
+	q->bq_maxsize = size;
+	return (0);
+}
+
+/*
+ * Destroy a blocking queue.  This function asserts that there are no
+ * elements in the queue, and no one is blocked on the condition
+ * variables.
+ */
+void
+bqueue_destroy(bqueue_t *q)
+{
+	ASSERT0(q->bq_size);
+	cv_destroy(&q->bq_add_cv);
+	cv_destroy(&q->bq_pop_cv);
+	mutex_destroy(&q->bq_lock);
+	list_destroy(&q->bq_list);
+}
+
+/*
+ * Add data to q, consuming size units of capacity.  If there is insufficient
+ * capacity to consume size units, block until capacity exists.  Asserts size is
+ * > 0.
+ */
+void
+bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size)
+{
+	ASSERT3U(item_size, >, 0);
+	ASSERT3U(item_size, <, q->bq_maxsize);
+	mutex_enter(&q->bq_lock);
+	obj2node(q, data)->bqn_size = item_size;
+	while (q->bq_size + item_size > q->bq_maxsize) {
+		cv_wait(&q->bq_add_cv, &q->bq_lock);
+	}
+	q->bq_size += item_size;
+	list_insert_tail(&q->bq_list, data);
+	cv_signal(&q->bq_pop_cv);
+	mutex_exit(&q->bq_lock);
+}
+/*
+ * Take the first element off of q.  If there are no elements on the queue, wait
+ * until one is put there.  Return the removed element.
+ */
+void *
+bqueue_dequeue(bqueue_t *q)
+{
+	void *ret;
+	uint64_t item_size;
+	mutex_enter(&q->bq_lock);
+	while (q->bq_size == 0) {
+		cv_wait(&q->bq_pop_cv, &q->bq_lock);
+	}
+	ret = list_remove_head(&q->bq_list);
+	item_size = obj2node(q, ret)->bqn_size;
+	q->bq_size -= item_size;
+	mutex_exit(&q->bq_lock);
+	cv_signal(&q->bq_add_cv);
+	return (ret);
+}
+
+/*
+ * Returns true if the space used is 0.
+ */
+boolean_t
+bqueue_empty(bqueue_t *q)
+{
+	return (q->bq_size == 0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
index 79b6aed..16d8a2e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
@@ -548,11 +548,35 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
 	return (abuf);
 }
 
+/*
+ * Calculate which level n block references the data at the level 0 offset
+ * provided.
+ */
 uint64_t
-dbuf_whichblock(dnode_t *dn, uint64_t offset)
+dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
 {
-	if (dn->dn_datablkshift) {
-		return (offset >> dn->dn_datablkshift);
+	if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
+		/*
+		 * The level n blkid is equal to the level 0 blkid divided by
+		 * the number of level 0s in a level n block.
+		 *
+		 * The level 0 blkid is offset >> datablkshift =
+		 * offset / 2^datablkshift.
+		 *
+		 * The number of level 0s in a level n is the number of block
+		 * pointers in an indirect block, raised to the power of level.
+		 * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
+		 * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
+		 *
+		 * Thus, the level n blkid is: offset /
+		 * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT)))
+		 * = offset / 2^(datablkshift + level *
+		 *   (indblkshift - SPA_BLKPTRSHIFT))
+		 * = offset >> (datablkshift + level *
+		 *   (indblkshift - SPA_BLKPTRSHIFT))
+		 */
+		return (offset >> (dn->dn_datablkshift + level *
+		    (dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
 	} else {
 		ASSERT3U(offset, <, dn->dn_datablksz);
 		return (0);
@@ -1549,6 +1573,11 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
 	struct dirty_leaf *dl;
 	dmu_object_type_t type;
 
+	if (etype == BP_EMBEDDED_TYPE_DATA) {
+		ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
+		    SPA_FEATURE_EMBEDDED_DATA));
+	}
+
 	DB_DNODE_ENTER(db);
 	type = DB_DNODE(db)->dn_type;
 	DB_DNODE_EXIT(db);
@@ -1715,6 +1744,12 @@ dbuf_clear(dmu_buf_impl_t *db)
 		dbuf_rele(parent, db);
 }
 
+/*
+ * Note: While bpp will always be updated if the function returns success,
+ * parentp will not be updated if the dnode does not have dn_dbuf filled in;
+ * this happens when the dnode is the meta-dnode, or a userused or groupused
+ * object.
+ */
 static int
 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
     dmu_buf_impl_t **parentp, blkptr_t **bpp)
@@ -1755,7 +1790,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
 	} else if (level < nlevels-1) {
 		/* this block is referenced from an indirect block */
 		int err = dbuf_hold_impl(dn, level+1,
-		    blkid >> epbs, fail_sparse, NULL, parentp);
+		    blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
 		if (err)
 			return (err);
 		err = dbuf_read(*parentp, NULL,
@@ -1930,11 +1965,96 @@ dbuf_destroy(dmu_buf_impl_t *db)
 	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 }
 
+typedef struct dbuf_prefetch_arg {
+	spa_t *dpa_spa;	/* The spa to issue the prefetch in. */
+	zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
+	int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
+	int dpa_curlevel; /* The current level that we're reading */
+	zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
+	zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
+	arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
+} dbuf_prefetch_arg_t;
+
+/*
+ * Actually issue the prefetch read for the block given.
+ */
+static void
+dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
+{
+	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+		return;
+
+	arc_flags_t aflags =
+	    dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+
+	ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
+	ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
+	ASSERT(dpa->dpa_zio != NULL);
+	(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
+	    dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+	    &aflags, &dpa->dpa_zb);
+}
+
+/*
+ * Called when an indirect block above our prefetch target is read in.  This
+ * will either read in the next indirect block down the tree or issue the actual
+ * prefetch if the next block down is our target.
+ */
+static void
+dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
+{
+	dbuf_prefetch_arg_t *dpa = private;
+
+	ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
+	ASSERT3S(dpa->dpa_curlevel, >, 0);
+	if (zio != NULL) {
+		ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
+		ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
+		ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
+	}
+
+	dpa->dpa_curlevel--;
+
+	uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
+	    (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
+	blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
+	    P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
+	if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
+		kmem_free(dpa, sizeof (*dpa));
+	} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
+		ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
+		dbuf_issue_final_prefetch(dpa, bp);
+		kmem_free(dpa, sizeof (*dpa));
+	} else {
+		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
+		zbookmark_phys_t zb;
+
+		ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
+
+		SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
+		    dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
+
+		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
+		    bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+		    &iter_aflags, &zb);
+	}
+	(void) arc_buf_remove_ref(abuf, private);
+}
+
+/*
+ * Issue prefetch reads for the given block on the given level.  If the indirect
+ * blocks above that block are not in memory, we will read them in
+ * asynchronously.  As a result, this call never blocks waiting for a read to
+ * complete.
+ */
 void
-dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
+dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
+    arc_flags_t aflags)
 {
-	dmu_buf_impl_t *db = NULL;
-	blkptr_t *bp = NULL;
+	blkptr_t bp;
+	int epbs, nlevels, curlevel;
+	uint64_t curblkid;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
@@ -1942,35 +2062,104 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
 	if (dnode_block_freed(dn, blkid))
 		return;
 
-	/* dbuf_find() returns with db_mtx held */
-	if (db = dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid)) {
+	/*
+	 * This dnode hasn't been written to disk yet, so there's nothing to
+	 * prefetch.
+	 */
+	nlevels = dn->dn_phys->dn_nlevels;
+	if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
+		return;
+
+	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
+		return;
+
+	dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
+	    level, blkid);
+	if (db != NULL) {
+		mutex_exit(&db->db_mtx);
 		/*
-		 * This dbuf is already in the cache.  We assume that
-		 * it is already CACHED, or else about to be either
-		 * read or filled.
+		 * This dbuf already exists.  It is either CACHED, or
+		 * (we assume) about to be read or filled.
 		 */
-		mutex_exit(&db->db_mtx);
 		return;
 	}
 
-	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
-		if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
-			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
-			arc_flags_t aflags =
-			    ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
-			zbookmark_phys_t zb;
+	/*
+	 * Find the closest ancestor (indirect block) of the target block
+	 * that is present in the cache.  In this indirect block, we will
+	 * find the bp that is at curlevel, curblkid.
+	 */
+	curlevel = level;
+	curblkid = blkid;
+	while (curlevel < nlevels - 1) {
+		int parent_level = curlevel + 1;
+		uint64_t parent_blkid = curblkid >> epbs;
+		dmu_buf_impl_t *db;
+
+		if (dbuf_hold_impl(dn, parent_level, parent_blkid,
+		    FALSE, TRUE, FTAG, &db) == 0) {
+			blkptr_t *bpp = db->db_buf->b_data;
+			bp = bpp[P2PHASE(curblkid, 1 << epbs)];
+			dbuf_rele(db, FTAG);
+			break;
+		}
 
-			SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
-			    dn->dn_object, 0, blkid);
+		curlevel = parent_level;
+		curblkid = parent_blkid;
+	}
 
-			(void) arc_read(NULL, dn->dn_objset->os_spa,
-			    bp, NULL, NULL, prio,
-			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
-			    &aflags, &zb);
-		}
-		if (db)
-			dbuf_rele(db, NULL);
+	if (curlevel == nlevels - 1) {
+		/* No cached indirect blocks found. */
+		ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
+		bp = dn->dn_phys->dn_blkptr[curblkid];
+	}
+	if (BP_IS_HOLE(&bp))
+		return;
+
+	ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
+
+	zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
+	    ZIO_FLAG_CANFAIL);
+
+	dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
+	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+	SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
+	    dn->dn_object, level, blkid);
+	dpa->dpa_curlevel = curlevel;
+	dpa->dpa_prio = prio;
+	dpa->dpa_aflags = aflags;
+	dpa->dpa_spa = dn->dn_objset->os_spa;
+	dpa->dpa_epbs = epbs;
+	dpa->dpa_zio = pio;
+
+	/*
+	 * If we have the indirect just above us, no need to do the asynchronous
+	 * prefetch chain; we'll just run the last step ourselves.  If we're at
+	 * a higher level, though, we want to issue the prefetches for all the
+	 * indirect blocks asynchronously, so we can go on with whatever we were
+	 * doing.
+	 */
+	if (curlevel == level) {
+		ASSERT3U(curblkid, ==, blkid);
+		dbuf_issue_final_prefetch(dpa, &bp);
+		kmem_free(dpa, sizeof (*dpa));
+	} else {
+		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
+		zbookmark_phys_t zb;
+
+		SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
+		    dn->dn_object, curlevel, curblkid);
+		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
+		    &bp, dbuf_prefetch_indirect_done, dpa, prio,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+		    &iter_aflags, &zb);
 	}
+	/*
+	 * We use pio here instead of dpa_zio since it's possible that
+	 * dpa may have already been freed.
+	 */
+	zio_nowait(pio);
 }
 
 /*
@@ -1978,7 +2167,8 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
  * Note: dn_struct_rwlock must be held.
  */
 int
-dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
+dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
+    boolean_t fail_sparse, boolean_t fail_uncached,
     void *tag, dmu_buf_impl_t **dbp)
 {
 	dmu_buf_impl_t *db, *parent = NULL;
@@ -1996,6 +2186,9 @@ top:
 		blkptr_t *bp = NULL;
 		int err;
 
+		if (fail_uncached)
+			return (SET_ERROR(ENOENT));
+
 		ASSERT3P(parent, ==, NULL);
 		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
 		if (fail_sparse) {
@@ -2012,6 +2205,11 @@ top:
 		db = dbuf_create(dn, level, blkid, parent, bp);
 	}
 
+	if (fail_uncached && db->db_state != DB_CACHED) {
+		mutex_exit(&db->db_mtx);
+		return (SET_ERROR(ENOENT));
+	}
+
 	if (db->db_buf && refcount_is_zero(&db->db_holds)) {
 		arc_buf_add_ref(db->db_buf, db);
 		if (db->db_buf->b_data == NULL) {
@@ -2067,16 +2265,14 @@ top:
 dmu_buf_impl_t *
 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
 {
-	dmu_buf_impl_t *db;
-	int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
-	return (err ? NULL : db);
+	return (dbuf_hold_level(dn, 0, blkid, tag));
 }
 
 dmu_buf_impl_t *
 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
 {
 	dmu_buf_impl_t *db;
-	int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
+	int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
 	return (err ? NULL : db);
 }
 
@@ -2429,8 +2625,8 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
 		if (parent == NULL) {
 			mutex_exit(&db->db_mtx);
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
-			(void) dbuf_hold_impl(dn, db->db_level+1,
-			    db->db_blkid >> epbs, FALSE, db, &parent);
+			parent = dbuf_hold_level(dn, db->db_level + 1,
+			    db->db_blkid >> epbs, db);
 			rw_exit(&dn->dn_struct_rwlock);
 			mutex_enter(&db->db_mtx);
 			db->db_parent = parent;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
index f45071b..91cd511 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -141,7 +141,7 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
-	blkid = dbuf_whichblock(dn, offset);
+	blkid = dbuf_whichblock(dn, 0, offset);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	db = dbuf_hold(dn, blkid, tag);
 	rw_exit(&dn->dn_struct_rwlock);
@@ -424,7 +424,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 
 	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-	blkid = dbuf_whichblock(dn, offset);
+	blkid = dbuf_whichblock(dn, 0, offset);
 	for (i = 0; i < nblks; i++) {
 		dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
 		if (db == NULL) {
@@ -528,17 +528,16 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
 }
 
 /*
- * Issue prefetch i/os for the given blocks.
+ * Issue prefetch i/os for the given blocks.  If level is greater than 0, the
+ * indirect blocks prefeteched will be those that point to the blocks containing
+ * the data starting at offset, and continuing to offset + len.
  *
- * Note: The assumption is that we *know* these blocks will be needed
- * almost immediately.  Therefore, the prefetch i/os will be issued at
- * ZIO_PRIORITY_SYNC_READ
- *
- * Note: indirect blocks and other metadata will be read synchronously,
- * causing this function to block if they are not already cached.
+ * Note that if the indirect blocks above the blocks being prefetched are not in
+ * cache, they will be asychronously read in.
  */
 void
-dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
+dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
+    uint64_t len, zio_priority_t pri)
 {
 	dnode_t *dn;
 	uint64_t blkid;
@@ -554,8 +553,9 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
 			return;
 
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
-		dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ);
+		blkid = dbuf_whichblock(dn, level,
+		    object * sizeof (dnode_phys_t));
+		dbuf_prefetch(dn, level, blkid, pri, 0);
 		rw_exit(&dn->dn_struct_rwlock);
 		return;
 	}
@@ -570,18 +570,24 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
 		return;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	if (dn->dn_datablkshift) {
-		int blkshift = dn->dn_datablkshift;
-		nblks = (P2ROUNDUP(offset + len, 1 << blkshift) -
-		    P2ALIGN(offset, 1 << blkshift)) >> blkshift;
+	/*
+	 * offset + len - 1 is the last byte we want to prefetch for, and offset
+	 * is the first.  Then dbuf_whichblk(dn, level, off + len - 1) is the
+	 * last block we want to prefetch, and dbuf_whichblock(dn, level,
+	 * offset)  is the first.  Then the number we need to prefetch is the
+	 * last - first + 1.
+	 */
+	if (level > 0 || dn->dn_datablkshift != 0) {
+		nblks = dbuf_whichblock(dn, level, offset + len - 1) -
+		    dbuf_whichblock(dn, level, offset) + 1;
 	} else {
 		nblks = (offset < dn->dn_datablksz);
 	}
 
 	if (nblks != 0) {
-		blkid = dbuf_whichblock(dn, offset);
+		blkid = dbuf_whichblock(dn, level, offset);
 		for (int i = 0; i < nblks; i++)
-			dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ);
+			dbuf_prefetch(dn, level, blkid + i, pri, 0);
 	}
 
 	rw_exit(&dn->dn_struct_rwlock);
@@ -1393,7 +1399,7 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
 	DB_DNODE_ENTER(dbuf);
 	dn = DB_DNODE(dbuf);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	blkid = dbuf_whichblock(dn, offset);
+	blkid = dbuf_whichblock(dn, 0, offset);
 	VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
 	rw_exit(&dn->dn_struct_rwlock);
 	DB_DNODE_EXIT(dbuf);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
index bd9e894..e88968b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
@@ -138,7 +138,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	if (issig(JUSTLOOKING) && issig(FORREAL))
 		return (SET_ERROR(EINTR));
 
-	if (zb->zb_object != DMU_META_DNODE_OBJECT)
+	if (bp == NULL || zb->zb_object != DMU_META_DNODE_OBJECT)
 		return (0);
 
 	if (BP_IS_HOLE(bp)) {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
index 808864a..6ca021e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
@@ -148,6 +148,11 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
 	return (0);
 }
 
+/*
+ * Return (in *objectp) the next object which is allocated (or a hole)
+ * after *object, taking into account only objects that may have been modified
+ * after the specified txg.
+ */
 int
 dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
 {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
index be1f46d..267aa35 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved.
  * Copyright 2014 HybridCluster. All rights reserved.
@@ -53,6 +53,7 @@
 #include <sys/blkptr.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/zfeature.h>
+#include <sys/bqueue.h>
 
 #ifdef __FreeBSD__
 #undef dump_write
@@ -61,10 +62,34 @@
 
 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
 int zfs_send_corrupt_data = B_FALSE;
+int zfs_send_queue_length = 16 * 1024 * 1024;
+int zfs_recv_queue_length = 16 * 1024 * 1024;
 
 static char *dmu_recv_tag = "dmu_recv_tag";
 static const char *recv_clone_name = "%recv";
 
+#define	BP_SPAN(datablkszsec, indblkshift, level) \
+	(((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \
+	(level) * (indblkshift - SPA_BLKPTRSHIFT)))
+
+struct send_thread_arg {
+	bqueue_t	q;
+	dsl_dataset_t	*ds;		/* Dataset to traverse */
+	uint64_t	fromtxg;	/* Traverse from this txg */
+	int		flags;		/* flags to pass to traverse_dataset */
+	int		error_code;
+	boolean_t	cancel;
+};
+
+struct send_block_record {
+	boolean_t		eos_marker; /* Marks the end of the stream */
+	blkptr_t		bp;
+	zbookmark_phys_t	zb;
+	uint8_t			indblkshift;
+	uint16_t		datablkszsec;
+	bqueue_node_t		ln;
+};
+
 static int
 dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
 {
@@ -455,58 +480,116 @@ backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
 	return (B_FALSE);
 }
 
-#define	BP_SPAN(dnp, level) \
-	(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
-	(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
+/*
+ * This is the callback function to traverse_dataset that acts as the worker
+ * thread for dmu_send_impl.
+ */
+/*ARGSUSED*/
+static int
+send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
+{
+	struct send_thread_arg *sta = arg;
+	struct send_block_record *record;
+	uint64_t record_size;
+	int err = 0;
 
-/* ARGSUSED */
+	if (sta->cancel)
+		return (SET_ERROR(EINTR));
+
+	if (bp == NULL) {
+		ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL);
+		return (0);
+	} else if (zb->zb_level < 0) {
+		return (0);
+	}
+
+	record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP);
+	record->eos_marker = B_FALSE;
+	record->bp = *bp;
+	record->zb = *zb;
+	record->indblkshift = dnp->dn_indblkshift;
+	record->datablkszsec = dnp->dn_datablkszsec;
+	record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+	bqueue_enqueue(&sta->q, record, record_size);
+
+	return (err);
+}
+
+/*
+ * This function kicks off the traverse_dataset.  It also handles setting the
+ * error code of the thread in case something goes wrong, and pushes the End of
+ * Stream record when the traverse_dataset call has finished.  If there is no
+ * dataset to traverse, the thread immediately pushes End of Stream marker.
+ */
+static void
+send_traverse_thread(void *arg)
+{
+	struct send_thread_arg *st_arg = arg;
+	int err;
+	struct send_block_record *data;
+
+	if (st_arg->ds != NULL) {
+		err = traverse_dataset(st_arg->ds, st_arg->fromtxg,
+		    st_arg->flags, send_cb, arg);
+		if (err != EINTR)
+			st_arg->error_code = err;
+	}
+	data = kmem_zalloc(sizeof (*data), KM_SLEEP);
+	data->eos_marker = B_TRUE;
+	bqueue_enqueue(&st_arg->q, data, 1);
+	thread_exit();
+}
+
+/*
+ * This function actually handles figuring out what kind of record needs to be
+ * dumped, reading the data (which has hopefully been prefetched), and calling
+ * the appropriate helper function.
+ */
 static int
-backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
 {
-	dmu_sendarg_t *dsp = arg;
+	dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os);
+	const blkptr_t *bp = &data->bp;
+	const zbookmark_phys_t *zb = &data->zb;
+	uint8_t indblkshift = data->indblkshift;
+	uint16_t dblkszsec = data->datablkszsec;
+	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
 	dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
 	int err = 0;
 
-	if (issig(JUSTLOOKING) && issig(FORREAL))
-		return (SET_ERROR(EINTR));
+	ASSERT3U(zb->zb_level, >=, 0);
 
 	if (zb->zb_object != DMU_META_DNODE_OBJECT &&
 	    DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
 		return (0);
-	} else if (zb->zb_level == ZB_ZIL_LEVEL) {
-		/*
-		 * If we are sending a non-snapshot (which is allowed on
-		 * read-only pools), it may have a ZIL, which must be ignored.
-		 */
-		return (0);
 	} else if (BP_IS_HOLE(bp) &&
 	    zb->zb_object == DMU_META_DNODE_OBJECT) {
-		uint64_t span = BP_SPAN(dnp, zb->zb_level);
+		uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
 		uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
-		err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT);
+		err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT);
 	} else if (BP_IS_HOLE(bp)) {
-		uint64_t span = BP_SPAN(dnp, zb->zb_level);
-		err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span);
+		uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
+		uint64_t offset = zb->zb_blkid * span;
+		err = dump_free(dsa, zb->zb_object, offset, span);
 	} else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
 		return (0);
 	} else if (type == DMU_OT_DNODE) {
-		dnode_phys_t *blk;
-		int i;
 		int blksz = BP_GET_LSIZE(bp);
 		arc_flags_t aflags = ARC_FLAG_WAIT;
 		arc_buf_t *abuf;
 
+		ASSERT0(zb->zb_level);
+
 		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
 		    &aflags, zb) != 0)
 			return (SET_ERROR(EIO));
 
-		blk = abuf->b_data;
-		for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
-			uint64_t dnobj = (zb->zb_blkid <<
-			    (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
-			err = dump_dnode(dsp, dnobj, blk+i);
+		dnode_phys_t *blk = abuf->b_data;
+		uint64_t dnobj = zb->zb_blkid * (blksz >> DNODE_SHIFT);
+		for (int i = 0; i < blksz >> DNODE_SHIFT; i++) {
+			err = dump_dnode(dsa, dnobj + i, blk + i);
 			if (err != 0)
 				break;
 		}
@@ -521,20 +604,21 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 		    &aflags, zb) != 0)
 			return (SET_ERROR(EIO));
 
-		err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data);
+		err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data);
 		(void) arc_buf_remove_ref(abuf, &abuf);
-	} else if (backup_do_embed(dsp, bp)) {
+	} else if (backup_do_embed(dsa, bp)) {
 		/* it's an embedded level-0 block of a regular object */
-		int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
-		err = dump_write_embedded(dsp, zb->zb_object,
+		int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
+		ASSERT0(zb->zb_level);
+		err = dump_write_embedded(dsa, zb->zb_object,
 		    zb->zb_blkid * blksz, blksz, bp);
-	} else { /* it's a level-0 block of a regular object */
+	} else {
+		/* it's a level-0 block of a regular object */
 		arc_flags_t aflags = ARC_FLAG_WAIT;
 		arc_buf_t *abuf;
-		int blksz = BP_GET_LSIZE(bp);
+		int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
 		uint64_t offset;
 
-		ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 		ASSERT0(zb->zb_level);
 		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
@@ -555,20 +639,20 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 
 		offset = zb->zb_blkid * blksz;
 
-		if (!(dsp->dsa_featureflags &
+		if (!(dsa->dsa_featureflags &
 		    DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
 		    blksz > SPA_OLD_MAXBLOCKSIZE) {
 			char *buf = abuf->b_data;
 			while (blksz > 0 && err == 0) {
 				int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
-				err = dump_write(dsp, type, zb->zb_object,
+				err = dump_write(dsa, type, zb->zb_object,
 				    offset, n, NULL, buf);
 				offset += n;
 				buf += n;
 				blksz -= n;
 			}
 		} else {
-			err = dump_write(dsp, type, zb->zb_object,
+			err = dump_write(dsa, type, zb->zb_object,
 			    offset, blksz, bp, abuf->b_data);
 		}
 		(void) arc_buf_remove_ref(abuf, &abuf);
@@ -579,11 +663,24 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 }
 
 /*
- * Releases dp using the specified tag.
+ * Pop the new data off the queue, and free the old data.
+ */
+static struct send_block_record *
+get_next_record(bqueue_t *bq, struct send_block_record *data)
+{
+	struct send_block_record *tmp = bqueue_dequeue(bq);
+	kmem_free(data, sizeof (*data));
+	return (tmp);
+}
+
+/*
+ * Actually do the bulk of the work in a zfs send.
+ *
+ * Note: Releases dp using the specified tag.
  */
 static int
-dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
-    zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
+dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
+    zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, boolean_t embedok,
 #ifdef illumos
     boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off)
 #else
@@ -596,8 +693,9 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
 	int err;
 	uint64_t fromtxg = 0;
 	uint64_t featureflags = 0;
+	struct send_thread_arg to_arg;
 
-	err = dmu_objset_from_ds(ds, &os);
+	err = dmu_objset_from_ds(to_ds, &os);
 	if (err != 0) {
 		dsl_pool_rele(dp, tag);
 		return (err);
@@ -623,35 +721,34 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
 	}
 #endif
 
-	if (large_block_ok && ds->ds_large_blocks)
+	if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS])
 		featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
 	if (embedok &&
 	    spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
 		featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
 		if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
 			featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4;
-	} else {
-		embedok = B_FALSE;
 	}
 
 	DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
 	    featureflags);
 
 	drr->drr_u.drr_begin.drr_creation_time =
-	    dsl_dataset_phys(ds)->ds_creation_time;
+	    dsl_dataset_phys(to_ds)->ds_creation_time;
 	drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
 	if (is_clone)
 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
-	drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(ds)->ds_guid;
-	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
+	drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
+	if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
 
-	if (fromzb != NULL) {
-		drr->drr_u.drr_begin.drr_fromguid = fromzb->zbm_guid;
-		fromtxg = fromzb->zbm_creation_txg;
+	if (ancestor_zb != NULL) {
+		drr->drr_u.drr_begin.drr_fromguid =
+		    ancestor_zb->zbm_guid;
+		fromtxg = ancestor_zb->zbm_creation_txg;
 	}
-	dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
-	if (!ds->ds_is_snapshot) {
+	dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname);
+	if (!to_ds->ds_is_snapshot) {
 		(void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--",
 		    sizeof (drr->drr_u.drr_begin.drr_toname));
 	}
@@ -665,16 +762,16 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
 	dsp->dsa_fp = fp;
 	dsp->dsa_os = os;
 	dsp->dsa_off = off;
-	dsp->dsa_toguid = dsl_dataset_phys(ds)->ds_guid;
+	dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
 	dsp->dsa_pending_op = PENDING_NONE;
-	dsp->dsa_incremental = (fromzb != NULL);
+	dsp->dsa_incremental = (ancestor_zb != NULL);
 	dsp->dsa_featureflags = featureflags;
 
-	mutex_enter(&ds->ds_sendstream_lock);
-	list_insert_head(&ds->ds_sendstreams, dsp);
-	mutex_exit(&ds->ds_sendstream_lock);
+	mutex_enter(&to_ds->ds_sendstream_lock);
+	list_insert_head(&to_ds->ds_sendstreams, dsp);
+	mutex_exit(&to_ds->ds_sendstream_lock);
 
-	dsl_dataset_long_hold(ds, FTAG);
+	dsl_dataset_long_hold(to_ds, FTAG);
 	dsl_pool_rele(dp, tag);
 
 	if (dump_record(dsp, NULL, 0) != 0) {
@@ -682,8 +779,41 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
 		goto out;
 	}
 
-	err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
-	    backup_cb, dsp);
+	err = bqueue_init(&to_arg.q, zfs_send_queue_length,
+	    offsetof(struct send_block_record, ln));
+	to_arg.error_code = 0;
+	to_arg.cancel = B_FALSE;
+	to_arg.ds = to_ds;
+	to_arg.fromtxg = fromtxg;
+	to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH;
+	(void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, curproc,
+	    TS_RUN, minclsyspri);
+
+	struct send_block_record *to_data;
+	to_data = bqueue_dequeue(&to_arg.q);
+
+	while (!to_data->eos_marker && err == 0) {
+		err = do_dump(dsp, to_data);
+		to_data = get_next_record(&to_arg.q, to_data);
+		if (issig(JUSTLOOKING) && issig(FORREAL))
+			err = EINTR;
+	}
+
+	if (err != 0) {
+		to_arg.cancel = B_TRUE;
+		while (!to_data->eos_marker) {
+			to_data = get_next_record(&to_arg.q, to_data);
+		}
+	}
+	kmem_free(to_data, sizeof (*to_data));
+
+	bqueue_destroy(&to_arg.q);
+
+	if (err == 0 && to_arg.error_code != 0)
+		err = to_arg.error_code;
+
+	if (err != 0)
+		goto out;
 
 	if (dsp->dsa_pending_op != PENDING_NONE)
 		if (dump_record(dsp, NULL, 0) != 0)
@@ -700,20 +830,18 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
 	drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
 	drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
 
-	if (dump_record(dsp, NULL, 0) != 0) {
+	if (dump_record(dsp, NULL, 0) != 0)
 		err = dsp->dsa_err;
-		goto out;
-	}
 
 out:
-	mutex_enter(&ds->ds_sendstream_lock);
-	list_remove(&ds->ds_sendstreams, dsp);
-	mutex_exit(&ds->ds_sendstream_lock);
+	mutex_enter(&to_ds->ds_sendstream_lock);
+	list_remove(&to_ds->ds_sendstreams, dsp);
+	mutex_exit(&to_ds->ds_sendstream_lock);
 
 	kmem_free(drr, sizeof (dmu_replay_record_t));
 	kmem_free(dsp, sizeof (dmu_sendarg_t));
 
-	dsl_dataset_long_rele(ds, FTAG);
+	dsl_dataset_long_rele(to_ds, FTAG);
 
 	return (err);
 }
@@ -1144,7 +1272,8 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
 		 * If it's a non-clone incremental, we are missing the
 		 * target fs, so fail the recv.
 		 */
-		if (fromguid != 0 && !(flags & DRR_FLAG_CLONE))
+		if (fromguid != 0 && !(flags & DRR_FLAG_CLONE ||
+		    drba->drba_origin))
 			return (SET_ERROR(ENOENT));
 
 		/* Open the parent of tofs */
@@ -1250,13 +1379,6 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
 	}
 	VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
 
-	if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
-	    DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
-	    !newds->ds_large_blocks) {
-		dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
-		newds->ds_large_blocks = B_TRUE;
-	}
-
 	dmu_buf_will_dirty(newds->ds_dbuf, tx);
 	dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
 
@@ -1326,22 +1448,58 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
 	    &drba, 5, ZFS_SPACE_CHECK_NORMAL));
 }
 
-struct restorearg {
+struct receive_record_arg {
+	dmu_replay_record_t header;
+	void *payload; /* Pointer to a buffer containing the payload */
+	/*
+	 * If the record is a write, pointer to the arc_buf_t containing the
+	 * payload.
+	 */
+	arc_buf_t *write_buf;
+	int payload_size;
+	boolean_t eos_marker; /* Marks the end of the stream */
+	bqueue_node_t node;
+};
+
+struct receive_writer_arg {
 	objset_t *os;
-	int err;
 	boolean_t byteswap;
+	bqueue_t q;
+	/*
+	 * These three args are used to signal to the main thread that we're
+	 * done.
+	 */
+	kmutex_t mutex;
+	kcondvar_t cv;
+	boolean_t done;
+	int err;
+	/* A map from guid to dataset to help handle dedup'd streams. */
+	avl_tree_t *guid_to_ds_map;
+};
+
+struct receive_arg  {
+	objset_t *os;
 	kthread_t *td;
 	struct file *fp;
-	uint64_t voff;
-	int bufsize; /* amount of memory allocated for buf */
-
-	dmu_replay_record_t *drr;
-	dmu_replay_record_t *next_drr;
-	char *buf;
+	uint64_t voff; /* The current offset in the stream */
+	/*
+	 * A record that has had its payload read in, but hasn't yet been handed
+	 * off to the worker thread.
+	 */
+	struct receive_record_arg *rrd;
+	/* A record that has had its header read in, but not its payload. */
+	struct receive_record_arg *next_rrd;
 	zio_cksum_t cksum;
 	zio_cksum_t prev_cksum;
+	int err;
+	boolean_t byteswap;
+	/* Sorted list of objects not to issue prefetches for. */
+	list_t ignore_obj_list;
+};
 
-	avl_tree_t *guid_to_ds_map;
+struct receive_ign_obj_node {
+	list_node_t node;
+	uint64_t object;
 };
 
 typedef struct guid_map_entry {
@@ -1380,7 +1538,7 @@ free_guid_map_onexit(void *arg)
 }
 
 static int
-restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *resid)
+restore_bytes(struct receive_arg *ra, void *buf, int len, off_t off, ssize_t *resid)
 {
 	struct uio auio;
 	struct iovec aiov;
@@ -1406,13 +1564,12 @@ restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *res
 }
 
 static int
-restore_read(struct restorearg *ra, int len, void *buf)
+receive_read(struct receive_arg *ra, int len, void *buf)
 {
 	int done = 0;
 
 	/* some things will require 8-byte alignment, so everything must */
 	ASSERT0(len % 8);
-	ASSERT3U(len, <=, ra->bufsize);
 
 	while (done < len) {
 		ssize_t resid;
@@ -1529,7 +1686,8 @@ deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
 }
 
 static int
-restore_object(struct restorearg *ra, struct drr_object *drro, void *data)
+receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
+    void *data)
 {
 	dmu_object_info_t doi;
 	dmu_tx_t *tx;
@@ -1543,12 +1701,12 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data)
 	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
 	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
 	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
-	    drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(ra->os)) ||
+	    drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
 	    drro->drr_bonuslen > DN_MAX_BONUSLEN) {
 		return (SET_ERROR(EINVAL));
 	}
 
-	err = dmu_object_info(ra->os, drro->drr_object, &doi);
+	err = dmu_object_info(rwa->os, drro->drr_object, &doi);
 
 	if (err != 0 && err != ENOENT)
 		return (SET_ERROR(EINVAL));
@@ -1567,14 +1725,14 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data)
 
 		if (drro->drr_blksz != doi.doi_data_block_size ||
 		    nblkptr < doi.doi_nblkptr) {
-			err = dmu_free_long_range(ra->os, drro->drr_object,
+			err = dmu_free_long_range(rwa->os, drro->drr_object,
 			    0, DMU_OBJECT_END);
 			if (err != 0)
 				return (SET_ERROR(EINVAL));
 		}
 	}
 
-	tx = dmu_tx_create(ra->os);
+	tx = dmu_tx_create(rwa->os);
 	dmu_tx_hold_bonus(tx, object);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
@@ -1584,7 +1742,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data)
 
 	if (object == DMU_NEW_OBJECT) {
 		/* currently free, want to be allocated */
-		err = dmu_object_claim(ra->os, drro->drr_object,
+		err = dmu_object_claim(rwa->os, drro->drr_object,
 		    drro->drr_type, drro->drr_blksz,
 		    drro->drr_bonustype, drro->drr_bonuslen, tx);
 	} else if (drro->drr_type != doi.doi_type ||
@@ -1592,7 +1750,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data)
 	    drro->drr_bonustype != doi.doi_bonus_type ||
 	    drro->drr_bonuslen != doi.doi_bonus_size) {
 		/* currently allocated, but with different properties */
-		err = dmu_object_reclaim(ra->os, drro->drr_object,
+		err = dmu_object_reclaim(rwa->os, drro->drr_object,
 		    drro->drr_type, drro->drr_blksz,
 		    drro->drr_bonustype, drro->drr_bonuslen, tx);
 	}
@@ -1601,20 +1759,20 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data)
 		return (SET_ERROR(EINVAL));
 	}
 
-	dmu_object_set_checksum(ra->os, drro->drr_object,
+	dmu_object_set_checksum(rwa->os, drro->drr_object,
 	    drro->drr_checksumtype, tx);
-	dmu_object_set_compress(ra->os, drro->drr_object,
+	dmu_object_set_compress(rwa->os, drro->drr_object,
 	    drro->drr_compress, tx);
 
 	if (data != NULL) {
 		dmu_buf_t *db;
 
-		VERIFY0(dmu_bonus_hold(ra->os, drro->drr_object, FTAG, &db));
+		VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db));
 		dmu_buf_will_dirty(db, tx);
 
 		ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
 		bcopy(data, db->db_data, drro->drr_bonuslen);
-		if (ra->byteswap) {
+		if (rwa->byteswap) {
 			dmu_object_byteswap_t byteswap =
 			    DMU_OT_BYTESWAP(drro->drr_bonustype);
 			dmu_ot_byteswap[byteswap].ob_func(db->db_data,
@@ -1628,7 +1786,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data)
 
 /* ARGSUSED */
 static int
-restore_freeobjects(struct restorearg *ra,
+receive_freeobjects(struct receive_writer_arg *rwa,
     struct drr_freeobjects *drrfo)
 {
 	uint64_t obj;
@@ -1638,13 +1796,13 @@ restore_freeobjects(struct restorearg *ra,
 
 	for (obj = drrfo->drr_firstobj;
 	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
-	    (void) dmu_object_next(ra->os, &obj, FALSE, 0)) {
+	    (void) dmu_object_next(rwa->os, &obj, FALSE, 0)) {
 		int err;
 
-		if (dmu_object_info(ra->os, obj, NULL) != 0)
+		if (dmu_object_info(rwa->os, obj, NULL) != 0)
 			continue;
 
-		err = dmu_free_long_object(ra->os, obj);
+		err = dmu_free_long_object(rwa->os, obj);
 		if (err != 0)
 			return (err);
 	}
@@ -1652,7 +1810,8 @@ restore_freeobjects(struct restorearg *ra,
 }
 
 static int
-restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf)
+receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
+    arc_buf_t *abuf)
 {
 	dmu_tx_t *tx;
 	int err;
@@ -1661,10 +1820,10 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf)
 	    !DMU_OT_IS_VALID(drrw->drr_type))
 		return (SET_ERROR(EINVAL));
 
-	if (dmu_object_info(ra->os, drrw->drr_object, NULL) != 0)
+	if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
-	tx = dmu_tx_create(ra->os);
+	tx = dmu_tx_create(rwa->os);
 
 	dmu_tx_hold_write(tx, drrw->drr_object,
 	    drrw->drr_offset, drrw->drr_length);
@@ -1673,7 +1832,7 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf)
 		dmu_tx_abort(tx);
 		return (err);
 	}
-	if (ra->byteswap) {
+	if (rwa->byteswap) {
 		dmu_object_byteswap_t byteswap =
 		    DMU_OT_BYTESWAP(drrw->drr_type);
 		dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
@@ -1681,7 +1840,7 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf)
 	}
 
 	dmu_buf_t *bonus;
-	if (dmu_bonus_hold(ra->os, drrw->drr_object, FTAG, &bonus) != 0)
+	if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0)
 		return (SET_ERROR(EINVAL));
 	dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx);
 	dmu_tx_commit(tx);
@@ -1697,7 +1856,8 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf)
  * data from the stream to fulfill this write.
  */
 static int
-restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr)
+receive_write_byref(struct receive_writer_arg *rwa,
+    struct drr_write_byref *drrwbr)
 {
 	dmu_tx_t *tx;
 	int err;
@@ -1716,14 +1876,14 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr)
 	 */
 	if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
 		gmesrch.guid = drrwbr->drr_refguid;
-		if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch,
+		if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch,
 		    &where)) == NULL) {
 			return (SET_ERROR(EINVAL));
 		}
 		if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
 			return (SET_ERROR(EINVAL));
 	} else {
-		ref_os = ra->os;
+		ref_os = rwa->os;
 	}
 
 	err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
@@ -1731,7 +1891,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr)
 	if (err != 0)
 		return (err);
 
-	tx = dmu_tx_create(ra->os);
+	tx = dmu_tx_create(rwa->os);
 
 	dmu_tx_hold_write(tx, drrwbr->drr_object,
 	    drrwbr->drr_offset, drrwbr->drr_length);
@@ -1740,7 +1900,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr)
 		dmu_tx_abort(tx);
 		return (err);
 	}
-	dmu_write(ra->os, drrwbr->drr_object,
+	dmu_write(rwa->os, drrwbr->drr_object,
 	    drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
 	dmu_buf_rele(dbp, FTAG);
 	dmu_tx_commit(tx);
@@ -1748,7 +1908,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr)
 }
 
 static int
-restore_write_embedded(struct restorearg *ra,
+receive_write_embedded(struct receive_writer_arg *rwa,
     struct drr_write_embedded *drrwnp, void *data)
 {
 	dmu_tx_t *tx;
@@ -1765,7 +1925,7 @@ restore_write_embedded(struct restorearg *ra,
 	if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
 		return (EINVAL);
 
-	tx = dmu_tx_create(ra->os);
+	tx = dmu_tx_create(rwa->os);
 
 	dmu_tx_hold_write(tx, drrwnp->drr_object,
 	    drrwnp->drr_offset, drrwnp->drr_length);
@@ -1775,36 +1935,37 @@ restore_write_embedded(struct restorearg *ra,
 		return (err);
 	}
 
-	dmu_write_embedded(ra->os, drrwnp->drr_object,
+	dmu_write_embedded(rwa->os, drrwnp->drr_object,
 	    drrwnp->drr_offset, data, drrwnp->drr_etype,
 	    drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize,
-	    ra->byteswap ^ ZFS_HOST_BYTEORDER, tx);
+	    rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx);
 
 	dmu_tx_commit(tx);
 	return (0);
 }
 
 static int
-restore_spill(struct restorearg *ra, struct drr_spill *drrs, void *data)
+receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
+    void *data)
 {
 	dmu_tx_t *tx;
 	dmu_buf_t *db, *db_spill;
 	int err;
 
 	if (drrs->drr_length < SPA_MINBLOCKSIZE ||
-	    drrs->drr_length > spa_maxblocksize(dmu_objset_spa(ra->os)))
+	    drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os)))
 		return (SET_ERROR(EINVAL));
 
-	if (dmu_object_info(ra->os, drrs->drr_object, NULL) != 0)
+	if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
-	VERIFY0(dmu_bonus_hold(ra->os, drrs->drr_object, FTAG, &db));
+	VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
 	if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
 		dmu_buf_rele(db, FTAG);
 		return (err);
 	}
 
-	tx = dmu_tx_create(ra->os);
+	tx = dmu_tx_create(rwa->os);
 
 	dmu_tx_hold_spill(tx, db->db_object);
 
@@ -1831,7 +1992,7 @@ restore_spill(struct restorearg *ra, struct drr_spill *drrs, void *data)
 
 /* ARGSUSED */
 static int
-restore_free(struct restorearg *ra, struct drr_free *drrf)
+receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
 {
 	int err;
 
@@ -1839,11 +2000,12 @@ restore_free(struct restorearg *ra, struct drr_free *drrf)
 	    drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
 		return (SET_ERROR(EINVAL));
 
-	if (dmu_object_info(ra->os, drrf->drr_object, NULL) != 0)
+	if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
-	err = dmu_free_long_range(ra->os, drrf->drr_object,
+	err = dmu_free_long_range(rwa->os, drrf->drr_object,
 	    drrf->drr_offset, drrf->drr_length);
+
 	return (err);
 }
 
@@ -1858,7 +2020,7 @@ dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
 }
 
 static void
-restore_cksum(struct restorearg *ra, int len, void *buf)
+receive_cksum(struct receive_arg *ra, int len, void *buf)
 {
 	if (ra->byteswap) {
 		fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
@@ -1868,30 +2030,42 @@ restore_cksum(struct restorearg *ra, int len, void *buf)
 }
 
 /*
- * If len != 0, read payload into buf.
- * Read next record's header into ra->next_drr.
+ * Read the payload into a buffer of size len, and update the current record's
+ * payload field.
+ * Allocate ra->next_rrd and read the next record's header into
+ * ra->next_rrd->header.
  * Verify checksum of payload and next record.
  */
 static int
-restore_read_payload_and_next_header(struct restorearg *ra, int len, void *buf)
+receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
 {
 	int err;
 
 	if (len != 0) {
-		ASSERT3U(len, <=, ra->bufsize);
-		err = restore_read(ra, len, buf);
+		ASSERT3U(len, <=, SPA_MAXBLOCKSIZE);
+		ra->rrd->payload = buf;
+		ra->rrd->payload_size = len;
+		err = receive_read(ra, len, ra->rrd->payload);
 		if (err != 0)
 			return (err);
-		restore_cksum(ra, len, buf);
+		receive_cksum(ra, len, ra->rrd->payload);
 	}
 
 	ra->prev_cksum = ra->cksum;
 
-	err = restore_read(ra, sizeof (*ra->next_drr), ra->next_drr);
-	if (err != 0)
+	ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP);
+	err = receive_read(ra, sizeof (ra->next_rrd->header),
+	    &ra->next_rrd->header);
+	if (err != 0) {
+		kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
+		ra->next_rrd = NULL;
 		return (err);
-	if (ra->next_drr->drr_type == DRR_BEGIN)
+	}
+	if (ra->next_rrd->header.drr_type == DRR_BEGIN) {
+		kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
+		ra->next_rrd = NULL;
 		return (SET_ERROR(EINVAL));
+	}
 
 	/*
 	 * Note: checksum is of everything up to but not including the
@@ -1899,107 +2073,248 @@ restore_read_payload_and_next_header(struct restorearg *ra, int len, void *buf)
 	 */
 	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
-	restore_cksum(ra,
+	receive_cksum(ra,
 	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
-	    ra->next_drr);
+	    &ra->next_rrd->header);
 
-	zio_cksum_t cksum_orig = ra->next_drr->drr_u.drr_checksum.drr_checksum;
-	zio_cksum_t *cksump = &ra->next_drr->drr_u.drr_checksum.drr_checksum;
+	zio_cksum_t cksum_orig =
+	    ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
+	zio_cksum_t *cksump =
+	    &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
 
 	if (ra->byteswap)
-		byteswap_record(ra->next_drr);
+		byteswap_record(&ra->next_rrd->header);
 
 	if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) &&
-	    !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump))
+	    !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) {
+		kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
+		ra->next_rrd = NULL;
 		return (SET_ERROR(ECKSUM));
+	}
 
-	restore_cksum(ra, sizeof (cksum_orig), &cksum_orig);
+	receive_cksum(ra, sizeof (cksum_orig), &cksum_orig);
 
 	return (0);
 }
 
+/*
+ * Issue the prefetch reads for any necessary indirect blocks.
+ *
+ * We use the object ignore list to tell us whether or not to issue prefetches
+ * for a given object.  We do this for both correctness (in case the blocksize
+ * of an object has changed) and performance (if the object doesn't exist, don't
+ * needlessly try to issue prefetches).  We also trim the list as we go through
+ * the stream to prevent it from growing to an unbounded size.
+ *
+ * The object numbers within will always be in sorted order, and any write
+ * records we see will also be in sorted order, but they're not sorted with
+ * respect to each other (i.e. we can get several object records before
+ * receiving each object's write records).  As a result, once we've reached a
+ * given object number, we can safely remove any reference to lower object
+ * numbers in the ignore list. In practice, we receive up to 32 object records
+ * before receiving write records, so the list can have up to 32 nodes in it.
+ */
+/* ARGSUSED */
+static void
+receive_read_prefetch(struct receive_arg *ra,
+    uint64_t object, uint64_t offset, uint64_t length)
+{
+	struct receive_ign_obj_node *node = list_head(&ra->ignore_obj_list);
+	while (node != NULL && node->object < object) {
+		VERIFY3P(node, ==, list_remove_head(&ra->ignore_obj_list));
+		kmem_free(node, sizeof (*node));
+		node = list_head(&ra->ignore_obj_list);
+	}
+	if (node == NULL || node->object > object) {
+		dmu_prefetch(ra->os, object, 1, offset, length,
+		    ZIO_PRIORITY_SYNC_READ);
+	}
+}
+
+/*
+ * Read records off the stream, issuing any necessary prefetches.
+ */
 static int
-restore_process_record(struct restorearg *ra)
+receive_read_record(struct receive_arg *ra)
 {
 	int err;
 
-	switch (ra->drr->drr_type) {
+	switch (ra->rrd->header.drr_type) {
 	case DRR_OBJECT:
 	{
-		struct drr_object *drro = &ra->drr->drr_u.drr_object;
-		err = restore_read_payload_and_next_header(ra,
-		    P2ROUNDUP(drro->drr_bonuslen, 8), ra->buf);
-		if (err != 0)
+		struct drr_object *drro = &ra->rrd->header.drr_u.drr_object;
+		uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8);
+		void *buf = kmem_zalloc(size, KM_SLEEP);
+		dmu_object_info_t doi;
+		err = receive_read_payload_and_next_header(ra, size, buf);
+		if (err != 0) {
+			kmem_free(buf, size);
 			return (err);
-		return (restore_object(ra, drro, ra->buf));
+		}
+		err = dmu_object_info(ra->os, drro->drr_object, &doi);
+		/*
+		 * See receive_read_prefetch for an explanation why we're
+		 * storing this object in the ignore_obj_list.
+		 */
+		if (err == ENOENT ||
+		    (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
+			struct receive_ign_obj_node *node =
+			    kmem_zalloc(sizeof (*node),
+			    KM_SLEEP);
+			node->object = drro->drr_object;
+#ifdef ZFS_DEBUG
+			struct receive_ign_obj_node *last_object =
+			    list_tail(&ra->ignore_obj_list);
+			uint64_t last_objnum = (last_object != NULL ?
+			    last_object->object : 0);
+			ASSERT3U(node->object, >, last_objnum);
+#endif
+			list_insert_tail(&ra->ignore_obj_list, node);
+			err = 0;
+		}
+		return (err);
 	}
 	case DRR_FREEOBJECTS:
 	{
-		struct drr_freeobjects *drrfo =
-		    &ra->drr->drr_u.drr_freeobjects;
-		err = restore_read_payload_and_next_header(ra, 0, NULL);
-		if (err != 0)
-			return (err);
-		return (restore_freeobjects(ra, drrfo));
+		err = receive_read_payload_and_next_header(ra, 0, NULL);
+		return (err);
 	}
 	case DRR_WRITE:
 	{
-		struct drr_write *drrw = &ra->drr->drr_u.drr_write;
+		struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write;
 		arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os),
 		    drrw->drr_length);
 
-		err = restore_read_payload_and_next_header(ra,
+		err = receive_read_payload_and_next_header(ra,
 		    drrw->drr_length, abuf->b_data);
-		if (err != 0)
-			return (err);
-		err = restore_write(ra, drrw, abuf);
-		/* if restore_write() is successful, it consumes the arc_buf */
-		if (err != 0)
+		if (err != 0) {
 			dmu_return_arcbuf(abuf);
+			return (err);
+		}
+		ra->rrd->write_buf = abuf;
+		receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset,
+		    drrw->drr_length);
 		return (err);
 	}
 	case DRR_WRITE_BYREF:
 	{
-		struct drr_write_byref *drrwbr =
-		    &ra->drr->drr_u.drr_write_byref;
-		err = restore_read_payload_and_next_header(ra, 0, NULL);
-		if (err != 0)
-			return (err);
-		return (restore_write_byref(ra, drrwbr));
+		struct drr_write_byref *drrwb =
+		    &ra->rrd->header.drr_u.drr_write_byref;
+		err = receive_read_payload_and_next_header(ra, 0, NULL);
+		receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset,
+		    drrwb->drr_length);
+		return (err);
 	}
 	case DRR_WRITE_EMBEDDED:
 	{
 		struct drr_write_embedded *drrwe =
-		    &ra->drr->drr_u.drr_write_embedded;
-		err = restore_read_payload_and_next_header(ra,
-		    P2ROUNDUP(drrwe->drr_psize, 8), ra->buf);
-		if (err != 0)
+		    &ra->rrd->header.drr_u.drr_write_embedded;
+		uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8);
+		void *buf = kmem_zalloc(size, KM_SLEEP);
+
+		err = receive_read_payload_and_next_header(ra, size, buf);
+		if (err != 0) {
+			kmem_free(buf, size);
 			return (err);
-		return (restore_write_embedded(ra, drrwe, ra->buf));
+		}
+
+		receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset,
+		    drrwe->drr_length);
+		return (err);
 	}
 	case DRR_FREE:
 	{
-		struct drr_free *drrf = &ra->drr->drr_u.drr_free;
-		err = restore_read_payload_and_next_header(ra, 0, NULL);
-		if (err != 0)
-			return (err);
-		return (restore_free(ra, drrf));
+		/*
+		 * It might be beneficial to prefetch indirect blocks here, but
+		 * we don't really have the data to decide for sure.
+		 */
+		err = receive_read_payload_and_next_header(ra, 0, NULL);
+		return (err);
 	}
 	case DRR_END:
 	{
-		struct drr_end *drre = &ra->drr->drr_u.drr_end;
+		struct drr_end *drre = &ra->rrd->header.drr_u.drr_end;
 		if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum))
 			return (SET_ERROR(EINVAL));
 		return (0);
 	}
 	case DRR_SPILL:
 	{
-		struct drr_spill *drrs = &ra->drr->drr_u.drr_spill;
-		err = restore_read_payload_and_next_header(ra,
-		    drrs->drr_length, ra->buf);
+		struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill;
+		void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP);
+		err = receive_read_payload_and_next_header(ra, drrs->drr_length,
+		    buf);
 		if (err != 0)
-			return (err);
-		return (restore_spill(ra, drrs, ra->buf));
+			kmem_free(buf, drrs->drr_length);
+		return (err);
+	}
+	default:
+		return (SET_ERROR(EINVAL));
+	}
+}
+
+/*
+ * Commit the records to the pool.
+ */
+static int
+receive_process_record(struct receive_writer_arg *rwa,
+    struct receive_record_arg *rrd)
+{
+	int err;
+
+	switch (rrd->header.drr_type) {
+	case DRR_OBJECT:
+	{
+		struct drr_object *drro = &rrd->header.drr_u.drr_object;
+		err = receive_object(rwa, drro, rrd->payload);
+		kmem_free(rrd->payload, rrd->payload_size);
+		rrd->payload = NULL;
+		return (err);
+	}
+	case DRR_FREEOBJECTS:
+	{
+		struct drr_freeobjects *drrfo =
+		    &rrd->header.drr_u.drr_freeobjects;
+		return (receive_freeobjects(rwa, drrfo));
+	}
+	case DRR_WRITE:
+	{
+		struct drr_write *drrw = &rrd->header.drr_u.drr_write;
+		err = receive_write(rwa, drrw, rrd->write_buf);
+		/* if receive_write() is successful, it consumes the arc_buf */
+		if (err != 0)
+			dmu_return_arcbuf(rrd->write_buf);
+		rrd->write_buf = NULL;
+		rrd->payload = NULL;
+		return (err);
+	}
+	case DRR_WRITE_BYREF:
+	{
+		struct drr_write_byref *drrwbr =
+		    &rrd->header.drr_u.drr_write_byref;
+		return (receive_write_byref(rwa, drrwbr));
+	}
+	case DRR_WRITE_EMBEDDED:
+	{
+		struct drr_write_embedded *drrwe =
+		    &rrd->header.drr_u.drr_write_embedded;
+		err = receive_write_embedded(rwa, drrwe, rrd->payload);
+		kmem_free(rrd->payload, rrd->payload_size);
+		rrd->payload = NULL;
+		return (err);
+	}
+	case DRR_FREE:
+	{
+		struct drr_free *drrf = &rrd->header.drr_u.drr_free;
+		return (receive_free(rwa, drrf));
+	}
+	case DRR_SPILL:
+	{
+		struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
+		err = receive_spill(rwa, drrs, rrd->payload);
+		kmem_free(rrd->payload, rrd->payload_size);
+		rrd->payload = NULL;
+		return (err);
 	}
 	default:
 		return (SET_ERROR(EINVAL));
@@ -2007,6 +2322,51 @@ restore_process_record(struct restorearg *ra)
 }
 
 /*
+ * dmu_recv_stream's worker thread; pull records off the queue, and then call
+ * receive_process_record  When we're done, signal the main thread and exit.
+ */
+static void
+receive_writer_thread(void *arg)
+{
+	struct receive_writer_arg *rwa = arg;
+	struct receive_record_arg *rrd;
+	for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker;
+	    rrd = bqueue_dequeue(&rwa->q)) {
+		/*
+		 * If there's an error, the main thread will stop putting things
+		 * on the queue, but we need to clear everything in it before we
+		 * can exit.
+		 */
+		if (rwa->err == 0) {
+			rwa->err = receive_process_record(rwa, rrd);
+		} else if (rrd->write_buf != NULL) {
+			dmu_return_arcbuf(rrd->write_buf);
+			rrd->write_buf = NULL;
+			rrd->payload = NULL;
+		} else if (rrd->payload != NULL) {
+			kmem_free(rrd->payload, rrd->payload_size);
+			rrd->payload = NULL;
+		}
+		kmem_free(rrd, sizeof (*rrd));
+	}
+	kmem_free(rrd, sizeof (*rrd));
+	mutex_enter(&rwa->mutex);
+	rwa->done = B_TRUE;
+	cv_signal(&rwa->cv);
+	mutex_exit(&rwa->mutex);
+	thread_exit();
+}
+
+/*
+ * Read in the stream's records, one by one, and apply them to the pool.  There
+ * are two threads involved; the thread that calls this function will spin up a
+ * worker thread, read the records off the stream one by one, and issue
+ * prefetches for any necessary indirect blocks.  It will then push the records
+ * onto an internal blocking queue.  The worker thread will pull the records off
+ * the queue, and actually write the data into the DMU.  This way, the worker
+ * thread doesn't have to wait for reads to complete, since everything it needs
+ * (the indirect blocks) will be prefetched.
+ *
  * NB: callers *must* call dmu_recv_end() if this succeeds.
  */
 int
@@ -2014,7 +2374,8 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
     int cleanup_fd, uint64_t *action_handlep)
 {
 	int err = 0;
-	struct restorearg ra = { 0 };
+	struct receive_arg ra = { 0 };
+	struct receive_writer_arg rwa = { 0 };
 	int featureflags;
 
 	ra.byteswap = drc->drc_byteswap;
@@ -2022,10 +2383,8 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
 	ra.td = curthread;
 	ra.fp = fp;
 	ra.voff = *voffp;
-	ra.bufsize = SPA_MAXBLOCKSIZE;
-	ra.drr = kmem_alloc(sizeof (*ra.drr), KM_SLEEP);
-	ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
-	ra.next_drr = kmem_alloc(sizeof (*ra.next_drr), KM_SLEEP);
+	list_create(&ra.ignore_obj_list, sizeof (struct receive_ign_obj_node),
+	    offsetof(struct receive_ign_obj_node, node));
 
 	/* these were verified in dmu_recv_begin */
 	ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
@@ -2056,48 +2415,92 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
 		}
 
 		if (*action_handlep == 0) {
-			ra.guid_to_ds_map =
+			rwa.guid_to_ds_map =
 			    kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
-			avl_create(ra.guid_to_ds_map, guid_compare,
+			avl_create(rwa.guid_to_ds_map, guid_compare,
 			    sizeof (guid_map_entry_t),
 			    offsetof(guid_map_entry_t, avlnode));
 			err = zfs_onexit_add_cb(minor,
-			    free_guid_map_onexit, ra.guid_to_ds_map,
+			    free_guid_map_onexit, rwa.guid_to_ds_map,
 			    action_handlep);
 			if (ra.err != 0)
 				goto out;
 		} else {
 			err = zfs_onexit_cb_data(minor, *action_handlep,
-			    (void **)&ra.guid_to_ds_map);
+			    (void **)&rwa.guid_to_ds_map);
 			if (ra.err != 0)
 				goto out;
 		}
 
-		drc->drc_guid_to_ds_map = ra.guid_to_ds_map;
+		drc->drc_guid_to_ds_map = rwa.guid_to_ds_map;
 	}
 
-	err = restore_read_payload_and_next_header(&ra, 0, NULL);
-	if (err != 0)
+	err = receive_read_payload_and_next_header(&ra, 0, NULL);
+	if (err)
 		goto out;
-	for (;;) {
-		void *tmp;
 
+	(void) bqueue_init(&rwa.q, zfs_recv_queue_length,
+	    offsetof(struct receive_record_arg, node));
+	cv_init(&rwa.cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL);
+	rwa.os = ra.os;
+	rwa.byteswap = drc->drc_byteswap;
+
+	(void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, curproc,
+	    TS_RUN, minclsyspri);
+	/*
+	 * We're reading rwa.err without locks, which is safe since we are the
+	 * only reader, and the worker thread is the only writer.  It's ok if we
+	 * miss a write for an iteration or two of the loop, since the writer
+	 * thread will keep freeing records we send it until we send it an eos
+	 * marker.
+	 *
+	 * We can leave this loop in 3 ways:  First, if rwa.err is
+	 * non-zero.  In that case, the writer thread will free the rrd we just
+	 * pushed.  Second, if  we're interrupted; in that case, either it's the
+	 * first loop and ra.rrd was never allocated, or it's later, and ra.rrd
+	 * has been handed off to the writer thread who will free it.  Finally,
+	 * if receive_read_record fails or we're at the end of the stream, then
+	 * we free ra.rrd and exit.
+	 */
+	while (rwa.err == 0) {
 		if (issig(JUSTLOOKING) && issig(FORREAL)) {
 			err = SET_ERROR(EINTR);
 			break;
 		}
 
-		tmp = ra.next_drr;
-		ra.next_drr = ra.drr;
-		ra.drr = tmp;
+		ASSERT3P(ra.rrd, ==, NULL);
+		ra.rrd = ra.next_rrd;
+		ra.next_rrd = NULL;
+		/* Allocates and loads header into ra.next_rrd */
+		err = receive_read_record(&ra);
 
-		/* process ra.drr, read in ra.next_drr */
-		err = restore_process_record(&ra);
-		if (err != 0)
-			break;
-		if (ra.drr->drr_type == DRR_END)
+		if (ra.rrd->header.drr_type == DRR_END || err != 0) {
+			kmem_free(ra.rrd, sizeof (*ra.rrd));
+			ra.rrd = NULL;
 			break;
+		}
+
+		bqueue_enqueue(&rwa.q, ra.rrd,
+		    sizeof (struct receive_record_arg) + ra.rrd->payload_size);
+		ra.rrd = NULL;
+	}
+	if (ra.next_rrd == NULL)
+		ra.next_rrd = kmem_zalloc(sizeof (*ra.next_rrd), KM_SLEEP);
+	ra.next_rrd->eos_marker = B_TRUE;
+	bqueue_enqueue(&rwa.q, ra.next_rrd, 1);
+
+	mutex_enter(&rwa.mutex);
+	while (!rwa.done) {
+		cv_wait(&rwa.cv, &rwa.mutex);
 	}
+	mutex_exit(&rwa.mutex);
+
+	cv_destroy(&rwa.cv);
+	mutex_destroy(&rwa.mutex);
+	bqueue_destroy(&rwa.q);
+	if (err == 0)
+		err = rwa.err;
 
 out:
 	if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
@@ -2111,10 +2514,13 @@ out:
 		dmu_recv_cleanup_ds(drc);
 	}
 
-	kmem_free(ra.drr, sizeof (*ra.drr));
-	kmem_free(ra.buf, ra.bufsize);
-	kmem_free(ra.next_drr, sizeof (*ra.next_drr));
 	*voffp = ra.voff;
+	for (struct receive_ign_obj_node *n =
+	    list_remove_head(&ra.ignore_obj_list); n != NULL;
+	    n = list_remove_head(&ra.ignore_obj_list)) {
+		kmem_free(n, sizeof (*n));
+	}
+	list_destroy(&ra.ignore_obj_list);
 	return (err);
 }
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
index e246c49..151d04c 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
@@ -158,7 +158,7 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
 		 * If we already visited this bp & everything below,
 		 * don't bother doing it again.
 		 */
-		if (zbookmark_is_before(dnp, zb, td->td_resume))
+		if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
 			return (RESUME_SKIP_ALL);
 
 		/*
@@ -425,6 +425,17 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
 	int j, err = 0;
 	zbookmark_phys_t czb;
 
+	if (td->td_flags & TRAVERSE_PRE) {
+		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
+		    ZB_DNODE_BLKID);
+		err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
+		    td->td_arg);
+		if (err == TRAVERSE_VISIT_NO_CHILDREN)
+			return (0);
+		if (err != 0)
+			return (err);
+	}
+
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
 		err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
@@ -432,10 +443,21 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
 			break;
 	}
 
-	if (err == 0 && dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+	if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
 		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
 		err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
 	}
+
+	if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
+		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
+		    ZB_DNODE_BLKID);
+		err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
+		    td->td_arg);
+		if (err == TRAVERSE_VISIT_NO_CHILDREN)
+			return (0);
+		if (err != 0)
+			return (err);
+	}
 	return (err);
 }
 
@@ -448,6 +470,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
 
 	ASSERT(pfd->pd_bytes_fetched >= 0);
+	if (bp == NULL)
+		return (0);
 	if (pfd->pd_cancel)
 		return (SET_ERROR(EINTR));
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
index dff9fab..65a017f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
@@ -315,7 +315,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 			dmu_buf_impl_t *db;
 
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
-			err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
+			err = dbuf_hold_impl(dn, 0, start,
+			    FALSE, FALSE, FTAG, &db);
 			rw_exit(&dn->dn_struct_rwlock);
 
 			if (err) {
@@ -516,7 +517,8 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 		blkoff = P2PHASE(blkid, epb);
 		tochk = MIN(epb - blkoff, nblks);
 
-		err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf);
+		err = dbuf_hold_impl(dn, 1, blkid >> epbs,
+		    FALSE, FALSE, FTAG, &dbuf);
 		if (err) {
 			txh->txh_tx->tx_err = err;
 			break;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
index 77100ef..65ce914 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
@@ -305,7 +305,8 @@ dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
 	fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
 
 	for (i = 0; i < fetchsz; i++) {
-		dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_ASYNC_READ);
+		dbuf_prefetch(dn, 0, blkid + i, ZIO_PRIORITY_ASYNC_READ,
+		    ARC_FLAG_PREFETCH);
 	}
 
 	return (fetchsz);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
index 5b953fc..0fdcde4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
@@ -1116,7 +1116,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
 		drop_struct_lock = TRUE;
 	}
 
-	blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
+	blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
 
 	db = dbuf_hold(mdn, blk, FTAG);
 	if (drop_struct_lock)
@@ -1413,7 +1413,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 		goto fail;
 
 	/* resize the old block */
-	err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db);
+	err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
 	if (err == 0)
 		dbuf_new_size(db, size, tx);
 	else if (err != ENOENT)
@@ -1586,8 +1586,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 		ASSERT3U(blkoff + head, ==, blksz);
 		if (len < head)
 			head = len;
-		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
-		    FTAG, &db) == 0) {
+		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
+		    TRUE, FALSE, FTAG, &db) == 0) {
 			caddr_t data;
 
 			/* don't dirty if it isn't on disk and isn't dirty */
@@ -1624,8 +1624,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 	if (tail) {
 		if (len < tail)
 			tail = len;
-		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
-		    TRUE, FTAG, &db) == 0) {
+		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
+		    TRUE, FALSE, FTAG, &db) == 0) {
 			/* don't dirty if not on disk and not dirty */
 			if (db->db_last_dirty ||
 			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
@@ -1854,7 +1854,7 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
  */
 static int
 dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
-	int lvl, uint64_t blkfill, uint64_t txg)
+    int lvl, uint64_t blkfill, uint64_t txg)
 {
 	dmu_buf_impl_t *db = NULL;
 	void *data = NULL;
@@ -1876,8 +1876,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 		epb = dn->dn_phys->dn_nblkptr;
 		data = dn->dn_phys->dn_blkptr;
 	} else {
-		uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
-		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
+		uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
+		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
 		if (error) {
 			if (error != ENOENT)
 				return (error);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
index 0633604..0787885 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
@@ -188,7 +188,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
 
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		err = dbuf_hold_impl(dn, db->db_level-1,
-		    (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
+		    (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child);
 		rw_exit(&dn->dn_struct_rwlock);
 		if (err == ENOENT)
 			continue;
@@ -284,7 +284,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
 				continue;
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
 			VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
-			    i, B_TRUE, FTAG, &subdb));
+			    i, TRUE, FALSE, FTAG, &subdb));
 			rw_exit(&dn->dn_struct_rwlock);
 			ASSERT3P(bp, ==, subdb->db_blkptr);
 
@@ -357,7 +357,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
 				continue;
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
 			VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
-			    TRUE, FTAG, &db));
+			    TRUE, FALSE, FTAG, &db));
 			rw_exit(&dn->dn_struct_rwlock);
 
 			free_children(db, blkid, nblks, tx);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
index 551e35b..f4fdaf9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Portions Copyright (c) 2011 Martin Matuska <mm@FreeBSD.org>
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 RackTop Systems.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
@@ -130,8 +130,10 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 	dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
 	dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
 	dsl_dataset_phys(ds)->ds_unique_bytes += used;
-	if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE)
-		ds->ds_need_large_blocks = B_TRUE;
+	if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) {
+		ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] =
+		    B_TRUE;
+	}
 	mutex_exit(&ds->ds_lock);
 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
 	    compressed, uncompressed, tx);
@@ -433,19 +435,23 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 		    offsetof(dmu_sendarg_t, dsa_link));
 
 		if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
-			int zaperr = zap_contains(mos, dsobj,
-			    DS_FIELD_LARGE_BLOCKS);
-			if (zaperr != ENOENT) {
-				VERIFY0(zaperr);
-				ds->ds_large_blocks = B_TRUE;
+			for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+				if (!(spa_feature_table[f].fi_flags &
+				    ZFEATURE_FLAG_PER_DATASET))
+					continue;
+				err = zap_contains(mos, dsobj,
+				    spa_feature_table[f].fi_guid);
+				if (err == 0) {
+					ds->ds_feature_inuse[f] = B_TRUE;
+				} else {
+					ASSERT3U(err, ==, ENOENT);
+					err = 0;
+				}
 			}
 		}
 
-		if (err == 0) {
-			err = dsl_dir_hold_obj(dp,
-			    dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds,
-			    &ds->ds_dir);
-		}
+		err = dsl_dir_hold_obj(dp,
+		    dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, &ds->ds_dir);
 		if (err != 0) {
 			mutex_destroy(&ds->ds_lock);
 			mutex_destroy(&ds->ds_opening_lock);
@@ -540,6 +546,7 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name,
 	const char *snapname;
 	uint64_t obj;
 	int err = 0;
+	dsl_dataset_t *ds;
 
 	err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
 	if (err != 0)
@@ -548,36 +555,37 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name,
 	ASSERT(dsl_pool_config_held(dp));
 	obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
 	if (obj != 0)
-		err = dsl_dataset_hold_obj(dp, obj, tag, dsp);
+		err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
 	else
 		err = SET_ERROR(ENOENT);
 
 	/* we may be looking for a snapshot */
 	if (err == 0 && snapname != NULL) {
-		dsl_dataset_t *ds;
+		dsl_dataset_t *snap_ds;
 
 		if (*snapname++ != '@') {
-			dsl_dataset_rele(*dsp, tag);
+			dsl_dataset_rele(ds, tag);
 			dsl_dir_rele(dd, FTAG);
 			return (SET_ERROR(ENOENT));
 		}
 
 		dprintf("looking for snapshot '%s'\n", snapname);
-		err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
+		err = dsl_dataset_snap_lookup(ds, snapname, &obj);
 		if (err == 0)
-			err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
-		dsl_dataset_rele(*dsp, tag);
+			err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds);
+		dsl_dataset_rele(ds, tag);
 
 		if (err == 0) {
-			mutex_enter(&ds->ds_lock);
-			if (ds->ds_snapname[0] == 0)
-				(void) strlcpy(ds->ds_snapname, snapname,
-				    sizeof (ds->ds_snapname));
-			mutex_exit(&ds->ds_lock);
-			*dsp = ds;
+			mutex_enter(&snap_ds->ds_lock);
+			if (snap_ds->ds_snapname[0] == 0)
+				(void) strlcpy(snap_ds->ds_snapname, snapname,
+				    sizeof (snap_ds->ds_snapname));
+			mutex_exit(&snap_ds->ds_lock);
+			ds = snap_ds;
 		}
 	}
-
+	if (err == 0)
+		*dsp = ds;
 	dsl_dir_rele(dd, FTAG);
 	return (err);
 }
@@ -699,6 +707,34 @@ dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
 	return (gotit);
 }
 
+static void
+dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+	uint64_t zero = 0;
+
+	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
+
+	spa_feature_incr(spa, f, tx);
+	dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
+
+	VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,
+	    sizeof (zero), 1, &zero, tx));
+}
+
+void
+dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+
+	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
+
+	VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx));
+	spa_feature_decr(spa, f, tx);
+}
+
 uint64_t
 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
     uint64_t flags, dmu_tx_t *tx)
@@ -759,8 +795,10 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
 		dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
 		    (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
 
-		if (origin->ds_large_blocks)
-			dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+		for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+			if (origin->ds_feature_inuse[f])
+				dsl_dataset_activate_feature(dsobj, f, tx);
+		}
 
 		dmu_buf_will_dirty(origin->ds_dbuf, tx);
 		dsl_dataset_phys(origin)->ds_num_children++;
@@ -1322,8 +1360,10 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
 	dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
 	dmu_buf_rele(dbuf, FTAG);
 
-	if (ds->ds_large_blocks)
-		dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (ds->ds_feature_inuse[f])
+			dsl_dataset_activate_feature(dsobj, f, tx);
+	}
 
 	ASSERT3U(ds->ds_prev != 0, ==,
 	    dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
@@ -1615,9 +1655,13 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
 
 	dmu_objset_sync(ds->ds_objset, zio, tx);
 
-	if (ds->ds_need_large_blocks && !ds->ds_large_blocks) {
-		dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
-		ds->ds_large_blocks = B_TRUE;
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (ds->ds_feature_activation_needed[f]) {
+			if (ds->ds_feature_inuse[f])
+				continue;
+			dsl_dataset_activate_feature(ds->ds_object, f, tx);
+			ds->ds_feature_inuse[f] = B_TRUE;
+		}
 	}
 }
 
@@ -2781,6 +2825,40 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
 	    dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota);
 	ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
 
+	/*
+	 * Swap per-dataset feature flags.
+	 */
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (!(spa_feature_table[f].fi_flags &
+		    ZFEATURE_FLAG_PER_DATASET)) {
+			ASSERT(!clone->ds_feature_inuse[f]);
+			ASSERT(!origin_head->ds_feature_inuse[f]);
+			continue;
+		}
+
+		boolean_t clone_inuse = clone->ds_feature_inuse[f];
+		boolean_t origin_head_inuse = origin_head->ds_feature_inuse[f];
+
+		if (clone_inuse) {
+			dsl_dataset_deactivate_feature(clone->ds_object, f, tx);
+			clone->ds_feature_inuse[f] = B_FALSE;
+		}
+		if (origin_head_inuse) {
+			dsl_dataset_deactivate_feature(origin_head->ds_object,
+			    f, tx);
+			origin_head->ds_feature_inuse[f] = B_FALSE;
+		}
+		if (clone_inuse) {
+			dsl_dataset_activate_feature(origin_head->ds_object,
+			    f, tx);
+			origin_head->ds_feature_inuse[f] = B_TRUE;
+		}
+		if (origin_head_inuse) {
+			dsl_dataset_activate_feature(clone->ds_object, f, tx);
+			clone->ds_feature_inuse[f] = B_TRUE;
+		}
+	}
+
 	dmu_buf_will_dirty(clone->ds_dbuf, tx);
 	dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
 
@@ -3335,77 +3413,6 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
 	return (err);
 }
 
-static int
-dsl_dataset_activate_large_blocks_check(void *arg, dmu_tx_t *tx)
-{
-	const char *dsname = arg;
-	dsl_dataset_t *ds;
-	dsl_pool_t *dp = dmu_tx_pool(tx);
-	int error = 0;
-
-	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
-		return (SET_ERROR(ENOTSUP));
-
-	ASSERT(spa_feature_is_enabled(dp->dp_spa,
-	    SPA_FEATURE_EXTENSIBLE_DATASET));
-
-	error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
-	if (error != 0)
-		return (error);
-
-	if (ds->ds_large_blocks)
-		error = EALREADY;
-	dsl_dataset_rele(ds, FTAG);
-
-	return (error);
-}
-
-void
-dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx)
-{
-	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
-	uint64_t zero = 0;
-
-	spa_feature_incr(spa, SPA_FEATURE_LARGE_BLOCKS, tx);
-	dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
-
-	VERIFY0(zap_add(mos, dsobj, DS_FIELD_LARGE_BLOCKS,
-	    sizeof (zero), 1, &zero, tx));
-}
-
-static void
-dsl_dataset_activate_large_blocks_sync(void *arg, dmu_tx_t *tx)
-{
-	const char *dsname = arg;
-	dsl_dataset_t *ds;
-
-	VERIFY0(dsl_dataset_hold(dmu_tx_pool(tx), dsname, FTAG, &ds));
-
-	dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
-	ASSERT(!ds->ds_large_blocks);
-	ds->ds_large_blocks = B_TRUE;
-	dsl_dataset_rele(ds, FTAG);
-}
-
-int
-dsl_dataset_activate_large_blocks(const char *dsname)
-{
-	int error;
-
-	error = dsl_sync_task(dsname,
-	    dsl_dataset_activate_large_blocks_check,
-	    dsl_dataset_activate_large_blocks_sync, (void *)dsname,
-	    1, ZFS_SPACE_CHECK_RESERVED);
-
-	/*
-	 * EALREADY indicates that this dataset already supports large blocks.
-	 */
-	if (error == EALREADY)
-		error = 0;
-	return (error);
-}
-
 /*
  * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
  * For example, they could both be snapshots of the same filesystem, and
@@ -3450,7 +3457,6 @@ dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
 	return (ret);
 }
 
-
 void
 dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
index 7f90469..c7a623c 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2013 by Joyent, Inc. All rights reserved.
  */
@@ -267,9 +267,11 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
 
 	obj = ds->ds_object;
 
-	if (ds->ds_large_blocks) {
-		ASSERT0(zap_contains(mos, obj, DS_FIELD_LARGE_BLOCKS));
-		spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx);
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (ds->ds_feature_inuse[f]) {
+			dsl_dataset_deactivate_feature(obj, f, tx);
+			ds->ds_feature_inuse[f] = B_FALSE;
+		}
 	}
 	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 		ASSERT3P(ds->ds_prev, ==, NULL);
@@ -552,7 +554,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	struct killarg *ka = arg;
 	dmu_tx_t *tx = ka->tx;
 
-	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
 		return (0);
 
 	if (zb->zb_level == ZB_ZIL_LEVEL) {
@@ -736,12 +738,16 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
 		ASSERT0(ds->ds_reserved);
 	}
 
-	if (ds->ds_large_blocks)
-		spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx);
+	obj = ds->ds_object;
 
-	dsl_scan_ds_destroyed(ds, tx);
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (ds->ds_feature_inuse[f]) {
+			dsl_dataset_deactivate_feature(obj, f, tx);
+			ds->ds_feature_inuse[f] = B_FALSE;
+		}
+	}
 
-	obj = ds->ds_object;
+	dsl_scan_ds_destroyed(ds, tx);
 
 	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 		/* This is a clone */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
index d58886b..189ca19 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
@@ -415,7 +415,14 @@ dsl_pool_close(dsl_pool_t *dp)
 	txg_list_destroy(&dp->dp_sync_tasks);
 	txg_list_destroy(&dp->dp_dirty_dirs);
 
-	arc_flush(dp->dp_spa);
+	/*
+	 * We can't set retry to TRUE since we're explicitly specifying
+	 * a spa to flush. This is good enough; any missed buffers for
+	 * this spa won't cause trouble, and they'll eventually fall
+	 * out of the ARC just like any other unused buffer.
+	 */
+	arc_flush(dp->dp_spa, FALSE);
+
 	txg_fini(dp);
 	dsl_scan_fini(dp);
 	dmu_buf_user_evict_wait();
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
index d08b5fb..406af3b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
@@ -600,7 +600,8 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
 		 * If we already visited this bp & everything below (in
 		 * a prior txg sync), don't bother doing it again.
 		 */
-		if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
+		if (zbookmark_subtree_completed(dnp, zb,
+		    &scn->scn_phys.scn_bookmark))
 			return (B_TRUE);
 
 		/*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c
new file mode 100644
index 0000000..8296057
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c
@@ -0,0 +1,366 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/multilist.h>
+
+/* needed for spa_get_random() */
+#include <sys/spa.h>
+
+/*
+ * Given the object contained on the list, return a pointer to the
+ * object's multilist_node_t structure it contains.
+ */
+static multilist_node_t *
+multilist_d2l(multilist_t *ml, void *obj)
+{
+	return ((multilist_node_t *)((char *)obj + ml->ml_offset));
+}
+
+/*
+ * Initialize a new mutlilist using the parameters specified.
+ *
+ *  - 'size' denotes the size of the structure containing the
+ *     multilist_node_t.
+ *  - 'offset' denotes the byte offset of the mutlilist_node_t within
+ *     the structure that contains it.
+ *  - 'num' specifies the number of internal sublists to create.
+ *  - 'index_func' is used to determine which sublist to insert into
+ *     when the multilist_insert() function is called; as well as which
+ *     sublist to remove from when multilist_remove() is called. The
+ *     requirements this function must meet, are the following:
+ *
+ *      - It must always return the same value when called on the same
+ *        object (to ensure the object is removed from the list it was
+ *        inserted into).
+ *
+ *      - It must return a value in the range [0, number of sublists).
+ *        The multilist_get_num_sublists() function may be used to
+ *        determine the number of sublists in the multilist.
+ *
+ *     Also, in order to reduce internal contention between the sublists
+ *     during insertion and removal, this function should choose evenly
+ *     between all available sublists when inserting. This isn't a hard
+ *     requirement, but a general rule of thumb in order to garner the
+ *     best multi-threaded performance out of the data structure.
+ */
+void
+multilist_create(multilist_t *ml, size_t size, size_t offset, unsigned int num,
+    multilist_sublist_index_func_t *index_func)
+{
+	ASSERT3P(ml, !=, NULL);
+	ASSERT3U(size, >, 0);
+	ASSERT3U(size, >=, offset + sizeof (multilist_node_t));
+	ASSERT3U(num, >, 0);
+	ASSERT3P(index_func, !=, NULL);
+
+	ml->ml_offset = offset;
+	ml->ml_num_sublists = num;
+	ml->ml_index_func = index_func;
+
+	ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) *
+	    ml->ml_num_sublists, KM_SLEEP);
+
+	ASSERT3P(ml->ml_sublists, !=, NULL);
+
+	for (int i = 0; i < ml->ml_num_sublists; i++) {
+		multilist_sublist_t *mls = &ml->ml_sublists[i];
+		mutex_init(&mls->mls_lock, NULL, MUTEX_DEFAULT, NULL);
+		list_create(&mls->mls_list, size, offset);
+	}
+}
+
+/*
+ * Destroy the given multilist object, and free up any memory it holds.
+ */
+void
+multilist_destroy(multilist_t *ml)
+{
+	ASSERT(multilist_is_empty(ml));
+
+	for (int i = 0; i < ml->ml_num_sublists; i++) {
+		multilist_sublist_t *mls = &ml->ml_sublists[i];
+
+		ASSERT(list_is_empty(&mls->mls_list));
+
+		list_destroy(&mls->mls_list);
+		mutex_destroy(&mls->mls_lock);
+	}
+
+	ASSERT3P(ml->ml_sublists, !=, NULL);
+	kmem_free(ml->ml_sublists,
+	    sizeof (multilist_sublist_t) * ml->ml_num_sublists);
+
+	ml->ml_num_sublists = 0;
+	ml->ml_offset = 0;
+}
+
+/*
+ * Insert the given object into the multilist.
+ *
+ * This function will insert the object specified into the sublist
+ * determined using the function given at multilist creation time.
+ *
+ * The sublist locks are automatically acquired if not already held, to
+ * ensure consistency when inserting and removing from multiple threads.
+ */
+void
+multilist_insert(multilist_t *ml, void *obj)
+{
+	unsigned int sublist_idx = ml->ml_index_func(ml, obj);
+	multilist_sublist_t *mls;
+	boolean_t need_lock;
+
+	DTRACE_PROBE3(multilist__insert, multilist_t *, ml,
+	    unsigned int, sublist_idx, void *, obj);
+
+	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+
+	mls = &ml->ml_sublists[sublist_idx];
+
+	/*
+	 * Note: Callers may already hold the sublist lock by calling
+	 * multilist_sublist_lock().  Here we rely on MUTEX_HELD()
+	 * returning TRUE if and only if the current thread holds the
+	 * lock.  While it's a little ugly to make the lock recursive in
+	 * this way, it works and allows the calling code to be much
+	 * simpler -- otherwise it would have to pass around a flag
+	 * indicating that it already has the lock.
+	 */
+	need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+	if (need_lock)
+		mutex_enter(&mls->mls_lock);
+
+	ASSERT(!multilist_link_active(multilist_d2l(ml, obj)));
+
+	multilist_sublist_insert_head(mls, obj);
+
+	if (need_lock)
+		mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * Remove the given object from the multilist.
+ *
+ * This function will remove the object specified from the sublist
+ * determined using the function given at multilist creation time.
+ *
+ * The necessary sublist locks are automatically acquired, to ensure
+ * consistency when inserting and removing from multiple threads.
+ */
+void
+multilist_remove(multilist_t *ml, void *obj)
+{
+	unsigned int sublist_idx = ml->ml_index_func(ml, obj);
+	multilist_sublist_t *mls;
+	boolean_t need_lock;
+
+	DTRACE_PROBE3(multilist__remove, multilist_t *, ml,
+	    unsigned int, sublist_idx, void *, obj);
+
+	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+
+	mls = &ml->ml_sublists[sublist_idx];
+	/* See comment in multilist_insert(). */
+	need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+	if (need_lock)
+		mutex_enter(&mls->mls_lock);
+
+	ASSERT(multilist_link_active(multilist_d2l(ml, obj)));
+
+	multilist_sublist_remove(mls, obj);
+
+	if (need_lock)
+		mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * Check to see if this multilist object is empty.
+ *
+ * This will return TRUE if it finds all of the sublists of this
+ * multilist to be empty, and FALSE otherwise. Each sublist lock will be
+ * automatically acquired as necessary.
+ *
+ * If concurrent insertions and removals are occurring, the semantics
+ * of this function become a little fuzzy. Instead of locking all
+ * sublists for the entire call time of the function, each sublist is
+ * only locked as it is individually checked for emptiness. Thus, it's
+ * possible for this function to return TRUE with non-empty sublists at
+ * the time the function returns. This would be due to another thread
+ * inserting into a given sublist, after that specific sublist was check
+ * and deemed empty, but before all sublists have been checked.
+ */
+int
+multilist_is_empty(multilist_t *ml)
+{
+	for (int i = 0; i < ml->ml_num_sublists; i++) {
+		multilist_sublist_t *mls = &ml->ml_sublists[i];
+		/* See comment in multilist_insert(). */
+		boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+		if (need_lock)
+			mutex_enter(&mls->mls_lock);
+
+		if (!list_is_empty(&mls->mls_list)) {
+			if (need_lock)
+				mutex_exit(&mls->mls_lock);
+
+			return (FALSE);
+		}
+
+		if (need_lock)
+			mutex_exit(&mls->mls_lock);
+	}
+
+	return (TRUE);
+}
+
+/* Return the number of sublists composing this multilist */
+unsigned int
+multilist_get_num_sublists(multilist_t *ml)
+{
+	return (ml->ml_num_sublists);
+}
+
+/* Return a randomly selected, valid sublist index for this multilist */
+unsigned int
+multilist_get_random_index(multilist_t *ml)
+{
+	return (spa_get_random(ml->ml_num_sublists));
+}
+
+/* Lock and return the sublist specified at the given index */
+multilist_sublist_t *
+multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
+{
+	multilist_sublist_t *mls;
+
+	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+	mls = &ml->ml_sublists[sublist_idx];
+	mutex_enter(&mls->mls_lock);
+
+	return (mls);
+}
+
+void
+multilist_sublist_unlock(multilist_sublist_t *mls)
+{
+	mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * We're allowing any object to be inserted into this specific sublist,
+ * but this can lead to trouble if multilist_remove() is called to
+ * remove this object. Specifically, if calling ml_index_func on this
+ * object returns an index for sublist different than what is passed as
+ * a parameter here, any call to multilist_remove() with this newly
+ * inserted object is undefined! (the call to multilist_remove() will
+ * remove the object from a list that it isn't contained in)
+ */
+void
+multilist_sublist_insert_head(multilist_sublist_t *mls, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_insert_head(&mls->mls_list, obj);
+}
+
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_insert_tail(&mls->mls_list, obj);
+}
+
+/*
+ * Move the object one element forward in the list.
+ *
+ * This function will move the given object forward in the list (towards
+ * the head) by one object. So, in essence, it will swap its position in
+ * the list with its "prev" pointer. If the given object is already at the
+ * head of the list, it cannot be moved forward any more than it already
+ * is, so no action is taken.
+ *
+ * NOTE: This function **must not** remove any object from the list other
+ *       than the object given as the parameter. This is relied upon in
+ *       arc_evict_state_impl().
+ */
+void
+multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj)
+{
+	void *prev = list_prev(&mls->mls_list, obj);
+
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	ASSERT(!list_is_empty(&mls->mls_list));
+
+	/* 'obj' must be at the head of the list, nothing to do */
+	if (prev == NULL)
+		return;
+
+	list_remove(&mls->mls_list, obj);
+	list_insert_before(&mls->mls_list, prev, obj);
+}
+
+void
+multilist_sublist_remove(multilist_sublist_t *mls, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_remove(&mls->mls_list, obj);
+}
+
+void *
+multilist_sublist_head(multilist_sublist_t *mls)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	return (list_head(&mls->mls_list));
+}
+
+void *
+multilist_sublist_tail(multilist_sublist_t *mls)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	return (list_tail(&mls->mls_list));
+}
+
+void *
+multilist_sublist_next(multilist_sublist_t *mls, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	return (list_next(&mls->mls_list, obj));
+}
+
+void *
+multilist_sublist_prev(multilist_sublist_t *mls, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	return (list_prev(&mls->mls_list, obj));
+}
+
+void
+multilist_link_init(multilist_node_t *link)
+{
+	list_link_init(link);
+}
+
+int
+multilist_link_active(multilist_node_t *link)
+{
+	return (list_link_active(link));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
index 40efaba..a5389c3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -1943,7 +1943,7 @@ static int
 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
-	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
 		return (0);
 	/*
 	 * Note: normally this routine will not be called if
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
index aeac124..1ea829f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
@@ -80,8 +80,8 @@ space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
 
 	mutex_exit(sm->sm_lock);
 	if (end > bufsize) {
-		dmu_prefetch(sm->sm_os, space_map_object(sm), bufsize,
-		    end - bufsize);
+		dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize,
+		    end - bufsize, ZIO_PRIORITY_SYNC_READ);
 	}
 	mutex_enter(sm->sm_lock);
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
index 4d13cb1..a26d8f8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
@@ -37,6 +37,12 @@ extern "C" {
 #include <sys/dmu.h>
 #include <sys/spa.h>
 
+/*
+ * Used by arc_flush() to inform arc_evict_state() that it should evict
+ * all available buffers from the arc state being passed in.
+ */
+#define	ARC_EVICT_ALL	-1ULL
+
 typedef struct arc_buf_hdr arc_buf_hdr_t;
 typedef struct arc_buf arc_buf_t;
 typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv);
@@ -154,7 +160,7 @@ void arc_freed(spa_t *spa, const blkptr_t *bp);
 void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *priv);
 boolean_t arc_clear_callback(arc_buf_t *buf);
 
-void arc_flush(spa_t *spa);
+void arc_flush(spa_t *spa, boolean_t retry);
 void arc_tempreserve_clear(uint64_t reserve);
 int arc_tempreserve_space(uint64_t reserve, uint64_t txg);
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h
new file mode 100644
index 0000000..63722df
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h
@@ -0,0 +1,54 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
+#ifndef	_BQUEUE_H
+#define	_BQUEUE_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include	<sys/zfs_context.h>
+
+typedef struct bqueue {
+	list_t bq_list;
+	kmutex_t bq_lock;
+	kcondvar_t bq_add_cv;
+	kcondvar_t bq_pop_cv;
+	uint64_t bq_size;
+	uint64_t bq_maxsize;
+	size_t bq_node_offset;
+} bqueue_t;
+
+typedef struct bqueue_node {
+	list_node_t bqn_node;
+	uint64_t bqn_size;
+} bqueue_node_t;
+
+
+int bqueue_init(bqueue_t *, uint64_t, size_t);
+void bqueue_destroy(bqueue_t *);
+void bqueue_enqueue(bqueue_t *, void *, uint64_t);
+void *bqueue_dequeue(bqueue_t *);
+boolean_t bqueue_empty(bqueue_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _BQUEUE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
index 2e07185..482ccb0 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
@@ -245,8 +245,7 @@ typedef struct dbuf_hash_table {
 	kmutex_t hash_mutexes[DBUF_MUTEXES];
 } dbuf_hash_table_t;
 
-
-uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
+uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset);
 
 dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
 void dbuf_create_bonus(struct dnode *dn);
@@ -258,10 +257,12 @@ void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx);
 dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
 dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
     void *tag);
-int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create,
+int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid,
+    boolean_t fail_sparse, boolean_t fail_uncached,
     void *tag, dmu_buf_impl_t **dbp);
 
-void dbuf_prefetch(struct dnode *dn, uint64_t blkid, zio_priority_t prio);
+void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
+    zio_priority_t prio, arc_flags_t aflags);
 
 void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
 boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
index 3c5cfbe..f6c72b0 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
@@ -45,6 +45,7 @@
 #include <sys/zfs_context.h>
 #include <sys/cred.h>
 #include <sys/fs/zfs.h>
+#include <sys/zio_priority.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -748,8 +749,8 @@ extern int zfs_max_recordsize;
 /*
  * Asynchronously try to read in the data.
  */
-void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
-    uint64_t len);
+void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
+    uint64_t len, enum zio_priority pri);
 
 typedef struct dmu_object_info {
 	/* All sizes are in bytes unless otherwise indicated. */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
index 7d490ec..001bff5 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
@@ -38,6 +38,7 @@
 #include <sys/zfs_context.h>
 #include <sys/dsl_deadlist.h>
 #include <sys/refcount.h>
+#include <zfeature_common.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -145,8 +146,6 @@ typedef struct dsl_dataset {
 	/* only used in syncing context, only valid for non-snapshots: */
 	struct dsl_dataset *ds_prev;
 	uint64_t ds_bookmarks;  /* DMU_OTN_ZAP_METADATA */
-	boolean_t ds_large_blocks;
-	boolean_t ds_need_large_blocks;
 
 	/* has internal locking: */
 	dsl_deadlist_t ds_deadlist;
@@ -185,6 +184,18 @@ typedef struct dsl_dataset {
 	kmutex_t ds_sendstream_lock;
 	list_t ds_sendstreams;
 
+	/*
+	 * For ZFEATURE_FLAG_PER_DATASET features, set if this dataset
+	 * uses this feature.
+	 */
+	uint8_t ds_feature_inuse[SPA_FEATURES];
+
+	/*
+	 * Set if we need to activate the feature on this dataset this txg
+	 * (used only in syncing context).
+	 */
+	uint8_t ds_feature_activation_needed[SPA_FEATURES];
+
 	/* Protected by ds_lock; keep at end of struct for better locality */
 	char ds_snapname[MAXNAMELEN];
 } dsl_dataset_t;
@@ -264,8 +275,6 @@ int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
 int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
 boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds);
-int dsl_dataset_activate_large_blocks(const char *dsname);
-void dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx);
 
 int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
 
@@ -305,6 +314,9 @@ void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
 void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx);
 int dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result);
 
+void dsl_dataset_deactivate_feature(uint64_t dsobj,
+    spa_feature_t f, dmu_tx_t *tx);
+
 #ifdef ZFS_DEBUG
 #define	dprintf_ds(ds, fmt, ...) do { \
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h
new file mode 100644
index 0000000..5ebb7fe
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h
@@ -0,0 +1,106 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ */
+
+#ifndef	_SYS_MULTILIST_H
+#define	_SYS_MULTILIST_H
+
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef list_node_t multilist_node_t;
+typedef struct multilist multilist_t;
+typedef struct multilist_sublist multilist_sublist_t;
+typedef unsigned int multilist_sublist_index_func_t(multilist_t *, void *);
+
+struct multilist_sublist {
+	/*
+	 * The mutex used internally to implement thread safe insertions
+	 * and removals to this individual sublist. It can also be locked
+	 * by a consumer using multilist_sublist_{lock,unlock}, which is
+	 * useful if a consumer needs to traverse the list in a thread
+	 * safe manner.
+	 */
+	kmutex_t	mls_lock;
+	/*
+	 * The actual list object containing all objects in this sublist.
+	 */
+	list_t		mls_list;
+	/*
+	 * Pad to cache line (64 bytes), in an effort to try and prevent
+	 * cache line contention.
+	 */
+	uint8_t		mls_pad[24];
+};
+
+struct multilist {
+	/*
+	 * This is used to get to the multilist_node_t structure given
+	 * the void *object contained on the list.
+	 */
+	size_t				ml_offset;
+	/*
+	 * The number of sublists used internally by this multilist.
+	 */
+	uint64_t			ml_num_sublists;
+	/*
+	 * The array of pointers to the actual sublists.
+	 */
+	multilist_sublist_t		*ml_sublists;
+	/*
+	 * Pointer to function which determines the sublist to use
+	 * when inserting and removing objects from this multilist.
+	 * Please see the comment above multilist_create for details.
+	 */
+	multilist_sublist_index_func_t	*ml_index_func;
+};
+
+void multilist_destroy(multilist_t *);
+void multilist_create(multilist_t *, size_t, size_t, unsigned int,
+    multilist_sublist_index_func_t *);
+
+void multilist_insert(multilist_t *, void *);
+void multilist_remove(multilist_t *, void *);
+int  multilist_is_empty(multilist_t *);
+
+unsigned int multilist_get_num_sublists(multilist_t *);
+unsigned int multilist_get_random_index(multilist_t *);
+
+multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int);
+void multilist_sublist_unlock(multilist_sublist_t *);
+
+void multilist_sublist_insert_head(multilist_sublist_t *, void *);
+void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
+void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
+void multilist_sublist_remove(multilist_sublist_t *, void *);
+
+void *multilist_sublist_head(multilist_sublist_t *);
+void *multilist_sublist_tail(multilist_sublist_t *);
+void *multilist_sublist_next(multilist_sublist_t *, void *);
+void *multilist_sublist_prev(multilist_sublist_t *, void *);
+
+void multilist_link_init(multilist_node_t *);
+int  multilist_link_active(multilist_node_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_MULTILIST_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
index 36739cd..342c9cd 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
@@ -29,6 +29,7 @@
 #ifndef _ZIO_H
 #define	_ZIO_H
 
+#include <sys/zio_priority.h>
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
@@ -144,18 +145,6 @@ enum zio_compress {
 #define	ZIO_FAILURE_MODE_CONTINUE	1
 #define	ZIO_FAILURE_MODE_PANIC		2
 
-typedef enum zio_priority {
-	ZIO_PRIORITY_SYNC_READ,
-	ZIO_PRIORITY_SYNC_WRITE,	/* ZIL */
-	ZIO_PRIORITY_ASYNC_READ,	/* prefetch */
-	ZIO_PRIORITY_ASYNC_WRITE,	/* spa_sync() */
-	ZIO_PRIORITY_SCRUB,		/* asynchronous scrub/resilver reads */
-	ZIO_PRIORITY_TRIM,		/* free requests used for TRIM */
-	ZIO_PRIORITY_NUM_QUEUEABLE,
-
-	ZIO_PRIORITY_NOW		/* non-queued I/Os (e.g. ioctl) */
-} zio_priority_t;
-
 enum zio_flag {
 	/*
 	 * Flags inherited by gang, ddt, and vdev children,
@@ -260,6 +249,7 @@ extern const char *zio_type_name[ZIO_TYPES];
  * Root blocks (objset_phys_t) are object 0, level -1:  <objset, 0, -1, 0>.
  * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
  * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
+ * dnode visit bookmarks are <objset, object id of dnode, -3, 0>.
  *
  * Note: this structure is called a bookmark because its original purpose
  * was to remember where to resume a pool-wide traverse.
@@ -292,6 +282,9 @@ typedef struct zbookmark_phys {
 #define	ZB_ZIL_OBJECT		(0ULL)
 #define	ZB_ZIL_LEVEL		(-2LL)
 
+#define	ZB_DNODE_LEVEL		(-3LL)
+#define	ZB_DNODE_BLKID		(0ULL)
+
 #define	ZB_IS_ZERO(zb)						\
 	((zb)->zb_objset == 0 && (zb)->zb_object == 0 &&	\
 	(zb)->zb_level == 0 && (zb)->zb_blkid == 0)
@@ -633,8 +626,10 @@ extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
 extern void spa_handle_ignored_writes(spa_t *spa);
 
 /* zbookmark_phys functions */
-boolean_t zbookmark_is_before(const struct dnode_phys *dnp,
-    const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2);
+boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp,
+    const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block);
+int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2,
+    uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2);
 
 #ifdef	__cplusplus
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
index a921a2f..0c293ab 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
@@ -44,7 +44,7 @@ typedef struct zio_checksum_info {
 	zio_checksum_func_t *ci_func[2]; /* checksum function per byteorder */
 	int		ci_correctable;	/* number of correctable bits	*/
 	int		ci_eck;		/* uses zio embedded checksum? */
-	int		ci_dedup;	/* strong enough for dedup? */
+	boolean_t	ci_dedup;	/* strong enough for dedup? */
 	char		*ci_name;	/* descriptive name */
 } zio_checksum_info_t;
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h
new file mode 100644
index 0000000..32e90e2
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h
@@ -0,0 +1,41 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+#ifndef	_ZIO_PRIORITY_H
+#define	_ZIO_PRIORITY_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef enum zio_priority {
+	ZIO_PRIORITY_SYNC_READ,
+	ZIO_PRIORITY_SYNC_WRITE,	/* ZIL */
+	ZIO_PRIORITY_ASYNC_READ,	/* prefetch */
+	ZIO_PRIORITY_ASYNC_WRITE,	/* spa_sync() */
+	ZIO_PRIORITY_SCRUB,		/* asynchronous scrub/resilver reads */
+	ZIO_PRIORITY_TRIM,		/* free requests used for TRIM */
+	ZIO_PRIORITY_NUM_QUEUEABLE,
+
+	ZIO_PRIORITY_NOW		/* non-queued i/os (e.g. free) */
+} zio_priority_t;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _ZIO_PRIORITY_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
index 36969e8..44919d2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
@@ -162,8 +162,9 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
 		tbl->zt_nextblk = newblk;
 		ASSERT0(tbl->zt_blks_copied);
-		dmu_prefetch(zap->zap_objset, zap->zap_object,
-		    tbl->zt_blk << bs, tbl->zt_numblks << bs);
+		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+		    tbl->zt_blk << bs, tbl->zt_numblks << bs,
+		    ZIO_PRIORITY_SYNC_READ);
 	}
 
 	/*
@@ -939,7 +940,8 @@ fzap_prefetch(zap_name_t *zn)
 	if (zap_idx_to_blk(zap, idx, &blk) != 0)
 		return;
 	bs = FZAP_BLOCK_SHIFT(zap);
-	dmu_prefetch(zap->zap_objset, zap->zap_object, blk << bs, 1 << bs);
+	dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
+	    ZIO_PRIORITY_SYNC_READ);
 }
 
 /*
@@ -1310,9 +1312,10 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 	} else {
 		int b;
 
-		dmu_prefetch(zap->zap_objset, zap->zap_object,
+		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
 		    zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
-		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs);
+		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
+		    ZIO_PRIORITY_SYNC_READ);
 
 		for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
 		    b++) {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
index 7540320..80a3f0b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -245,7 +245,7 @@ feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature,
 {
 	int err;
 	uint64_t refcount;
-	uint64_t zapobj = feature->fi_can_readonly ?
+	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
 	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
 
 	/*
@@ -296,7 +296,7 @@ feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount,
     dmu_tx_t *tx)
 {
 	ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature));
-	uint64_t zapobj = feature->fi_can_readonly ?
+	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
 	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
 
 	VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid,
@@ -322,7 +322,7 @@ feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount,
 
 	if (refcount == 0)
 		spa_deactivate_mos_feature(spa, feature->fi_guid);
-	else if (feature->fi_mos)
+	else if (feature->fi_flags & ZFEATURE_FLAG_MOS)
 		spa_activate_mos_feature(spa, feature->fi_guid, tx);
 }
 
@@ -333,8 +333,9 @@ feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount,
 void
 feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
 {
-	uint64_t initial_refcount = feature->fi_activate_on_enable ? 1 : 0;
-	uint64_t zapobj = feature->fi_can_readonly ?
+	uint64_t initial_refcount =
+	    (feature->fi_flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE) ? 1 : 0;
+	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
 	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
 
 	ASSERT(0 != zapobj);
@@ -379,7 +380,7 @@ feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action,
 {
 	uint64_t refcount;
 	zfeature_info_t *feature = &spa_feature_table[fid];
-	uint64_t zapobj = feature->fi_can_readonly ?
+	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
 	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
 
 	ASSERT(VALID_FEATURE_FID(fid));
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
index c2dd020..693ba41 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
  */
 
 /*
@@ -1149,10 +1150,11 @@ zfsctl_shares_lookup(ap)
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENOTSUP));
 	}
-	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0)
+	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
 		error = VOP_LOOKUP(ZTOV(dzp), vpp, cnp);
+		VN_RELE(ZTOV(dzp));
+	}
 
-	VN_RELE(ZTOV(dzp));
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
index 2a583d4..2e51916 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
@@ -5190,6 +5190,7 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
 		if ((error = get_nvlist(zc->zc_nvlist_src,
 		    zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) {
 			VN_RELE(vp);
+			VN_RELE(ZTOV(sharedir));
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
index 8a08c8d..ed56d17 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
@@ -22,7 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
  * All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -950,7 +950,7 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
 		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
 		    &sa_obj);
 		if (error)
-			return (error);
+			goto out;
 	} else {
 		/*
 		 * Pre SA versions file systems should never touch
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
index 1038a87..45a2bd7 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
@@ -2675,7 +2675,8 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
 
 		/* Prefetch znode */
 		if (prefetch)
-			dmu_prefetch(os, objnum, 0, 0);
+			dmu_prefetch(os, objnum, 0, 0, 0,
+			    ZIO_PRIORITY_SYNC_READ);
 
 	skip_entry:
 		/*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
index 48de571..867b798 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
@@ -94,6 +94,9 @@ extern vmem_t *zio_alloc_arena;
 #define	ZIO_PIPELINE_CONTINUE		0x100
 #define	ZIO_PIPELINE_STOP		0x101
 
+#define	BP_SPANB(indblkshift, level) \
+	(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
+#define	COMPARE_META_LEVEL	0x80000000ul
 /*
  * The following actions directly effect the spa's sync-to-convergence logic.
  * The values below define the sync pass when we start performing the action.
@@ -3461,37 +3464,127 @@ static zio_pipe_stage_t *zio_pipeline[] = {
 	zio_done
 };
 
-/* dnp is the dnode for zb1->zb_object */
-boolean_t
-zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1,
-    const zbookmark_phys_t *zb2)
-{
-	uint64_t zb1nextL0, zb2thisobj;
 
-	ASSERT(zb1->zb_objset == zb2->zb_objset);
-	ASSERT(zb2->zb_level == 0);
 
-	/* The objset_phys_t isn't before anything. */
-	if (dnp == NULL)
-		return (B_FALSE);
 
-	zb1nextL0 = (zb1->zb_blkid + 1) <<
-	    ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+/*
+ * Compare two zbookmark_phys_t's to see which we would reach first in a
+ * pre-order traversal of the object tree.
+ *
+ * This is simple in every case aside from the meta-dnode object. For all other
+ * objects, we traverse them in order (object 1 before object 2, and so on).
+ * However, all of these objects are traversed while traversing object 0, since
+ * the data it points to is the list of objects.  Thus, we need to convert to a
+ * canonical representation so we can compare meta-dnode bookmarks to
+ * non-meta-dnode bookmarks.
+ *
+ * We do this by calculating "equivalents" for each field of the zbookmark.
+ * zbookmarks outside of the meta-dnode use their own object and level, and
+ * calculate the level 0 equivalent (the first L0 blkid that is contained in the
+ * blocks this bookmark refers to) by multiplying their blkid by their span
+ * (the number of L0 blocks contained within one block at their level).
+ * zbookmarks inside the meta-dnode calculate their object equivalent
+ * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
+ * level + 1<<31 (any value larger than a level could ever be) for their level.
+ * This causes them to always compare before a bookmark in their object
+ * equivalent, compare appropriately to bookmarks in other objects, and to
+ * compare appropriately to other bookmarks in the meta-dnode.
+ */
+int
+zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
+    const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
+{
+	/*
+	 * These variables represent the "equivalent" values for the zbookmark,
+	 * after converting zbookmarks inside the meta dnode to their
+	 * normal-object equivalents.
+	 */
+	uint64_t zb1obj, zb2obj;
+	uint64_t zb1L0, zb2L0;
+	uint64_t zb1level, zb2level;
 
-	zb2thisobj = zb2->zb_object ? zb2->zb_object :
-	    zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
+	if (zb1->zb_object == zb2->zb_object &&
+	    zb1->zb_level == zb2->zb_level &&
+	    zb1->zb_blkid == zb2->zb_blkid)
+		return (0);
+
+	/*
+	 * BP_SPANB calculates the span in blocks.
+	 */
+	zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
+	zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
 
 	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
-		uint64_t nextobj = zb1nextL0 *
-		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
-		return (nextobj <= zb2thisobj);
+		zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
+		zb1L0 = 0;
+		zb1level = zb1->zb_level + COMPARE_META_LEVEL;
+	} else {
+		zb1obj = zb1->zb_object;
+		zb1level = zb1->zb_level;
 	}
 
-	if (zb1->zb_object < zb2thisobj)
-		return (B_TRUE);
-	if (zb1->zb_object > zb2thisobj)
-		return (B_FALSE);
-	if (zb2->zb_object == DMU_META_DNODE_OBJECT)
+	if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
+		zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
+		zb2L0 = 0;
+		zb2level = zb2->zb_level + COMPARE_META_LEVEL;
+	} else {
+		zb2obj = zb2->zb_object;
+		zb2level = zb2->zb_level;
+	}
+
+	/* Now that we have a canonical representation, do the comparison. */
+	if (zb1obj != zb2obj)
+		return (zb1obj < zb2obj ? -1 : 1);
+	else if (zb1L0 != zb2L0)
+		return (zb1L0 < zb2L0 ? -1 : 1);
+	else if (zb1level != zb2level)
+		return (zb1level > zb2level ? -1 : 1);
+	/*
+	 * This can (theoretically) happen if the bookmarks have the same object
+	 * and level, but different blkids, if the block sizes are not the same.
+	 * There is presently no way to change the indirect block sizes
+	 */
+	return (0);
+}
+
+/*
+ *  This function checks the following: given that last_block is the place that
+ *  our traversal stopped last time, does that guarantee that we've visited
+ *  every node under subtree_root?  Therefore, we can't just use the raw output
+ *  of zbookmark_compare.  We have to pass in a modified version of
+ *  subtree_root; by incrementing the block id, and then checking whether
+ *  last_block is before or equal to that, we can tell whether or not having
+ *  visited last_block implies that all of subtree_root's children have been
+ *  visited.
+ */
+boolean_t
+zbookmark_subtree_completed(const dnode_phys_t *dnp,
+    const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
+{
+	zbookmark_phys_t mod_zb = *subtree_root;
+	mod_zb.zb_blkid++;
+	ASSERT(last_block->zb_level == 0);
+
+	/* The objset_phys_t isn't before anything. */
+	if (dnp == NULL)
 		return (B_FALSE);
-	return (zb1nextL0 <= zb2->zb_blkid);
+
+	/*
+	 * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
+	 * data block size in sectors, because that variable is only used if
+	 * the bookmark refers to a block in the meta-dnode.  Since we don't
+	 * know without examining it what object it refers to, and there's no
+	 * harm in passing in this value in other cases, we always pass it in.
+	 *
+	 * We pass in 0 for the indirect block size shift because zb2 must be
+	 * level 0.  The indirect block size is only used to calculate the span
+	 * of the bookmark, but since the bookmark must be level 0, the span is
+	 * always 1, so the math works out.
+	 *
+	 * If you make changes to how the zbookmark_compare code works, be sure
+	 * to make sure that this code still works afterwards.
+	 */
+	return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
+	    1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
+	    last_block) <= 0);
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
index 991a0a3..0a7f4e4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
@@ -438,7 +438,11 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
 	 * fault injection isn't a performance critical path.
 	 */
 	if (flags & ZINJECT_FLUSH_ARC)
-		arc_flush(NULL);
+		/*
+		 * We must use FALSE to ensure arc_flush returns, since
+		 * we're not preventing concurrent ARC insertions.
+		 */
+		arc_flush(NULL, FALSE);
 
 	return (0);
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
index 55de1b4..2c90810 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
@@ -358,7 +358,7 @@ zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	zvol_extent_t *ze;
 	int bs = ma->ma_zv->zv_volblocksize;
 
-	if (BP_IS_HOLE(bp) ||
+	if (bp == NULL || BP_IS_HOLE(bp) ||
 	    zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
 		return (0);
 
diff --git a/sys/compat/cloudabi/cloudabi_proc.c b/sys/compat/cloudabi/cloudabi_proc.c
index 9c735fa..d917337 100644
--- a/sys/compat/cloudabi/cloudabi_proc.c
+++ b/sys/compat/cloudabi/cloudabi_proc.c
@@ -46,14 +46,19 @@ cloudabi_sys_proc_exec(struct thread *td,
     struct cloudabi_sys_proc_exec_args *uap)
 {
 	struct image_args args;
+	struct vmspace *oldvmspace;
 	int error;
 
+	error = pre_execve(td, &oldvmspace);
+	if (error != 0)
+		return (error);
 	error = exec_copyin_data_fds(td, &args, uap->data, uap->datalen,
 	    uap->fds, uap->fdslen);
 	if (error == 0) {
 		args.fd = uap->fd;
 		error = kern_execve(td, &args, NULL);
 	}
+	post_execve(td, error, oldvmspace);
 	return (error);
 }
 
diff --git a/sys/conf/Makefile.arm b/sys/conf/Makefile.arm
index 86b11c6..af5f7da 100644
--- a/sys/conf/Makefile.arm
+++ b/sys/conf/Makefile.arm
@@ -66,10 +66,6 @@ SYSTEM_LD_TAIL +=;sed s/" + SIZEOF_HEADERS"// ldscript.$M\
 		${KERNEL_KO}.bin; \
 		rm ${FULLKERNEL}.noheader
 
-.if defined(MFS_IMAGE)
-SYSTEM_LD_TAIL += ;sh ${S}/tools/embed_mfs.sh ${KERNEL_KO}.bin ${MFS_IMAGE};
-.endif
-
 FILES_CPU_FUNC = \
 	$S/$M/$M/cpufunc_asm_arm9.S \
 	$S/$M/$M/cpufunc_asm_arm10.S \
diff --git a/sys/conf/NOTES b/sys/conf/NOTES
index b0619cb..7bc2048 100644
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@@ -2981,9 +2981,10 @@ options 	MAXFILES=999
 
 # Random number generator
 # Only ONE of the below two may be used; they are mutually exclusive.
-# If neither is present, then the Fortuna algorithm is used.
-options 	RANDOM_YARROW	# Yarrow CSPRNG (old default)
-#options 	RANDOM_DUMMY	# Dummy CSPRNG that always blocks
+# If neither is present, then the Fortuna algorithm is selected.
+#options 	RANDOM_YARROW	# Yarrow CSPRNG (old default)
+#options 	RANDOM_LOADABLE	# Allow the algorithm to be loaded as
+				# a module.
 # For developers.
 options 	RANDOM_DEBUG	# Extra debugging messages
 
diff --git a/sys/conf/files b/sys/conf/files
index 531647f..dfe9763 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -83,7 +83,7 @@ cam/ctl/ctl_backend_ramdisk.c	optional ctl
 cam/ctl/ctl_cmd_table.c		optional ctl
 cam/ctl/ctl_frontend.c		optional ctl
 cam/ctl/ctl_frontend_cam_sim.c	optional ctl
-cam/ctl/ctl_frontend_internal.c	optional ctl
+cam/ctl/ctl_frontend_ioctl.c	optional ctl
 cam/ctl/ctl_frontend_iscsi.c	optional ctl
 cam/ctl/ctl_scsi_all.c		optional ctl
 cam/ctl/ctl_tpc.c		optional ctl
@@ -145,6 +145,7 @@ cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c			optional zfs compile-with
 cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c			optional zfs compile-with "${ZFS_C}"
 cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c			optional zfs compile-with "${ZFS_C}"
 cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c			optional zfs compile-with "${ZFS_C}"
+cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c			optional zfs compile-with "${ZFS_C}"
 cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c			optional zfs compile-with "${ZFS_C}"
 cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c			optional zfs compile-with "${ZFS_C}"
 cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c			optional zfs compile-with "${ZFS_C}"
@@ -174,6 +175,7 @@ cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c			optional zfs compile-with "$
 cddl/contrib/opensolaris/uts/common/fs/zfs/lz4.c			optional zfs compile-with "${ZFS_C}"
 cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c			optional zfs compile-with "${ZFS_C}"
 cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c			optional zfs compile-with "${ZFS_C}"
+cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c			optional zfs compile-with "${ZFS_C}"
 cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c			optional zfs compile-with "${ZFS_C}"
 cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c			optional zfs compile-with "${ZFS_C}"
 cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c			optional zfs compile-with "${ZFS_C}"
@@ -548,14 +550,14 @@ crypto/des/des_ecb.c		optional crypto | ipsec | netsmb
 crypto/des/des_setkey.c		optional crypto | ipsec | netsmb
 crypto/rc4/rc4.c		optional netgraph_mppc_encryption | kgssapi
 crypto/rijndael/rijndael-alg-fst.c optional crypto | geom_bde | \
-					 ipsec | random random_yarrow | random !random_yarrow !random_dummy | wlan_ccmp
-crypto/rijndael/rijndael-api-fst.c optional geom_bde | random random_yarrow | random !random_yarrow !random_dummy
+					 ipsec | random !random_loadable | wlan_ccmp
+crypto/rijndael/rijndael-api-fst.c optional geom_bde | random !random_loadable
 crypto/rijndael/rijndael-api.c	optional crypto | ipsec | wlan_ccmp
 crypto/sha1.c			optional carp | crypto | ipsec | \
 					 netgraph_mppc_encryption | sctp
-crypto/sha2/sha2.c		optional crypto | geom_bde | ipsec | random random_yarrow | random !random_yarrow !random_dummy | \
+crypto/sha2/sha2.c		optional crypto | geom_bde | ipsec | random !random_loadable | \
 					 sctp | zfs
-crypto/sha2/sha256c.c		optional crypto | geom_bde | ipsec | random random_yarrow | random !random_yarrow !random_dummy | \
+crypto/sha2/sha256c.c		optional crypto | geom_bde | ipsec | random !random_loadable | \
 					 sctp | zfs
 crypto/siphash/siphash.c	optional inet | inet6
 crypto/siphash/siphash_test.c	optional inet | inet6
@@ -2312,12 +2314,14 @@ rt2860.fw			optional rt2860fw | ralfw		\
 	compile-with	"${NORMAL_FW}"					\
 	no-obj no-implicit-rule						\
 	clean		"rt2860.fw"
-dev/random/randomdev_none.c	optional !random
-dev/random/randomdev.c		optional random
-dev/random/random_harvestq.c	optional random random_yarrow | random !random_dummy
+dev/random/random_infra.c	optional random
+dev/random/random_harvestq.c	optional random
+dev/random/randomdev.c		optional random random_yarrow | \
+					 random !random_yarrow !random_loadable
 dev/random/yarrow.c		optional random random_yarrow
-dev/random/fortuna.c		optional random !random_yarrow !random_dummy
-dev/random/hash.c		optional random random_yarrow | random !random_dummy
+dev/random/fortuna.c		optional random !random_yarrow !random_loadable
+dev/random/hash.c		optional random random_yarrow | \
+					 random !random_yarrow !random_loadable
 dev/rc/rc.c			optional rc
 dev/re/if_re.c			optional re
 dev/rl/if_rl.c			optional rl pci
diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64
index 2ffe102..8451e00 100644
--- a/sys/conf/files.amd64
+++ b/sys/conf/files.amd64
@@ -40,7 +40,7 @@ ia32_genassym.o			standard				\
 #
 ia32_assym.h			standard				\
 	dependency 	"$S/kern/genassym.sh ia32_genassym.o"		\
-	compile-with	"env NM='${NM}' sh $S/kern/genassym.sh ia32_genassym.o > ${.TARGET}" \
+	compile-with	"env NM='${NM}' NMFLAGS='${NMFLAGS}' sh $S/kern/genassym.sh ia32_genassym.o > ${.TARGET}" \
 	no-obj no-implicit-rule before-depend				\
 	clean		"ia32_assym.h"
 #
diff --git a/sys/conf/kern.post.mk b/sys/conf/kern.post.mk
index 28ea453..137e72c 100644
--- a/sys/conf/kern.post.mk
+++ b/sys/conf/kern.post.mk
@@ -121,7 +121,7 @@ gdbinit:
 .endif
 .endif
 
-${FULLKERNEL}: ${SYSTEM_DEP} vers.o ${MFS_IMAGE}
+${FULLKERNEL}: ${SYSTEM_DEP} vers.o
 	@rm -f ${.TARGET}
 	@echo linking ${.TARGET}
 	${SYSTEM_LD}
@@ -133,9 +133,6 @@ ${FULLKERNEL}: ${SYSTEM_DEP} vers.o ${MFS_IMAGE}
 	${OBJCOPY} --strip-debug ${.TARGET}
 .endif
 	${SYSTEM_LD_TAIL}
-.if defined(MFS_IMAGE)
-	sh ${S}/tools/embed_mfs.sh ${FULLKERNEL} ${MFS_IMAGE}
-.endif
 
 .if !exists(${.OBJDIR}/.depend)
 ${SYSTEM_OBJS}: assym.s vnode_if.h ${BEFORE_DEPEND:M*.h} ${MFILES:T:S/.m$/.h/}
@@ -177,7 +174,7 @@ hack.So: Makefile
 ./assym.s: assym.s
 
 assym.s: $S/kern/genassym.sh genassym.o
-	NM='${NM}' sh $S/kern/genassym.sh genassym.o > ${.TARGET}
+	NM='${NM}' NMFLAGS='${NMFLAGS}' sh $S/kern/genassym.sh genassym.o > ${.TARGET}
 
 genassym.o: $S/$M/$M/genassym.c
 	${CC} -c ${CFLAGS:N-fno-common} $S/$M/$M/genassym.c
@@ -301,6 +298,27 @@ vnode_if_newproto.h:
 vnode_if_typedef.h:
 	${AWK} -f $S/tools/vnode_if.awk $S/kern/vnode_if.src -q
 
+.if ${MFS_IMAGE:Uno} != "no"
+# Generate an object file from the file system image to embed in the kernel
+# via linking. Make sure the contents are in the mfs section and rename the
+# start/end/size variables to __start_mfs, __stop_mfs, and mfs_size,
+# respectively.
+embedfs_${MFS_IMAGE:T:R}.o: ${MFS_IMAGE}
+	${OBJCOPY} --input-target binary \
+	    --output-target ${EMBEDFS_FORMAT.${MACHINE_ARCH}} \
+	    --binary-architecture ${EMBEDFS_ARCH.${MACHINE_ARCH}} \
+	    ${MFS_IMAGE} ${.TARGET}
+	${OBJCOPY} \
+	    --rename-section .data=mfs,contents,alloc,load,readonly,data \
+	    --redefine-sym \
+		_binary_${MFS_IMAGE:C,[^[:alnum:]],_,g}_size=__mfs_root_size \
+	    --redefine-sym \
+		_binary_${MFS_IMAGE:C,[^[:alnum:]],_,g}_start=mfs_root \
+	    --redefine-sym \
+		_binary_${MFS_IMAGE:C,[^[:alnum:]],_,g}_end=mfs_root_end \
+	    ${.TARGET}
+.endif
+
 # XXX strictly, everything depends on Makefile because changes to ${PROF}
 # only appear there, but we don't handle that.
 
diff --git a/sys/conf/kern.pre.mk b/sys/conf/kern.pre.mk
index cf1b127..3783881 100644
--- a/sys/conf/kern.pre.mk
+++ b/sys/conf/kern.pre.mk
@@ -191,6 +191,9 @@ SYSTEM_DEP= Makefile ${SYSTEM_OBJS}
 SYSTEM_OBJS= locore.o ${MDOBJS} ${OBJS}
 SYSTEM_OBJS+= ${SYSTEM_CFILES:.c=.o}
 SYSTEM_OBJS+= hack.So
+.if ${MFS_IMAGE:Uno} != "no"
+SYSTEM_OBJS+= embedfs_${MFS_IMAGE:T:R}.o
+.endif
 SYSTEM_LD= @${LD} -Bdynamic -T ${LDSCRIPT} ${_LDFLAGS} --no-warn-mismatch \
 	--warn-common --export-dynamic --dynamic-linker /red/herring \
 	-o ${.TARGET} -X ${SYSTEM_OBJS} vers.o
@@ -222,6 +225,32 @@ MKMODULESENV+=	DEBUG_FLAGS="${DEBUG}"
 .endif
 MKMODULESENV+=	_MPATH="${_MPATH}"
 
+# Architecture and output format arguments for objdump to convert image to
+# object file
+.if ${MFS_IMAGE:Uno} != "no"
+
+.if !defined(EMBEDFS_FORMAT.${MACHINE_ARCH})
+EMBEDFS_FORMAT.${MACHINE_ARCH}!= awk -F'"' '/OUTPUT_FORMAT/ {print $$2}' ${LDSCRIPT}
+.if empty(EMBEDFS_FORMAT.${MACHINE_ARCH})
+.undef EMBEDFS_FORMAT.${MACHINE_ARCH}
+.endif
+.endif
+
+.if !defined(EMBEDFS_ARCH.${MACHINE_ARCH})
+EMBEDFS_ARCH.${MACHINE_ARCH}!= sed -n '/OUTPUT_ARCH/s/.*(\(.*\)).*/\1/p' ${LDSCRIPT}
+.if empty(EMBEDFS_ARCH.${MACHINE_ARCH})
+.undef EMBEDFS_ARCH.${MACHINE_ARCH}
+.endif
+.endif
+
+EMBEDFS_FORMAT.arm?=		elf32-littlearm
+EMBEDFS_FORMAT.armv6?=		elf32-littlearm
+EMBEDFS_FORMAT.mips?=		elf32-tradbigmips
+EMBEDFS_FORMAT.mipsel?=		elf32-tradlittlemips
+EMBEDFS_FORMAT.mips64?=		elf64-tradbigmips
+EMBEDFS_FORMAT.mips64el?=	elf64-tradlittlemips
+.endif
+
 # Detect kernel config options that force stack frames to be turned on.
 DDB_ENABLED!=	grep DDB opt_ddb.h || true ; echo
 DTR_ENABLED!=	grep KDTRACE_FRAME opt_kdtrace.h || true ; echo
diff --git a/sys/conf/options b/sys/conf/options
index bf6c4a6..30bbc53 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -711,6 +711,7 @@ DEV_PCI			opt_pci.h
 DEV_PF			opt_pf.h
 DEV_PFLOG		opt_pf.h
 DEV_PFSYNC		opt_pf.h
+DEV_RANDOM		opt_global.h
 DEV_SPLASH		opt_splash.h
 DEV_VLAN		opt_vlan.h
 
@@ -946,13 +947,14 @@ RCTL		opt_global.h
 # The DEBUG option is in global.h as the random harvesting
 # puts probes all over the place, and it makes little sense
 # to pollute these headers with an extra include.
-# the DUMMY option is in global.h because it is used to
-# turn off harvesting all over the kernel.
-RANDOM_DEBUG	opt_global.h
+RANDOM_DEBUG	opt_random.h
 # Which CSPRNG hashes we get.
-# These are mutually exclusive. With neither, Fortuna is selected.
-RANDOM_DUMMY	opt_global.h
+# If Yarrow is not chosen, Fortuna is selected.
 RANDOM_YARROW	opt_random.h
+# With this, no entropy processor is loaded, but the entropy
+# harvesting infrastructure is present. This means an entropy
+# processor may be loaded as a module.
+RANDOM_LOADABLE	opt_random.h
 
 # Intel em(4) driver
 EM_MULTIQUEUE	opt_em.h
diff --git a/sys/contrib/libnv/nv_impl.h b/sys/contrib/libnv/nv_impl.h
index 7928431..b50bdf7 100644
--- a/sys/contrib/libnv/nv_impl.h
+++ b/sys/contrib/libnv/nv_impl.h
@@ -1,5 +1,6 @@
 /*-
  * Copyright (c) 2013 The FreeBSD Foundation
+ * Copyright (c) 2013-2015 Mariusz Zaborski <oshogbo@FreeBSD.org>
  * All rights reserved.
  *
  * This software was developed by Pawel Jakub Dawidek under sponsorship from
@@ -39,12 +40,14 @@ struct nvpair;
 typedef struct nvpair nvpair_t;
 #endif
 
+#define	NV_TYPE_NVLIST_ARRAY_NEXT	254
 #define	NV_TYPE_NVLIST_UP		255
 
 #define	NV_TYPE_FIRST			NV_TYPE_NULL
-#define	NV_TYPE_LAST			NV_TYPE_BINARY
+#define	NV_TYPE_LAST			NV_TYPE_DESCRIPTOR_ARRAY
 
-#define	NV_FLAG_BIG_ENDIAN		0x80
+#define	NV_FLAG_BIG_ENDIAN		0x080
+#define	NV_FLAG_IN_ARRAY		0x100
 
 #ifdef _KERNEL
 #define	nv_malloc(size)			malloc((size), M_NVLIST, M_WAITOK)
@@ -86,6 +89,7 @@ typedef struct nvpair nvpair_t;
 
 int	*nvlist_descriptors(const nvlist_t *nvl, size_t *nitemsp);
 size_t	 nvlist_ndescriptors(const nvlist_t *nvl);
+void	 nvlist_set_flags(nvlist_t *nvl, int flags);
 
 nvpair_t *nvlist_first_nvpair(const nvlist_t *nvl);
 nvpair_t *nvlist_next_nvpair(const nvlist_t *nvl, const nvpair_t *nvp);
@@ -96,6 +100,7 @@ void nvlist_add_nvpair(nvlist_t *nvl, const nvpair_t *nvp);
 bool nvlist_move_nvpair(nvlist_t *nvl, nvpair_t *nvp);
 
 void nvlist_set_parent(nvlist_t *nvl, nvpair_t *parent);
+void nvlist_set_array_next(nvlist_t *nvl, nvpair_t *ele);
 
 const nvpair_t *nvlist_get_nvpair(const nvlist_t *nvl, const char *name);
 
@@ -120,18 +125,33 @@ nvpair_t *nvpair_create_stringv(const char *name, const char *valuefmt, va_list
 nvpair_t *nvpair_create_nvlist(const char *name, const nvlist_t *value);
 nvpair_t *nvpair_create_descriptor(const char *name, int value);
 nvpair_t *nvpair_create_binary(const char *name, const void *value, size_t size);
+nvpair_t *nvpair_create_bool_array(const char *name, const bool *value, size_t nitems);
+nvpair_t *nvpair_create_number_array(const char *name, const uint64_t *value, size_t nitems);
+nvpair_t *nvpair_create_string_array(const char *name, const char * const *value, size_t nitems);
+nvpair_t *nvpair_create_nvlist_array(const char *name, const nvlist_t * const *value, size_t nitems);
+nvpair_t *nvpair_create_descriptor_array(const char *name, const int *value, size_t nitems);
 
 nvpair_t *nvpair_move_string(const char *name, char *value);
 nvpair_t *nvpair_move_nvlist(const char *name, nvlist_t *value);
 nvpair_t *nvpair_move_descriptor(const char *name, int value);
 nvpair_t *nvpair_move_binary(const char *name, void *value, size_t size);
-
-bool		 nvpair_get_bool(const nvpair_t *nvp);
-uint64_t	 nvpair_get_number(const nvpair_t *nvp);
-const char	*nvpair_get_string(const nvpair_t *nvp);
-const nvlist_t	*nvpair_get_nvlist(const nvpair_t *nvp);
-int		 nvpair_get_descriptor(const nvpair_t *nvp);
-const void	*nvpair_get_binary(const nvpair_t *nvp, size_t *sizep);
+nvpair_t *nvpair_move_bool_array(const char *name, bool *value, size_t nitems);
+nvpair_t *nvpair_move_nvlist_array(const char *name, nvlist_t **value, size_t nitems);
+nvpair_t *nvpair_move_descriptor_array(const char *name, int *value, size_t nitems);
+nvpair_t *nvpair_move_number_array(const char *name, uint64_t *value, size_t nitems);
+nvpair_t *nvpair_move_string_array(const char *name, char **value, size_t nitems);
+
+bool			 nvpair_get_bool(const nvpair_t *nvp);
+uint64_t		 nvpair_get_number(const nvpair_t *nvp);
+const char		*nvpair_get_string(const nvpair_t *nvp);
+const nvlist_t		*nvpair_get_nvlist(const nvpair_t *nvp);
+int			 nvpair_get_descriptor(const nvpair_t *nvp);
+const void		*nvpair_get_binary(const nvpair_t *nvp, size_t *sizep);
+const bool		*nvpair_get_bool_array(const nvpair_t *nvp, size_t *nitemsp);
+const uint64_t		*nvpair_get_number_array(const nvpair_t *nvp, size_t *nitemsp);
+const char * const	*nvpair_get_string_array(const nvpair_t *nvp, size_t *nitemsp);
+const nvlist_t * const	*nvpair_get_nvlist_array(const nvpair_t *nvp, size_t *nitemsp);
+const int		*nvpair_get_descriptor_array(const nvpair_t *nvp, size_t *nitemsp);
 
 void nvpair_free(nvpair_t *nvp);
 
diff --git a/sys/contrib/libnv/nvlist.c b/sys/contrib/libnv/nvlist.c
index edcd074..cf8281e 100644
--- a/sys/contrib/libnv/nvlist.c
+++ b/sys/contrib/libnv/nvlist.c
@@ -1,5 +1,6 @@
 /*-
  * Copyright (c) 2009-2013 The FreeBSD Foundation
+ * Copyright (c) 2013-2015 Mariusz Zaborski <oshogbo@FreeBSD.org>
  * All rights reserved.
  *
  * This software was developed by Pawel Jakub Dawidek under sponsorship from
@@ -88,7 +89,7 @@ __FBSDID("$FreeBSD$");
 #endif
 #endif
 
-#define	NV_FLAG_PRIVATE_MASK	(NV_FLAG_BIG_ENDIAN)
+#define	NV_FLAG_PRIVATE_MASK	(NV_FLAG_BIG_ENDIAN | NV_FLAG_IN_ARRAY)
 #define	NV_FLAG_PUBLIC_MASK	(NV_FLAG_IGNORE_CASE | NV_FLAG_NO_UNIQUE)
 #define	NV_FLAG_ALL_MASK	(NV_FLAG_PRIVATE_MASK | NV_FLAG_PUBLIC_MASK)
 
@@ -98,6 +99,7 @@ struct nvlist {
 	int		 nvl_error;
 	int		 nvl_flags;
 	nvpair_t	*nvl_parent;
+	nvpair_t	*nvl_array_next;
 	struct nvl_head	 nvl_head;
 };
 
@@ -135,6 +137,7 @@ nvlist_create(int flags)
 	nvl->nvl_error = 0;
 	nvl->nvl_flags = flags;
 	nvl->nvl_parent = NULL;
+	nvl->nvl_array_next = NULL;
 	TAILQ_INIT(&nvl->nvl_head);
 	nvl->nvl_magic = NVLIST_MAGIC;
 
@@ -157,6 +160,10 @@ nvlist_destroy(nvlist_t *nvl)
 		nvlist_remove_nvpair(nvl, nvp);
 		nvpair_free(nvp);
 	}
+	if (nvl->nvl_array_next != NULL)
+		nvpair_free_structure(nvl->nvl_array_next);
+	nvl->nvl_array_next = NULL;
+	nvl->nvl_parent = NULL;
 	nvl->nvl_magic = 0;
 	nv_free(nvl);
 
@@ -223,6 +230,59 @@ nvlist_set_parent(nvlist_t *nvl, nvpair_t *parent)
 	nvl->nvl_parent = parent;
 }
 
+void
+nvlist_set_array_next(nvlist_t *nvl, nvpair_t *ele)
+{
+
+	NVLIST_ASSERT(nvl);
+
+	if (ele != NULL)
+		nvl->nvl_flags |= NV_FLAG_IN_ARRAY;
+	else
+		nvl->nvl_flags &= ~NV_FLAG_IN_ARRAY;
+
+	nvl->nvl_array_next = ele;
+}
+
+bool
+nvlist_in_array(const nvlist_t *nvl)
+{
+
+	NVLIST_ASSERT(nvl);
+
+	return ((nvl->nvl_flags & NV_FLAG_IN_ARRAY) != 0);
+}
+
+const nvlist_t *
+nvlist_get_array_next(const nvlist_t *nvl)
+{
+	nvpair_t *nvp;
+
+	NVLIST_ASSERT(nvl);
+
+	nvp = nvl->nvl_array_next;
+	if (nvp == NULL)
+		return (NULL);
+
+	return (nvpair_get_nvlist(nvp));
+}
+
+const nvlist_t *
+nvlist_get_pararr(const nvlist_t *nvl, void **cookiep)
+{
+	const nvlist_t *ret;
+
+	ret = nvlist_get_array_next(nvl);
+	if (ret != NULL) {
+		if (cookiep != NULL)
+			*cookiep = NULL;
+		return (ret);
+	}
+
+	ret = nvlist_get_parent(nvl, cookiep);
+	return (ret);
+}
+
 bool
 nvlist_empty(const nvlist_t *nvl)
 {
@@ -239,9 +299,18 @@ nvlist_flags(const nvlist_t *nvl)
 
 	NVLIST_ASSERT(nvl);
 	PJDLOG_ASSERT(nvl->nvl_error == 0);
-	PJDLOG_ASSERT((nvl->nvl_flags & ~(NV_FLAG_PUBLIC_MASK)) == 0);
 
-	return (nvl->nvl_flags);
+	return (nvl->nvl_flags & NV_FLAG_PUBLIC_MASK);
+}
+
+void
+nvlist_set_flags(nvlist_t *nvl, int flags)
+{
+
+	NVLIST_ASSERT(nvl);
+	PJDLOG_ASSERT(nvl->nvl_error == 0);
+
+	nvl->nvl_flags = flags;
 }
 
 static void
@@ -418,17 +487,129 @@ nvlist_dump(const nvlist_t *nvl, int fd)
 			dprintf(fd, "\n");
 			break;
 		    }
+		case NV_TYPE_BOOL_ARRAY:
+		    {
+			const bool *value;
+			unsigned int ii;
+			size_t nitems;
+
+			value = nvpair_get_bool_array(nvp, &nitems);
+			dprintf(fd, " [ ");
+			for (ii = 0; ii < nitems; ii++) {
+				dprintf(fd, "%s", value[ii] ? "TRUE" : "FALSE");
+				if (ii != nitems - 1)
+					dprintf(fd, ", ");
+			}
+			dprintf(fd, " ]\n");
+			break;
+		    }
+		case NV_TYPE_STRING_ARRAY:
+		    {
+			const char * const *value;
+			unsigned int ii;
+			size_t nitems;
+
+			value = nvpair_get_string_array(nvp, &nitems);
+			dprintf(fd, " [ ");
+			for (ii = 0; ii < nitems; ii++) {
+				if (value[ii] == NULL)
+					dprintf(fd, "NULL");
+				else
+					dprintf(fd, "\"%s\"", value[ii]);
+				if (ii != nitems - 1)
+					dprintf(fd, ", ");
+			}
+			dprintf(fd, " ]\n");
+			break;
+		    }
+		case NV_TYPE_NUMBER_ARRAY:
+		    {
+			const uint64_t *value;
+			unsigned int ii;
+			size_t nitems;
+
+			value = nvpair_get_number_array(nvp, &nitems);
+			dprintf(fd, " [ ");
+			for (ii = 0; ii < nitems; ii++) {
+				dprintf(fd, "%ju (%jd) (0x%jx)",
+				    value[ii], value[ii], value[ii]);
+				if (ii != nitems - 1)
+					dprintf(fd, ", ");
+			}
+			dprintf(fd, " ]\n");
+			break;
+		    }
+		case NV_TYPE_DESCRIPTOR_ARRAY:
+		    {
+			const int *value;
+			unsigned int ii;
+			size_t nitems;
+
+			value = nvpair_get_descriptor_array(nvp, &nitems);
+			dprintf(fd, " [ ");
+			for (ii = 0; ii < nitems; ii++) {
+				dprintf(fd, "%d", value[ii]);
+				if (ii != nitems - 1)
+					dprintf(fd, ", ");
+			}
+			dprintf(fd, " ]\n");
+			break;
+		    }
+		case NV_TYPE_NVLIST_ARRAY:
+		    {
+			const nvlist_t * const *value;
+			unsigned int ii;
+			size_t nitems;
+
+			value = nvpair_get_nvlist_array(nvp, &nitems);
+			dprintf(fd, " %zu\n", nitems);
+			tmpnvl = NULL;
+			tmpnvp = NULL;
+			for (ii = 0; ii < nitems; ii++) {
+				if (nvlist_dump_error_check(value[ii], fd,
+				    level + 1)) {
+					break;
+				}
+
+				if (tmpnvl == NULL) {
+					tmpnvp = nvlist_first_nvpair(value[ii]);
+					if (tmpnvp != NULL) {
+						tmpnvl = value[ii];
+					} else {
+						dprintf(fd, "%*s,\n",
+						    (level + 1) * 4, "");
+					}
+				}
+			}
+			if (tmpnvp != NULL) {
+				nvl = tmpnvl;
+				nvp = tmpnvp;
+				level++;
+				continue;
+			}
+			break;
+		    }
 		default:
 			PJDLOG_ABORT("Unknown type: %d.", nvpair_type(nvp));
 		}
 
 		while ((nvp = nvlist_next_nvpair(nvl, nvp)) == NULL) {
-			cookie = NULL;
-			nvl = nvlist_get_parent(nvl, &cookie);
-			if (nvl == NULL)
-				return;
-			nvp = cookie;
-			level--;
+			do {
+				cookie = NULL;
+				if (nvlist_in_array(nvl))
+					dprintf(fd, "%*s,\n", level * 4, "");
+				nvl = nvlist_get_pararr(nvl, &cookie);
+				if (nvl == NULL)
+					return;
+				if (nvlist_in_array(nvl) && cookie == NULL) {
+					nvp = nvlist_first_nvpair(nvl);
+				} else {
+					nvp = cookie;
+					level--;
+				}
+			} while (nvp == NULL);
+			if (nvlist_in_array(nvl) && cookie == NULL)
+				break;
 		}
 	}
 }
@@ -449,9 +630,11 @@ size_t
 nvlist_size(const nvlist_t *nvl)
 {
 	const nvlist_t *tmpnvl;
+	const nvlist_t * const *nvlarray;
 	const nvpair_t *nvp, *tmpnvp;
 	void *cookie;
-	size_t size;
+	size_t size, nitems;
+	unsigned int ii;
 
 	NVLIST_ASSERT(nvl);
 	PJDLOG_ASSERT(nvl->nvl_error == 0);
@@ -472,16 +655,47 @@ nvlist_size(const nvlist_t *nvl)
 				nvp = tmpnvp;
 				continue;
 			}
+		} else if (nvpair_type(nvp) == NV_TYPE_NVLIST_ARRAY) {
+			nvlarray = nvpair_get_nvlist_array(nvp, &nitems);
+			PJDLOG_ASSERT(nitems > 0);
+
+			size += (nvpair_header_size() + 1) * nitems;
+			size += sizeof(struct nvlist_header) * nitems;
+
+			tmpnvl = NULL;
+			tmpnvp = NULL;
+			for (ii = 0; ii < nitems; ii++) {
+				PJDLOG_ASSERT(nvlarray[ii]->nvl_error == 0);
+				tmpnvp = nvlist_first_nvpair(nvlarray[ii]);
+				if (tmpnvp != NULL) {
+					tmpnvl = nvlarray[ii];
+					break;
+				}
+			}
+			if (tmpnvp != NULL) {
+				nvp = tmpnvp;
+				nvl = tmpnvl;
+				continue;
+			}
+
 		} else {
 			size += nvpair_size(nvp);
 		}
 
 		while ((nvp = nvlist_next_nvpair(nvl, nvp)) == NULL) {
-			cookie = NULL;
-			nvl = nvlist_get_parent(nvl, &cookie);
-			if (nvl == NULL)
-				goto out;
-			nvp = cookie;
+			do {
+				cookie = NULL;
+				nvl = nvlist_get_pararr(nvl, &cookie);
+				if (nvl == NULL)
+					goto out;
+				if (nvlist_in_array(nvl) && cookie == NULL) {
+					nvp = nvlist_first_nvpair(nvl);
+				} else {
+					nvp = cookie;
+				}
+			} while (nvp == NULL);
+			if (nvlist_in_array(nvl) && cookie == NULL)
+				break;
 		}
 	}
 
@@ -508,13 +722,40 @@ nvlist_xdescriptors(const nvlist_t *nvl, int *descs)
 				*descs = nvpair_get_descriptor(nvp);
 				descs++;
 				break;
+			case NV_TYPE_DESCRIPTOR_ARRAY:
+			    {
+				const int *value;
+				size_t nitems;
+				unsigned int ii;
+
+				value = nvpair_get_descriptor_array(nvp,
+				    &nitems);
+				for (ii = 0; ii < nitems; ii++) {
+					*descs = value[ii];
+					descs++;
+				}
+				break;
+			    }
 			case NV_TYPE_NVLIST:
 				nvl = nvpair_get_nvlist(nvp);
 				nvp = NULL;
 				break;
+			case NV_TYPE_NVLIST_ARRAY:
+			    {
+				const nvlist_t * const *value;
+				size_t nitems;
+
+				value = nvpair_get_nvlist_array(nvp, &nitems);
+				PJDLOG_ASSERT(value != NULL);
+				PJDLOG_ASSERT(nitems > 0);
+
+				nvl = value[0];
+				nvp = NULL;
+				break;
+			    }
 			}
 		}
-	} while ((nvl = nvlist_get_parent(nvl, (void**)&nvp)) != NULL);
+	} while ((nvl = nvlist_get_pararr(nvl, (void**)&nvp)) != NULL);
 
 	return (descs);
 }
@@ -564,9 +805,31 @@ nvlist_ndescriptors(const nvlist_t *nvl)
 				nvl = nvpair_get_nvlist(nvp);
 				nvp = NULL;
 				break;
+			case NV_TYPE_NVLIST_ARRAY:
+			    {
+				const nvlist_t * const *value;
+				size_t nitems;
+
+				value = nvpair_get_nvlist_array(nvp, &nitems);
+				PJDLOG_ASSERT(value != NULL);
+				PJDLOG_ASSERT(nitems > 0);
+
+				nvl = value[0];
+				nvp = NULL;
+				break;
+			    }
+			case NV_TYPE_DESCRIPTOR_ARRAY:
+			    {
+				size_t nitems;
+
+				(void)nvpair_get_descriptor_array(nvp,
+				    &nitems);
+				ndescs += nitems;
+				break;
+			    }
 			}
 		}
-	} while ((nvl = nvlist_get_parent(nvl, (void**)&nvp)) != NULL);
+	} while ((nvl = nvlist_get_pararr(nvl, (void**)&nvp)) != NULL);
 
 	return (ndescs);
 #else
@@ -661,24 +924,86 @@ nvlist_xpack(const nvlist_t *nvl, int64_t *fdidxp, size_t *sizep)
 		case NV_TYPE_DESCRIPTOR:
 			ptr = nvpair_pack_descriptor(nvp, ptr, fdidxp, &left);
 			break;
+		case NV_TYPE_DESCRIPTOR_ARRAY:
+			ptr = nvpair_pack_descriptor_array(nvp, ptr, fdidxp,
+			    &left);
+			break;
 #endif
 		case NV_TYPE_BINARY:
 			ptr = nvpair_pack_binary(nvp, ptr, &left);
 			break;
+		case NV_TYPE_BOOL_ARRAY:
+			ptr = nvpair_pack_bool_array(nvp, ptr, &left);
+			break;
+		case NV_TYPE_NUMBER_ARRAY:
+			ptr = nvpair_pack_number_array(nvp, ptr, &left);
+			break;
+		case NV_TYPE_STRING_ARRAY:
+			ptr = nvpair_pack_string_array(nvp, ptr, &left);
+			break;
+		case NV_TYPE_NVLIST_ARRAY:
+		    {
+			const nvlist_t * const * value;
+			size_t nitems;
+			unsigned int ii;
+
+			tmpnvl = NULL;
+			value = nvpair_get_nvlist_array(nvp, &nitems);
+			for (ii = 0; ii < nitems; ii++) {
+				ptr = nvlist_pack_header(value[ii], ptr, &left);
+				if (ptr == NULL)
+					goto out;
+				tmpnvp = nvlist_first_nvpair(value[ii]);
+				if (tmpnvp != NULL) {
+					tmpnvl = value[ii];
+					break;
+				}
+				ptr = nvpair_pack_nvlist_array_next(ptr, &left);
+				if (ptr == NULL)
+					goto out;
+			}
+			if (tmpnvl != NULL) {
+				nvl = tmpnvl;
+				nvp = tmpnvp;
+				continue;
+			}
+			break;
+		    }
 		default:
 			PJDLOG_ABORT("Invalid type (%d).", nvpair_type(nvp));
 		}
 		if (ptr == NULL)
 			goto fail;
 		while ((nvp = nvlist_next_nvpair(nvl, nvp)) == NULL) {
-			cookie = NULL;
-			nvl = nvlist_get_parent(nvl, &cookie);
-			if (nvl == NULL)
-				goto out;
-			nvp = cookie;
-			ptr = nvpair_pack_nvlist_up(ptr, &left);
-			if (ptr == NULL)
-				goto fail;
+			do {
+				cookie = NULL;
+				if (nvlist_in_array(nvl)) {
+					ptr = nvpair_pack_nvlist_array_next(ptr,
+					    &left);
+					if (ptr == NULL)
+						goto fail;
+				}
+				nvl = nvlist_get_pararr(nvl, &cookie);
+				if (nvl == NULL)
+					goto out;
+				if (nvlist_in_array(nvl) && cookie == NULL) {
+					nvp = nvlist_first_nvpair(nvl);
+					ptr = nvlist_pack_header(nvl, ptr,
+					    &left);
+					if (ptr == NULL)
+						goto fail;
+				} else if (nvpair_type((nvpair_t *)cookie) !=
+				    NV_TYPE_NVLIST_ARRAY) {
+					ptr = nvpair_pack_nvlist_up(ptr, &left);
+					if (ptr == NULL)
+						goto fail;
+					nvp = cookie;
+				} else {
+					nvp = cookie;
+				}
+			} while (nvp == NULL);
+			if (nvlist_in_array(nvl) && cookie == NULL)
+				break;
 		}
 	}
 
@@ -741,6 +1066,7 @@ nvlist_unpack_header(nvlist_t *nvl, const unsigned char *ptr, size_t nfds,
     bool *isbep, size_t *leftp)
 {
 	struct nvlist_header nvlhdr;
+	int inarrayf;
 
 	if (*leftp < sizeof(nvlhdr))
 		goto failed;
@@ -762,7 +1088,8 @@ nvlist_unpack_header(nvlist_t *nvl, const unsigned char *ptr, size_t nfds,
 	if ((nvlhdr.nvlh_flags & ~NV_FLAG_ALL_MASK) != 0)
 		goto failed;
 
-	nvl->nvl_flags = (nvlhdr.nvlh_flags & NV_FLAG_PUBLIC_MASK);
+	inarrayf = (nvl->nvl_flags & NV_FLAG_IN_ARRAY);
+	nvl->nvl_flags = (nvlhdr.nvlh_flags & NV_FLAG_PUBLIC_MASK) | inarrayf;
 
 	ptr += sizeof(nvlhdr);
 	if (isbep != NULL)
@@ -780,7 +1107,7 @@ nvlist_xunpack(const void *buf, size_t size, const int *fds, size_t nfds,
     int flags)
 {
 	const unsigned char *ptr;
-	nvlist_t *nvl, *retnvl, *tmpnvl;
+	nvlist_t *nvl, *retnvl, *tmpnvl, *array;
 	nvpair_t *nvp;
 	size_t left;
 	bool isbe;
@@ -790,7 +1117,7 @@ nvlist_xunpack(const void *buf, size_t size, const int *fds, size_t nfds,
 	left = size;
 	ptr = buf;
 
-	tmpnvl = NULL;
+	tmpnvl = array = NULL;
 	nvl = retnvl = nvlist_create(0);
 	if (nvl == NULL)
 		goto failed;
@@ -832,6 +1159,10 @@ nvlist_xunpack(const void *buf, size_t size, const int *fds, size_t nfds,
 			ptr = nvpair_unpack_descriptor(isbe, nvp, ptr, &left,
 			    fds, nfds);
 			break;
+		case NV_TYPE_DESCRIPTOR_ARRAY:
+			ptr = nvpair_unpack_descriptor_array(isbe, nvp, ptr,
+			    &left, fds, nfds);
+			break;
 #endif
 		case NV_TYPE_BINARY:
 			ptr = nvpair_unpack_binary(isbe, nvp, ptr, &left);
@@ -842,6 +1173,44 @@ nvlist_xunpack(const void *buf, size_t size, const int *fds, size_t nfds,
 			nvl = nvpair_nvlist(nvl->nvl_parent);
 			nvpair_free_structure(nvp);
 			continue;
+		case NV_TYPE_NVLIST_ARRAY_NEXT:
+			if (nvl->nvl_array_next == NULL) {
+				if (nvl->nvl_parent == NULL)
+					goto failed;
+				nvl = nvpair_nvlist(nvl->nvl_parent);
+			} else {
+				nvl = __DECONST(nvlist_t *,
+				    nvlist_get_array_next(nvl));
+				ptr = nvlist_unpack_header(nvl, ptr, nfds,
+				    &isbe, &left);
+				if (ptr == NULL)
+					goto failed;
+			}
+			nvpair_free_structure(nvp);
+			continue;
+		case NV_TYPE_BOOL_ARRAY:
+			ptr = nvpair_unpack_bool_array(isbe, nvp, ptr, &left);
+			break;
+		case NV_TYPE_NUMBER_ARRAY:
+			ptr = nvpair_unpack_number_array(isbe, nvp, ptr, &left);
+			break;
+		case NV_TYPE_STRING_ARRAY:
+			ptr = nvpair_unpack_string_array(isbe, nvp, ptr, &left);
+			break;
+		case NV_TYPE_NVLIST_ARRAY:
+			ptr = nvpair_unpack_nvlist_array(isbe, nvp, ptr, &left,
+			    &array);
+			if (ptr == NULL)
+				goto failed;
+			tmpnvl = array;
+			while (array != NULL) {
+				nvlist_set_parent(array, nvp);
+				array = __DECONST(nvlist_t *,
+				    nvlist_get_array_next(array));
+			}
+			ptr = nvlist_unpack_header(tmpnvl, ptr, nfds, &isbe,
+			    &left);
+			break;
 		default:
 			PJDLOG_ABORT("Invalid type (%d).", nvpair_type(nvp));
 		}
@@ -1062,10 +1431,15 @@ NVLIST_EXISTS(bool, BOOL)
 NVLIST_EXISTS(number, NUMBER)
 NVLIST_EXISTS(string, STRING)
 NVLIST_EXISTS(nvlist, NVLIST)
+NVLIST_EXISTS(binary, BINARY)
+NVLIST_EXISTS(bool_array, BOOL_ARRAY)
+NVLIST_EXISTS(number_array, NUMBER_ARRAY)
+NVLIST_EXISTS(string_array, STRING_ARRAY)
+NVLIST_EXISTS(nvlist_array, NVLIST_ARRAY)
 #ifndef _KERNEL
 NVLIST_EXISTS(descriptor, DESCRIPTOR)
+NVLIST_EXISTS(descriptor_array, DESCRIPTOR_ARRAY)
 #endif
-NVLIST_EXISTS(binary, BINARY)
 
 #undef	NVLIST_EXISTS
 
@@ -1198,6 +1572,37 @@ NVLIST_ADD(int, descriptor);
 
 #undef	NVLIST_ADD
 
+#define	NVLIST_ADD_ARRAY(vtype, type)					\
+void									\
+nvlist_add_##type##_array(nvlist_t *nvl, const char *name, vtype value,	\
+    size_t nitems)							\
+{									\
+	nvpair_t *nvp;							\
+									\
+	if (nvlist_error(nvl) != 0) {					\
+		ERRNO_SET(nvlist_error(nvl));				\
+		return;							\
+	}								\
+									\
+	nvp = nvpair_create_##type##_array(name, value, nitems);	\
+	if (nvp == NULL) {						\
+		nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM);		\
+		ERRNO_SET(nvl->nvl_error);				\
+	} else {							\
+		(void)nvlist_move_nvpair(nvl, nvp);			\
+	}								\
+}
+
+NVLIST_ADD_ARRAY(const bool *, bool)
+NVLIST_ADD_ARRAY(const uint64_t *, number)
+NVLIST_ADD_ARRAY(const char * const *, string)
+NVLIST_ADD_ARRAY(const nvlist_t * const *, nvlist)
+#ifndef _KERNEL
+NVLIST_ADD_ARRAY(const int *, descriptor)
+#endif
+
+#undef	NVLIST_ADD_ARRAY
+
 bool
 nvlist_move_nvpair(nvlist_t *nvl, nvpair_t *nvp)
 {
@@ -1306,6 +1711,131 @@ nvlist_move_binary(nvlist_t *nvl, const char *name, void *value, size_t size)
 	}
 }
 
+void
+nvlist_move_bool_array(nvlist_t *nvl, const char *name, bool *value,
+    size_t nitems)
+{
+	nvpair_t *nvp;
+
+	if (nvlist_error(nvl) != 0) {
+		nv_free(value);
+		ERRNO_SET(nvlist_error(nvl));
+		return;
+	}
+
+	nvp = nvpair_move_bool_array(name, value, nitems);
+	if (nvp == NULL) {
+		nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM);
+		ERRNO_SET(nvl->nvl_error);
+	} else {
+		(void)nvlist_move_nvpair(nvl, nvp);
+	}
+}
+
+void
+nvlist_move_string_array(nvlist_t *nvl, const char *name, char **value,
+    size_t nitems)
+{
+	nvpair_t *nvp;
+	size_t i;
+
+	if (nvlist_error(nvl) != 0) {
+		if (value != NULL) {
+			for (i = 0; i < nitems; i++)
+				nv_free(value[i]);
+			nv_free(value);
+		}
+		ERRNO_SET(nvlist_error(nvl));
+		return;
+	}
+
+	nvp = nvpair_move_string_array(name, value, nitems);
+	if (nvp == NULL) {
+		nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM);
+		ERRNO_SET(nvl->nvl_error);
+	} else {
+		(void)nvlist_move_nvpair(nvl, nvp);
+	}
+}
+
+void
+nvlist_move_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t **value,
+    size_t nitems)
+{
+	nvpair_t *nvp;
+	size_t i;
+
+	if (nvlist_error(nvl) != 0) {
+		if (value != NULL) {
+			for (i = 0; i < nitems; i++) {
+				if (nvlist_get_pararr(value[i], NULL) == NULL)
+					nvlist_destroy(value[i]);
+			}
+		}
+		nv_free(value);
+		ERRNO_SET(nvlist_error(nvl));
+		return;
+	}
+
+	nvp = nvpair_move_nvlist_array(name, value, nitems);
+	if (nvp == NULL) {
+		nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM);
+		ERRNO_SET(nvl->nvl_error);
+	} else {
+		(void)nvlist_move_nvpair(nvl, nvp);
+	}
+}
+
+void
+nvlist_move_number_array(nvlist_t *nvl, const char *name, uint64_t *value,
+    size_t nitems)
+{
+	nvpair_t *nvp;
+
+	if (nvlist_error(nvl) != 0) {
+		nv_free(value);
+		ERRNO_SET(nvlist_error(nvl));
+		return;
+	}
+
+	nvp = nvpair_move_number_array(name, value, nitems);
+	if (nvp == NULL) {
+		nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM);
+		ERRNO_SET(nvl->nvl_error);
+	} else {
+		(void)nvlist_move_nvpair(nvl, nvp);
+	}
+}
+
+#ifndef _KERNEL
+void
+nvlist_move_descriptor_array(nvlist_t *nvl, const char *name, int *value,
+    size_t nitems)
+{
+	nvpair_t *nvp;
+	size_t i;
+
+	if (nvlist_error(nvl) != 0) {
+		if (value != 0) {
+			for (i = 0; i < nitems; i++)
+				close(value[i]);
+			nv_free(value);
+		}
+
+		ERRNO_SET(nvlist_error(nvl));
+		return;
+	}
+
+	nvp = nvpair_move_descriptor_array(name, value, nitems);
+	if (nvp == NULL) {
+		nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM);
+		ERRNO_SET(nvl->nvl_error);
+	} else {
+		(void)nvlist_move_nvpair(nvl, nvp);
+	}
+}
+#endif
+
 const nvpair_t *
 nvlist_get_nvpair(const nvlist_t *nvl, const char *name)
 {
@@ -1347,6 +1877,29 @@ nvlist_get_binary(const nvlist_t *nvl, const char *name, size_t *sizep)
 	return (nvpair_get_binary(nvp, sizep));
 }
 
+#define	NVLIST_GET_ARRAY(ftype, type, TYPE)				\
+ftype									\
+nvlist_get_##type##_array(const nvlist_t *nvl, const char *name,	\
+    size_t *nitems)							\
+{									\
+	const nvpair_t *nvp;						\
+									\
+	nvp = nvlist_find(nvl, NV_TYPE_##TYPE##_ARRAY, name);		\
+	if (nvp == NULL)						\
+		nvlist_report_missing(NV_TYPE_##TYPE##_ARRAY, name);	\
+	return (nvpair_get_##type##_array(nvp, nitems));		\
+}
+
+NVLIST_GET_ARRAY(const bool *, bool, BOOL)
+NVLIST_GET_ARRAY(const uint64_t *, number, NUMBER)
+NVLIST_GET_ARRAY(const char * const *, string, STRING)
+NVLIST_GET_ARRAY(const nvlist_t * const *, nvlist, NVLIST)
+#ifndef _KERNEL
+NVLIST_GET_ARRAY(const int *, descriptor, DESCRIPTOR)
+#endif
+
+#undef	NVLIST_GET_ARRAY
+
 #define	NVLIST_TAKE(ftype, type, TYPE)					\
 ftype									\
 nvlist_take_##type(nvlist_t *nvl, const char *name)			\
@@ -1389,6 +1942,31 @@ nvlist_take_binary(nvlist_t *nvl, const char *name, size_t *sizep)
 	return (value);
 }
 
+#define	NVLIST_TAKE_ARRAY(ftype, type, TYPE)				\
+ftype									\
+nvlist_take_##type##_array(nvlist_t *nvl, const char *name,		\
+    size_t *nitems)							\
+{									\
+	nvpair_t *nvp;							\
+	ftype value;							\
+									\
+	nvp = nvlist_find(nvl, NV_TYPE_##TYPE##_ARRAY, name);		\
+	if (nvp == NULL)						\
+		nvlist_report_missing(NV_TYPE_##TYPE##_ARRAY, name);	\
+	value = (ftype)(intptr_t)nvpair_get_##type##_array(nvp, nitems);\
+	nvlist_remove_nvpair(nvl, nvp);					\
+	nvpair_free_structure(nvp);					\
+	return (value);							\
+}
+
+NVLIST_TAKE_ARRAY(bool *, bool, BOOL)
+NVLIST_TAKE_ARRAY(uint64_t *, number, NUMBER)
+NVLIST_TAKE_ARRAY(char **, string, STRING)
+NVLIST_TAKE_ARRAY(nvlist_t **, nvlist, NVLIST)
+#ifndef _KERNEL
+NVLIST_TAKE_ARRAY(int *, descriptor, DESCRIPTOR)
+#endif
+
 void
 nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
 {
@@ -1420,10 +1998,15 @@ NVLIST_FREE(bool, BOOL)
 NVLIST_FREE(number, NUMBER)
 NVLIST_FREE(string, STRING)
 NVLIST_FREE(nvlist, NVLIST)
+NVLIST_FREE(binary, BINARY)
+NVLIST_FREE(bool_array, BOOL_ARRAY)
+NVLIST_FREE(number_array, NUMBER_ARRAY)
+NVLIST_FREE(string_array, STRING_ARRAY)
+NVLIST_FREE(nvlist_array, NVLIST_ARRAY)
 #ifndef _KERNEL
 NVLIST_FREE(descriptor, DESCRIPTOR)
+NVLIST_FREE(descriptor_array, DESCRIPTOR_ARRAY)
 #endif
-NVLIST_FREE(binary, BINARY)
 
 #undef	NVLIST_FREE
 
diff --git a/sys/contrib/libnv/nvlist_impl.h b/sys/contrib/libnv/nvlist_impl.h
index 18ccebf..9952db8 100644
--- a/sys/contrib/libnv/nvlist_impl.h
+++ b/sys/contrib/libnv/nvlist_impl.h
@@ -1,5 +1,6 @@
 /*-
  * Copyright (c) 2013 The FreeBSD Foundation
+ * Copyright (c) 2013-2015 Mariusz Zaborski <oshogbo@FreeBSD.org>
  * All rights reserved.
  *
  * This software was developed by Pawel Jakub Dawidek under sponsorship from
diff --git a/sys/contrib/libnv/nvpair.c b/sys/contrib/libnv/nvpair.c
index 7146767..1e3bd0e 100644
--- a/sys/contrib/libnv/nvpair.c
+++ b/sys/contrib/libnv/nvpair.c
@@ -1,5 +1,6 @@
 /*-
  * Copyright (c) 2009-2013 The FreeBSD Foundation
+ * Copyright (c) 2013-2015 Mariusz Zaborski <oshogbo@FreeBSD.org>
  * All rights reserved.
  *
  * This software was developed by Pawel Jakub Dawidek under sponsorship from
@@ -86,6 +87,7 @@ struct nvpair {
 	int		 nvp_type;
 	uint64_t	 nvp_data;
 	size_t		 nvp_datasize;
+	size_t		 nvp_nitems;	/* Used only for array types. */
 	nvlist_t	*nvp_list;
 	TAILQ_ENTRY(nvpair) nvp_next;
 };
@@ -99,6 +101,7 @@ struct nvpair_header {
 	uint8_t		nvph_type;
 	uint16_t	nvph_namesize;
 	uint64_t	nvph_datasize;
+	uint64_t	nvph_nitems;
 } __packed;
 
 
@@ -109,6 +112,36 @@ nvpair_assert(const nvpair_t *nvp)
 	NVPAIR_ASSERT(nvp);
 }
 
+static nvpair_t *
+nvpair_allocv(const char *name, int type, uint64_t data, size_t datasize,
+    size_t nitems)
+{
+	nvpair_t *nvp;
+	size_t namelen;
+
+	PJDLOG_ASSERT(type >= NV_TYPE_FIRST && type <= NV_TYPE_LAST);
+
+	namelen = strlen(name);
+	if (namelen >= NV_NAME_MAX) {
+		ERRNO_SET(ENAMETOOLONG);
+		return (NULL);
+	}
+
+	nvp = nv_calloc(1, sizeof(*nvp) + namelen + 1);
+	if (nvp != NULL) {
+		nvp->nvp_name = (char *)(nvp + 1);
+		memcpy(nvp->nvp_name, name, namelen);
+		nvp->nvp_name[namelen] = '\0';
+		nvp->nvp_type = type;
+		nvp->nvp_data = data;
+		nvp->nvp_datasize = datasize;
+		nvp->nvp_nitems = nitems;
+		nvp->nvp_magic = NVPAIR_MAGIC;
+	}
+
+	return (nvp);
+}
+
 nvlist_t *
 nvpair_nvlist(const nvpair_t *nvp)
 {
@@ -162,6 +195,19 @@ nvpair_remove_nvlist(nvpair_t *nvp)
 	nvlist_set_parent(nvl, NULL);
 }
 
+static void
+nvpair_remove_nvlist_array(nvpair_t *nvp)
+{
+	nvlist_t **nvlarray;
+	size_t count, i;
+
+	/* XXX: DECONST is bad, mkay? */
+	nvlarray = __DECONST(nvlist_t **,
+	    nvpair_get_nvlist_array(nvp, &count));
+	for (i = 0; i < count; i++)
+		nvlist_set_array_next(nvlarray[i], NULL);
+}
+
 void
 nvpair_remove(struct nvl_head *head, nvpair_t *nvp, const nvlist_t *nvl)
 {
@@ -171,6 +217,8 @@ nvpair_remove(struct nvl_head *head, nvpair_t *nvp, const nvlist_t *nvl)
 
 	if (nvpair_type(nvp) == NV_TYPE_NVLIST)
 		nvpair_remove_nvlist(nvp);
+	else if (nvpair_type(nvp) == NV_TYPE_NVLIST_ARRAY)
+		nvpair_remove_nvlist_array(nvp);
 
 	TAILQ_REMOVE(head, nvp, nvp_next);
 	nvp->nvp_list = NULL;
@@ -204,16 +252,36 @@ nvpair_clone(const nvpair_t *nvp)
 	case NV_TYPE_NVLIST:
 		newnvp = nvpair_create_nvlist(name, nvpair_get_nvlist(nvp));
 		break;
+	case NV_TYPE_BINARY:
+		data = nvpair_get_binary(nvp, &datasize);
+		newnvp = nvpair_create_binary(name, data, datasize);
+		break;
+	case NV_TYPE_BOOL_ARRAY:
+		data = nvpair_get_bool_array(nvp, &datasize);
+		newnvp = nvpair_create_bool_array(name, data, datasize);
+		break;
+	case NV_TYPE_NUMBER_ARRAY:
+		data = nvpair_get_number_array(nvp, &datasize);
+		newnvp = nvpair_create_number_array(name, data, datasize);
+		break;
+	case NV_TYPE_STRING_ARRAY:
+		data = nvpair_get_string_array(nvp, &datasize);
+		newnvp = nvpair_create_string_array(name, data, datasize);
+		break;
+	case NV_TYPE_NVLIST_ARRAY:
+		data = nvpair_get_nvlist_array(nvp, &datasize);
+		newnvp = nvpair_create_nvlist_array(name, data, datasize);
+		break;
 #ifndef _KERNEL
 	case NV_TYPE_DESCRIPTOR:
 		newnvp = nvpair_create_descriptor(name,
 		    nvpair_get_descriptor(nvp));
 		break;
-#endif
-	case NV_TYPE_BINARY:
-		data = nvpair_get_binary(nvp, &datasize);
-		newnvp = nvpair_create_binary(name, data, datasize);
+	case NV_TYPE_DESCRIPTOR_ARRAY:
+		data = nvpair_get_descriptor_array(nvp, &datasize);
+		newnvp = nvpair_create_descriptor_array(name, data, datasize);
 		break;
+#endif
 	default:
 		PJDLOG_ABORT("Unknown type: %d.", nvpair_type(nvp));
 	}
@@ -250,6 +318,7 @@ nvpair_pack_header(const nvpair_t *nvp, unsigned char *ptr, size_t *leftp)
 	PJDLOG_ASSERT(namesize > 0 && namesize <= UINT16_MAX);
 	nvphdr.nvph_namesize = namesize;
 	nvphdr.nvph_datasize = nvp->nvp_datasize;
+	nvphdr.nvph_nitems = nvp->nvp_nitems;
 	PJDLOG_ASSERT(*leftp >= sizeof(nvphdr));
 	memcpy(ptr, &nvphdr, sizeof(nvphdr));
 	ptr += sizeof(nvphdr);
@@ -336,6 +405,32 @@ nvpair_pack_nvlist_up(unsigned char *ptr, size_t *leftp)
 	nvphdr.nvph_type = NV_TYPE_NVLIST_UP;
 	nvphdr.nvph_namesize = namesize;
 	nvphdr.nvph_datasize = 0;
+	nvphdr.nvph_nitems = 0;
+	PJDLOG_ASSERT(*leftp >= sizeof(nvphdr));
+	memcpy(ptr, &nvphdr, sizeof(nvphdr));
+	ptr += sizeof(nvphdr);
+	*leftp -= sizeof(nvphdr);
+
+	PJDLOG_ASSERT(*leftp >= namesize);
+	memcpy(ptr, name, namesize);
+	ptr += namesize;
+	*leftp -= namesize;
+
+	return (ptr);
+}
+
+unsigned char *
+nvpair_pack_nvlist_array_next(unsigned char *ptr, size_t *leftp)
+{
+	struct nvpair_header nvphdr;
+	size_t namesize;
+	const char *name = "";
+
+	namesize = 1;
+	nvphdr.nvph_type = NV_TYPE_NVLIST_ARRAY_NEXT;
+	nvphdr.nvph_namesize = namesize;
+	nvphdr.nvph_datasize = 0;
+	nvphdr.nvph_nitems = 0;
 	PJDLOG_ASSERT(*leftp >= sizeof(nvphdr));
 	memcpy(ptr, &nvphdr, sizeof(nvphdr));
 	ptr += sizeof(nvphdr);
@@ -396,6 +491,106 @@ nvpair_pack_binary(const nvpair_t *nvp, unsigned char *ptr, size_t *leftp)
 	return (ptr);
 }
 
+unsigned char *
+nvpair_pack_bool_array(const nvpair_t *nvp, unsigned char *ptr, size_t *leftp)
+{
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_BOOL_ARRAY);
+	PJDLOG_ASSERT(*leftp >= nvp->nvp_datasize);
+
+	memcpy(ptr, (const void *)(intptr_t)nvp->nvp_data, nvp->nvp_datasize);
+	ptr += nvp->nvp_datasize;
+	*leftp -= nvp->nvp_datasize;
+
+	return (ptr);
+}
+
+unsigned char *
+nvpair_pack_number_array(const nvpair_t *nvp, unsigned char *ptr, size_t *leftp)
+{
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NUMBER_ARRAY);
+	PJDLOG_ASSERT(*leftp >= nvp->nvp_datasize);
+
+	memcpy(ptr, (const void *)(intptr_t)nvp->nvp_data, nvp->nvp_datasize);
+	ptr += nvp->nvp_datasize;
+	*leftp -= nvp->nvp_datasize;
+
+	return (ptr);
+}
+
+unsigned char *
+nvpair_pack_string_array(const nvpair_t *nvp, unsigned char *ptr, size_t *leftp)
+{
+	unsigned int ii;
+	size_t size, len;
+	const char * const *array;
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_STRING_ARRAY);
+	PJDLOG_ASSERT(*leftp >= nvp->nvp_datasize);
+
+	size = 0;
+	array = nvpair_get_string_array(nvp, NULL);
+	PJDLOG_ASSERT(array != NULL);
+
+	for (ii = 0; ii < nvp->nvp_nitems; ii++) {
+		len = strlen(array[ii]) + 1;
+		PJDLOG_ASSERT(*leftp >= len);
+
+		memcpy(ptr, (const void *)array[ii], len);
+		size += len;
+		ptr += len;
+		*leftp -= len;
+	}
+
+	PJDLOG_ASSERT(size == nvp->nvp_datasize);
+
+	return (ptr);
+}
+
+#ifndef _KERNEL
+unsigned char *
+nvpair_pack_descriptor_array(const nvpair_t *nvp, unsigned char *ptr,
+    int64_t *fdidxp, size_t *leftp)
+{
+	int64_t value;
+	const int *array;
+	unsigned int ii;
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_DESCRIPTOR_ARRAY);
+	PJDLOG_ASSERT(*leftp >= nvp->nvp_datasize);
+
+	array = nvpair_get_descriptor_array(nvp, NULL);
+	PJDLOG_ASSERT(array != NULL);
+
+	for (ii = 0; ii < nvp->nvp_nitems; ii++) {
+		PJDLOG_ASSERT(*leftp >= sizeof(value));
+
+		value = array[ii];
+		if (value != -1) {
+			/*
+			 * If there is a real descriptor here, we change its
+			 * number to position in the array of descriptors send
+			 * via control message.
+			 */
+			PJDLOG_ASSERT(fdidxp != NULL);
+
+			value = *fdidxp;
+			(*fdidxp)++;
+		}
+		memcpy(ptr, &value, sizeof(value));
+		ptr += sizeof(value);
+		*leftp -= sizeof(value);
+	}
+
+	return (ptr);
+}
+#endif
+
 void
 nvpair_init_datasize(nvpair_t *nvp)
 {
@@ -430,7 +625,8 @@ nvpair_unpack_header(bool isbe, nvpair_t *nvp, const unsigned char *ptr,
 		goto failed;
 #endif
 	if (nvphdr.nvph_type > NV_TYPE_LAST &&
-	    nvphdr.nvph_type != NV_TYPE_NVLIST_UP) {
+	    nvphdr.nvph_type != NV_TYPE_NVLIST_UP &&
+	    nvphdr.nvph_type != NV_TYPE_NVLIST_ARRAY_NEXT) {
 		goto failed;
 	}
 
@@ -467,6 +663,7 @@ nvpair_unpack_header(bool isbe, nvpair_t *nvp, const unsigned char *ptr,
 	nvp->nvp_type = nvphdr.nvph_type;
 	nvp->nvp_data = 0;
 	nvp->nvp_datasize = nvphdr.nvph_datasize;
+	nvp->nvp_nitems = nvphdr.nvph_nitems;
 
 	return (ptr);
 failed:
@@ -540,6 +737,7 @@ nvpair_unpack_number(bool isbe, nvpair_t *nvp, const unsigned char *ptr,
 		nvp->nvp_data = be64dec(ptr);
 	else
 		nvp->nvp_data = le64dec(ptr);
+
 	ptr += sizeof(uint64_t);
 	*leftp -= sizeof(uint64_t);
 
@@ -670,6 +868,234 @@ nvpair_unpack_binary(bool isbe __unused, nvpair_t *nvp,
 }
 
 const unsigned char *
+nvpair_unpack_bool_array(bool isbe __unused, nvpair_t *nvp,
+    const unsigned char *ptr, size_t *leftp)
+{
+	uint8_t *value;
+	size_t size;
+	unsigned int i;
+
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_BOOL_ARRAY);
+
+	size = sizeof(*value) * nvp->nvp_nitems;
+	if (nvp->nvp_datasize != size || *leftp < size ||
+	    nvp->nvp_nitems == 0 || size < nvp->nvp_nitems) {
+		ERRNO_SET(EINVAL);
+		return (NULL);
+	}
+
+	value = nv_malloc(size);
+	if (value == NULL)
+		return (NULL);
+
+	for (i = 0; i < nvp->nvp_nitems; i++) {
+		value[i] = *(const uint8_t *)ptr;
+
+		ptr += sizeof(*value);
+		*leftp -= sizeof(*value);
+	}
+
+	nvp->nvp_data = (uint64_t)(uintptr_t)value;
+
+	return (ptr);
+}
+
+const unsigned char *
+nvpair_unpack_number_array(bool isbe, nvpair_t *nvp, const unsigned char *ptr,
+     size_t *leftp)
+{
+	uint64_t *value;
+	size_t size;
+	unsigned int i;
+
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NUMBER_ARRAY);
+
+	size = sizeof(*value) * nvp->nvp_nitems;
+	if (nvp->nvp_datasize != size || *leftp < size ||
+	    nvp->nvp_nitems == 0 || size < nvp->nvp_nitems) {
+		ERRNO_SET(EINVAL);
+		return (NULL);
+	}
+
+	value = nv_malloc(size);
+	if (value == NULL)
+		return (NULL);
+
+	for (i = 0; i < nvp->nvp_nitems; i++) {
+		if (isbe)
+			value[i] = be64dec(ptr);
+		else
+			value[i] = le64dec(ptr);
+
+		ptr += sizeof(*value);
+		*leftp -= sizeof(*value);
+	}
+
+	nvp->nvp_data = (uint64_t)(uintptr_t)value;
+
+	return (ptr);
+}
+
+const unsigned char *
+nvpair_unpack_string_array(bool isbe __unused, nvpair_t *nvp,
+    const unsigned char *ptr, size_t *leftp)
+{
+	ssize_t size;
+	size_t len;
+	const char *tmp;
+	char **value;
+	unsigned int ii, j;
+
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_STRING_ARRAY);
+
+	if (*leftp < nvp->nvp_datasize || nvp->nvp_datasize == 0 ||
+	    nvp->nvp_nitems == 0) {
+		ERRNO_SET(EINVAL);
+		return (NULL);
+	}
+
+	size = nvp->nvp_datasize;
+	tmp = (const char *)ptr;
+	for (ii = 0; ii < nvp->nvp_nitems; ii++) {
+		len = strnlen(tmp, size - 1) + 1;
+		size -= len;
+		if (size < 0) {
+			ERRNO_SET(EINVAL);
+			return (NULL);
+		}
+		tmp += len;
+	}
+	if (size != 0) {
+		ERRNO_SET(EINVAL);
+		return (NULL);
+	}
+
+	value = nv_malloc(sizeof(*value) * nvp->nvp_nitems);
+	if (value == NULL)
+		return (NULL);
+
+	for (ii = 0; ii < nvp->nvp_nitems; ii++) {
+		value[ii] = nv_strdup((const char *)ptr);
+		if (value[ii] == NULL)
+			goto out;
+		len = strlen(value[ii]) + 1;
+		ptr += len;
+		*leftp -= len;
+	}
+	nvp->nvp_data = (uint64_t)(uintptr_t)value;
+
+	return (ptr);
+out:
+	for (j = 0; j < ii; j++)
+		nv_free(value[j]);
+	nv_free(value);
+	return (NULL);
+}
+
+#ifndef _KERNEL
+const unsigned char *
+nvpair_unpack_descriptor_array(bool isbe, nvpair_t *nvp,
+    const unsigned char *ptr, size_t *leftp, const int *fds, size_t nfds)
+{
+	int64_t idx;
+	size_t size;
+	unsigned int ii;
+	int *array;
+
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_DESCRIPTOR_ARRAY);
+
+	size = sizeof(idx) * nvp->nvp_nitems;
+	if (nvp->nvp_datasize != size || *leftp < size ||
+	    nvp->nvp_nitems == 0 || size < nvp->nvp_nitems) {
+		ERRNO_SET(EINVAL);
+		return (NULL);
+	}
+
+	array = (int *)nv_malloc(size);
+	if (array == NULL)
+		return (NULL);
+
+	for (ii = 0; ii < nvp->nvp_nitems; ii++) {
+		if (isbe)
+			idx = be64dec(ptr);
+		else
+			idx = le64dec(ptr);
+
+		if (idx < 0) {
+			ERRNO_SET(EINVAL);
+			nv_free(array);
+			return (NULL);
+		}
+
+		if ((size_t)idx >= nfds) {
+			ERRNO_SET(EINVAL);
+			nv_free(array);
+			return (NULL);
+		}
+
+		array[ii] = (uint64_t)fds[idx];
+
+		ptr += sizeof(idx);
+		*leftp -= sizeof(idx);
+	}
+
+	nvp->nvp_data = (uint64_t)(uintptr_t)array;
+
+	return (ptr);
+}
+#endif
+
+const unsigned char *
+nvpair_unpack_nvlist_array(bool isbe __unused, nvpair_t *nvp,
+    const unsigned char *ptr, size_t *leftp, nvlist_t **firstel)
+{
+	nvlist_t **value;
+	nvpair_t *tmpnvp;
+	unsigned int ii, j;
+	size_t sizeup;
+
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NVLIST_ARRAY);
+
+	sizeup = sizeof(struct nvpair_header) * nvp->nvp_nitems;
+	if (nvp->nvp_nitems == 0 || sizeup < nvp->nvp_nitems ||
+	    sizeup > *leftp) {
+		ERRNO_SET(EINVAL);
+		return (NULL);
+	}
+
+	value = nv_malloc(nvp->nvp_nitems * sizeof(*value));
+	if (value == NULL)
+		return (NULL);
+
+	for (ii = 0; ii < nvp->nvp_nitems; ii++) {
+		value[ii] = nvlist_create(0);
+		if (value[ii] == NULL)
+			goto fail;
+		if (ii > 0) {
+			tmpnvp = nvpair_allocv(" ", NV_TYPE_NVLIST,
+			    (uint64_t)(uintptr_t)value[ii], 0, 0);
+			if (tmpnvp == NULL)
+				goto fail;
+			nvlist_set_array_next(value[ii - 1], tmpnvp);
+		}
+	}
+	nvlist_set_flags(value[nvp->nvp_nitems - 1], NV_FLAG_IN_ARRAY);
+
+	nvp->nvp_data = (uint64_t)(uintptr_t)value;
+	*firstel = value[0];
+
+	return (ptr);
+fail:
+	ERRNO_SAVE();
+	for (j = 0; j < ii; j++)
+		nvlist_destroy(value[j]);
+	nv_free(value);
+	ERRNO_RESTORE();
+
+	return (NULL);
+}
+
+const unsigned char *
 nvpair_unpack(bool isbe, const unsigned char *ptr, size_t *leftp,
     nvpair_t **nvpp)
 {
@@ -717,34 +1143,6 @@ nvpair_name(const nvpair_t *nvp)
 	return (nvp->nvp_name);
 }
 
-static nvpair_t *
-nvpair_allocv(const char *name, int type, uint64_t data, size_t datasize)
-{
-	nvpair_t *nvp;
-	size_t namelen;
-
-	PJDLOG_ASSERT(type >= NV_TYPE_FIRST && type <= NV_TYPE_LAST);
-
-	namelen = strlen(name);
-	if (namelen >= NV_NAME_MAX) {
-		ERRNO_SET(ENAMETOOLONG);
-		return (NULL);
-	}
-
-	nvp = nv_calloc(1, sizeof(*nvp) + namelen + 1);
-	if (nvp != NULL) {
-		nvp->nvp_name = (char *)(nvp + 1);
-		memcpy(nvp->nvp_name, name, namelen);
-		nvp->nvp_name[namelen] = '\0';
-		nvp->nvp_type = type;
-		nvp->nvp_data = data;
-		nvp->nvp_datasize = datasize;
-		nvp->nvp_magic = NVPAIR_MAGIC;
-	}
-
-	return (nvp);
-}
-
 nvpair_t *
 nvpair_create_stringf(const char *name, const char *valuefmt, ...)
 {
@@ -778,7 +1176,7 @@ nvpair_t *
 nvpair_create_null(const char *name)
 {
 
-	return (nvpair_allocv(name, NV_TYPE_NULL, 0, 0));
+	return (nvpair_allocv(name, NV_TYPE_NULL, 0, 0, 0));
 }
 
 nvpair_t *
@@ -786,14 +1184,14 @@ nvpair_create_bool(const char *name, bool value)
 {
 
 	return (nvpair_allocv(name, NV_TYPE_BOOL, value ? 1 : 0,
-	    sizeof(uint8_t)));
+	    sizeof(uint8_t), 0));
 }
 
 nvpair_t *
 nvpair_create_number(const char *name, uint64_t value)
 {
 
-	return (nvpair_allocv(name, NV_TYPE_NUMBER, value, sizeof(value)));
+	return (nvpair_allocv(name, NV_TYPE_NUMBER, value, sizeof(value), 0));
 }
 
 nvpair_t *
@@ -814,7 +1212,7 @@ nvpair_create_string(const char *name, const char *value)
 	size = strlen(value) + 1;
 
 	nvp = nvpair_allocv(name, NV_TYPE_STRING, (uint64_t)(uintptr_t)data,
-	    size);
+	    size, 0);
 	if (nvp == NULL)
 		nv_free(data);
 
@@ -836,7 +1234,8 @@ nvpair_create_nvlist(const char *name, const nvlist_t *value)
 	if (nvl == NULL)
 		return (NULL);
 
-	nvp = nvpair_allocv(name, NV_TYPE_NVLIST, (uint64_t)(uintptr_t)nvl, 0);
+	nvp = nvpair_allocv(name, NV_TYPE_NVLIST, (uint64_t)(uintptr_t)nvl, 0,
+	    0);
 	if (nvp == NULL)
 		nvlist_destroy(nvl);
 	else
@@ -861,7 +1260,7 @@ nvpair_create_descriptor(const char *name, int value)
 		return (NULL);
 
 	nvp = nvpair_allocv(name, NV_TYPE_DESCRIPTOR, (uint64_t)value,
-	    sizeof(int64_t));
+	    sizeof(int64_t), 0);
 	if (nvp == NULL) {
 		ERRNO_SAVE();
 		close(value);
@@ -889,7 +1288,7 @@ nvpair_create_binary(const char *name, const void *value, size_t size)
 	memcpy(data, value, size);
 
 	nvp = nvpair_allocv(name, NV_TYPE_BINARY, (uint64_t)(uintptr_t)data,
-	    size);
+	    size, 0);
 	if (nvp == NULL)
 		nv_free(data);
 
@@ -897,6 +1296,226 @@ nvpair_create_binary(const char *name, const void *value, size_t size)
 }
 
 nvpair_t *
+nvpair_create_bool_array(const char *name, const bool *value, size_t nitems)
+{
+	nvpair_t *nvp;
+	size_t size;
+	void *data;
+
+	if (value == NULL || nitems == 0) {
+		ERRNO_SET(EINVAL);
+		return (NULL);
+	}
+
+	size = sizeof(value[0]) * nitems;
+	data = nv_malloc(size);
+	if (data == NULL)
+		return (NULL);
+
+	memcpy(data, value, size);
+	nvp = nvpair_allocv(name, NV_TYPE_BOOL_ARRAY, (uint64_t)(uintptr_t)data,
+	    size, nitems);
+	if (nvp == NULL) {
+		ERRNO_SAVE();
+		nv_free(data);
+		ERRNO_RESTORE();
+	}
+
+	return (nvp);
+}
+
+nvpair_t *
+nvpair_create_number_array(const char *name, const uint64_t *value,
+    size_t nitems)
+{
+	nvpair_t *nvp;
+	size_t size;
+	void *data;
+
+	if (value == NULL || nitems == 0) {
+		ERRNO_SET(EINVAL);
+		return (NULL);
+	}
+
+	size = sizeof(value[0]) * nitems;
+	data = nv_malloc(size);
+	if (data == NULL)
+		return (NULL);
+
+	memcpy(data, value, size);
+	nvp = nvpair_allocv(name, NV_TYPE_NUMBER_ARRAY,
+	    (uint64_t)(uintptr_t)data, size, nitems);
+	if (nvp == NULL) {
+		ERRNO_SAVE();
+		nv_free(data);
+		ERRNO_RESTORE();
+	}
+
+	return (nvp);
+}
+
+nvpair_t *
+nvpair_create_string_array(const char *name, const char * const *value,
+    size_t nitems)
+{
+	nvpair_t *nvp;
+	unsigned int ii;
+	size_t datasize, size;
+	char **data;
+
+	if (value == NULL || nitems == 0) {
+		ERRNO_SET(EINVAL);
+		return (NULL);
+	}
+
+	nvp = NULL;
+	datasize = 0;
+	data = nv_malloc(sizeof(value[0]) * nitems);
+	if (data == NULL)
+		return (NULL);
+
+	for (ii = 0; ii < nitems; ii++) {
+		if (value[ii] == NULL) {
+			ERRNO_SET(EINVAL);
+			goto fail;
+		}
+
+		size = strlen(value[ii]) + 1;
+		datasize += size;
+		data[ii] = nv_strdup(value[ii]);
+		if (data[ii] == NULL)
+			goto fail;
+	}
+	nvp = nvpair_allocv(name, NV_TYPE_STRING_ARRAY,
+	    (uint64_t)(uintptr_t)data, datasize, nitems);
+
+fail:
+	if (nvp == NULL) {
+		ERRNO_SAVE();
+		for (; ii > 0; ii--)
+			nv_free(data[ii - 1]);
+		nv_free(data);
+		ERRNO_RESTORE();
+	}
+
+	return (nvp);
+}
+
+nvpair_t *
+nvpair_create_nvlist_array(const char *name, const nvlist_t * const *value,
+    size_t nitems)
+{
+	unsigned int ii;
+	nvlist_t **nvls;
+	nvpair_t *nvp;
+	int flags;
+
+	nvp = NULL;
+	nvls = NULL;
+	ii = 0;
+
+	if (value == NULL || nitems == 0) {
+		ERRNO_SET(EINVAL);
+		return (NULL);
+	}
+
+	nvls = nv_malloc(sizeof(value[0]) * nitems);
+	if (nvls == NULL)
+		return (NULL);
+
+	for (ii = 0; ii < nitems; ii++) {
+		if (value[ii] == NULL) {
+			ERRNO_SET(EINVAL);
+			goto fail;
+		}
+
+		nvls[ii] = nvlist_clone(value[ii]);
+		if (nvls[ii] == NULL)
+			goto fail;
+
+		if (ii > 0) {
+			nvp = nvpair_allocv(" ", NV_TYPE_NVLIST,
+			    (uint64_t)(uintptr_t)nvls[ii], 0, 0);
+			if (nvp == NULL)
+				goto fail;
+			nvlist_set_array_next(nvls[ii - 1], nvp);
+		}
+	}
+	flags = nvlist_flags(nvls[nitems - 1]) | NV_FLAG_IN_ARRAY;
+	nvlist_set_flags(nvls[nitems - 1], flags);
+
+	nvp = nvpair_allocv(name, NV_TYPE_NVLIST_ARRAY,
+	    (uint64_t)(uintptr_t)nvls, 0, nitems);
+
+fail:
+	if (nvp == NULL) {
+		ERRNO_SAVE();
+		for (; ii > 0; ii--)
+			nvlist_destroy(nvls[ii - 1]);
+
+		nv_free(nvls);
+		ERRNO_RESTORE();
+	} else {
+		for (ii = 0; ii < nitems; ii++)
+			nvlist_set_parent(nvls[ii], nvp);
+	}
+
+	return (nvp);
+}
+
+#ifndef _KERNEL
+nvpair_t *
+nvpair_create_descriptor_array(const char *name, const int *value,
+    size_t nitems)
+{
+	unsigned int ii;
+	nvpair_t *nvp;
+	int *fds;
+
+	if (value == NULL) {
+		ERRNO_SET(EINVAL);
+		return (NULL);
+	}
+
+	nvp = NULL;
+
+	fds = nv_malloc(sizeof(value[0]) * nitems);
+	if (fds == NULL)
+		return (NULL);
+	for (ii = 0; ii < nitems; ii++) {
+		if (value[ii] == -1) {
+			fds[ii] = -1;
+		} else {
+			if (!fd_is_valid(value[ii])) {
+				ERRNO_SET(EBADF);
+				goto fail;
+			}
+
+			fds[ii] = fcntl(value[ii], F_DUPFD_CLOEXEC, 0);
+			if (fds[ii] == -1)
+				goto fail;
+		}
+	}
+
+	nvp = nvpair_allocv(name, NV_TYPE_DESCRIPTOR_ARRAY,
+	    (uint64_t)(uintptr_t)fds, sizeof(int64_t) * nitems, nitems);
+
+fail:
+	if (nvp == NULL) {
+		ERRNO_SAVE();
+		for (; ii > 0; ii--) {
+			if (fds[ii - 1] != -1)
+				close(fds[ii - 1]);
+		}
+		nv_free(fds);
+		ERRNO_RESTORE();
+	}
+
+	return (nvp);
+}
+#endif
+
+nvpair_t *
 nvpair_move_string(const char *name, char *value)
 {
 	nvpair_t *nvp;
@@ -907,7 +1526,7 @@ nvpair_move_string(const char *name, char *value)
 	}
 
 	nvp = nvpair_allocv(name, NV_TYPE_STRING, (uint64_t)(uintptr_t)value,
-	    strlen(value) + 1);
+	    strlen(value) + 1, 0);
 	if (nvp == NULL) {
 		ERRNO_SAVE();
 		nv_free(value);
@@ -934,7 +1553,7 @@ nvpair_move_nvlist(const char *name, nvlist_t *value)
 	}
 
 	nvp = nvpair_allocv(name, NV_TYPE_NVLIST, (uint64_t)(uintptr_t)value,
-	    0);
+	    0, 0);
 	if (nvp == NULL)
 		nvlist_destroy(value);
 	else
@@ -955,7 +1574,7 @@ nvpair_move_descriptor(const char *name, int value)
 	}
 
 	nvp = nvpair_allocv(name, NV_TYPE_DESCRIPTOR, (uint64_t)value,
-	    sizeof(int64_t));
+	    sizeof(int64_t), 0);
 	if (nvp == NULL) {
 		ERRNO_SAVE();
 		close(value);
@@ -977,7 +1596,83 @@ nvpair_move_binary(const char *name, void *value, size_t size)
 	}
 
 	nvp = nvpair_allocv(name, NV_TYPE_BINARY, (uint64_t)(uintptr_t)value,
-	    size);
+	    size, 0);
+	if (nvp == NULL) {
+		ERRNO_SAVE();
+		nv_free(value);
+		ERRNO_RESTORE();
+	}
+
+	return (nvp);
+}
+
+nvpair_t *
+nvpair_move_bool_array(const char *name, bool *value, size_t nitems)
+{
+	nvpair_t *nvp;
+
+	if (value == NULL || nitems == 0) {
+		ERRNO_SET(EINVAL);
+		return (NULL);
+	}
+
+	nvp = nvpair_allocv(name, NV_TYPE_BOOL_ARRAY,
+	    (uint64_t)(uintptr_t)value, sizeof(value[0]) * nitems, nitems);
+	if (nvp == NULL) {
+		ERRNO_SAVE();
+		nv_free(value);
+		ERRNO_RESTORE();
+	}
+
+	return (nvp);
+}
+
+nvpair_t *
+nvpair_move_string_array(const char *name, char **value, size_t nitems)
+{
+	nvpair_t *nvp;
+	size_t i, size;
+
+	if (value == NULL || nitems == 0) {
+		ERRNO_SET(EINVAL);
+		return (NULL);
+	}
+
+	size = 0;
+	for (i = 0; i < nitems; i++) {
+		if (value[i] == NULL) {
+			ERRNO_SET(EINVAL);
+			return (NULL);
+		}
+
+		size += strlen(value[i]) + 1;
+	}
+
+	nvp = nvpair_allocv(name, NV_TYPE_STRING_ARRAY,
+	    (uint64_t)(uintptr_t)value, size, nitems);
+	if (nvp == NULL) {
+		ERRNO_SAVE();
+		for (i = 0; i < nitems; i++)
+			nv_free(value[i]);
+		nv_free(value);
+		ERRNO_RESTORE();
+	}
+
+	return (nvp);
+}
+
+nvpair_t *
+nvpair_move_number_array(const char *name, uint64_t *value, size_t nitems)
+{
+	nvpair_t *nvp;
+
+	if (value == NULL || nitems == 0) {
+		ERRNO_SET(EINVAL);
+		return (NULL);
+	}
+
+	nvp = nvpair_allocv(name, NV_TYPE_NUMBER_ARRAY,
+	    (uint64_t)(uintptr_t)value, sizeof(value[0]) * nitems, nitems);
 	if (nvp == NULL) {
 		ERRNO_SAVE();
 		nv_free(value);
@@ -987,6 +1682,95 @@ nvpair_move_binary(const char *name, void *value, size_t size)
 	return (nvp);
 }
 
+nvpair_t *
+nvpair_move_nvlist_array(const char *name, nvlist_t **value, size_t nitems)
+{
+	unsigned int ii;
+	nvpair_t *nvp;
+	int flags;
+
+	nvp = NULL;
+	if (value == NULL || nitems == 0) {
+		ERRNO_SET(EINVAL);
+		return (NULL);
+	}
+
+	for (ii = 0; ii < nitems; ii++) {
+		if (value == NULL || nvlist_error(value[ii]) != 0 ||
+		    nvlist_get_pararr(value[ii], NULL) != NULL) {
+			ERRNO_SET(EINVAL);
+			goto fail;
+		}
+		if (ii > 0) {
+			nvp = nvpair_allocv(" ", NV_TYPE_NVLIST,
+			    (uint64_t)(uintptr_t)value[ii], 0, 0);
+			if (nvp == NULL)
+				goto fail;
+			nvlist_set_array_next(value[ii - 1], nvp);
+		}
+	}
+	flags = nvlist_flags(value[nitems - 1]) | NV_FLAG_IN_ARRAY;
+	nvlist_set_flags(value[nitems - 1], flags);
+
+	nvp = nvpair_allocv(name, NV_TYPE_NVLIST_ARRAY,
+	    (uint64_t)(uintptr_t)value, 0, nitems);
+fail:
+	if (nvp == NULL) {
+		ERRNO_SAVE();
+		for (ii = 0; ii < nitems; ii++) {
+			if (value[ii] != NULL &&
+			    nvlist_get_pararr(value[ii], NULL) != NULL) {
+				nvlist_destroy(value[ii]);
+			}
+			nv_free(value);
+		}
+		ERRNO_RESTORE();
+	} else {
+		for (ii = 0; ii < nitems; ii++)
+			nvlist_set_parent(value[ii], nvp);
+	}
+
+	return (nvp);
+}
+
+#ifndef _KERNEL
+nvpair_t *
+nvpair_move_descriptor_array(const char *name, int *value, size_t nitems)
+{
+	nvpair_t *nvp;
+	size_t i;
+
+	nvp = NULL;
+	if (value == NULL || nitems == 0) {
+		ERRNO_SET(EINVAL);
+		return (NULL);
+	}
+
+	for (i = 0; i < nitems; i++) {
+		if (value[i] != -1 && !fd_is_valid(value[i])) {
+			ERRNO_SET(EBADF);
+			goto fail;
+		}
+	}
+
+	nvp = nvpair_allocv(name, NV_TYPE_DESCRIPTOR_ARRAY,
+	    (uint64_t)(uintptr_t)value, sizeof(value[0]) * nitems, nitems);
+
+fail:
+	if (nvp == NULL) {
+		ERRNO_SAVE();
+		for (i = 0; i < nitems; i++) {
+			if (fd_is_valid(value[i]))
+				close(value[i]);
+		}
+		nv_free(value);
+		ERRNO_RESTORE();
+	}
+
+	return (nvp);
+}
+#endif
+
 bool
 nvpair_get_bool(const nvpair_t *nvp)
 {
@@ -1046,12 +1830,81 @@ nvpair_get_binary(const nvpair_t *nvp, size_t *sizep)
 
 	if (sizep != NULL)
 		*sizep = nvp->nvp_datasize;
+
 	return ((const void *)(intptr_t)nvp->nvp_data);
 }
 
+const bool *
+nvpair_get_bool_array(const nvpair_t *nvp, size_t *nitems)
+{
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_BOOL_ARRAY);
+
+	if (nitems != NULL)
+		*nitems = nvp->nvp_nitems;
+
+	return ((const bool *)(intptr_t)nvp->nvp_data);
+}
+
+const uint64_t *
+nvpair_get_number_array(const nvpair_t *nvp, size_t *nitems)
+{
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NUMBER_ARRAY);
+
+	if (nitems != NULL)
+		*nitems = nvp->nvp_nitems;
+
+	return ((const uint64_t *)(intptr_t)nvp->nvp_data);
+}
+
+const char * const *
+nvpair_get_string_array(const nvpair_t *nvp, size_t *nitems)
+{
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_STRING_ARRAY);
+
+	if (nitems != NULL)
+		*nitems = nvp->nvp_nitems;
+
+	return ((const char * const *)(intptr_t)nvp->nvp_data);
+}
+
+const nvlist_t * const *
+nvpair_get_nvlist_array(const nvpair_t *nvp, size_t *nitems)
+{
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NVLIST_ARRAY);
+
+	if (nitems != NULL)
+		*nitems = nvp->nvp_nitems;
+
+	return ((const nvlist_t * const *)((intptr_t)nvp->nvp_data));
+}
+
+#ifndef _KERNEL
+const int *
+nvpair_get_descriptor_array(const nvpair_t *nvp, size_t *nitems)
+{
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_DESCRIPTOR_ARRAY);
+
+	if (nitems != NULL)
+		*nitems = nvp->nvp_nitems;
+
+	return ((const int *)(intptr_t)nvp->nvp_data);
+}
+#endif
+
 void
 nvpair_free(nvpair_t *nvp)
 {
+	size_t i;
 
 	NVPAIR_ASSERT(nvp);
 	PJDLOG_ASSERT(nvp->nvp_list == NULL);
@@ -1062,6 +1915,10 @@ nvpair_free(nvpair_t *nvp)
 	case NV_TYPE_DESCRIPTOR:
 		close((int)nvp->nvp_data);
 		break;
+	case NV_TYPE_DESCRIPTOR_ARRAY:
+		for (i = 0; i < nvp->nvp_nitems; i++)
+			close(((int *)(intptr_t)nvp->nvp_data)[i]);
+		break;
 #endif
 	case NV_TYPE_NVLIST:
 		nvlist_destroy((nvlist_t *)(intptr_t)nvp->nvp_data);
@@ -1072,6 +1929,23 @@ nvpair_free(nvpair_t *nvp)
 	case NV_TYPE_BINARY:
 		nv_free((void *)(intptr_t)nvp->nvp_data);
 		break;
+	case NV_TYPE_NVLIST_ARRAY:
+		for (i = 0; i < nvp->nvp_nitems; i++) {
+			nvlist_destroy(
+			    ((nvlist_t **)(intptr_t)nvp->nvp_data)[i]);
+		}
+		nv_free(((nvlist_t **)(intptr_t)nvp->nvp_data));
+		break;
+	case NV_TYPE_NUMBER_ARRAY:
+		nv_free((uint64_t *)(intptr_t)nvp->nvp_data);
+		break;
+	case NV_TYPE_BOOL_ARRAY:
+		nv_free((bool *)(intptr_t)nvp->nvp_data);
+		break;
+	case NV_TYPE_STRING_ARRAY:
+		for (i = 0; i < nvp->nvp_nitems; i++)
+			nv_free(((char **)(intptr_t)nvp->nvp_data)[i]);
+		break;
 	}
 	nv_free(nvp);
 }
@@ -1106,6 +1980,16 @@ nvpair_type_string(int type)
 		return ("DESCRIPTOR");
 	case NV_TYPE_BINARY:
 		return ("BINARY");
+	case NV_TYPE_BOOL_ARRAY:
+		return ("BOOL ARRAY");
+	case NV_TYPE_NUMBER_ARRAY:
+		return ("NUMBER ARRAY");
+	case NV_TYPE_STRING_ARRAY:
+		return ("STRING ARRAY");
+	case NV_TYPE_NVLIST_ARRAY:
+		return ("NVLIST ARRAY");
+	case NV_TYPE_DESCRIPTOR_ARRAY:
+		return ("DESCRIPTOR ARRAY");
 	default:
 		return ("<UNKNOWN>");
 	}
diff --git a/sys/contrib/libnv/nvpair_impl.h b/sys/contrib/libnv/nvpair_impl.h
index fed7725..0350b1c 100644
--- a/sys/contrib/libnv/nvpair_impl.h
+++ b/sys/contrib/libnv/nvpair_impl.h
@@ -1,5 +1,6 @@
 /*-
  * Copyright (c) 2009-2013 The FreeBSD Foundation
+ * Copyright (c) 2013-2015 Mariusz Zaborski <oshogbo@FreeBSD.org>
  * All rights reserved.
  *
  * This software was developed by Pawel Jakub Dawidek under sponsorship from
@@ -71,6 +72,15 @@ unsigned char *nvpair_pack_descriptor(const nvpair_t *nvp, unsigned char *ptr,
 unsigned char *nvpair_pack_binary(const nvpair_t *nvp, unsigned char *ptr,
     size_t *leftp);
 unsigned char *nvpair_pack_nvlist_up(unsigned char *ptr, size_t *leftp);
+unsigned char *nvpair_pack_bool_array(const nvpair_t *nvp, unsigned char *ptr,
+    size_t *leftp);
+unsigned char *nvpair_pack_number_array(const nvpair_t *nvp, unsigned char *ptr,
+    size_t *leftp);
+unsigned char *nvpair_pack_string_array(const nvpair_t *nvp, unsigned char *ptr,
+    size_t *leftp);
+unsigned char *nvpair_pack_descriptor_array(const nvpair_t *nvp,
+    unsigned char *ptr, int64_t *fdidxp, size_t *leftp);
+unsigned char *nvpair_pack_nvlist_array_next(unsigned char *ptr, size_t *leftp);
 
 /* Unpack data functions. */
 const unsigned char *nvpair_unpack_header(bool isbe, nvpair_t *nvp,
@@ -89,5 +99,15 @@ const unsigned char *nvpair_unpack_descriptor(bool isbe, nvpair_t *nvp,
     const unsigned char *ptr, size_t *leftp, const int *fds, size_t nfds);
 const unsigned char *nvpair_unpack_binary(bool isbe, nvpair_t *nvp,
     const unsigned char *ptr, size_t *leftp);
+const unsigned char *nvpair_unpack_bool_array(bool isbe, nvpair_t *nvp,
+    const unsigned char *ptr, size_t *leftp);
+const unsigned char *nvpair_unpack_number_array(bool isbe, nvpair_t *nvp,
+    const unsigned char *ptr, size_t *leftp);
+const unsigned char *nvpair_unpack_string_array(bool isbe, nvpair_t *nvp,
+    const unsigned char *ptr, size_t *leftp);
+const unsigned char *nvpair_unpack_descriptor_array(bool isbe, nvpair_t *nvp,
+    const unsigned char *ptr, size_t *leftp, const int *fds, size_t nfds);
+const unsigned char *nvpair_unpack_nvlist_array(bool isbe, nvpair_t *nvp,
+    const unsigned char *ptr, size_t *leftp, nvlist_t **firstel);
 
 #endif	/* !_NVPAIR_IMPL_H_ */
diff --git a/sys/dev/ata/ata-all.c b/sys/dev/ata/ata-all.c
index 52db44d..118e38e 100644
--- a/sys/dev/ata/ata-all.c
+++ b/sys/dev/ata/ata-all.c
@@ -64,18 +64,15 @@ static void ata_cam_end_transaction(device_t dev, struct ata_request *request);
 static void ata_cam_request_sense(device_t dev, struct ata_request *request);
 static int ata_check_ids(device_t dev, union ccb *ccb);
 static void ata_conn_event(void *context, int dummy);
-static void ata_init(void);
 static void ata_interrupt_locked(void *data);
 static int ata_module_event_handler(module_t mod, int what, void *arg);
 static void ata_periodic_poll(void *data);
 static int ata_str2mode(const char *str);
-static void ata_uninit(void);
 
 /* global vars */
 MALLOC_DEFINE(M_ATA, "ata_generic", "ATA driver generic layer");
 int (*ata_raid_ioctl_func)(u_long cmd, caddr_t data) = NULL;
 devclass_t ata_devclass;
-uma_zone_t ata_request_zone;
 int ata_dma_check_80pin = 1;
 
 /* sysctl vars */
@@ -650,12 +647,7 @@ ata_cam_begin_transaction(device_t dev, union ccb *ccb)
 	struct ata_channel *ch = device_get_softc(dev);
 	struct ata_request *request;
 
-	if (!(request = ata_alloc_request())) {
-		device_printf(dev, "FAILURE - out of memory in start\n");
-		ccb->ccb_h.status = CAM_REQ_INVALID;
-		xpt_done(ccb);
-		return;
-	}
+	request = &ch->request;
 	bzero(request, sizeof(*request));
 
 	/* setup request */
@@ -794,7 +786,6 @@ ata_cam_process_sense(device_t dev, struct ata_request *request)
 		ccb->ccb_h.status |= CAM_AUTOSENSE_FAIL;
 	}
 
-	ata_free_request(request);
 	xpt_done(ccb);
 	/* Do error recovery if needed. */
 	if (fatalerr)
@@ -865,10 +856,8 @@ ata_cam_end_transaction(device_t dev, struct ata_request *request)
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_SCSI_STATUS_ERROR &&
 	    (ccb->ccb_h.flags & CAM_DIS_AUTOSENSE) == 0)
 		ata_cam_request_sense(dev, request);
-	else {
-		ata_free_request(request);
+	else
 		xpt_done(ccb);
-	}
 	/* Do error recovery if needed. */
 	if (fatalerr)
 		ata_reinit(dev);
@@ -1148,18 +1137,3 @@ static moduledata_t ata_moduledata = { "ata", ata_module_event_handler, NULL };
 DECLARE_MODULE(ata, ata_moduledata, SI_SUB_CONFIGURE, SI_ORDER_SECOND);
 MODULE_VERSION(ata, 1);
 MODULE_DEPEND(ata, cam, 1, 1, 1);
-
-static void
-ata_init(void)
-{
-    ata_request_zone = uma_zcreate("ata_request", sizeof(struct ata_request),
-				   NULL, NULL, NULL, NULL, 0, 0);
-}
-SYSINIT(ata_register, SI_SUB_DRIVERS, SI_ORDER_SECOND, ata_init, NULL);
-
-static void
-ata_uninit(void)
-{
-    uma_zdestroy(ata_request_zone);
-}
-SYSUNINIT(ata_unregister, SI_SUB_DRIVERS, SI_ORDER_SECOND, ata_uninit, NULL);
diff --git a/sys/dev/ata/ata-all.h b/sys/dev/ata/ata-all.h
index 19cb7ef..cf8ed78 100644
--- a/sys/dev/ata/ata-all.h
+++ b/sys/dev/ata/ata-all.h
@@ -450,6 +450,7 @@ struct ata_channel {
 	struct ata_cam_device	curr[16];       /* Current settings */
 	int			requestsense;	/* CCB waiting for SENSE. */
 	struct callout		poll_callout;	/* Periodic status poll. */
+	struct ata_request	request;
 };
 
 /* disk bay/enclosure related */
@@ -507,14 +508,6 @@ int ata_sata_getrev(device_t dev, int target);
 int ata_request2fis_h2d(struct ata_request *request, u_int8_t *fis);
 void ata_pm_identify(device_t dev);
 
-/* macros for alloc/free of struct ata_request */
-extern uma_zone_t ata_request_zone;
-#define ata_alloc_request() uma_zalloc(ata_request_zone, M_NOWAIT | M_ZERO)
-#define ata_free_request(request) { \
-	if (!(request->flags & ATA_R_DANGER2)) \
-	    uma_zfree(ata_request_zone, request); \
-	}
-
 MALLOC_DECLARE(M_ATA);
 
 /* misc newbus defines */
diff --git a/sys/dev/ath/if_ath.c b/sys/dev/ath/if_ath.c
index 2c935a2b..26bf591 100644
--- a/sys/dev/ath/if_ath.c
+++ b/sys/dev/ath/if_ath.c
@@ -1473,7 +1473,7 @@ ath_vap_create(struct ieee80211com *ic, const char name[IFNAMSIZ], int unit,
     const uint8_t bssid[IEEE80211_ADDR_LEN],
     const uint8_t mac0[IEEE80211_ADDR_LEN])
 {
-	struct ath_softc *sc = ic->ic_ifp->if_softc;
+	struct ath_softc *sc = ic->ic_softc;
 	struct ath_vap *avp;
 	struct ieee80211vap *vap;
 	uint8_t mac[IEEE80211_ADDR_LEN];
@@ -1732,7 +1732,7 @@ ath_vap_delete(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ifnet *ifp = ic->ic_ifp;
-	struct ath_softc *sc = ifp->if_softc;
+	struct ath_softc *sc = ic->ic_softc;
 	struct ath_hal *ah = sc->sc_ah;
 	struct ath_vap *avp = ATH_VAP(vap);
 
@@ -2340,7 +2340,7 @@ ath_fatal_proc(void *arg, int pending)
 static void
 ath_bmiss_vap(struct ieee80211vap *vap)
 {
-	struct ath_softc *sc = vap->iv_ic->ic_ifp->if_softc;
+	struct ath_softc *sc = vap->iv_ic->ic_softc;
 
 	/*
 	 * Workaround phantom bmiss interrupts by sanity-checking
@@ -2361,8 +2361,6 @@ ath_bmiss_vap(struct ieee80211vap *vap)
 	ATH_UNLOCK(sc);
 
 	if ((vap->iv_flags_ext & IEEE80211_FEXT_SWBMISS) == 0) {
-		struct ifnet *ifp = vap->iv_ic->ic_ifp;
-		struct ath_softc *sc = ifp->if_softc;
 		u_int64_t lastrx = sc->sc_lastrx;
 		u_int64_t tsf = ath_hal_gettsf64(sc->sc_ah);
 		/* XXX should take a locked ref to iv_bss */
@@ -2851,8 +2849,8 @@ ath_stop(struct ifnet *ifp)
 int
 ath_reset(struct ifnet *ifp, ATH_RESET_TYPE reset_type)
 {
-	struct ath_softc *sc = ifp->if_softc;
 	struct ieee80211com *ic = ifp->if_l2com;
+	struct ath_softc *sc = ic->ic_softc;
 	struct ath_hal *ah = sc->sc_ah;
 	HAL_STATUS status;
 	int i;
@@ -3045,7 +3043,7 @@ ath_reset_vap(struct ieee80211vap *vap, u_long cmd)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ifnet *ifp = ic->ic_ifp;
-	struct ath_softc *sc = ifp->if_softc;
+	struct ath_softc *sc = ic->ic_softc;
 	struct ath_hal *ah = sc->sc_ah;
 
 	switch (cmd) {
@@ -3248,7 +3246,7 @@ static int
 ath_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ieee80211com *ic = ifp->if_l2com;
-	struct ath_softc *sc = ic->ic_ifp->if_softc;
+	struct ath_softc *sc = ic->ic_softc;
 	struct ieee80211_node *ni;
 	struct mbuf *next;
 	struct ath_buf *bf;
@@ -3538,8 +3536,7 @@ ath_media_change(struct ifnet *ifp)
 static void
 ath_key_update_begin(struct ieee80211vap *vap)
 {
-	struct ifnet *ifp = vap->iv_ic->ic_ifp;
-	struct ath_softc *sc = ifp->if_softc;
+	struct ath_softc *sc = vap->iv_ic->ic_softc;
 
 	DPRINTF(sc, ATH_DEBUG_KEYCACHE, "%s:\n", __func__);
 	taskqueue_block(sc->sc_tq);
@@ -3548,8 +3545,7 @@ ath_key_update_begin(struct ieee80211vap *vap)
 static void
 ath_key_update_end(struct ieee80211vap *vap)
 {
-	struct ifnet *ifp = vap->iv_ic->ic_ifp;
-	struct ath_softc *sc = ifp->if_softc;
+	struct ath_softc *sc = vap->iv_ic->ic_softc;
 
 	DPRINTF(sc, ATH_DEBUG_KEYCACHE, "%s:\n", __func__);
 	taskqueue_unblock(sc->sc_tq);
@@ -4156,7 +4152,7 @@ static struct ieee80211_node *
 ath_node_alloc(struct ieee80211vap *vap, const uint8_t mac[IEEE80211_ADDR_LEN])
 {
 	struct ieee80211com *ic = vap->iv_ic;
-	struct ath_softc *sc = ic->ic_ifp->if_softc;
+	struct ath_softc *sc = ic->ic_softc;
 	const size_t space = sizeof(struct ath_node) + sc->sc_rc->arc_space;
 	struct ath_node *an;
 
@@ -4183,7 +4179,7 @@ static void
 ath_node_cleanup(struct ieee80211_node *ni)
 {
 	struct ieee80211com *ic = ni->ni_ic;
-	struct ath_softc *sc = ic->ic_ifp->if_softc;
+	struct ath_softc *sc = ic->ic_softc;
 
 	DPRINTF(sc, ATH_DEBUG_NODE, "%s: %6D: an %p\n", __func__,
 	    ni->ni_macaddr, ":", ATH_NODE(ni));
@@ -4198,7 +4194,7 @@ static void
 ath_node_free(struct ieee80211_node *ni)
 {
 	struct ieee80211com *ic = ni->ni_ic;
-	struct ath_softc *sc = ic->ic_ifp->if_softc;
+	struct ath_softc *sc = ic->ic_softc;
 
 	DPRINTF(sc, ATH_DEBUG_NODE, "%s: %6D: an %p\n", __func__,
 	    ni->ni_macaddr, ":", ATH_NODE(ni));
@@ -4210,7 +4206,7 @@ static void
 ath_node_getsignal(const struct ieee80211_node *ni, int8_t *rssi, int8_t *noise)
 {
 	struct ieee80211com *ic = ni->ni_ic;
-	struct ath_softc *sc = ic->ic_ifp->if_softc;
+	struct ath_softc *sc = ic->ic_softc;
 	struct ath_hal *ah = sc->sc_ah;
 
 	*rssi = ic->ic_node_getrssi(ni);
@@ -4422,7 +4418,7 @@ ath_txq_update(struct ath_softc *sc, int ac)
 int
 ath_wme_update(struct ieee80211com *ic)
 {
-	struct ath_softc *sc = ic->ic_ifp->if_softc;
+	struct ath_softc *sc = ic->ic_softc;
 
 	return !ath_txq_update(sc, WME_AC_BE) ||
 	    !ath_txq_update(sc, WME_AC_BK) ||
@@ -5797,7 +5793,7 @@ static void
 ath_scan_start(struct ieee80211com *ic)
 {
 	struct ifnet *ifp = ic->ic_ifp;
-	struct ath_softc *sc = ifp->if_softc;
+	struct ath_softc *sc = ic->ic_softc;
 	struct ath_hal *ah = sc->sc_ah;
 	u_int32_t rfilt;
 
@@ -5821,8 +5817,7 @@ ath_scan_start(struct ieee80211com *ic)
 static void
 ath_scan_end(struct ieee80211com *ic)
 {
-	struct ifnet *ifp = ic->ic_ifp;
-	struct ath_softc *sc = ifp->if_softc;
+	struct ath_softc *sc = ic->ic_softc;
 	struct ath_hal *ah = sc->sc_ah;
 	u_int32_t rfilt;
 
@@ -5862,8 +5857,7 @@ ath_scan_end(struct ieee80211com *ic)
 static void
 ath_update_chw(struct ieee80211com *ic)
 {
-	struct ifnet *ifp = ic->ic_ifp;
-	struct ath_softc *sc = ifp->if_softc;
+	struct ath_softc *sc = ic->ic_softc;
 
 	DPRINTF(sc, ATH_DEBUG_STATE, "%s: called\n", __func__);
 	ath_set_channel(ic);
@@ -5873,8 +5867,7 @@ ath_update_chw(struct ieee80211com *ic)
 static void
 ath_set_channel(struct ieee80211com *ic)
 {
-	struct ifnet *ifp = ic->ic_ifp;
-	struct ath_softc *sc = ifp->if_softc;
+	struct ath_softc *sc = ic->ic_softc;
 
 	ATH_LOCK(sc);
 	ath_power_set_power_state(sc, HAL_PM_AWAKE);
@@ -5916,7 +5909,7 @@ static int
 ath_newstate(struct ieee80211vap *vap, enum ieee80211_state nstate, int arg)
 {
 	struct ieee80211com *ic = vap->iv_ic;
-	struct ath_softc *sc = ic->ic_ifp->if_softc;
+	struct ath_softc *sc = ic->ic_softc;
 	struct ath_vap *avp = ATH_VAP(vap);
 	struct ath_hal *ah = sc->sc_ah;
 	struct ieee80211_node *ni = NULL;
@@ -6252,7 +6245,7 @@ static void
 ath_setup_stationkey(struct ieee80211_node *ni)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
-	struct ath_softc *sc = vap->iv_ic->ic_ifp->if_softc;
+	struct ath_softc *sc = vap->iv_ic->ic_softc;
 	ieee80211_keyix keyix, rxkeyix;
 
 	/* XXX should take a locked ref to vap->iv_bss */
@@ -6285,7 +6278,7 @@ ath_newassoc(struct ieee80211_node *ni, int isnew)
 {
 	struct ath_node *an = ATH_NODE(ni);
 	struct ieee80211vap *vap = ni->ni_vap;
-	struct ath_softc *sc = vap->iv_ic->ic_ifp->if_softc;
+	struct ath_softc *sc = vap->iv_ic->ic_softc;
 	const struct ieee80211_txparam *tp = ni->ni_txparms;
 
 	an->an_mcastrix = ath_tx_findrix(sc, tp->mcastrate);
@@ -6337,7 +6330,7 @@ static int
 ath_setregdomain(struct ieee80211com *ic, struct ieee80211_regdomain *reg,
 	int nchans, struct ieee80211_channel chans[])
 {
-	struct ath_softc *sc = ic->ic_ifp->if_softc;
+	struct ath_softc *sc = ic->ic_softc;
 	struct ath_hal *ah = sc->sc_ah;
 	HAL_STATUS status;
 
@@ -6361,7 +6354,7 @@ static void
 ath_getradiocaps(struct ieee80211com *ic,
 	int maxchans, int *nchans, struct ieee80211_channel chans[])
 {
-	struct ath_softc *sc = ic->ic_ifp->if_softc;
+	struct ath_softc *sc = ic->ic_softc;
 	struct ath_hal *ah = sc->sc_ah;
 
 	DPRINTF(sc, ATH_DEBUG_REGDOMAIN, "%s: use rd %u cc %d\n",
@@ -6693,8 +6686,8 @@ ath_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 #define	IS_RUNNING(ifp) \
 	((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))
-	struct ath_softc *sc = ifp->if_softc;
 	struct ieee80211com *ic = ifp->if_l2com;
+	struct ath_softc *sc = ic->ic_softc;
 	struct ifreq *ifr = (struct ifreq *)data;
 	const HAL_RATE_TABLE *rt;
 	int error = 0;
@@ -6864,7 +6857,7 @@ ath_node_powersave(struct ieee80211_node *ni, int enable)
 #ifdef	ATH_SW_PSQ
 	struct ath_node *an = ATH_NODE(ni);
 	struct ieee80211com *ic = ni->ni_ic;
-	struct ath_softc *sc = ic->ic_ifp->if_softc;
+	struct ath_softc *sc = ic->ic_softc;
 	struct ath_vap *avp = ATH_VAP(ni->ni_vap);
 
 	/* XXX and no TXQ locks should be held here */
@@ -6931,7 +6924,7 @@ ath_node_set_tim(struct ieee80211_node *ni, int enable)
 {
 #ifdef	ATH_SW_PSQ
 	struct ieee80211com *ic = ni->ni_ic;
-	struct ath_softc *sc = ic->ic_ifp->if_softc;
+	struct ath_softc *sc = ic->ic_softc;
 	struct ath_node *an = ATH_NODE(ni);
 	struct ath_vap *avp = ATH_VAP(ni->ni_vap);
 	int changed = 0;
@@ -7136,7 +7129,7 @@ ath_node_recv_pspoll(struct ieee80211_node *ni, struct mbuf *m)
 	struct ath_node *an;
 	struct ath_vap *avp;
 	struct ieee80211com *ic = ni->ni_ic;
-	struct ath_softc *sc = ic->ic_ifp->if_softc;
+	struct ath_softc *sc = ic->ic_softc;
 	int tid;
 
 	/* Just paranoia */
diff --git a/sys/dev/ath/if_ath_keycache.c b/sys/dev/ath/if_ath_keycache.c
index fe99f10..b8a77e8 100644
--- a/sys/dev/ath/if_ath_keycache.c
+++ b/sys/dev/ath/if_ath_keycache.c
@@ -425,7 +425,7 @@ int
 ath_key_alloc(struct ieee80211vap *vap, struct ieee80211_key *k,
 	ieee80211_keyix *keyix, ieee80211_keyix *rxkeyix)
 {
-	struct ath_softc *sc = vap->iv_ic->ic_ifp->if_softc;
+	struct ath_softc *sc = vap->iv_ic->ic_softc;
 
 	/*
 	 * Group key allocation must be handled specially for
@@ -493,7 +493,7 @@ ath_key_alloc(struct ieee80211vap *vap, struct ieee80211_key *k,
 int
 ath_key_delete(struct ieee80211vap *vap, const struct ieee80211_key *k)
 {
-	struct ath_softc *sc = vap->iv_ic->ic_ifp->if_softc;
+	struct ath_softc *sc = vap->iv_ic->ic_softc;
 	struct ath_hal *ah = sc->sc_ah;
 	const struct ieee80211_cipher *cip = k->wk_cipher;
 	u_int keyix = k->wk_keyix;
@@ -538,7 +538,7 @@ int
 ath_key_set(struct ieee80211vap *vap, const struct ieee80211_key *k,
 	const u_int8_t mac[IEEE80211_ADDR_LEN])
 {
-	struct ath_softc *sc = vap->iv_ic->ic_ifp->if_softc;
+	struct ath_softc *sc = vap->iv_ic->ic_softc;
 
 	return ath_keyset(sc, vap, k, vap->iv_bss);
 }
diff --git a/sys/dev/ath/if_ath_rx.c b/sys/dev/ath/if_ath_rx.c
index 2779b7a..e391dd7 100644
--- a/sys/dev/ath/if_ath_rx.c
+++ b/sys/dev/ath/if_ath_rx.c
@@ -330,7 +330,7 @@ ath_recv_mgmt(struct ieee80211_node *ni, struct mbuf *m,
 	int subtype, const struct ieee80211_rx_stats *rxs, int rssi, int nf)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
-	struct ath_softc *sc = vap->iv_ic->ic_ifp->if_softc;
+	struct ath_softc *sc = vap->iv_ic->ic_softc;
 	uint64_t tsf_beacon_old, tsf_beacon;
 	uint64_t nexttbtt;
 	int64_t tsf_delta;
diff --git a/sys/dev/ath/if_ath_tdma.c b/sys/dev/ath/if_ath_tdma.c
index fd23db1..d4c9ccd 100644
--- a/sys/dev/ath/if_ath_tdma.c
+++ b/sys/dev/ath/if_ath_tdma.c
@@ -359,7 +359,7 @@ ath_tdma_update(struct ieee80211_node *ni,
 #define	TU_TO_TSF(_tu)	(((u_int64_t)(_tu)) << 10)
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
-	struct ath_softc *sc = ic->ic_ifp->if_softc;
+	struct ath_softc *sc = ic->ic_softc;
 	struct ath_hal *ah = sc->sc_ah;
 	const HAL_RATE_TABLE *rt = sc->sc_currates;
 	u_int64_t tsf, rstamp, nextslot, nexttbtt, nexttbtt_full;
diff --git a/sys/dev/ath/if_ath_tx.c b/sys/dev/ath/if_ath_tx.c
index c15b158..916d4cb 100644
--- a/sys/dev/ath/if_ath_tx.c
+++ b/sys/dev/ath/if_ath_tx.c
@@ -2341,7 +2341,7 @@ ath_raw_xmit(struct ieee80211_node *ni, struct mbuf *m,
 {
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ifnet *ifp = ic->ic_ifp;
-	struct ath_softc *sc = ifp->if_softc;
+	struct ath_softc *sc = ic->ic_softc;
 	struct ath_buf *bf;
 	struct ieee80211_frame *wh = mtod(m, struct ieee80211_frame *);
 	int error = 0;
@@ -5731,7 +5731,7 @@ int
 ath_addba_request(struct ieee80211_node *ni, struct ieee80211_tx_ampdu *tap,
     int dialogtoken, int baparamset, int batimeout)
 {
-	struct ath_softc *sc = ni->ni_ic->ic_ifp->if_softc;
+	struct ath_softc *sc = ni->ni_ic->ic_softc;
 	int tid = tap->txa_tid;
 	struct ath_node *an = ATH_NODE(ni);
 	struct ath_tid *atid = &an->an_tid[tid];
@@ -5809,7 +5809,7 @@ int
 ath_addba_response(struct ieee80211_node *ni, struct ieee80211_tx_ampdu *tap,
     int status, int code, int batimeout)
 {
-	struct ath_softc *sc = ni->ni_ic->ic_ifp->if_softc;
+	struct ath_softc *sc = ni->ni_ic->ic_softc;
 	int tid = tap->txa_tid;
 	struct ath_node *an = ATH_NODE(ni);
 	struct ath_tid *atid = &an->an_tid[tid];
@@ -5856,7 +5856,7 @@ ath_addba_response(struct ieee80211_node *ni, struct ieee80211_tx_ampdu *tap,
 void
 ath_addba_stop(struct ieee80211_node *ni, struct ieee80211_tx_ampdu *tap)
 {
-	struct ath_softc *sc = ni->ni_ic->ic_ifp->if_softc;
+	struct ath_softc *sc = ni->ni_ic->ic_softc;
 	int tid = tap->txa_tid;
 	struct ath_node *an = ATH_NODE(ni);
 	struct ath_tid *atid = &an->an_tid[tid];
@@ -5991,7 +5991,7 @@ void
 ath_bar_response(struct ieee80211_node *ni, struct ieee80211_tx_ampdu *tap,
     int status)
 {
-	struct ath_softc *sc = ni->ni_ic->ic_ifp->if_softc;
+	struct ath_softc *sc = ni->ni_ic->ic_softc;
 	int tid = tap->txa_tid;
 	struct ath_node *an = ATH_NODE(ni);
 	struct ath_tid *atid = &an->an_tid[tid];
@@ -6064,7 +6064,7 @@ void
 ath_addba_response_timeout(struct ieee80211_node *ni,
     struct ieee80211_tx_ampdu *tap)
 {
-	struct ath_softc *sc = ni->ni_ic->ic_ifp->if_softc;
+	struct ath_softc *sc = ni->ni_ic->ic_softc;
 	int tid = tap->txa_tid;
 	struct ath_node *an = ATH_NODE(ni);
 	struct ath_tid *atid = &an->an_tid[tid];
diff --git a/sys/dev/bxe/ecore_hsi.h b/sys/dev/bxe/ecore_hsi.h
index 005bb2e..f78f4ea 100644
--- a/sys/dev/bxe/ecore_hsi.h
+++ b/sys/dev/bxe/ecore_hsi.h
@@ -2536,9 +2536,9 @@ struct shmem2_region {
 	#define SHMEM_EEE_SUPPORTED_MASK	   0x000f0000
 	#define SHMEM_EEE_SUPPORTED_SHIFT	   16
 	#define SHMEM_EEE_ADV_STATUS_MASK	   0x00f00000
-		#define SHMEM_EEE_100M_ADV	   (1<<0)
-		#define SHMEM_EEE_1G_ADV	   (1<<1)
-		#define SHMEM_EEE_10G_ADV	   (1<<2)
+		#define SHMEM_EEE_100M_ADV	   (1U<<0)
+		#define SHMEM_EEE_1G_ADV	   (1U<<1)
+		#define SHMEM_EEE_10G_ADV	   (1U<<2)
 	#define SHMEM_EEE_ADV_STATUS_SHIFT	   20
 	#define	SHMEM_EEE_LP_ADV_STATUS_MASK	   0x0f000000
 	#define SHMEM_EEE_LP_ADV_STATUS_SHIFT	   24
diff --git a/sys/dev/e1000/e1000_80003es2lan.c b/sys/dev/e1000/e1000_80003es2lan.c
index 076e02b..b948bb4 100644
--- a/sys/dev/e1000/e1000_80003es2lan.c
+++ b/sys/dev/e1000/e1000_80003es2lan.c
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2013, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_80003es2lan.h b/sys/dev/e1000/e1000_80003es2lan.h
index 3807e46..89b1551 100644
--- a/sys/dev/e1000/e1000_80003es2lan.h
+++ b/sys/dev/e1000/e1000_80003es2lan.h
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2013, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_82540.c b/sys/dev/e1000/e1000_82540.c
index 141b92e..68f92c6 100644
--- a/sys/dev/e1000/e1000_82540.c
+++ b/sys/dev/e1000/e1000_82540.c
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2011, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_82541.c b/sys/dev/e1000/e1000_82541.c
index 781aa93..69fcee4 100644
--- a/sys/dev/e1000/e1000_82541.c
+++ b/sys/dev/e1000/e1000_82541.c
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2011, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_82541.h b/sys/dev/e1000/e1000_82541.h
index 3b6b961..1eebfad 100644
--- a/sys/dev/e1000/e1000_82541.h
+++ b/sys/dev/e1000/e1000_82541.h
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2008, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_82542.c b/sys/dev/e1000/e1000_82542.c
index 19d5402..a6b3616 100644
--- a/sys/dev/e1000/e1000_82542.c
+++ b/sys/dev/e1000/e1000_82542.c
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_82543.c b/sys/dev/e1000/e1000_82543.c
index 1c01658..3350f17 100644
--- a/sys/dev/e1000/e1000_82543.c
+++ b/sys/dev/e1000/e1000_82543.c
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2011, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_82543.h b/sys/dev/e1000/e1000_82543.h
index 60e5c15..0fa813b 100644
--- a/sys/dev/e1000/e1000_82543.h
+++ b/sys/dev/e1000/e1000_82543.h
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2008, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_82571.c b/sys/dev/e1000/e1000_82571.c
index e209d43..a64ef56 100644
--- a/sys/dev/e1000/e1000_82571.c
+++ b/sys/dev/e1000/e1000_82571.c
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_82571.h b/sys/dev/e1000/e1000_82571.h
index c76f16f..cda87a2 100644
--- a/sys/dev/e1000/e1000_82571.h
+++ b/sys/dev/e1000/e1000_82571.h
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2010, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_82575.c b/sys/dev/e1000/e1000_82575.c
index d79db67..8981ae3 100644
--- a/sys/dev/e1000/e1000_82575.c
+++ b/sys/dev/e1000/e1000_82575.c
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_82575.h b/sys/dev/e1000/e1000_82575.h
index 6569b98..503fdce 100644
--- a/sys/dev/e1000/e1000_82575.h
+++ b/sys/dev/e1000/e1000_82575.h
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_api.c b/sys/dev/e1000/e1000_api.c
index 374ffa6..5db22db 100644
--- a/sys/dev/e1000/e1000_api.c
+++ b/sys/dev/e1000/e1000_api.c
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_api.h b/sys/dev/e1000/e1000_api.h
index a2ffa16..e87acc8 100644
--- a/sys/dev/e1000/e1000_api.h
+++ b/sys/dev/e1000/e1000_api.h
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_defines.h b/sys/dev/e1000/e1000_defines.h
index 5deada2..9472ca4 100644
--- a/sys/dev/e1000/e1000_defines.h
+++ b/sys/dev/e1000/e1000_defines.h
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_hw.h b/sys/dev/e1000/e1000_hw.h
index faf64a3..3ec921e 100644
--- a/sys/dev/e1000/e1000_hw.h
+++ b/sys/dev/e1000/e1000_hw.h
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_i210.c b/sys/dev/e1000/e1000_i210.c
index f12c13f..563f11a 100644
--- a/sys/dev/e1000/e1000_i210.c
+++ b/sys/dev/e1000/e1000_i210.c
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_i210.h b/sys/dev/e1000/e1000_i210.h
index 2a20ca1..f940915 100644
--- a/sys/dev/e1000/e1000_i210.h
+++ b/sys/dev/e1000/e1000_i210.h
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_ich8lan.c b/sys/dev/e1000/e1000_ich8lan.c
index 204c39c..23e7b95 100644
--- a/sys/dev/e1000/e1000_ich8lan.c
+++ b/sys/dev/e1000/e1000_ich8lan.c
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_ich8lan.h b/sys/dev/e1000/e1000_ich8lan.h
index f045ebd..9cb79c0 100644
--- a/sys/dev/e1000/e1000_ich8lan.h
+++ b/sys/dev/e1000/e1000_ich8lan.h
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_mac.c b/sys/dev/e1000/e1000_mac.c
index b888b34..1c86307 100644
--- a/sys/dev/e1000/e1000_mac.c
+++ b/sys/dev/e1000/e1000_mac.c
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_mac.h b/sys/dev/e1000/e1000_mac.h
index 2c1bfe3..1daed9b 100644
--- a/sys/dev/e1000/e1000_mac.h
+++ b/sys/dev/e1000/e1000_mac.h
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_manage.c b/sys/dev/e1000/e1000_manage.c
index 8087e65..f319c8b 100644
--- a/sys/dev/e1000/e1000_manage.c
+++ b/sys/dev/e1000/e1000_manage.c
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_manage.h b/sys/dev/e1000/e1000_manage.h
index 51f17671..303e99e 100644
--- a/sys/dev/e1000/e1000_manage.h
+++ b/sys/dev/e1000/e1000_manage.h
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2012, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_mbx.c b/sys/dev/e1000/e1000_mbx.c
index 55477b2..d9fb9ac 100644
--- a/sys/dev/e1000/e1000_mbx.c
+++ b/sys/dev/e1000/e1000_mbx.c
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_mbx.h b/sys/dev/e1000/e1000_mbx.h
index d2aea5c4..fadd849 100644
--- a/sys/dev/e1000/e1000_mbx.h
+++ b/sys/dev/e1000/e1000_mbx.h
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_nvm.c b/sys/dev/e1000/e1000_nvm.c
index f702f71..0a1a18d 100644
--- a/sys/dev/e1000/e1000_nvm.c
+++ b/sys/dev/e1000/e1000_nvm.c
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_nvm.h b/sys/dev/e1000/e1000_nvm.h
index 34077b2..31f2180 100644
--- a/sys/dev/e1000/e1000_nvm.h
+++ b/sys/dev/e1000/e1000_nvm.h
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2013, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_osdep.c b/sys/dev/e1000/e1000_osdep.c
index 75a7b79..2987cda 100644
--- a/sys/dev/e1000/e1000_osdep.c
+++ b/sys/dev/e1000/e1000_osdep.c
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2010, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_osdep.h b/sys/dev/e1000/e1000_osdep.h
index 1324110..fc46f48 100644
--- a/sys/dev/e1000/e1000_osdep.h
+++ b/sys/dev/e1000/e1000_osdep.h
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_phy.c b/sys/dev/e1000/e1000_phy.c
index f27889c..adb6732 100644
--- a/sys/dev/e1000/e1000_phy.c
+++ b/sys/dev/e1000/e1000_phy.c
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_phy.h b/sys/dev/e1000/e1000_phy.h
index 0e5b2e6..d3d563f 100644
--- a/sys/dev/e1000/e1000_phy.h
+++ b/sys/dev/e1000/e1000_phy.h
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_regs.h b/sys/dev/e1000/e1000_regs.h
index 952a7dc..da93d75 100644
--- a/sys/dev/e1000/e1000_regs.h
+++ b/sys/dev/e1000/e1000_regs.h
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_vf.c b/sys/dev/e1000/e1000_vf.c
index 2cabac9..4af985b 100644
--- a/sys/dev/e1000/e1000_vf.c
+++ b/sys/dev/e1000/e1000_vf.c
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/e1000_vf.h b/sys/dev/e1000/e1000_vf.h
index 2a780741..e6f834e 100644
--- a/sys/dev/e1000/e1000_vf.h
+++ b/sys/dev/e1000/e1000_vf.h
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2014, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c
index 830325b..e36a3d8 100644
--- a/sys/dev/e1000/if_em.c
+++ b/sys/dev/e1000/if_em.c
@@ -364,8 +364,14 @@ MODULE_DEPEND(em, netmap, 1, 1, 1);
 #define CSUM_TSO	0
 #endif
 
+#define TSO_WORKAROUND	4
+
 static SYSCTL_NODE(_hw, OID_AUTO, em, CTLFLAG_RD, 0, "EM driver parameters");
 
+static int em_disable_crc_stripping = 0;
+SYSCTL_INT(_hw_em, OID_AUTO, disable_crc_stripping, CTLFLAG_RDTUN,
+    &em_disable_crc_stripping, 0, "Disable CRC Stripping");
+
 static int em_tx_int_delay_dflt = EM_TICKS_TO_USECS(EM_TIDV);
 static int em_rx_int_delay_dflt = EM_TICKS_TO_USECS(EM_RDTR);
 SYSCTL_INT(_hw_em, OID_AUTO, tx_int_delay, CTLFLAG_RDTUN, &em_tx_int_delay_dflt,
@@ -1872,13 +1878,15 @@ em_xmit(struct tx_ring *txr, struct mbuf **m_headp)
 	struct ether_header	*eh;
 	struct ip		*ip = NULL;
 	struct tcphdr		*tp = NULL;
-	u32			txd_upper = 0, txd_lower = 0, txd_used = 0;
+	u32			txd_upper = 0, txd_lower = 0;
 	int			ip_off, poff;
 	int			nsegs, i, j, first, last = 0;
-	int			error, do_tso, tso_desc = 0, remap = 1;
+	int			error;
+	bool			do_tso, tso_desc, remap = TRUE;
 
 	m_head = *m_headp;
-	do_tso = ((m_head->m_pkthdr.csum_flags & CSUM_TSO) != 0);
+	do_tso = (m_head->m_pkthdr.csum_flags & CSUM_TSO);
+	tso_desc = FALSE;
 	ip_off = poff = 0;
 
 	/*
@@ -1914,74 +1922,82 @@ em_xmit(struct tx_ring *txr, struct mbuf **m_headp)
 		 * for IPv6 yet.
 		 */
 		ip_off = sizeof(struct ether_header);
-		m_head = m_pullup(m_head, ip_off);
-		if (m_head == NULL) {
-			*m_headp = NULL;
-			return (ENOBUFS);
+		if (m_head->m_len < ip_off) {
+			m_head = m_pullup(m_head, ip_off);
+			if (m_head == NULL) {
+				*m_headp = NULL;
+				return (ENOBUFS);
+			}
 		}
 		eh = mtod(m_head, struct ether_header *);
 		if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
 			ip_off = sizeof(struct ether_vlan_header);
-			m_head = m_pullup(m_head, ip_off);
+			if (m_head->m_len < ip_off) {
+				m_head = m_pullup(m_head, ip_off);
+				if (m_head == NULL) {
+					*m_headp = NULL;
+					return (ENOBUFS);
+				}
+			}
+		}
+		if (m_head->m_len < ip_off + sizeof(struct ip)) {
+			m_head = m_pullup(m_head, ip_off + sizeof(struct ip));
 			if (m_head == NULL) {
 				*m_headp = NULL;
 				return (ENOBUFS);
 			}
 		}
-		m_head = m_pullup(m_head, ip_off + sizeof(struct ip));
-		if (m_head == NULL) {
-			*m_headp = NULL;
-			return (ENOBUFS);
-		}
 		ip = (struct ip *)(mtod(m_head, char *) + ip_off);
 		poff = ip_off + (ip->ip_hl << 2);
-		if (do_tso) {
-			m_head = m_pullup(m_head, poff + sizeof(struct tcphdr));
-			if (m_head == NULL) {
-				*m_headp = NULL;
-				return (ENOBUFS);
+
+		if (do_tso || (m_head->m_pkthdr.csum_flags & CSUM_TCP)) {
+			if (m_head->m_len < poff + sizeof(struct tcphdr)) {
+				m_head = m_pullup(m_head, poff +
+				    sizeof(struct tcphdr));
+				if (m_head == NULL) {
+					*m_headp = NULL;
+					return (ENOBUFS);
+				}
 			}
 			tp = (struct tcphdr *)(mtod(m_head, char *) + poff);
 			/*
 			 * TSO workaround:
 			 *   pull 4 more bytes of data into it.
 			 */
-			m_head = m_pullup(m_head, poff + (tp->th_off << 2) + 4);
-			if (m_head == NULL) {
-				*m_headp = NULL;
-				return (ENOBUFS);
+			if (m_head->m_len < poff + (tp->th_off << 2)) {
+				m_head = m_pullup(m_head, poff +
+				                 (tp->th_off << 2) +
+				                 TSO_WORKAROUND);
+				if (m_head == NULL) {
+					*m_headp = NULL;
+					return (ENOBUFS);
+				}
 			}
 			ip = (struct ip *)(mtod(m_head, char *) + ip_off);
-			ip->ip_len = 0;
-			ip->ip_sum = 0;
-			/*
-			 * The pseudo TCP checksum does not include TCP payload
-			 * length so driver should recompute the checksum here
-			 * what hardware expect to see. This is adherence of
-			 * Microsoft's Large Send specification.
-			 */
 			tp = (struct tcphdr *)(mtod(m_head, char *) + poff);
-			tp->th_sum = in_pseudo(ip->ip_src.s_addr,
-			    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
-		} else if (m_head->m_pkthdr.csum_flags & CSUM_TCP) {
-			m_head = m_pullup(m_head, poff + sizeof(struct tcphdr));
-			if (m_head == NULL) {
-				*m_headp = NULL;
-				return (ENOBUFS);
+			if (do_tso) {
+				ip->ip_len = htons(m_head->m_pkthdr.tso_segsz +
+				                  (ip->ip_hl << 2) +
+				                  (tp->th_off << 2));
+				ip->ip_sum = 0;
+				/*
+				 * The pseudo TCP checksum does not include TCP
+				 * payload length so driver should recompute
+				 * the checksum here what hardware expect to
+				 * see. This is adherence of Microsoft's Large
+				 * Send specification.
+			 	*/
+				tp->th_sum = in_pseudo(ip->ip_src.s_addr,
+				    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
 			}
-			tp = (struct tcphdr *)(mtod(m_head, char *) + poff);
-			m_head = m_pullup(m_head, poff + (tp->th_off << 2));
-			if (m_head == NULL) {
-				*m_headp = NULL;
-				return (ENOBUFS);
-			}
-			ip = (struct ip *)(mtod(m_head, char *) + ip_off);
-			tp = (struct tcphdr *)(mtod(m_head, char *) + poff);
 		} else if (m_head->m_pkthdr.csum_flags & CSUM_UDP) {
-			m_head = m_pullup(m_head, poff + sizeof(struct udphdr));
-			if (m_head == NULL) {
-				*m_headp = NULL;
-				return (ENOBUFS);
+			if (m_head->m_len < poff + sizeof(struct udphdr)) {
+				m_head = m_pullup(m_head, poff +
+				    sizeof(struct udphdr));
+				if (m_head == NULL) {
+					*m_headp = NULL;
+					return (ENOBUFS);
+				}
 			}
 			ip = (struct ip *)(mtod(m_head, char *) + ip_off);
 		}
@@ -2027,7 +2043,7 @@ retry:
 		*m_headp = m;
 
 		/* Try it again, but only once */
-		remap = 0;
+		remap = FALSE;
 		goto retry;
 	} else if (error != 0) {
 		adapter->no_tx_dma_setup++;
@@ -2042,13 +2058,13 @@ retry:
 	 * it follows a TSO burst, then we need to add a
 	 * sentinel descriptor to prevent premature writeback.
 	 */
-	if ((do_tso == 0) && (txr->tx_tso == TRUE)) {
+	if ((!do_tso) && (txr->tx_tso == TRUE)) {
 		if (nsegs == 1)
 			tso_desc = TRUE;
 		txr->tx_tso = FALSE;
 	}
 
-        if (nsegs > (txr->tx_avail - 2)) {
+        if (nsegs > (txr->tx_avail - EM_MAX_SCATTER)) {
                 txr->no_desc_avail++;
 		bus_dmamap_unload(txr->txtag, map);
 		return (ENOBUFS);
@@ -2088,23 +2104,23 @@ retry:
 		** If this is the last descriptor, we want to
 		** split it so we have a small final sentinel
 		*/
-		if (tso_desc && (j == (nsegs -1)) && (seg_len > 8)) {
-			seg_len -= 4;
+		if (tso_desc && (j == (nsegs - 1)) && (seg_len > 8)) {
+			seg_len -= TSO_WORKAROUND;
 			ctxd->buffer_addr = htole64(seg_addr);
 			ctxd->lower.data = htole32(
-			adapter->txd_cmd | txd_lower | seg_len);
-			ctxd->upper.data =
-			    htole32(txd_upper);
+				adapter->txd_cmd | txd_lower | seg_len);
+			ctxd->upper.data = htole32(txd_upper);
 			if (++i == adapter->num_tx_desc)
 				i = 0;
+
 			/* Now make the sentinel */	
-			++txd_used; /* using an extra txd */
+			txr->tx_avail--;
 			ctxd = &txr->tx_base[i];
 			tx_buffer = &txr->tx_buffers[i];
 			ctxd->buffer_addr =
 			    htole64(seg_addr + seg_len);
 			ctxd->lower.data = htole32(
-			adapter->txd_cmd | txd_lower | 4);
+			adapter->txd_cmd | txd_lower | TSO_WORKAROUND);
 			ctxd->upper.data =
 			    htole32(txd_upper);
 			last = i;
@@ -2114,8 +2130,7 @@ retry:
 			ctxd->buffer_addr = htole64(seg_addr);
 			ctxd->lower.data = htole32(
 			adapter->txd_cmd | txd_lower | seg_len);
-			ctxd->upper.data =
-			    htole32(txd_upper);
+			ctxd->upper.data = htole32(txd_upper);
 			last = i;
 			if (++i == adapter->num_tx_desc)
 				i = 0;
@@ -2126,8 +2141,6 @@ retry:
 
 	txr->next_avail_desc = i;
 	txr->tx_avail -= nsegs;
-	if (tso_desc) /* TSO used an extra for sentinel */
-		txr->tx_avail -= txd_used;
 
         tx_buffer->m_head = m_head;
 	/*
@@ -3030,6 +3043,11 @@ em_setup_interface(device_t dev, struct adapter *adapter)
 	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
 	if_setioctlfn(ifp, em_ioctl);
 	if_setgetcounterfn(ifp, em_get_counter);
+	/* TSO parameters */
+	ifp->if_hw_tsomax = EM_TSO_SIZE;
+	ifp->if_hw_tsomaxsegcount = EM_MAX_SCATTER;
+	ifp->if_hw_tsomaxsegsize = EM_TSO_SEG_SIZE;
+
 #ifdef EM_MULTIQUEUE
 	/* Multiqueue stack interface */
 	if_settransmitfn(ifp, em_mq_start);
@@ -4514,7 +4532,8 @@ em_initialize_receive_unit(struct adapter *adapter)
 	    (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT);
 
         /* Strip the CRC */
-        rctl |= E1000_RCTL_SECRC;
+        if (!em_disable_crc_stripping)
+		rctl |= E1000_RCTL_SECRC;
 
         /* Make sure VLAN Filters are off */
         rctl &= ~E1000_RCTL_VFE;
@@ -4888,8 +4907,8 @@ em_enable_intr(struct adapter *adapter)
 	u32 ims_mask = IMS_ENABLE_MASK;
 
 	if (hw->mac.type == e1000_82574) {
-		E1000_WRITE_REG(hw, EM_EIAC, EM_MSIX_MASK);
-		ims_mask |= EM_MSIX_MASK;
+		E1000_WRITE_REG(hw, EM_EIAC, adapter->ims);
+		ims_mask |= adapter->ims;
 	} 
 	E1000_WRITE_REG(hw, E1000_IMS, ims_mask);
 }
diff --git a/sys/dev/e1000/if_em.h b/sys/dev/e1000/if_em.h
index be18a6c..8725de3 100644
--- a/sys/dev/e1000/if_em.h
+++ b/sys/dev/e1000/if_em.h
@@ -266,7 +266,7 @@
 #define HW_DEBUGOUT1(S, A)          if (DEBUG_HW) printf(S "\n", A)
 #define HW_DEBUGOUT2(S, A, B)       if (DEBUG_HW) printf(S "\n", A, B)
 
-#define EM_MAX_SCATTER		32
+#define EM_MAX_SCATTER		64
 #define EM_VFTA_SIZE		128
 #define EM_TSO_SIZE		(65535 + sizeof(struct ether_vlan_header))
 #define EM_TSO_SEG_SIZE		4096	/* Max dma segment size */
diff --git a/sys/dev/e1000/if_igb.c b/sys/dev/e1000/if_igb.c
index 9eacc78..a3ea8d0 100644
--- a/sys/dev/e1000/if_igb.c
+++ b/sys/dev/e1000/if_igb.c
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2013, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/if_igb.h b/sys/dev/e1000/if_igb.h
index f2d0926..a4222e3 100644
--- a/sys/dev/e1000/if_igb.h
+++ b/sys/dev/e1000/if_igb.h
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2013, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
diff --git a/sys/dev/e1000/if_lem.c b/sys/dev/e1000/if_lem.c
index f34010e..7476be5 100644
--- a/sys/dev/e1000/if_lem.c
+++ b/sys/dev/e1000/if_lem.c
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2012, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
@@ -97,7 +97,7 @@
 /*********************************************************************
  *  Legacy Em Driver version:
  *********************************************************************/
-char lem_driver_version[] = "1.0.6";
+char lem_driver_version[] = "1.1.0";
 
 /*********************************************************************
  *  PCI Device ID Table
@@ -2913,10 +2913,6 @@ lem_free_transmit_structures(struct adapter *adapter)
 		bus_dma_tag_destroy(adapter->txtag);
 		adapter->txtag = NULL;
 	}
-#if __FreeBSD_version >= 800000
-	if (adapter->br != NULL)
-        	buf_ring_free(adapter->br, M_DEVBUF);
-#endif
 }
 
 /*********************************************************************
diff --git a/sys/dev/e1000/if_lem.h b/sys/dev/e1000/if_lem.h
index 41447d1..4c43bdd 100644
--- a/sys/dev/e1000/if_lem.h
+++ b/sys/dev/e1000/if_lem.h
@@ -1,6 +1,6 @@
 /******************************************************************************
 
-  Copyright (c) 2001-2011, Intel Corporation 
+  Copyright (c) 2001-2015, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
@@ -296,9 +296,6 @@ struct em_int_delay_info {
 /* Our adapter structure */
 struct adapter {
 	if_t		ifp;
-#if __FreeBSD_version >= 800000
-	struct buf_ring	*br;
-#endif
 	struct e1000_hw	hw;
 
 	/* FreeBSD operating-system-specific structures. */
diff --git a/sys/dev/gpio/gpiobus.c b/sys/dev/gpio/gpiobus.c
index e741d28..6cafdaf 100644
--- a/sys/dev/gpio/gpiobus.c
+++ b/sys/dev/gpio/gpiobus.c
@@ -155,12 +155,16 @@ gpiobus_attach_bus(device_t dev)
 int
 gpiobus_detach_bus(device_t dev)
 {
+	int err;
 
 #ifdef FDT
 	ofw_gpiobus_unregister_provider(dev);
 #endif
+	err = bus_generic_detach(dev);
+	if (err != 0)
+		return (err);
 
-	return (bus_generic_detach(dev));
+	return (device_delete_children(dev));
 }
 
 int
@@ -338,11 +342,14 @@ gpiobus_detach(device_t dev)
 	if ((err = device_get_children(dev, &devlist, &ndevs)) != 0)
 		return (err);
 	for (i = 0; i < ndevs; i++) {
-		device_delete_child(dev, devlist[i]);
 		devi = GPIOBUS_IVAR(devlist[i]);
 		gpiobus_free_ivars(devi);
+		resource_list_free(&devi->rl);
+		free(devi, M_DEVBUF);
+		device_delete_child(dev, devlist[i]);
 	}
 	free(devlist, M_TEMP);
+	rman_fini(&sc->sc_intr_rman);
 	if (sc->sc_pins) {
 		for (i = 0; i < sc->sc_npins; i++) {
 			if (sc->sc_pins[i].name != NULL)
@@ -442,7 +449,7 @@ gpiobus_add_child(device_t dev, u_int order, const char *name, int unit)
 	devi = malloc(sizeof(struct gpiobus_ivar), M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (devi == NULL) {
 		device_delete_child(dev, child);
-		return (0);
+		return (NULL);
 	}
 	resource_list_init(&devi->rl);
 	device_set_ivars(child, devi);
@@ -461,8 +468,11 @@ gpiobus_hinted_child(device_t bus, const char *dname, int dunit)
 	child = BUS_ADD_CHILD(bus, 0, dname, dunit);
 	devi = GPIOBUS_IVAR(child);
 	resource_int_value(dname, dunit, "pins", &pins);
-	if (gpiobus_parse_pins(sc, child, pins))
+	if (gpiobus_parse_pins(sc, child, pins)) {
+		resource_list_free(&devi->rl);
+		free(devi, M_DEVBUF);
 		device_delete_child(bus, child);
+	}
 	if (resource_int_value(dname, dunit, "irq", &irq) == 0) {
 		if (bus_set_resource(child, SYS_RES_IRQ, 0, irq, 1) != 0)
 			device_printf(bus,
diff --git a/sys/dev/gpio/gpioled.c b/sys/dev/gpio/gpioled.c
index 01710c2..e699128 100644
--- a/sys/dev/gpio/gpioled.c
+++ b/sys/dev/gpio/gpioled.c
@@ -255,3 +255,4 @@ static driver_t gpioled_driver = {
 };
 
 DRIVER_MODULE(gpioled, gpiobus, gpioled_driver, gpioled_devclass, 0, 0);
+MODULE_DEPEND(gpioled, gpiobus, 1, 1, 1);
diff --git a/sys/dev/md/md.c b/sys/dev/md/md.c
index a82d81d..c19f7fe 100644
--- a/sys/dev/md/md.c
+++ b/sys/dev/md/md.c
@@ -89,6 +89,7 @@
 #include <sys/vnode.h>
 
 #include <geom/geom.h>
+#include <geom/geom_int.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -121,9 +122,12 @@ SYSCTL_INT(_vm, OID_AUTO, md_malloc_wait, CTLFLAG_RW, &md_malloc_wait, 0,
 #define	MD_ROOT_FSTYPE	"ufs"
 #endif
 
-#if defined(MD_ROOT) && defined(MD_ROOT_SIZE)
+#if defined(MD_ROOT)
 /*
  * Preloaded image gets put here.
+ */
+#if defined(MD_ROOT_SIZE)
+/*
  * Applications that patch the object with the image can determine
  * the size looking at the start and end markers (strings),
  * so we want them contiguous.
@@ -135,6 +139,14 @@ static struct {
 	.start = "MFS Filesystem goes here",
 	.end = "MFS Filesystem had better STOP here",
 };
+const int mfs_root_size = sizeof(mfs_root.start);
+#else
+extern volatile u_char __weak_symbol mfs_root;
+extern volatile u_char __weak_symbol mfs_root_end;
+__GLOBL(mfs_root);
+__GLOBL(mfs_root_end);
+#define mfs_root_size ((uintptr_t)(&mfs_root_end - &mfs_root))
+#endif
 #endif
 
 static g_init_t g_md_init;
@@ -1552,6 +1564,9 @@ md_preloaded(u_char *image, size_t length, const char *name)
 	if (name != NULL) {
 		printf("%s%d: Preloaded image <%s> %zd bytes at %p\n",
 		    MD_NAME, sc->unit, name, length, image);
+	} else {
+		printf("%s%d: Embedded image %zd bytes at %p\n",
+		    MD_NAME, sc->unit, length, image);
 	}
 }
 
@@ -1571,10 +1586,13 @@ g_md_init(struct g_class *mp __unused)
 	sx_init(&md_sx, "MD config lock");
 	g_topology_unlock();
 	md_uh = new_unrhdr(0, INT_MAX, NULL);
-#ifdef MD_ROOT_SIZE
-	sx_xlock(&md_sx);
-	md_preloaded(mfs_root.start, sizeof(mfs_root.start), NULL);
-	sx_xunlock(&md_sx);
+#ifdef MD_ROOT
+	if (mfs_root_size != 0) {
+		sx_xlock(&md_sx);
+		md_preloaded(__DEVOLATILE(u_char *, &mfs_root), mfs_root_size,
+		    NULL);
+		sx_xunlock(&md_sx);
+	}
 #endif
 	/* XXX: are preload_* static or do they need Giant ? */
 	while ((mod = preload_search_next_name(mod)) != NULL) {
@@ -1660,9 +1678,11 @@ g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
 			    "read-only");
 			sbuf_printf(sb, "%s<type>%s</type>\n", indent,
 			    type);
-			if (mp->type == MD_VNODE && mp->vnode != NULL)
-				sbuf_printf(sb, "%s<file>%s</file>\n",
-				    indent, mp->file);
+			if (mp->type == MD_VNODE && mp->vnode != NULL) {
+				sbuf_printf(sb, "%s<file>", indent);
+				g_conf_printf_escaped(sb, "%s", mp->file);
+				sbuf_printf(sb, "</file>\n");
+			}
 		}
 	}
 }
diff --git a/sys/dev/random/fortuna.c b/sys/dev/random/fortuna.c
index 2aafba4..0b03931 100644
--- a/sys/dev/random/fortuna.c
+++ b/sys/dev/random/fortuna.c
@@ -58,6 +58,7 @@ __FBSDID("$FreeBSD$");
 #include <dev/random/fortuna.h>
 #else /* !_KERNEL */
 #include <inttypes.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -124,9 +125,7 @@ static uint8_t zero_region[RANDOM_ZERO_BLOCKSIZE];
 
 static void random_fortuna_pre_read(void);
 static void random_fortuna_read(uint8_t *, u_int);
-static void random_fortuna_write(uint8_t *, u_int);
-static void random_fortuna_reseed(void);
-static int random_fortuna_seeded(void);
+static bool random_fortuna_seeded(void);
 static void random_fortuna_process_event(struct harvest_event *);
 static void random_fortuna_init_alg(void *);
 static void random_fortuna_deinit_alg(void *);
@@ -139,8 +138,6 @@ struct random_algorithm random_alg_context = {
 	.ra_deinit_alg = random_fortuna_deinit_alg,
 	.ra_pre_read = random_fortuna_pre_read,
 	.ra_read = random_fortuna_read,
-	.ra_write = random_fortuna_write,
-	.ra_reseed = random_fortuna_reseed,
 	.ra_seeded = random_fortuna_seeded,
 	.ra_event_processor = random_fortuna_process_event,
 	.ra_poolcount = RANDOM_FORTUNA_NPOOLS,
@@ -420,43 +417,7 @@ random_fortuna_read(uint8_t *buf, u_int bytecount)
 	RANDOM_RESEED_UNLOCK();
 }
 
-/* Internal function to hand external entropy to the PRNG. */
-void
-random_fortuna_write(uint8_t *buf, u_int count)
-{
-	static u_int destination = 0;
-	struct harvest_event event;
-	struct randomdev_hash hash;
-	uint32_t entropy_data[RANDOM_KEYSIZE_WORDS], timestamp;
-	int i;
-
-	/* Extra timing here is helpful to scrape scheduler timing entropy */
-	randomdev_hash_init(&hash);
-	timestamp = (uint32_t)get_cyclecount();
-	randomdev_hash_iterate(&hash, &timestamp, sizeof(timestamp));
-	randomdev_hash_iterate(&hash, buf, count);
-	timestamp = (uint32_t)get_cyclecount();
-	randomdev_hash_iterate(&hash, &timestamp, sizeof(timestamp));
-	randomdev_hash_finish(&hash, entropy_data);
-	explicit_bzero(&hash, sizeof(hash));
-	for (i = 0; i < RANDOM_KEYSIZE_WORDS; i += sizeof(event.he_entropy)/sizeof(event.he_entropy[0])) {
-		event.he_somecounter = (uint32_t)get_cyclecount();
-		event.he_size = sizeof(event.he_entropy);
-		event.he_bits = event.he_size/8;
-		event.he_source = RANDOM_CACHED;
-		event.he_destination = destination++; /* Harmless cheating */
-		memcpy(event.he_entropy, entropy_data + i, sizeof(event.he_entropy));
-		random_fortuna_process_event(&event);
-	}
-	explicit_bzero(entropy_data, sizeof(entropy_data));
-}
-
-void
-random_fortuna_reseed(void)
-{
-}
-
-int
+bool
 random_fortuna_seeded(void)
 {
 
diff --git a/sys/dev/random/other_algorithm.c b/sys/dev/random/other_algorithm.c
new file mode 100644
index 0000000..740e879
--- /dev/null
+++ b/sys/dev/random/other_algorithm.c
@@ -0,0 +1,209 @@
+/*-
+ * Copyright (c) 2015 Mark R V Murray
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*-
+ * This is a skeleton for folks who wish to build a loadable module
+ * containing an alternative entropy-processing algorithm for random(4).
+ *
+ * The functions below should be completed with the appropriate code,
+ * and the nearby yarrow.c and fortuna.c may be consulted for examples
+ * of working code.
+ *
+ * The author is willing to provide reasonable help to those wishing to
+ * write such a module for themselves. Please use the markm@ FreeBSD
+ * email address, and ensure that you are developing this on a suitably
+ * supported branch (This is currently 11-CURRENT, and will be no
+ * older than 11-STABLE in the future).
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/random.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <machine/cpu.h>
+
+#include <crypto/rijndael/rijndael-api-fst.h>
+#include <crypto/sha2/sha2.h>
+
+#include <dev/random/hash.h>
+#include <dev/random/randomdev.h>
+#include <dev/random/random_harvestq.h>
+#include <dev/random/uint128.h>
+#include <dev/random/other_algorithm.h>
+
+static void random_other_pre_read(void);
+static void random_other_read(uint8_t *, u_int);
+static bool random_other_seeded(void);
+static void random_other_process_event(struct harvest_event *);
+static void random_other_init_alg(void *);
+static void random_other_deinit_alg(void *);
+
+/*
+ * RANDOM_OTHER_NPOOLS is used when reading hardware random
+ * number sources to ensure that each pool gets one read sample
+ * per loop iteration. Yarrow has 2 such pools (FAST and SLOW),
+ * and fortuna has 32 (0-31). The RNG used prior to Yarrow and
+ * ported from Linux had just 1 pool.
+ */
+#define RANDOM_OTHER_NPOOLS 1
+
+struct random_algorithm random_alg_context = {
+	.ra_ident = "other",
+	.ra_init_alg = random_other_init_alg,
+	.ra_deinit_alg = random_other_deinit_alg,
+	.ra_pre_read = random_other_pre_read,
+	.ra_read = random_other_read,
+	.ra_seeded = random_other_seeded,
+	.ra_event_processor = random_other_process_event,
+	.ra_poolcount = RANDOM_OTHER_NPOOLS,
+};
+
+/* Use a mutex to protect your reseed variables? */
+static mtx_t other_mtx;
+
+/*
+ * void random_other_init_alg(void *unused __unused)
+ *
+ * Do algorithm-specific initialisation here.
+ */
+void
+random_other_init_alg(void *unused __unused)
+{
+
+	RANDOM_RESEED_INIT_LOCK();
+	/*
+	 * Do set-up work here!
+	 */
+}
+
+/*
+ * void random_other_deinit_alg(void *unused __unused)
+ *
+ * Do algorithm-specific deinitialisation here.
+ */
+static void
+random_other_deinit_alg(void *unused __unused)
+{
+
+	/*
+	 * Do tear-down work here!
+	 */
+	RANDOM_RESEED_DEINIT_LOCK();
+}
+
+/*
+ * void random_other_pre_read(void)
+ *
+ * Do any pre-read preparation you need to. This will be called
+ * before >=1 calls to random_other_read() corresponding to one
+ * read(2).
+ *
+ * This routine will be called periodically while the generator is
+ * still blocked and a read is being attempted, giving you an
+ * opportunity to unblock.
+ */
+static void
+random_other_pre_read(void)
+{
+
+	RANDOM_RESEED_LOCK();
+	/*
+	 * Do pre-read housekeeping work here!
+	 * You may use this as a chance to unblock the generator.
+	 */
+	RANDOM_RESEED_UNLOCK();
+}
+
+/*
+ * void random_other_read(uint8_t *buf, u_int count)
+ *
+ * Generate <count> bytes of output into <*buf>.
+ * You may use the fact that <count> will be a multiple of
+ * RANDOM_BLOCKSIZE for optimization purposes.
+ *
+ * This function will always be called with your generator
+ * unblocked and ready. If you are not ready to generate
+ * output here, then feel free to KASSERT() or panic().
+ */
+static void
+random_other_read(uint8_t *buf, u_int count)
+{
+
+	RANDOM_RESEED_LOCK();
+	/*
+	 * Do random-number generation work here!
+	 */
+	RANDOM_RESEED_UNLOCK();
+}
+
+/*
+ * bool random_other_seeded(void)
+ *
+ * Return true if your generator is ready to generate
+ * output, and false otherwise.
+ */
+static bool
+random_other_seeded(void)
+{
+	bool seeded = false;
+
+	/*
+	 * Find out if your generator is seeded here!
+	 */
+	return (seeded);
+}
+
+/*
+ * void random_other_process_event(struct harvest_event *event)
+ *
+ * Process one stochastic event <*event> into your entropy
+ * processor.
+ *
+ * The structure of the event may change, so it is easier to
+ * just grab the whole thing into your accumulation system.
+ * You may pick-and-choose bits, but please don't complain
+ * when/if these change.
+ */
+static void
+random_other_process_event(struct harvest_event *event)
+{
+
+	RANDOM_RESEED_LOCK();
+	/*
+	 * Do entropy accumulation work here!
+	 * You may use this as a chance to unblock the generator.
+	 */
+	RANDOM_RESEED_UNLOCK();
+}
diff --git a/sys/dev/random/other_algorithm.h b/sys/dev/random/other_algorithm.h
new file mode 100644
index 0000000..8ca2bb8
--- /dev/null
+++ b/sys/dev/random/other_algorithm.h
@@ -0,0 +1,62 @@
+/*-
+ * Copyright (c) 2015 Mark R V Murray
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*-
+ * This is a skeleton for folks who wish to build a loadable module
+ * containing an alternative entropy-processing algorithm for random(4).
+ *
+ * The functions below should be completed with the appropriate code,
+ * and the nearby yarrow.c and fortuna.c may be consulted for examples
+ * of working code.
+ *
+ * The author is willing to provide reasonable help to those wishing to
+ * write such a module for themselves. Please use the markm@ FreeBSD
+ * email address, and ensure that you are developing this on a suitably
+ * supported branch (This is currently 11-CURRENT, and will be no
+ * older than 11-STABLE in the future).
+ */
+
+#ifndef SYS_DEV_RANDOM_OTHER_H_INCLUDED
+#define	SYS_DEV_RANDOM_OTHER_H_INCLUDED
+
+#ifdef _KERNEL
+typedef struct mtx mtx_t;
+#define	RANDOM_RESEED_INIT_LOCK(x)		mtx_init(&other_mtx, "reseed mutex", NULL, MTX_DEF)
+#define	RANDOM_RESEED_DEINIT_LOCK(x)		mtx_destroy(&other_mtx)
+#define	RANDOM_RESEED_LOCK(x)			mtx_lock(&other_mtx)
+#define	RANDOM_RESEED_UNLOCK(x)			mtx_unlock(&other_mtx)
+#define	RANDOM_RESEED_ASSERT_LOCK_OWNED(x)	mtx_assert(&other_mtx, MA_OWNED)
+#else
+#define	RANDOM_RESEED_INIT_LOCK(x)		mtx_init(&other_mtx, mtx_plain)
+#define	RANDOM_RESEED_DEINIT_LOCK(x)		mtx_destroy(&other_mtx)
+#define	RANDOM_RESEED_LOCK(x)			mtx_lock(&other_mtx)
+#define	RANDOM_RESEED_UNLOCK(x)			mtx_unlock(&other_mtx)
+#define	RANDOM_RESEED_ASSERT_LOCK_OWNED(x)
+#endif
+
+#endif /* SYS_DEV_RANDOM_OTHER_H_INCLUDED */
diff --git a/sys/dev/random/random_harvestq.c b/sys/dev/random/random_harvestq.c
index 34a809b..255136c 100644
--- a/sys/dev/random/random_harvestq.c
+++ b/sys/dev/random/random_harvestq.c
@@ -47,12 +47,21 @@ __FBSDID("$FreeBSD$");
 #include <sys/sysctl.h>
 #include <sys/unistd.h>
 
+#if defined(RANDOM_LOADABLE)
+#include <sys/lock.h>
+#include <sys/sx.h>
+#endif
+
+#include <machine/atomic.h>
 #include <machine/cpu.h>
 
 #include <dev/random/randomdev.h>
 #include <dev/random/random_harvestq.h>
 
 static void random_kthread(void);
+static void random_sources_feed(void);
+
+static u_int read_rate;
 
 /* List for the dynamic sysctls */
 static struct sysctl_ctx_list random_clist;
@@ -66,7 +75,7 @@ static struct sysctl_ctx_list random_clist;
 #define	RANDOM_RING_MAX		1024
 #define	RANDOM_ACCUM_MAX	8
 
-/* 1 to let the kernel thread run, 0 to terminate */
+/* 1 to let the kernel thread run, 0 to terminate, -1 to mark completion */
 volatile int random_kthread_control;
 
 /*
@@ -123,13 +132,18 @@ static struct kproc_desc random_proc_kp = {
 	&harvest_context.hc_kthread_proc,
 };
 
-
 /* Pass the given event straight through to Fortuna/Yarrow/Whatever. */
 static __inline void
 random_harvestq_fast_process_event(struct harvest_event *event)
 {
-	if (random_alg_context.ra_event_processor)
-		random_alg_context.ra_event_processor(event);
+#if defined(RANDOM_LOADABLE)
+	RANDOM_CONFIG_S_LOCK();
+	if (p_random_alg_context)
+#endif
+	p_random_alg_context->ra_event_processor(event);
+#if defined(RANDOM_LOADABLE)
+	RANDOM_CONFIG_S_UNLOCK();
+#endif
 }
 
 static void
@@ -163,12 +177,58 @@ random_kthread(void)
 		/* XXX: FIX!! This is a *great* place to pass hardware/live entropy to random(9) */
 		tsleep_sbt(&harvest_context.hc_kthread_proc, 0, "-", SBT_1S/10, 0, C_PREL(1));
 	}
+	random_kthread_control = -1;
 	wakeup(&harvest_context.hc_kthread_proc);
 	kproc_exit(0);
 	/* NOTREACHED */
 }
+/* This happens well after SI_SUB_RANDOM */
 SYSINIT(random_device_h_proc, SI_SUB_CREATE_INIT, SI_ORDER_ANY, kproc_start, &random_proc_kp);
 
+/*
+ * Run through all fast sources reading entropy for the given
+ * number of rounds, which should be a multiple of the number
+ * of entropy accumulation pools in use; 2 for Yarrow and 32
+ * for Fortuna.
+ */
+static void
+random_sources_feed(void)
+{
+	uint32_t entropy[HARVESTSIZE];
+	struct random_sources *rrs;
+	u_int i, n, local_read_rate;
+
+	/*
+	 * Step over all of live entropy sources, and feed their output
+	 * to the system-wide RNG.
+	 */
+#if defined(RANDOM_LOADABLE)
+	RANDOM_CONFIG_S_LOCK();
+	if (p_random_alg_context) {
+	/* It's an indenting error. Yeah, Yeah. */
+#endif
+	local_read_rate = atomic_readandclear_32(&read_rate);
+	LIST_FOREACH(rrs, &source_list, rrs_entries) {
+		for (i = 0; i < p_random_alg_context->ra_poolcount*(local_read_rate + 1); i++) {
+			n = rrs->rrs_source->rs_read(entropy, sizeof(entropy));
+			KASSERT((n > 0 && n <= sizeof(entropy)), ("very bad return from rs_read (= %d) in %s", n, __func__));
+			random_harvest_direct(entropy, n, (n*8)/2, rrs->rrs_source->rs_source);
+		}
+	}
+	explicit_bzero(entropy, sizeof(entropy));
+#if defined(RANDOM_LOADABLE)
+	}
+	RANDOM_CONFIG_S_UNLOCK();
+#endif
+}
+
+void
+read_rate_increment(u_int chunk)
+{
+
+	atomic_add_32(&read_rate, chunk);
+}
+
 /* ARGSUSED */
 RANDOM_CHECK_UINT(harvestmask, 0, RANDOM_HARVEST_EVERYTHING_MASK);
 
@@ -317,7 +377,8 @@ random_harvestq_deinit(void *unused __unused)
 
 	/* Command the hash/reseed thread to end and wait for it to finish */
 	random_kthread_control = 0;
-	tsleep(&harvest_context.hc_kthread_proc, 0, "harvqterm", 0);
+	while (random_kthread_control >= 0)
+		tsleep(&harvest_context.hc_kthread_proc, 0, "harvqterm", hz/5);
 	sysctl_ctx_free(&random_clist);
 }
 SYSUNINIT(random_device_h_init, SI_SUB_RANDOM, SI_ORDER_SECOND, random_harvestq_deinit, NULL);
@@ -412,3 +473,5 @@ random_harvest_direct(const void *entropy, u_int size, u_int bits, enum random_e
 	random_harvestq_fast_process_event(&event);
 	explicit_bzero(&event, sizeof(event));
 }
+
+MODULE_VERSION(random_harvestq, 1);
diff --git a/sys/dev/random/random_harvestq.h b/sys/dev/random/random_harvestq.h
index f1de86f..421b592 100644
--- a/sys/dev/random/random_harvestq.h
+++ b/sys/dev/random/random_harvestq.h
@@ -43,6 +43,8 @@ struct harvest_event {
 	uint8_t		he_source;		/* origin of the entropy */
 } __packed;
 
+void read_rate_increment(u_int);
+
 #define	RANDOM_HARVESTQ_BOOT_ENTROPY_FILE	"/boot/entropy"
 
 #define	RANDOM_HARVEST_INIT_LOCK(x)	mtx_init(&harvest_context.hc_mtx, "entropy harvest mutex", NULL, MTX_SPIN)
diff --git a/sys/dev/random/random_infra.c b/sys/dev/random/random_infra.c
new file mode 100644
index 0000000..d31b84b
--- /dev/null
+++ b/sys/dev/random/random_infra.c
@@ -0,0 +1,128 @@
+/*-
+ * Copyright (c) 2015 Mark R V Murray
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/random.h>
+#include <sys/sysctl.h>
+
+#if defined(RANDOM_LOADABLE)
+#include <sys/lock.h>
+#include <sys/sx.h>
+#endif
+
+#include <dev/random/randomdev.h>
+
+/* Set up the sysctl root node for the entropy device */
+SYSCTL_NODE(_kern, OID_AUTO, random, CTLFLAG_RW, 0, "Cryptographically Secure Random Number Generator");
+
+MALLOC_DEFINE(M_ENTROPY, "entropy", "Entropy harvesting buffers and data structures");
+
+struct sources_head source_list = LIST_HEAD_INITIALIZER(source_list);
+
+#if defined(RANDOM_LOADABLE)
+struct random_algorithm *p_random_alg_context = NULL;
+#else /* !defined(RANDOM_LOADABLE) */
+struct random_algorithm *p_random_alg_context = &random_alg_context;
+#endif /* defined(RANDOM_LOADABLE) */
+
+#if defined(RANDOM_LOADABLE)
+
+struct random_readers {
+	int	(*read_random_uio)(struct uio *, bool);
+	u_int	(*read_random)(void *, u_int);
+} random_reader_context = {
+	(int (*)(struct uio *, bool))nullop,
+	(u_int (*)(void *, u_int))nullop,
+};
+
+struct sx randomdev_config_lock;
+
+static void
+random_infra_sysinit(void *dummy __unused)
+{
+
+	RANDOM_CONFIG_INIT_LOCK();
+}
+SYSINIT(random_device_h_init, SI_SUB_RANDOM, SI_ORDER_FIRST, random_infra_sysinit, NULL);
+
+void
+random_infra_init(int (*p_random_read_uio)(struct uio *, bool), u_int (*p_random_read)(void *, u_int))
+{
+
+	RANDOM_CONFIG_X_LOCK();
+	random_reader_context.read_random_uio = p_random_read_uio;
+	random_reader_context.read_random = p_random_read;
+	RANDOM_CONFIG_X_UNLOCK();
+}
+
+void
+random_infra_uninit(void)
+{
+
+	RANDOM_CONFIG_X_LOCK();
+	random_reader_context.read_random_uio = (int (*)(struct uio *, bool))nullop;
+	random_reader_context.read_random = (u_int (*)(void *, u_int))nullop;
+	RANDOM_CONFIG_X_UNLOCK();
+}
+
+static void
+random_infra_sysuninit(void *dummy __unused)
+{
+
+	RANDOM_CONFIG_DEINIT_LOCK();
+}
+SYSUNINIT(random_device_h_init, SI_SUB_RANDOM, SI_ORDER_FIRST, random_infra_sysuninit, NULL);
+
+int
+read_random_uio(struct uio *uio, bool nonblock)
+{
+	int retval;
+
+	RANDOM_CONFIG_S_LOCK();
+	retval = random_reader_context.read_random_uio(uio, nonblock);
+	RANDOM_CONFIG_S_UNLOCK();
+	return (retval);
+}
+
+u_int
+read_random(void *buf, u_int len)
+{
+	u_int retval;
+
+	RANDOM_CONFIG_S_LOCK();
+	retval = random_reader_context.read_random(buf, len);
+	RANDOM_CONFIG_S_UNLOCK();
+	return (retval);
+}
+
+#endif /* defined(RANDOM_LOADABLE) */
diff --git a/sys/dev/random/randomdev.c b/sys/dev/random/randomdev.c
index 5c20c5d..f20a462 100644
--- a/sys/dev/random/randomdev.c
+++ b/sys/dev/random/randomdev.c
@@ -56,14 +56,18 @@ __FBSDID("$FreeBSD$");
 #include <dev/random/randomdev.h>
 #include <dev/random/random_harvestq.h>
 
-#include "opt_random.h"
+#define	RANDOM_UNIT	0
 
-#if defined(RANDOM_DUMMY) && defined(RANDOM_YARROW)
-#error "Cannot define both RANDOM_DUMMY and RANDOM_YARROW"
+#if defined(RANDOM_LOADABLE)
+#define READ_RANDOM_UIO	_read_random_uio
+#define READ_RANDOM	_read_random
+static int READ_RANDOM_UIO(struct uio *, bool);
+static u_int READ_RANDOM(void *, u_int);
+#else
+#define READ_RANDOM_UIO	read_random_uio
+#define READ_RANDOM	read_random
 #endif
 
-#define	RANDOM_UNIT	0
-
 /* Return the largest number >= x that is a multiple of m */
 #define CEIL_TO_MULTIPLE(x, m) ((((x) + (m) - 1)/(m))*(m))
 
@@ -84,68 +88,31 @@ static struct cdevsw random_cdevsw = {
 /* For use with make_dev(9)/destroy_dev(9). */
 static struct cdev *random_dev;
 
-/* Set up the sysctl root node for the entropy device */
-SYSCTL_NODE(_kern, OID_AUTO, random, CTLFLAG_RW, 0, "Cryptographically Secure Random Number Generator");
-
-MALLOC_DEFINE(M_ENTROPY, "entropy", "Entropy harvesting buffers and data structures");
-
-#if defined(RANDOM_DUMMY)
-
-/*-
- * Dummy "always block" pseudo algorithm, used when there is no real
- * random(4) driver to provide a CSPRNG.
- */
-
-static u_int
-dummy_random_zero(void)
-{
-
-	return (0);
-}
-
-static void
-dummy_random(void)
-{
-}
-
-struct random_algorithm random_alg_context = {
-	.ra_ident = "Dummy",
-	.ra_init_alg = NULL,
-	.ra_deinit_alg = NULL,
-	.ra_pre_read = dummy_random,
-	.ra_read = (random_alg_read_t *)dummy_random_zero,
-	.ra_write = (random_alg_write_t *)dummy_random_zero,
-	.ra_reseed = dummy_random,
-	.ra_seeded = (random_alg_seeded_t *)dummy_random_zero,
-	.ra_event_processor = NULL,
-	.ra_poolcount = 0,
-};
-
-#else /* !defined(RANDOM_DUMMY) */
-
-LIST_HEAD(sources_head, random_sources);
-static struct sources_head source_list = LIST_HEAD_INITIALIZER(source_list);
-static u_int read_rate;
-
 static void
 random_alg_context_ra_init_alg(void *data)
 {
 
-	random_alg_context.ra_init_alg(data);
+	p_random_alg_context = &random_alg_context;
+	p_random_alg_context->ra_init_alg(data);
+#if defined(RANDOM_LOADABLE)
+	random_infra_init(READ_RANDOM_UIO, READ_RANDOM);
+#endif
 }
 
 static void
 random_alg_context_ra_deinit_alg(void *data)
 {
 
-	random_alg_context.ra_deinit_alg(data);
+#if defined(RANDOM_LOADABLE)
+	random_infra_uninit();
+#endif
+	p_random_alg_context->ra_deinit_alg(data);
+	p_random_alg_context = NULL;
 }
 
 SYSINIT(random_device, SI_SUB_RANDOM, SI_ORDER_THIRD, random_alg_context_ra_init_alg, NULL);
 SYSUNINIT(random_device, SI_SUB_RANDOM, SI_ORDER_THIRD, random_alg_context_ra_deinit_alg, NULL);
 
-#endif /* defined(RANDOM_DUMMY) */
-
 static struct selinfo rsel;
 
 /*
@@ -156,28 +123,28 @@ static int
 randomdev_read(struct cdev *dev __unused, struct uio *uio, int flags)
 {
 
-	return (read_random_uio(uio, (flags & O_NONBLOCK) != 0));
+	return (READ_RANDOM_UIO(uio, (flags & O_NONBLOCK) != 0));
 }
 
 int
-read_random_uio(struct uio *uio, bool nonblock)
+READ_RANDOM_UIO(struct uio *uio, bool nonblock)
 {
 	uint8_t *random_buf;
 	int error, spamcount;
 	ssize_t read_len, total_read, c;
 
 	random_buf = malloc(PAGE_SIZE, M_ENTROPY, M_WAITOK);
-	random_alg_context.ra_pre_read();
+	p_random_alg_context->ra_pre_read();
 	error = 0;
 	spamcount = 0;
 	/* (Un)Blocking logic */
-	while (!random_alg_context.ra_seeded()) {
+	while (!p_random_alg_context->ra_seeded()) {
 		if (nonblock) {
 			error = EWOULDBLOCK;
 			break;
 		}
 		/* keep tapping away at the pre-read until we seed/unblock. */
-		random_alg_context.ra_pre_read();
+		p_random_alg_context->ra_pre_read();
 		/* Only bother the console every 10 seconds or so */
 		if (spamcount == 0)
 			printf("random: %s unblock wait\n", __func__);
@@ -187,10 +154,7 @@ read_random_uio(struct uio *uio, bool nonblock)
 			break;
 	}
 	if (error == 0) {
-#if !defined(RANDOM_DUMMY)
-		/* XXX: FIX!! Next line as an atomic operation? */
-		read_rate += (uio->uio_resid + sizeof(uint32_t))/sizeof(uint32_t);
-#endif
+		read_rate_increment((uio->uio_resid + sizeof(uint32_t))/sizeof(uint32_t));
 		total_read = 0;
 		while (uio->uio_resid && !error) {
 			read_len = uio->uio_resid;
@@ -203,7 +167,7 @@ read_random_uio(struct uio *uio, bool nonblock)
 			read_len = CEIL_TO_MULTIPLE(read_len, RANDOM_BLOCKSIZE);
 			/* Work in chunks page-sized or less */
 			read_len = MIN(read_len, PAGE_SIZE);
-			random_alg_context.ra_read(random_buf, read_len);
+			p_random_alg_context->ra_read(random_buf, read_len);
 			c = MIN(uio->uio_resid, read_len);
 			error = uiomove(random_buf, c, uio);
 			total_read += c;
@@ -224,19 +188,16 @@ read_random_uio(struct uio *uio, bool nonblock)
  * RANDOM_BLOCKSIZE bytes.
  */
 u_int
-read_random(void *random_buf, u_int len)
+READ_RANDOM(void *random_buf, u_int len)
 {
 	u_int read_len;
 	uint8_t local_buf[len + RANDOM_BLOCKSIZE];
 
 	KASSERT(random_buf != NULL, ("No suitable random buffer in %s", __func__));
-	random_alg_context.ra_pre_read();
+	p_random_alg_context->ra_pre_read();
 	/* (Un)Blocking logic; if not seeded, return nothing. */
-	if (random_alg_context.ra_seeded()) {
-#if !defined(RANDOM_DUMMY)
-		/* XXX: FIX!! Next line as an atomic operation? */
-		read_rate += (len + sizeof(uint32_t))/sizeof(uint32_t);
-#endif
+	if (p_random_alg_context->ra_seeded()) {
+		read_rate_increment((len + sizeof(uint32_t))/sizeof(uint32_t));
 		if (len > 0) {
 			/*
 			 * Belt-and-braces.
@@ -244,7 +205,7 @@ read_random(void *random_buf, u_int len)
 			 * which is what the underlying generator is expecting.
 			 */
 			read_len = CEIL_TO_MULTIPLE(len, RANDOM_BLOCKSIZE);
-			random_alg_context.ra_read(local_buf, read_len);
+			p_random_alg_context->ra_read(local_buf, read_len);
 			memcpy(random_buf, local_buf, len);
 		}
 	} else
@@ -252,6 +213,37 @@ read_random(void *random_buf, u_int len)
 	return (len);
 }
 
+static __inline void
+randomdev_accumulate(uint8_t *buf, u_int count)
+{
+	static u_int destination = 0;
+	static struct harvest_event event;
+	static struct randomdev_hash hash;
+	static uint32_t entropy_data[RANDOM_KEYSIZE_WORDS];
+	uint32_t timestamp;
+	int i;
+
+	/* Extra timing here is helpful to scrape scheduler jitter entropy */
+	randomdev_hash_init(&hash);
+	timestamp = (uint32_t)get_cyclecount();
+	randomdev_hash_iterate(&hash, &timestamp, sizeof(timestamp));
+	randomdev_hash_iterate(&hash, buf, count);
+	timestamp = (uint32_t)get_cyclecount();
+	randomdev_hash_iterate(&hash, &timestamp, sizeof(timestamp));
+	randomdev_hash_finish(&hash, entropy_data);
+	explicit_bzero(&hash, sizeof(hash));
+	for (i = 0; i < RANDOM_KEYSIZE_WORDS; i += sizeof(event.he_entropy)/sizeof(event.he_entropy[0])) {
+		event.he_somecounter = (uint32_t)get_cyclecount();
+		event.he_size = sizeof(event.he_entropy);
+		event.he_bits = event.he_size/8;
+		event.he_source = RANDOM_CACHED;
+		event.he_destination = destination++; /* Harmless cheating */
+		memcpy(event.he_entropy, entropy_data + i, sizeof(event.he_entropy));
+		p_random_alg_context->ra_event_processor(&event);
+	}
+	explicit_bzero(entropy_data, sizeof(entropy_data));
+}
+
 /* ARGSUSED */
 static int
 randomdev_write(struct cdev *dev __unused, struct uio *uio, int flags __unused)
@@ -267,7 +259,7 @@ randomdev_write(struct cdev *dev __unused, struct uio *uio, int flags __unused)
 		error = uiomove(random_buf, c, uio);
 		if (error)
 			break;
-		random_alg_context.ra_write(random_buf, c);
+		randomdev_accumulate(random_buf, c);
 		tsleep(&random_alg_context, 0, "randwr", hz/10);
 	}
 	if (nbytes != uio->uio_resid && (error == ERESTART || error == EINTR))
@@ -283,7 +275,7 @@ randomdev_poll(struct cdev *dev __unused, int events, struct thread *td __unused
 {
 
 	if (events & (POLLIN | POLLRDNORM)) {
-		if (random_alg_context.ra_seeded())
+		if (p_random_alg_context->ra_seeded())
 			events &= (POLLIN | POLLRDNORM);
 		else
 			selrecord(td, &rsel);
@@ -325,9 +317,6 @@ randomdev_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t addr __unused,
 void
 random_source_register(struct random_source *rsource)
 {
-#if defined(RANDOM_DUMMY)
-	(void)rsource;
-#else /* !defined(RANDOM_DUMMY) */
 	struct random_sources *rrs;
 
 	KASSERT(rsource != NULL, ("invalid input to %s", __func__));
@@ -337,15 +326,11 @@ random_source_register(struct random_source *rsource)
 
 	printf("random: registering fast source %s\n", rsource->rs_ident);
 	LIST_INSERT_HEAD(&source_list, rrs, rrs_entries);
-#endif /* defined(RANDOM_DUMMY) */
 }
 
 void
 random_source_deregister(struct random_source *rsource)
 {
-#if defined(RANDOM_DUMMY)
-	(void)rsource;
-#else /* !defined(RANDOM_DUMMY) */
 	struct random_sources *rrs = NULL;
 
 	KASSERT(rsource != NULL, ("invalid input to %s", __func__));
@@ -356,41 +341,6 @@ random_source_deregister(struct random_source *rsource)
 		}
 	if (rrs != NULL)
 		free(rrs, M_ENTROPY);
-#endif /* defined(RANDOM_DUMMY) */
-}
-
-#if !defined(RANDOM_DUMMY)
-/*
- * Run through all fast sources reading entropy for the given
- * number of rounds, which should be a multiple of the number
- * of entropy accumulation pools in use; 2 for Yarrow and 32
- * for Fortuna.
- *
- * BEWARE!!!
- * This function runs inside the RNG thread! Don't do anything silly!
- */
-void
-random_sources_feed(void)
-{
-	uint32_t entropy[HARVESTSIZE];
-	struct random_sources *rrs;
-	u_int i, n, local_read_rate;
-
-	/*
-	 * Step over all of live entropy sources, and feed their output
-	 * to the system-wide RNG.
-	 */
-	/* XXX: FIX!! Next lines as an atomic operation? */
-	local_read_rate = read_rate;
-	read_rate = RANDOM_ALG_READ_RATE_MINIMUM;
-	LIST_FOREACH(rrs, &source_list, rrs_entries) {
-		for (i = 0; i < random_alg_context.ra_poolcount*local_read_rate; i++) {
-			n = rrs->rrs_source->rs_read(entropy, sizeof(entropy));
-			KASSERT((n > 0 && n <= sizeof(entropy)), ("very bad return from rs_read (= %d) in %s", n, __func__));
-			random_harvest_direct(entropy, n, (n*8)/2, rrs->rrs_source->rs_source);
-		}
-	}
-	explicit_bzero(entropy, sizeof(entropy));
 }
 
 static int
@@ -414,7 +364,6 @@ random_source_handler(SYSCTL_HANDLER_ARGS)
 SYSCTL_PROC(_kern_random, OID_AUTO, random_sources, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	    NULL, 0, random_source_handler, "A",
 	    "List of active fast entropy sources.");
-#endif /* !defined(RANDOM_DUMMY) */
 
 /* ARGSUSED */
 static int
@@ -449,3 +398,5 @@ static moduledata_t randomdev_mod = {
 
 DECLARE_MODULE(random_device, randomdev_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
 MODULE_VERSION(random_device, 1);
+MODULE_DEPEND(random_device, crypto, 1, 1, 1);
+MODULE_DEPEND(random_device, random_harvestq, 1, 1, 1);
diff --git a/sys/dev/random/randomdev.h b/sys/dev/random/randomdev.h
index 799efb1..0f3b359 100644
--- a/sys/dev/random/randomdev.h
+++ b/sys/dev/random/randomdev.h
@@ -55,16 +55,15 @@ random_check_uint_##name(SYSCTL_HANDLER_ARGS)				\
 
 MALLOC_DECLARE(M_ENTROPY);
 
-#define	RANDOM_ALG_READ_RATE_MINIMUM	32
-
 #endif /* _KERNEL */
 
 struct harvest_event;
 
+typedef void random_alg_init_t(void *);
+typedef void random_alg_deinit_t(void *);
 typedef void random_alg_pre_read_t(void);
 typedef void random_alg_read_t(uint8_t *, u_int);
-typedef void random_alg_write_t(uint8_t *, u_int);
-typedef int random_alg_seeded_t(void);
+typedef bool random_alg_seeded_t(void);
 typedef void random_alg_reseed_t(void);
 typedef void random_alg_eventprocessor_t(struct harvest_event *);
 
@@ -81,13 +80,11 @@ struct random_algorithm {
 	void				(*ra_deinit_alg)(void *);
 	random_alg_pre_read_t		*ra_pre_read;
 	random_alg_read_t		*ra_read;
-	random_alg_write_t		*ra_write;
-	random_alg_reseed_t		*ra_reseed;
 	random_alg_seeded_t		*ra_seeded;
 	random_alg_eventprocessor_t	*ra_event_processor;
 };
 
-extern struct random_algorithm random_alg_context;
+extern struct random_algorithm random_alg_context, *p_random_alg_context;
 
 #ifdef _KERNEL
 
@@ -97,22 +94,33 @@ extern struct random_algorithm random_alg_context;
  * upon request.
  */
 struct random_source {
-	const char				*rs_ident;
-	enum random_entropy_source		 rs_source;
-	random_source_read_t			*rs_read;
+	const char			*rs_ident;
+	enum random_entropy_source	 rs_source;
+	random_source_read_t		*rs_read;
 };
 
-#if !defined(RANDOM_DUMMY)
 struct random_sources {
-	LIST_ENTRY(random_sources)		 rrs_entries;
-	struct random_source			*rrs_source;
+	LIST_ENTRY(random_sources)	 rrs_entries;
+	struct random_source		*rrs_source;
 };
-#endif /* !defined(RANDOM_DUMMY) */
+
+LIST_HEAD(sources_head, random_sources);
+extern struct sources_head source_list;
 
 void random_source_register(struct random_source *);
 void random_source_deregister(struct random_source *);
 
-void random_sources_feed(void);
+#if defined(RANDOM_LOADABLE)
+extern struct sx randomdev_config_lock;
+#define	RANDOM_CONFIG_INIT_LOCK(x)	sx_init(&randomdev_config_lock, "configuration change lock")
+#define	RANDOM_CONFIG_X_LOCK(x)		sx_xlock(&randomdev_config_lock)
+#define	RANDOM_CONFIG_X_UNLOCK(x)	sx_xunlock(&randomdev_config_lock)
+#define	RANDOM_CONFIG_S_LOCK(x)		sx_slock(&randomdev_config_lock)
+#define	RANDOM_CONFIG_S_UNLOCK(x)	sx_sunlock(&randomdev_config_lock)
+#define	RANDOM_CONFIG_DEINIT_LOCK(x)	sx_destroy(&randomdev_config_lock)
+void random_infra_init(int (*)(struct uio *, bool), u_int (*)(void *, u_int));
+void random_infra_uninit(void);
+#endif
 
 #endif /* _KERNEL */
 
diff --git a/sys/dev/random/randomdev_none.c b/sys/dev/random/randomdev_none.c
deleted file mode 100644
index ee5cbf2..0000000
--- a/sys/dev/random/randomdev_none.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*-
- * Copyright (c) 2015 Mark R V Murray
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer
- *    in this position and unchanged.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/malloc.h>
-#include <sys/random.h>
-#include <sys/systm.h>
-
-#include <dev/random/randomdev.h>
-
-#include "opt_random.h"
-
-#if defined(RANDOM_DUMMY) || defined(RANDOM_YARROW)
-#error "Cannot define any of RANDOM_DUMMY and RANDOM_YARROW without 'device random'"
-#endif
-
-/*-
- * Dummy "not even here" device. Stub out all routines that the kernel would need.
- */
-
-/* ARGSUSED */
-u_int
-read_random(void *random_buf __unused, u_int len __unused)
-{
-
-	return (0);
-}
-
-/* ARGSUSED */
-void
-random_harvest_direct(const void *entropy __unused, u_int count __unused, u_int bits __unused, enum random_entropy_source origin __unused)
-{
-}
-
-/* ARGSUSED */
-void
-random_harvest_queue(const void *entropy __unused, u_int count __unused, u_int bits __unused, enum random_entropy_source origin __unused)
-{
-}
-
-/* ARGSUSED */
-void
-random_harvest_fast(const void *entropy __unused, u_int count __unused, u_int bits __unused, enum random_entropy_source origin __unused)
-{
-}
diff --git a/sys/dev/random/unit_test.c b/sys/dev/random/unit_test.c
index 7ae5716..fac4c8d 100644
--- a/sys/dev/random/unit_test.c
+++ b/sys/dev/random/unit_test.c
@@ -46,6 +46,7 @@ Where <alg> is YARROW or FORTUNA.
 
 #include <sys/types.h>
 #include <inttypes.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <threads.h>
@@ -172,35 +173,6 @@ RunHarvester(void *arg __unused)
 }
 
 static int
-WriteCSPRNG(void *threadid)
-{
-	uint8_t *buf;
-	int i;
-
-	printf("Thread #1 starts\n");
-
-	for (i = 0; ; i++) {
-		if (stopseeding)
-			break;
-		buf = malloc(4096);
-		if (i % 1000 == 0)
-			printf("Thread write 1 - %d\n", i);
-		if (buf != NULL) {
-			printf("Thread 1 writing.\n");
-			random_alg_context.ra_write(buf, i);
-			free(buf);
-		}
-		usleep(1000000);
-	}
-
-	printf("Thread #1 ends\n");
-
-	thrd_exit(0);
-
-	return (0);
-}
-
-static int
 ReadCSPRNG(void *threadid)
 {
 	size_t tid, zsize;
@@ -271,7 +243,7 @@ main(int argc, char *argv[])
 
 	for (t = 0; t < NUM_THREADS; t++) {
 		printf("In main: creating thread %ld\n", t);
-		rc = thrd_create(&threads[t], (t == 0 ? RunHarvester : (t == 1 ? WriteCSPRNG : ReadCSPRNG)), NULL);
+		rc = thrd_create(&threads[t], (t == 0 ? RunHarvester : ReadCSPRNG), NULL);
 		if (rc != thrd_success) {
 			printf("ERROR; return code from thrd_create() is %d\n", rc);
 			exit(-1);
diff --git a/sys/dev/random/yarrow.c b/sys/dev/random/yarrow.c
index d6ebd46..2ef15a4 100644
--- a/sys/dev/random/yarrow.c
+++ b/sys/dev/random/yarrow.c
@@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$");
 #include <dev/random/yarrow.h>
 #else /* !_KERNEL */
 #include <inttypes.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
@@ -92,7 +93,7 @@ static struct yarrow_state {
 		u_int ysp_thresh;	/* pool reseed threshhold */
 		struct randomdev_hash ysp_hash;	/* accumulated entropy */
 	} ys_pool[RANDOM_YARROW_NPOOLS];/* pool[0] is fast, pool[1] is slow */
-	int ys_seeded;
+	bool ys_seeded;
 	/* Reseed lock */
 	mtx_t ys_mtx;
 } yarrow_state;
@@ -108,9 +109,7 @@ RANDOM_CHECK_UINT(slowoverthresh, 1, 5);
 
 static void random_yarrow_pre_read(void);
 static void random_yarrow_read(uint8_t *, u_int);
-static void random_yarrow_write(uint8_t *, u_int);
-static void random_yarrow_reseed(void);
-static int random_yarrow_seeded(void);
+static bool random_yarrow_seeded(void);
 static void random_yarrow_process_event(struct harvest_event *);
 static void random_yarrow_init_alg(void *);
 static void random_yarrow_deinit_alg(void *);
@@ -123,8 +122,6 @@ struct random_algorithm random_alg_context = {
 	.ra_deinit_alg = random_yarrow_deinit_alg,
 	.ra_pre_read = random_yarrow_pre_read,
 	.ra_read = random_yarrow_read,
-	.ra_write = random_yarrow_write,
-	.ra_reseed = random_yarrow_reseed,
 	.ra_seeded = random_yarrow_seeded,
 	.ra_event_processor = random_yarrow_process_event,
 	.ra_poolcount = RANDOM_YARROW_NPOOLS,
@@ -141,7 +138,7 @@ random_yarrow_init_alg(void *unused __unused)
 
 	RANDOM_RESEED_INIT_LOCK();
 	/* Start unseeded, therefore blocked. */
-	yarrow_state.ys_seeded = 0;
+	yarrow_state.ys_seeded = false;
 #ifdef _KERNEL
 	/*
 	 * Yarrow parameters. Do not adjust these unless you have
@@ -266,12 +263,14 @@ random_yarrow_reseed_internal(u_int fastslow)
 	RANDOM_RESEED_ASSERT_LOCK_OWNED();
 #ifdef RANDOM_DEBUG
 	/* WARNING! This is dangerously tedious to do with mutexes held! */
-	printf("random: %s %s seeded = %d\n", __func__, (fastslow == RANDOM_YARROW_FAST ? "RANDOM_YARROW_FAST" : "RANDOM_YARROW_SLOW"), yarrow_state.ys_seeded);
-	printf("random: %s - fast - thresh %d,1 - ", __func__, yarrow_state.ys_pool[RANDOM_YARROW_FAST].ysp_thresh);
+	printf("random: %s ", __func__);
+	printf("type/pool = %s ", fastslow == RANDOM_YARROW_FAST ? "RANDOM_YARROW_FAST" : "RANDOM_YARROW_SLOW");
+	printf("seeded = %s\n", yarrow_state.ys_seeded ? "true" : "false");
+	printf("random: fast - thresh %d,1 - ", yarrow_state.ys_pool[RANDOM_YARROW_FAST].ysp_thresh);
 	for (i = RANDOM_START; i < ENTROPYSOURCE; i++)
 		printf(" %d", yarrow_state.ys_pool[RANDOM_YARROW_FAST].ysp_source_bits[i]);
 	printf("\n");
-	printf("random: %s - slow - thresh %d,%d - ", __func__, yarrow_state.ys_pool[RANDOM_YARROW_SLOW].ysp_thresh, yarrow_state.ys_slowoverthresh);
+	printf("random: slow - thresh %d,%d - ", yarrow_state.ys_pool[RANDOM_YARROW_SLOW].ysp_thresh, yarrow_state.ys_slowoverthresh);
 	for (i = RANDOM_START; i < ENTROPYSOURCE; i++)
 		printf(" %d", yarrow_state.ys_pool[RANDOM_YARROW_SLOW].ysp_source_bits[i]);
 	printf("\n");
@@ -338,7 +337,7 @@ random_yarrow_reseed_internal(u_int fastslow)
 #endif
 	/* Unblock the device if it was blocked due to being unseeded */
 	if (!yarrow_state.ys_seeded) {
-		yarrow_state.ys_seeded = 1;
+		yarrow_state.ys_seeded = true;
 		randomdev_unblock();
 	}
 }
@@ -395,47 +394,7 @@ random_yarrow_read(uint8_t *buf, u_int bytecount)
 	RANDOM_RESEED_UNLOCK();
 }
 
-/* Internal function to hand external entropy to the PRNG. */
-void
-random_yarrow_write(uint8_t *buf, u_int count)
-{
-	static u_int destination = 0;
-	static struct harvest_event event;
-	struct randomdev_hash hash;
-	uint32_t entropy_data[RANDOM_KEYSIZE_WORDS], timestamp;
-	int i;
-
-	/* Extra timing here is helpful to scrape scheduler timing entropy */
-	randomdev_hash_init(&hash);
-	timestamp = (uint32_t)get_cyclecount();
-	randomdev_hash_iterate(&hash, &timestamp, sizeof(timestamp));
-	randomdev_hash_iterate(&hash, buf, count);
-	timestamp = (uint32_t)get_cyclecount();
-	randomdev_hash_iterate(&hash, &timestamp, sizeof(timestamp));
-	randomdev_hash_finish(&hash, entropy_data);
-	explicit_bzero(&hash, sizeof(hash));
-	for (i = 0; i < RANDOM_KEYSIZE_WORDS; i += sizeof(event.he_entropy)/sizeof(event.he_entropy[0])) {
-		event.he_somecounter = (uint32_t)get_cyclecount();
-		event.he_size = sizeof(event.he_entropy);
-		event.he_bits = event.he_size/8;
-		event.he_source = RANDOM_CACHED;
-		event.he_destination = destination++; /* Harmless cheating */
-		memcpy(event.he_entropy, entropy_data + i, sizeof(event.he_entropy));
-		random_yarrow_process_event(&event);
-	}
-	explicit_bzero(entropy_data, sizeof(entropy_data));
-}
-
-void
-random_yarrow_reseed(void)
-{
-
-	RANDOM_RESEED_LOCK();
-	random_yarrow_reseed_internal(RANDOM_YARROW_SLOW);
-	RANDOM_RESEED_UNLOCK();
-}
-
-int
+bool
 random_yarrow_seeded(void)
 {
 
diff --git a/sys/dev/usb/controller/dwc_otg.c b/sys/dev/usb/controller/dwc_otg.c
index bd3e51b..e018ab5 100644
--- a/sys/dev/usb/controller/dwc_otg.c
+++ b/sys/dev/usb/controller/dwc_otg.c
@@ -1,7 +1,7 @@
 /* $FreeBSD$ */
 /*-
  * Copyright (c) 2015 Daisuke Aoyama. All rights reserved.
- * Copyright (c) 2012 Hans Petter Selasky. All rights reserved.
+ * Copyright (c) 2012-2015 Hans Petter Selasky. All rights reserved.
  * Copyright (c) 2010-2011 Aleksandr Rybalko. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -597,14 +597,18 @@ dwc_otg_clear_hcint(struct dwc_otg_softc *sc, uint8_t x)
 }
 
 static uint8_t
-dwc_otg_host_check_fifo_empty(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
+dwc_otg_host_check_tx_fifo_empty(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
 {
 	uint32_t temp;
 
 	temp = DWC_OTG_READ_4(sc, DOTG_GINTSTS);
 
-	if (td->ep_type == UE_INTERRUPT ||
-	    td->ep_type == UE_ISOCHRONOUS) {
+	if (td->ep_type == UE_ISOCHRONOUS) {
+		/*
+		 * NOTE: USB INTERRUPT transactions are executed like
+		 * USB CONTROL transactions! See the setup standard
+		 * chain function for more information.
+		 */
 		if (!(temp & GINTSTS_PTXFEMP)) {
 			DPRINTF("Periodic TX FIFO is not empty\n");
 			if (!(sc->sc_irq_mask & GINTMSK_PTXFEMPMSK)) {
@@ -631,8 +635,10 @@ dwc_otg_host_channel_alloc(struct dwc_otg_softc *sc,
     struct dwc_otg_td *td, uint8_t is_out)
 {
 	uint8_t x;
+	uint8_t y;
+	uint8_t z;
 
-	if (td->channel < DWC_OTG_MAX_CHANNELS)
+	if (td->channel[0] < DWC_OTG_MAX_CHANNELS)
 		return (0);		/* already allocated */
 
 	/* check if device is suspended */
@@ -641,20 +647,42 @@ dwc_otg_host_channel_alloc(struct dwc_otg_softc *sc,
 
 	/* compute needed TX FIFO size */
 	if (is_out != 0) {
-		if (dwc_otg_host_check_fifo_empty(sc, td) != 0)
+		if (dwc_otg_host_check_tx_fifo_empty(sc, td) != 0)
 			return (1);	/* busy - cannot transfer data */
 	}
-
-	for (x = 0; x != sc->sc_host_ch_max; x++) {
+	z = td->max_packet_count;
+	for (x = y = 0; x != sc->sc_host_ch_max; x++) {
 		/* check if channel is allocated */
 		if (sc->sc_chan_state[x].allocated != 0)
 			continue;
 		/* check if channel is still enabled */
 		if (sc->sc_chan_state[x].wait_halted != 0)
 			continue;
+		/* store channel number */
+		td->channel[y++] = x;
+		/* check if we got all channels */
+		if (y == z)
+			break;
+	}
+	if (y != z) {
+		/* reset channel variable */
+		td->channel[0] = DWC_OTG_MAX_CHANNELS;
+		td->channel[1] = DWC_OTG_MAX_CHANNELS;
+		td->channel[2] = DWC_OTG_MAX_CHANNELS;
+		/* wait a bit */
+		dwc_otg_enable_sof_irq(sc);
+		return (1);	/* busy - not enough channels */
+	}
+
+	for (y = 0; y != z; y++) {
+		x = td->channel[y];
 
+		/* set allocated */
 		sc->sc_chan_state[x].allocated = 1;
 
+		/* set wait halted */
+		sc->sc_chan_state[x].wait_halted = 1;
+
 		/* clear interrupts */
 		dwc_otg_clear_hcint(sc, x);
 
@@ -663,29 +691,22 @@ dwc_otg_host_channel_alloc(struct dwc_otg_softc *sc,
 
 		/* set active channel */
 		sc->sc_active_rx_ep |= (1 << x);
-
-		/* set channel */
-		td->channel = x;
-
-		return (0);	/* allocated */
 	}
-	/* wait a bit */
-	dwc_otg_enable_sof_irq(sc);
-	return (1);	/* busy */
+	return (0);	/* allocated */
 }
 
 static void
-dwc_otg_host_channel_free(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
+dwc_otg_host_channel_free_sub(struct dwc_otg_softc *sc, struct dwc_otg_td *td, uint8_t index)
 {
 	uint32_t hcchar;
 	uint8_t x;
 
-	if (td->channel >= DWC_OTG_MAX_CHANNELS)
+	if (td->channel[index] >= DWC_OTG_MAX_CHANNELS)
 		return;		/* already freed */
 
 	/* free channel */
-	x = td->channel;
-	td->channel = DWC_OTG_MAX_CHANNELS;
+	x = td->channel[index];
+	td->channel[index] = DWC_OTG_MAX_CHANNELS;
 
 	DPRINTF("CH=%d\n", x);
 
@@ -704,26 +725,42 @@ dwc_otg_host_channel_free(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
 	/* clear active channel */
 	sc->sc_active_rx_ep &= ~(1 << x);
 
+	/* check if already halted */
+	if (sc->sc_chan_state[x].wait_halted == 0)
+		return;
+
 	/* disable host channel */
 	hcchar = DWC_OTG_READ_4(sc, DOTG_HCCHAR(x));
 	if (hcchar & HCCHAR_CHENA) {
 		DPRINTF("Halting channel %d\n", x);
 		DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(x),
 		    hcchar | HCCHAR_CHDIS);
-		sc->sc_chan_state[x].wait_halted = 1;
 		/* don't write HCCHAR until the channel is halted */
+	} else {
+		sc->sc_chan_state[x].wait_halted = 0;
 	}
 }
 
 static void
+dwc_otg_host_channel_free(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
+{
+	uint8_t x;
+	for (x = 0; x != td->max_packet_count; x++)
+		dwc_otg_host_channel_free_sub(sc, td, x);
+}
+
+static void
 dwc_otg_host_dump_rx(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
 {
+	uint8_t x;
 	/* dump any pending messages */
-	if (sc->sc_last_rx_status != 0) {
-		if (td->channel < DWC_OTG_MAX_CHANNELS &&
-		    td->channel == GRXSTSRD_CHNUM_GET(sc->sc_last_rx_status)) {
-			dwc_otg_common_rx_ack(sc);
-		}
+	if (sc->sc_last_rx_status == 0)
+		return;
+	for (x = 0; x != td->max_packet_count; x++) {
+		if (td->channel[x] >= DWC_OTG_MAX_CHANNELS ||
+		    td->channel[x] != GRXSTSRD_CHNUM_GET(sc->sc_last_rx_status))
+			continue;
+		dwc_otg_common_rx_ack(sc);
 	}
 }
 
@@ -737,13 +774,13 @@ dwc_otg_host_setup_tx(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
 
 	dwc_otg_host_dump_rx(sc, td);
 
-	if (td->channel < DWC_OTG_MAX_CHANNELS) {
-		hcint = sc->sc_chan_state[td->channel].hcint;
+	if (td->channel[0] < DWC_OTG_MAX_CHANNELS) {
+		hcint = sc->sc_chan_state[td->channel[0]].hcint;
 
 		DPRINTF("CH=%d ST=%d HCINT=0x%08x HCCHAR=0x%08x HCTSIZ=0x%08x\n",
-		    td->channel, td->state, hcint,
-		    DWC_OTG_READ_4(sc, DOTG_HCCHAR(td->channel)),
-		    DWC_OTG_READ_4(sc, DOTG_HCTSIZ(td->channel)));
+		    td->channel[0], td->state, hcint,
+		    DWC_OTG_READ_4(sc, DOTG_HCCHAR(td->channel[0])),
+		    DWC_OTG_READ_4(sc, DOTG_HCTSIZ(td->channel[0])));
 	} else {
 		hcint = 0;
 		goto check_state;
@@ -753,12 +790,12 @@ dwc_otg_host_setup_tx(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
 	    HCINT_ACK | HCINT_NYET)) {
 		/* give success bits priority over failure bits */
 	} else if (hcint & HCINT_STALL) {
-		DPRINTF("CH=%d STALL\n", td->channel);
+		DPRINTF("CH=%d STALL\n", td->channel[0]);
 		td->error_stall = 1;
 		td->error_any = 1;
 		goto complete;
 	} else if (hcint & HCINT_ERRORS) {
-		DPRINTF("CH=%d ERROR\n", td->channel);
+		DPRINTF("CH=%d ERROR\n", td->channel[0]);
 		td->errcnt++;
 		if (td->hcsplt != 0 || td->errcnt >= 3) {
 			td->error_any = 1;
@@ -863,23 +900,23 @@ send_pkt:
 
 	usbd_copy_out(td->pc, 0, &req, sizeof(req));
 
-	DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(td->channel),
+	DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(td->channel[0]),
 	    (sizeof(req) << HCTSIZ_XFERSIZE_SHIFT) |
 	    (1 << HCTSIZ_PKTCNT_SHIFT) |
 	    (HCTSIZ_PID_SETUP << HCTSIZ_PID_SHIFT));
 
-	DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(td->channel), td->hcsplt);
+	DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(td->channel[0]), td->hcsplt);
 
 	hcchar = td->hcchar;
 	hcchar &= ~(HCCHAR_EPDIR_IN | HCCHAR_EPTYPE_MASK);
 	hcchar |= UE_CONTROL << HCCHAR_EPTYPE_SHIFT;
 
 	/* must enable channel before writing data to FIFO */
-	DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(td->channel), hcchar);
+	DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(td->channel[0]), hcchar);
 
 	/* transfer data into FIFO */
 	bus_space_write_region_4(sc->sc_io_tag, sc->sc_io_hdl,
-	    DOTG_DFIFO(td->channel), (uint32_t *)&req, sizeof(req) / 4);
+	    DOTG_DFIFO(td->channel[0]), (uint32_t *)&req, sizeof(req) / 4);
 
 	/* wait until next slot before trying complete split */
 	td->tt_complete_slot = sc->sc_last_frame_num + 1;
@@ -916,17 +953,17 @@ send_cpkt:
 	td->hcsplt |= HCSPLT_COMPSPLT;
 	td->state = DWC_CHAN_ST_WAIT_C_ANE;
 
-	DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(td->channel),
+	DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(td->channel[0]),
 	    (HCTSIZ_PID_SETUP << HCTSIZ_PID_SHIFT));
 
-	DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(td->channel), td->hcsplt);
+	DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(td->channel[0]), td->hcsplt);
 
 	hcchar = td->hcchar;
 	hcchar &= ~(HCCHAR_EPDIR_IN | HCCHAR_EPTYPE_MASK);
 	hcchar |= UE_CONTROL << HCCHAR_EPTYPE_SHIFT;
 
 	/* must enable channel before writing data to FIFO */
-	DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(td->channel), hcchar);
+	DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(td->channel[0]), hcchar);
 
 busy:
 	return (1);	/* busy */
@@ -1060,50 +1097,51 @@ dwc_otg_host_rate_check_interrupt(struct dwc_otg_softc *sc, struct dwc_otg_td *t
 static uint8_t
 dwc_otg_host_rate_check(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
 {
+	uint8_t frame_num = (uint8_t)sc->sc_last_frame_num;
+
 	if (td->ep_type == UE_ISOCHRONOUS) {
 		/* non TT isochronous traffic */
-		if ((td->tmr_val != 0) ||
-		    (sc->sc_last_frame_num & (td->tmr_res - 1))) {
+		if (frame_num & (td->tmr_res - 1))
 			goto busy;
-		}
-		td->tmr_val = 1;	/* executed */
+		if ((frame_num ^ td->tmr_val) & td->tmr_res)
+			goto busy;
+		td->tmr_val = td->tmr_res + sc->sc_last_frame_num;
 		td->toggle = 0;
-
+		return (0);
 	} else if (td->ep_type == UE_INTERRUPT) {
 		if (!td->tt_scheduled)
 			goto busy;
 		td->tt_scheduled = 0;
+		return (0);
 	} else if (td->did_nak != 0) {
-		uint8_t frame_num = (uint8_t)sc->sc_last_frame_num;
 		/* check if we should pause sending queries for 125us */
 		if (td->tmr_res == frame_num) {
 			/* wait a bit */
 			dwc_otg_enable_sof_irq(sc);
 			goto busy;
 		}
-		/* query for data one more time */
-		td->tmr_res = frame_num;
-		td->did_nak = 0;
 	} else if (td->set_toggle) {
 		td->set_toggle = 0;
 		td->toggle = 1;
 	}
+	/* query for data one more time */
+	td->tmr_res = frame_num;
+	td->did_nak = 0;
 	return (0);
 busy:
 	return (1);
 }
 
 static uint8_t
-dwc_otg_host_data_rx_sub(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
+dwc_otg_host_data_rx_sub(struct dwc_otg_softc *sc, struct dwc_otg_td *td,
+    uint8_t channel)
 {
 	uint32_t count;
-	uint8_t channel;
 
 	/* check endpoint status */
 	if (sc->sc_last_rx_status == 0)
 		goto busy;
 
-	channel = td->channel;
 	if (channel >= DWC_OTG_MAX_CHANNELS)
 		goto busy;
 
@@ -1128,21 +1166,22 @@ dwc_otg_host_data_rx_sub(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
 		/* get the packet byte count */
 		count = GRXSTSRD_BCNT_GET(sc->sc_last_rx_status);
 
-		/* check for isochronous transfer or high-speed bandwidth endpoint */
-		if (td->ep_type == UE_ISOCHRONOUS || td->max_packet_count > 1) {
-			if ((sc->sc_last_rx_status & GRXSTSRD_DPID_MASK) != GRXSTSRD_DPID_DATA0) {
+		/* check for ISOCHRONOUS endpoint */
+		if (td->ep_type == UE_ISOCHRONOUS) {
+			if ((sc->sc_last_rx_status & GRXSTSRD_DPID_MASK) !=
+			    GRXSTSRD_DPID_DATA0) {
+				/* more data to be received */
 				td->tt_xactpos = HCSPLT_XACTPOS_MIDDLE;
 			} else {
+				/* all data received */
 				td->tt_xactpos = HCSPLT_XACTPOS_BEGIN;
-
 				/* verify the packet byte count */
-				if (count < td->max_packet_size) {
+				if (count != td->remainder) {
 					/* we have a short packet */
 					td->short_pkt = 1;
 					td->got_short = 1;
 				}
 			}
-			td->toggle = 0;
 		} else {
 			/* verify the packet byte count */
 			if (count != td->max_packet_size) {
@@ -1194,15 +1233,17 @@ complete:
 static uint8_t
 dwc_otg_host_data_rx(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
 {
-	uint32_t hcint;
+	uint32_t hcint = 0;
 	uint32_t hcchar;
 	uint8_t delta;
 	uint8_t channel;
+	uint8_t x;
 
-	channel = td->channel;
-
-	if (channel < DWC_OTG_MAX_CHANNELS) {
-		hcint = sc->sc_chan_state[channel].hcint;
+	for (x = 0; x != td->max_packet_count; x++) {
+		channel = td->channel[x];
+		if (channel >= DWC_OTG_MAX_CHANNELS)
+			continue;
+		hcint |= sc->sc_chan_state[channel].hcint;
 
 		DPRINTF("CH=%d ST=%d HCINT=0x%08x HCCHAR=0x%08x HCTSIZ=0x%08x\n",
 		    channel, td->state, hcint,
@@ -1230,19 +1271,17 @@ dwc_otg_host_data_rx(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
 		}
 
 		/* check channels for data, if any */
-		if (dwc_otg_host_data_rx_sub(sc, td))
+		if (dwc_otg_host_data_rx_sub(sc, td, channel))
 			goto complete;
 
 		/* refresh interrupt status */
-		hcint = sc->sc_chan_state[channel].hcint;
+		hcint |= sc->sc_chan_state[channel].hcint;
 
 		if (hcint & (HCINT_ERRORS | HCINT_RETRY |
 		    HCINT_ACK | HCINT_NYET)) {
 			if (!(hcint & HCINT_ERRORS))
 				td->errcnt = 0;
 		}
-	} else {
-		hcint = 0;
 	}
 
 	switch (td->state) {
@@ -1269,6 +1308,8 @@ dwc_otg_host_data_rx(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
 					td->toggle ^= 1;
 					goto receive_pkt;
 				}
+			} else if (td->ep_type == UE_ISOCHRONOUS) {
+				goto complete;
 			}
 			td->did_nak = 1;
 			td->tt_scheduled = 0;
@@ -1292,12 +1333,12 @@ dwc_otg_host_data_rx(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
 
 			if (td->ep_type == UE_ISOCHRONOUS) {
 				/* check if we are complete */
-				if ((td->remainder == 0) ||
-				    (td->tt_xactpos == HCSPLT_XACTPOS_BEGIN)) {
+				if (td->tt_xactpos == HCSPLT_XACTPOS_BEGIN) {
 					goto complete;
+				} else {
+					/* get more packets */
+					goto busy;
 				}
-				/* get another packet */
-				goto receive_pkt;
 			} else {
 				/* check if we are complete */
 				if ((td->remainder == 0) || (td->got_short != 0)) {
@@ -1365,8 +1406,7 @@ receive_pkt:
 		}
 		/* complete split */
 		td->hcsplt |= HCSPLT_COMPSPLT;
-	} else if (td->tt_xactpos == HCSPLT_XACTPOS_BEGIN &&
-	    dwc_otg_host_rate_check(sc, td)) {
+	} else if (dwc_otg_host_rate_check(sc, td)) {
 		td->state = DWC_CHAN_ST_WAIT_C_PKT;
 		goto busy;
 	}
@@ -1377,8 +1417,6 @@ receive_pkt:
 		goto busy;
 	}
 
-	channel = td->channel;
-
 	/* set toggle, if any */
 	if (td->set_toggle) {
 		td->set_toggle = 0;
@@ -1387,28 +1425,31 @@ receive_pkt:
 
 	td->state = DWC_CHAN_ST_WAIT_ANE;
 
-	/* receive one packet */
-	DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel),
-	    (td->max_packet_size << HCTSIZ_XFERSIZE_SHIFT) |
-	    (1 << HCTSIZ_PKTCNT_SHIFT) |
-	    (td->toggle ? (HCTSIZ_PID_DATA1 << HCTSIZ_PID_SHIFT) :
-	    (HCTSIZ_PID_DATA0 << HCTSIZ_PID_SHIFT)));
+	for (x = 0; x != td->max_packet_count; x++) {
+	  	channel = td->channel[x];
 
-	DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(channel), td->hcsplt);
+		/* receive one packet */
+		DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel),
+		    (td->max_packet_size << HCTSIZ_XFERSIZE_SHIFT) |
+		    (1 << HCTSIZ_PKTCNT_SHIFT) |
+		    (td->toggle ? (HCTSIZ_PID_DATA1 << HCTSIZ_PID_SHIFT) :
+		    (HCTSIZ_PID_DATA0 << HCTSIZ_PID_SHIFT)));
 
-	hcchar = td->hcchar;
-	hcchar |= HCCHAR_EPDIR_IN;
+		DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(channel), td->hcsplt);
 
-	/* receive complete split ASAP */
-	if ((sc->sc_last_frame_num & 1) != 0 &&
-	    (td->ep_type == UE_INTERRUPT || td->ep_type == UE_ISOCHRONOUS))
-		hcchar |= HCCHAR_ODDFRM;
-	else
-		hcchar &= ~HCCHAR_ODDFRM;
+		hcchar = td->hcchar;
+		hcchar |= HCCHAR_EPDIR_IN;
 
-	/* must enable channel before data can be received */
-	DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(channel), hcchar);
+		/* receive complete split ASAP */
+		if ((sc->sc_last_frame_num & 1) != 0 &&
+		    td->ep_type == UE_ISOCHRONOUS)
+			hcchar |= HCCHAR_ODDFRM;
+		else
+			hcchar &= ~HCCHAR_ODDFRM;
 
+		/* must enable channel before data can be received */
+		DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(channel), hcchar);
+	}
 	/* wait until next slot before trying complete split */
 	td->tt_complete_slot = sc->sc_last_frame_num + 1;
 
@@ -1437,7 +1478,7 @@ receive_spkt:
 		goto busy;
 	}
 
-	channel = td->channel;
+	channel = td->channel[0];
 
 	td->hcsplt &= ~HCSPLT_COMPSPLT;
 	td->state = DWC_CHAN_ST_WAIT_S_ANE;
@@ -1450,7 +1491,7 @@ receive_spkt:
 
 	/* send after next SOF event */
 	if ((sc->sc_last_frame_num & 1) == 0 &&
-	    (td->ep_type == UE_INTERRUPT || td->ep_type == UE_ISOCHRONOUS))
+	    td->ep_type == UE_ISOCHRONOUS)
 		td->hcchar |= HCCHAR_ODDFRM;
 	else
 		td->hcchar &= ~HCCHAR_ODDFRM;
@@ -1605,10 +1646,12 @@ dwc_otg_host_data_tx(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
 	uint32_t hcchar;
 	uint8_t delta;
 	uint8_t channel;
+	uint8_t x;
 
 	dwc_otg_host_dump_rx(sc, td);
 
-	channel = td->channel;
+	/* check that last channel is complete */
+	channel = td->channel[td->npkt];
 
 	if (channel < DWC_OTG_MAX_CHANNELS) {
 		hcint = sc->sc_chan_state[channel].hcint;
@@ -1658,7 +1701,11 @@ dwc_otg_host_data_tx(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
 			td->offset += td->tx_bytes;
 			td->remainder -= td->tx_bytes;
 			td->toggle ^= 1;
-			td->did_nak = 0;
+			/* check if next response will be a NAK */
+			if (hcint & HCINT_NYET)
+				td->did_nak = 1;
+			else
+				td->did_nak = 0;
 			td->tt_scheduled = 0;
 
 			/* check remainder */
@@ -1715,33 +1762,13 @@ dwc_otg_host_data_tx(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
 		goto send_cpkt;
 
 	case DWC_CHAN_ST_TX_WAIT_ISOC:
-
-		/* Check if isochronous OUT traffic is complete */
+		/* Check if ISOCHRONOUS OUT traffic is complete */
 		if ((hcint & HCINT_HCH_DONE_MASK) == 0)
 			break;
 
 		td->offset += td->tx_bytes;
 		td->remainder -= td->tx_bytes;
-
-		if (td->hcsplt != 0 || td->remainder == 0)
-			goto complete;
-
-		/* check for next packet */
-		if (td->max_packet_count > 1)
-			td->tt_xactpos++;
-
-		/* free existing channel, if any */
-		dwc_otg_host_channel_free(sc, td);
-
-		td->state = DWC_CHAN_ST_TX_PKT_ISOC;
-
-		/* FALLTHROUGH */
-
-	case DWC_CHAN_ST_TX_PKT_ISOC:
-		if (dwc_otg_host_channel_alloc(sc, td, 1))
-			break;
-		channel = td->channel;
-		goto send_isoc_pkt;
+		goto complete;
 	default:
 		break;
 	}
@@ -1775,8 +1802,6 @@ send_pkt:
 		goto busy;
 	}
 
-	channel = td->channel;
-
 	/* set toggle, if any */
 	if (td->set_toggle) {
 		td->set_toggle = 0;
@@ -1784,8 +1809,7 @@ send_pkt:
 	}
 
 	if (td->ep_type == UE_ISOCHRONOUS) {
-send_isoc_pkt:
-		/* Isochronous OUT transfers don't have any ACKs */
+		/* ISOCHRONOUS OUT transfers don't have any ACKs */
 		td->state = DWC_CHAN_ST_TX_WAIT_ISOC;
 		td->hcsplt &= ~HCSPLT_COMPSPLT;
 		if (td->hcsplt != 0) {
@@ -1799,123 +1823,110 @@ send_isoc_pkt:
 			/* Update transaction position */
 			td->hcsplt &= ~HCSPLT_XACTPOS_MASK;
 			td->hcsplt |= (HCSPLT_XACTPOS_ALL << HCSPLT_XACTPOS_SHIFT);
-		} else {
-			/* send one packet at a time */
-			count = td->max_packet_size;
-			if (td->remainder < count) {
-				/* we have a short packet */
-				td->short_pkt = 1;
-				count = td->remainder;
-			}
 		}
 	} else if (td->hcsplt != 0) {
-
 		td->hcsplt &= ~HCSPLT_COMPSPLT;
-
 		/* Wait for ACK/NAK/ERR from TT */
 		td->state = DWC_CHAN_ST_WAIT_S_ANE;
-
-		/* send one packet at a time */
-		count = td->max_packet_size;
-		if (td->remainder < count) {
-			/* we have a short packet */
-			td->short_pkt = 1;
-			count = td->remainder;
-		}
 	} else {
 		/* Wait for ACK/NAK/STALL from device */
 		td->state = DWC_CHAN_ST_WAIT_ANE;
+	}
+
+	td->tx_bytes = 0;
+	
+	for (x = 0; x != td->max_packet_count; x++) {
+		uint32_t rem_bytes;
+
+		channel = td->channel[x];
 
 		/* send one packet at a time */
 		count = td->max_packet_size;
-		if (td->remainder < count) {
+		rem_bytes = td->remainder - td->tx_bytes;
+		if (rem_bytes < count) {
 			/* we have a short packet */
 			td->short_pkt = 1;
-			count = td->remainder;
-		}
-	}
-
-	/* check for High-Speed multi-packets */
-	if ((td->hcsplt == 0) && (td->max_packet_count > 1)) {
-		if (td->npkt == 0) {
-			if (td->remainder >= (3 * td->max_packet_size))
-				td->npkt = 3;
-			else if (td->remainder >= (2 * td->max_packet_size))
-				td->npkt = 2;
-			else
-				td->npkt = 1;
-
-			if (td->npkt > td->max_packet_count)
-				td->npkt = td->max_packet_count;
-
-			td->tt_xactpos = 1;	/* overload */
+			count = rem_bytes;
 		}
-		if (td->tt_xactpos == td->npkt) {
-			if (td->npkt == 1) {
+		if (count == rem_bytes) {
+			/* last packet */
+			switch (x) {
+			case 0:
 				DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel),
 				    (count << HCTSIZ_XFERSIZE_SHIFT) |
 				    (1 << HCTSIZ_PKTCNT_SHIFT) |
-				    (HCTSIZ_PID_DATA0 << HCTSIZ_PID_SHIFT));
-			} else if (td->npkt == 2) {
+				    (td->toggle ? (HCTSIZ_PID_DATA1 << HCTSIZ_PID_SHIFT) :
+				    (HCTSIZ_PID_DATA0 << HCTSIZ_PID_SHIFT)));
+				break;
+			case 1:
 				DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel),
 				    (count << HCTSIZ_XFERSIZE_SHIFT) |
 				    (1 << HCTSIZ_PKTCNT_SHIFT) |
 				    (HCTSIZ_PID_DATA1 << HCTSIZ_PID_SHIFT));
-			} else {
+				break;
+			default:
 				DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel),
 				    (count << HCTSIZ_XFERSIZE_SHIFT) |
 				    (1 << HCTSIZ_PKTCNT_SHIFT) |
 				    (HCTSIZ_PID_DATA2 << HCTSIZ_PID_SHIFT));
+				break;
 			}
-			td->npkt = 0;
-		} else {
+		} else if (td->ep_type == UE_ISOCHRONOUS &&
+			   td->max_packet_count > 1) {
+			/* ISOCHRONOUS multi packet */
 			DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel),
 			    (count << HCTSIZ_XFERSIZE_SHIFT) |
 			    (1 << HCTSIZ_PKTCNT_SHIFT) |
 			    (HCTSIZ_PID_MDATA << HCTSIZ_PID_SHIFT));
+		} else {
+			/* TODO: HCTSIZ_DOPNG */
+			/* standard BULK/INTERRUPT/CONTROL packet */
+			DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel),
+			    (count << HCTSIZ_XFERSIZE_SHIFT) |
+			    (1 << HCTSIZ_PKTCNT_SHIFT) |
+			    (td->toggle ? (HCTSIZ_PID_DATA1 << HCTSIZ_PID_SHIFT) :
+			    (HCTSIZ_PID_DATA0 << HCTSIZ_PID_SHIFT)));
 		}
-	} else {
-		/* TODO: HCTSIZ_DOPNG */
 
-		DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel),
-		    (count << HCTSIZ_XFERSIZE_SHIFT) |
-		    (1 << HCTSIZ_PKTCNT_SHIFT) |
-		    (td->toggle ? (HCTSIZ_PID_DATA1 << HCTSIZ_PID_SHIFT) :
-		    (HCTSIZ_PID_DATA0 << HCTSIZ_PID_SHIFT)));
-	}
+		DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(channel), td->hcsplt);
 
-	DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(channel), td->hcsplt);
+		hcchar = td->hcchar;
+		hcchar &= ~HCCHAR_EPDIR_IN;
 
-	hcchar = td->hcchar;
-	hcchar &= ~HCCHAR_EPDIR_IN;
+		/* send after next SOF event */
+		if ((sc->sc_last_frame_num & 1) == 0 &&
+		    td->ep_type == UE_ISOCHRONOUS)
+			hcchar |= HCCHAR_ODDFRM;
+		else
+			hcchar &= ~HCCHAR_ODDFRM;
 
-	/* send after next SOF event */
-	if ((sc->sc_last_frame_num & 1) == 0 &&
-	    (td->ep_type == UE_INTERRUPT || td->ep_type == UE_ISOCHRONOUS))
-		hcchar |= HCCHAR_ODDFRM;
-	else
-		hcchar &= ~HCCHAR_ODDFRM;
+		/* must enable before writing data to FIFO */
+		DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(channel), hcchar);
 
-	/* must enable before writing data to FIFO */
-	DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(channel), hcchar);
+		if (count != 0) {
+			/* clear topmost word before copy */
+			sc->sc_tx_bounce_buffer[(count - 1) / 4] = 0;
 
-	if (count != 0) {
+			/* copy out data */
+			usbd_copy_out(td->pc, td->offset + td->tx_bytes,
+			    sc->sc_tx_bounce_buffer, count);
 
-		/* clear topmost word before copy */
-		sc->sc_tx_bounce_buffer[(count - 1) / 4] = 0;
+			/* transfer data into FIFO */
+			bus_space_write_region_4(sc->sc_io_tag, sc->sc_io_hdl,
+			    DOTG_DFIFO(channel),
+			    sc->sc_tx_bounce_buffer, (count + 3) / 4);
+		}
 
-		/* copy out data */
-		usbd_copy_out(td->pc, td->offset,
-		    sc->sc_tx_bounce_buffer, count);
+		/* store number of bytes transmitted */
+		td->tx_bytes += count;
 
-		/* transfer data into FIFO */
-		bus_space_write_region_4(sc->sc_io_tag, sc->sc_io_hdl,
-		    DOTG_DFIFO(channel),
-		    sc->sc_tx_bounce_buffer, (count + 3) / 4);
+		/* store last packet index */
+		td->npkt = x;
+		
+		/* check for last packet */
+		if (count == rem_bytes)
+			break;
 	}
-
-	/* store number of bytes transmitted */
-	td->tx_bytes = count;
 	goto busy;
 
 send_cpkt:
@@ -1941,7 +1952,7 @@ send_cpkt:
 		goto busy;
 	}
 
-	channel = td->channel;
+	channel = td->channel[0];
 
  	td->hcsplt |= HCSPLT_COMPSPLT;
 	td->state = DWC_CHAN_ST_WAIT_C_ANE;
@@ -1956,7 +1967,7 @@ send_cpkt:
 
 	/* receive complete split ASAP */
 	if ((sc->sc_last_frame_num & 1) != 0 &&
-	    (td->ep_type == UE_INTERRUPT || td->ep_type == UE_ISOCHRONOUS))
+	    td->ep_type == UE_ISOCHRONOUS)
 		hcchar |= HCCHAR_ODDFRM;
 	else
 		hcchar &= ~HCCHAR_ODDFRM;
@@ -2383,9 +2394,6 @@ dwc_otg_update_host_transfer_schedule_locked(struct dwc_otg_softc *sc)
 			if ((td->hcchar & HCCHAR_EPDIR_IN) != 0)
 				continue;
 
-			/* execute more frames */
-			td->tmr_val = 0;
-
 			sc->sc_needsof = 1;
 
 			if (td->hcsplt == 0 || td->tt_scheduled != 0)
@@ -2417,9 +2425,6 @@ dwc_otg_update_host_transfer_schedule_locked(struct dwc_otg_softc *sc)
 			if ((td->hcchar & HCCHAR_EPDIR_IN) == 0)
 				continue;
 
-			/* execute more frames */
-			td->tmr_val = 0;
-
 			sc->sc_needsof = 1;
 
 			if (td->hcsplt == 0 || td->tt_scheduled != 0)
@@ -2513,10 +2518,10 @@ dwc_otg_update_host_transfer_schedule_locked(struct dwc_otg_softc *sc)
 	TAILQ_CONCAT(&head, &sc->sc_bus.intr_q.head, wait_entry);
 	TAILQ_CONCAT(&sc->sc_bus.intr_q.head, &head, wait_entry);
 
-	/* put non-TT BULK transfers last */
+	/* put non-TT non-ISOCHRONOUS transfers last */
 	TAILQ_FOREACH_SAFE(xfer, &sc->sc_bus.intr_q.head, wait_entry, xfer_next) {
 		td = xfer->td_transfer_cache;
-		if (td == NULL || td->hcsplt != 0 || td->ep_type != UE_BULK)
+		if (td == NULL || td->hcsplt != 0 || td->ep_type == UE_ISOCHRONOUS)
 			continue;
 		TAILQ_REMOVE(&sc->sc_bus.intr_q.head, xfer, wait_entry);
 		TAILQ_INSERT_TAIL(&head, xfer, wait_entry);
@@ -2551,11 +2556,19 @@ static void
 dwc_otg_interrupt_poll_locked(struct dwc_otg_softc *sc)
 {
 	struct usb_xfer *xfer;
-	uint32_t count = 0;
+	uint32_t count;
 	uint32_t temp;
 	uint8_t got_rx_status;
 	uint8_t x;
 
+	if (sc->sc_flags.status_device_mode == 0) {
+		/*
+		 * Update host transfer schedule, so that new
+		 * transfers can be issued:
+		 */
+		dwc_otg_update_host_transfer_schedule_locked(sc);
+	}
+	count = 0;
 repeat:
 	if (++count == 16) {
 		/* give other interrupts a chance */
@@ -2659,12 +2672,6 @@ repeat:
 		sc->sc_irq_mask &= ~GINTMSK_RXFLVLMSK;
 		DWC_OTG_WRITE_4(sc, DOTG_GINTMSK, sc->sc_irq_mask);
 	}
-
-	if (sc->sc_flags.status_device_mode == 0 && sc->sc_xfer_complete == 0) {
-		/* update host transfer schedule, so that new transfers can be issued */
-		if (dwc_otg_update_host_transfer_schedule_locked(sc))
-			goto repeat;
-	}
 }
 
 static void
@@ -2944,12 +2951,6 @@ dwc_otg_interrupt(void *arg)
 
 		/* complete FIFOs, if any */
 		dwc_otg_interrupt_complete_locked(sc);
-
-		if (sc->sc_flags.status_device_mode == 0) {
-			/* update host transfer schedule, so that new transfers can be issued */
-			if (dwc_otg_update_host_transfer_schedule_locked(sc))
-				dwc_otg_interrupt_poll_locked(sc);
-		}
 	}
 	USB_BUS_SPIN_UNLOCK(&sc->sc_bus);
 	USB_BUS_UNLOCK(&sc->sc_bus);
@@ -2982,7 +2983,9 @@ dwc_otg_setup_standard_chain_sub(struct dwc_otg_std_temp *temp)
 	td->set_toggle = 0;
 	td->got_short = 0;
 	td->did_nak = 0;
-	td->channel = DWC_OTG_MAX_CHANNELS;
+	td->channel[0] = DWC_OTG_MAX_CHANNELS;
+	td->channel[1] = DWC_OTG_MAX_CHANNELS;
+	td->channel[2] = DWC_OTG_MAX_CHANNELS;
 	td->state = 0;
 	td->errcnt = 0;
 	td->tt_scheduled = 0;
@@ -3247,8 +3250,10 @@ dwc_otg_setup_standard_chain(struct usb_xfer *xfer)
 				td->tmr_val = sc->sc_tmr_val + ival;
 				td->tmr_res = ival;
 			} else if (td->ep_type == UE_ISOCHRONOUS) {
-				td->tmr_val = 0;
 				td->tmr_res = 1;
+				td->tmr_val = sc->sc_last_frame_num;
+				if (td->hcchar & HCCHAR_EPDIR_IN)
+					td->tmr_val++;
 			} else {
 				td->tmr_val = 0;
 				td->tmr_res = (uint8_t)sc->sc_last_frame_num;
@@ -3258,10 +3263,8 @@ dwc_otg_setup_standard_chain(struct usb_xfer *xfer)
 			hcsplt = 0;
 			if (td->ep_type == UE_INTERRUPT) {
 				uint32_t ival;
-#if 0
 				hcchar |= ((xfer->max_packet_count & 3)
 				    << HCCHAR_MC_SHIFT);
-#endif
 				ival = xfer->interval / DWC_OTG_HOST_TIMER_RATE;
 				if (ival == 0)
 					ival = 1;
@@ -3272,8 +3275,11 @@ dwc_otg_setup_standard_chain(struct usb_xfer *xfer)
 			} else if (td->ep_type == UE_ISOCHRONOUS) {
 				hcchar |= ((xfer->max_packet_count & 3)
 				    << HCCHAR_MC_SHIFT);
-				td->tmr_val = 0;
 				td->tmr_res = 1 << usbd_xfer_get_fps_shift(xfer);
+				td->tmr_val = sc->sc_last_frame_num;
+				if (td->hcchar & HCCHAR_EPDIR_IN)
+					td->tmr_val += td->tmr_res;
+
 			} else {
 				td->tmr_val = 0;
 				td->tmr_res = (uint8_t)sc->sc_last_frame_num;
@@ -3330,6 +3336,19 @@ dwc_otg_start_standard_chain(struct usb_xfer *xfer)
 		dwc_otg_xfer_do_fifo(sc, xfer);
 		if (dwc_otg_xfer_do_complete_locked(sc, xfer))
 			goto done;
+	} else {
+		struct dwc_otg_td *td = xfer->td_transfer_cache;
+		if (td->ep_type == UE_ISOCHRONOUS &&
+		    (td->hcchar & HCCHAR_EPDIR_IN) == 0) {
+			/*
+			 * Need to start ISOCHRONOUS OUT transfer ASAP
+			 * because execution is delayed by one 125us
+			 * microframe:
+			 */
+			dwc_otg_xfer_do_fifo(sc, xfer);
+			if (dwc_otg_xfer_do_complete_locked(sc, xfer))
+				goto done;
+		}
 	}
 
 	/* put transfer on interrupt queue */
@@ -3950,11 +3969,6 @@ dwc_otg_do_poll(struct usb_bus *bus)
 	USB_BUS_SPIN_LOCK(&sc->sc_bus);
 	dwc_otg_interrupt_poll_locked(sc);
 	dwc_otg_interrupt_complete_locked(sc);
-	if (sc->sc_flags.status_device_mode == 0) {
-		/* update host transfer schedule, so that new transfers can be issued */
-		if (dwc_otg_update_host_transfer_schedule_locked(sc))
-			dwc_otg_interrupt_poll_locked(sc);
-	}
 	USB_BUS_SPIN_UNLOCK(&sc->sc_bus);
 	USB_BUS_UNLOCK(&sc->sc_bus);
 }
@@ -4728,6 +4742,9 @@ dwc_otg_xfer_setup(struct usb_setup_params *parm)
 			/* init TD */
 			td->max_packet_size = xfer->max_packet_size;
 			td->max_packet_count = xfer->max_packet_count;
+			/* range check */
+			if (td->max_packet_count == 0 || td->max_packet_count > 3)
+				td->max_packet_count = 1;
 			td->ep_no = ep_no;
 			td->ep_type = ep_type;
 			td->obj_next = last_obj;
@@ -4766,12 +4783,13 @@ dwc_otg_ep_init(struct usb_device *udev, struct usb_endpoint_descriptor *edesc,
 				return;
 			}
 		} else {
-			if (udev->speed == USB_SPEED_HIGH) {
-				if ((UGETW(edesc->wMaxPacketSize) >> 11) & 3) {
-					/* high bandwidth endpoint - not tested */
-					DPRINTF("High Bandwidth Endpoint - not tested\n");
-					return;
-				}
+			if (udev->speed == USB_SPEED_HIGH &&
+			    (edesc->wMaxPacketSize[1] & 0x18) != 0 &&
+			    (edesc->bmAttributes & UE_XFERTYPE) != UE_ISOCHRONOUS) {
+				/* not supported */
+				DPRINTFN(-1, "Non-isochronous high bandwidth "
+				    "endpoint not supported\n");
+				return;
 			}
 		}
 		if ((edesc->bmAttributes & UE_XFERTYPE) == UE_ISOCHRONOUS)
diff --git a/sys/dev/usb/controller/dwc_otg.h b/sys/dev/usb/controller/dwc_otg.h
index 39c9529..f5e9887 100644
--- a/sys/dev/usb/controller/dwc_otg.h
+++ b/sys/dev/usb/controller/dwc_otg.h
@@ -69,7 +69,7 @@ struct dwc_otg_td {
 	uint8_t tmr_val;
 	uint8_t	ep_no;
 	uint8_t ep_type;
-	uint8_t channel;
+	uint8_t channel[3];
 	uint8_t tt_index;		/* TT data */
 	uint8_t tt_start_slot;		/* TT data */
 	uint8_t tt_complete_slot;	/* TT data */
@@ -80,8 +80,7 @@ struct dwc_otg_td {
 #define	DWC_CHAN_ST_WAIT_S_ANE 2
 #define	DWC_CHAN_ST_WAIT_C_ANE 3
 #define	DWC_CHAN_ST_WAIT_C_PKT 4
-#define	DWC_CHAN_ST_TX_PKT_ISOC 5
-#define	DWC_CHAN_ST_TX_WAIT_ISOC 6
+#define	DWC_CHAN_ST_TX_WAIT_ISOC 5
 	uint8_t	error_any:1;
 	uint8_t	error_stall:1;
 	uint8_t	alt_next:1;
diff --git a/sys/dev/usb/controller/usb_controller.c b/sys/dev/usb/controller/usb_controller.c
index 92ea6c5..9f7ce24 100644
--- a/sys/dev/usb/controller/usb_controller.c
+++ b/sys/dev/usb/controller/usb_controller.c
@@ -231,7 +231,8 @@ usb_detach(device_t dev)
 	/* Get rid of USB callback processes */
 
 	usb_proc_free(USB_BUS_GIANT_PROC(bus));
-	usb_proc_free(USB_BUS_NON_GIANT_PROC(bus));
+	usb_proc_free(USB_BUS_NON_GIANT_ISOC_PROC(bus));
+	usb_proc_free(USB_BUS_NON_GIANT_BULK_PROC(bus));
 
 	/* Get rid of USB explore process */
 
@@ -395,7 +396,8 @@ usb_bus_explore(struct usb_proc_msg *pm)
 		 */
 		usb_proc_rewakeup(USB_BUS_CONTROL_XFER_PROC(bus));
 		usb_proc_rewakeup(USB_BUS_GIANT_PROC(bus));
-		usb_proc_rewakeup(USB_BUS_NON_GIANT_PROC(bus));
+		usb_proc_rewakeup(USB_BUS_NON_GIANT_ISOC_PROC(bus));
+		usb_proc_rewakeup(USB_BUS_NON_GIANT_BULK_PROC(bus));
 #endif
 
 		USB_BUS_UNLOCK(bus);
@@ -860,9 +862,13 @@ usb_attach_sub(device_t dev, struct usb_bus *bus)
 	    &bus->bus_mtx, device_get_nameunit(dev), USB_PRI_MED)) {
 		device_printf(dev, "WARNING: Creation of USB Giant "
 		    "callback process failed.\n");
-	} else if (usb_proc_create(USB_BUS_NON_GIANT_PROC(bus),
+	} else if (usb_proc_create(USB_BUS_NON_GIANT_ISOC_PROC(bus),
+	    &bus->bus_mtx, device_get_nameunit(dev), USB_PRI_HIGHEST)) {
+		device_printf(dev, "WARNING: Creation of USB non-Giant ISOC "
+		    "callback process failed.\n");
+	} else if (usb_proc_create(USB_BUS_NON_GIANT_BULK_PROC(bus),
 	    &bus->bus_mtx, device_get_nameunit(dev), USB_PRI_HIGH)) {
-		device_printf(dev, "WARNING: Creation of USB non-Giant "
+		device_printf(dev, "WARNING: Creation of USB non-Giant BULK "
 		    "callback process failed.\n");
 	} else if (usb_proc_create(USB_BUS_EXPLORE_PROC(bus),
 	    &bus->bus_mtx, device_get_nameunit(dev), USB_PRI_MED)) {
diff --git a/sys/dev/usb/usb_bus.h b/sys/dev/usb/usb_bus.h
index bdd1681..3ceeb1e 100644
--- a/sys/dev/usb/usb_bus.h
+++ b/sys/dev/usb/usb_bus.h
@@ -57,19 +57,26 @@ struct usb_bus {
 	struct root_hold_token *bus_roothold;
 #endif
 
+/* convenience macros */
+#define	USB_BUS_TT_PROC(bus) USB_BUS_NON_GIANT_ISOC_PROC(bus)
+#define	USB_BUS_CS_PROC(bus) USB_BUS_NON_GIANT_ISOC_PROC(bus)
+  
 #if USB_HAVE_PER_BUS_PROCESS
 #define	USB_BUS_GIANT_PROC(bus) (&(bus)->giant_callback_proc)
-#define	USB_BUS_NON_GIANT_PROC(bus) (&(bus)->non_giant_callback_proc)
+#define	USB_BUS_NON_GIANT_ISOC_PROC(bus) (&(bus)->non_giant_isoc_callback_proc)
+#define	USB_BUS_NON_GIANT_BULK_PROC(bus) (&(bus)->non_giant_bulk_callback_proc)
 #define	USB_BUS_EXPLORE_PROC(bus) (&(bus)->explore_proc)
 #define	USB_BUS_CONTROL_XFER_PROC(bus) (&(bus)->control_xfer_proc)
-
 	/*
-	 * There are two callback processes. One for Giant locked
-	 * callbacks. One for non-Giant locked callbacks. This should
-	 * avoid congestion and reduce response time in most cases.
+	 * There are three callback processes. One for Giant locked
+	 * callbacks. One for non-Giant locked non-periodic callbacks
+	 * and one for non-Giant locked periodic callbacks. This
+	 * should avoid congestion and reduce response time in most
+	 * cases.
 	 */
 	struct usb_process giant_callback_proc;
-	struct usb_process non_giant_callback_proc;
+	struct usb_process non_giant_isoc_callback_proc;
+	struct usb_process non_giant_bulk_callback_proc;
 
 	/* Explore process */
 	struct usb_process explore_proc;
diff --git a/sys/dev/usb/usb_device.c b/sys/dev/usb/usb_device.c
index 5ffc07f..13e2c14 100644
--- a/sys/dev/usb/usb_device.c
+++ b/sys/dev/usb/usb_device.c
@@ -2181,7 +2181,7 @@ usb_free_device(struct usb_device *udev, uint8_t flag)
 	 * anywhere:
 	 */
 	USB_BUS_LOCK(udev->bus);
-	usb_proc_mwait(USB_BUS_NON_GIANT_PROC(udev->bus),
+	usb_proc_mwait(USB_BUS_CS_PROC(udev->bus),
 	    &udev->cs_msg[0], &udev->cs_msg[1]);
 	USB_BUS_UNLOCK(udev->bus);
 
diff --git a/sys/dev/usb/usb_hub.c b/sys/dev/usb/usb_hub.c
index 2f1459c..a54fa2e 100644
--- a/sys/dev/usb/usb_hub.c
+++ b/sys/dev/usb/usb_hub.c
@@ -346,7 +346,7 @@ uhub_tt_buffer_reset_async_locked(struct usb_device *child, struct usb_endpoint
 	}
 	up->req_reset_tt = req;
 	/* get reset transfer started */
-	usb_proc_msignal(USB_BUS_NON_GIANT_PROC(udev->bus),
+	usb_proc_msignal(USB_BUS_TT_PROC(udev->bus),
 	    &hub->tt_msg[0], &hub->tt_msg[1]);
 }
 #endif
@@ -1579,7 +1579,7 @@ uhub_detach(device_t dev)
 #if USB_HAVE_TT_SUPPORT
 	/* Make sure our TT messages are not queued anywhere */
 	USB_BUS_LOCK(bus);
-	usb_proc_mwait(USB_BUS_NON_GIANT_PROC(bus),
+	usb_proc_mwait(USB_BUS_TT_PROC(bus),
 	    &hub->tt_msg[0], &hub->tt_msg[1]);
 	USB_BUS_UNLOCK(bus);
 #endif
diff --git a/sys/dev/usb/usb_pf.c b/sys/dev/usb/usb_pf.c
index 468eafb..82ad8e4 100644
--- a/sys/dev/usb/usb_pf.c
+++ b/sys/dev/usb/usb_pf.c
@@ -221,7 +221,13 @@ usbpf_clone_destroy(struct if_clone *ifc, struct ifnet *ifp)
 	ubus = ifp->if_softc;
 	unit = ifp->if_dunit;
 
+	/*
+	 * Lock USB before clearing the "ifp" pointer, to avoid
+	 * clearing the pointer in the middle of a TAP operation:
+	 */
+	USB_BUS_LOCK(ubus);
 	ubus->ifp = NULL;
+	USB_BUS_UNLOCK(ubus);
 	bpfdetach(ifp);
 	if_detach(ifp);
 	if_free(ifp);
diff --git a/sys/dev/usb/usb_process.h b/sys/dev/usb/usb_process.h
index c12cdc4..dd20afd 100644
--- a/sys/dev/usb/usb_process.h
+++ b/sys/dev/usb/usb_process.h
@@ -34,6 +34,7 @@
 #endif
 
 /* defines */
+#define	USB_PRI_HIGHEST	PI_SWI(SWI_TTY)
 #define	USB_PRI_HIGH	PI_SWI(SWI_NET)
 #define	USB_PRI_MED	PI_SWI(SWI_CAMBIO)
 
diff --git a/sys/dev/usb/usb_transfer.c b/sys/dev/usb/usb_transfer.c
index 5650790..783a96c 100644
--- a/sys/dev/usb/usb_transfer.c
+++ b/sys/dev/usb/usb_transfer.c
@@ -872,6 +872,19 @@ done:
 	}
 }
 
+static uint8_t
+usbd_transfer_setup_has_bulk(const struct usb_config *setup_start,
+    uint16_t n_setup)
+{
+	while (n_setup--) {
+		uint8_t type = setup_start[n_setup].type;
+		if (type == UE_BULK || type == UE_BULK_INTR ||
+		    type == UE_TYPE_ANY)
+			return (1);
+	}
+	return (0);
+}
+
 /*------------------------------------------------------------------------*
  *	usbd_transfer_setup - setup an array of USB transfers
  *
@@ -1013,9 +1026,12 @@ usbd_transfer_setup(struct usb_device *udev,
 			else if (xfer_mtx == &Giant)
 				info->done_p =
 				    USB_BUS_GIANT_PROC(udev->bus);
+			else if (usbd_transfer_setup_has_bulk(setup_start, n_setup))
+				info->done_p =
+				    USB_BUS_NON_GIANT_BULK_PROC(udev->bus);
 			else
 				info->done_p =
-				    USB_BUS_NON_GIANT_PROC(udev->bus);
+				    USB_BUS_NON_GIANT_ISOC_PROC(udev->bus);
 		}
 		/* reset sizes */
 
@@ -2280,10 +2296,8 @@ usbd_callback_ss_done_defer(struct usb_xfer *xfer)
 	         * will have a Lock Order Reversal, LOR, if we try to
 	         * proceed !
 	         */
-		if (usb_proc_msignal(info->done_p,
-		    &info->done_m[0], &info->done_m[1])) {
-			/* ignore */
-		}
+		(void) usb_proc_msignal(info->done_p,
+		    &info->done_m[0], &info->done_m[1]);
 	} else {
 		/* clear second recurse flag */
 		pq->recurse_2 = 0;
@@ -2307,23 +2321,26 @@ usbd_callback_wrapper(struct usb_xfer_queue *pq)
 	struct usb_xfer_root *info = xfer->xroot;
 
 	USB_BUS_LOCK_ASSERT(info->bus, MA_OWNED);
-	if (!mtx_owned(info->xfer_mtx) && !SCHEDULER_STOPPED()) {
+	if ((pq->recurse_3 != 0 || mtx_owned(info->xfer_mtx) == 0) &&
+	    SCHEDULER_STOPPED() == 0) {
 		/*
 	       	 * Cases that end up here:
 		 *
 		 * 5) HW interrupt done callback or other source.
+		 * 6) HW completed transfer during callback
 		 */
-		DPRINTFN(3, "case 5\n");
+		DPRINTFN(3, "case 5 and 6\n");
 
 		/*
 	         * We have to postpone the callback due to the fact we
 	         * will have a Lock Order Reversal, LOR, if we try to
-	         * proceed !
+	         * proceed!
+		 *
+		 * Postponing the callback also ensures that other USB
+		 * transfer queues get a chance.
 	         */
-		if (usb_proc_msignal(info->done_p,
-		    &info->done_m[0], &info->done_m[1])) {
-			/* ignore */
-		}
+		(void) usb_proc_msignal(info->done_p,
+		    &info->done_m[0], &info->done_m[1]);
 		return;
 	}
 	/*
@@ -2381,8 +2398,11 @@ usbd_callback_wrapper(struct usb_xfer_queue *pq)
 	}
 
 #if USB_HAVE_PF
-	if (xfer->usb_state != USB_ST_SETUP)
+	if (xfer->usb_state != USB_ST_SETUP) {
+		USB_BUS_LOCK(info->bus);
 		usbpf_xfertap(xfer, USBPF_XFERTAP_DONE);
+		USB_BUS_UNLOCK(info->bus);
+	}
 #endif
 	/* call processing routine */
 	(xfer->callback) (xfer, xfer->error);
@@ -2694,7 +2714,7 @@ usbd_pipe_start(struct usb_xfer_queue *pq)
 			} else if (udev->ctrl_xfer[1]) {
 				info = udev->ctrl_xfer[1]->xroot;
 				usb_proc_msignal(
-				    USB_BUS_NON_GIANT_PROC(info->bus),
+				    USB_BUS_CS_PROC(info->bus),
 				    &udev->cs_msg[0], &udev->cs_msg[1]);
 			} else {
 				/* should not happen */
@@ -3019,9 +3039,11 @@ usb_command_wrapper(struct usb_xfer_queue *pq, struct usb_xfer *xfer)
 
 	if (!pq->recurse_1) {
 
-		do {
+		/* clear third recurse flag */
+		pq->recurse_3 = 0;
 
-			/* set both recurse flags */
+		do {
+			/* set two first recurse flags */
 			pq->recurse_1 = 1;
 			pq->recurse_2 = 1;
 
@@ -3040,6 +3062,12 @@ usb_command_wrapper(struct usb_xfer_queue *pq, struct usb_xfer *xfer)
 			(pq->command) (pq);
 			DPRINTFN(6, "cb %p (leave)\n", pq->curr);
 
+			/*
+			 * Set third recurse flag to indicate
+			 * recursion happened:
+			 */
+			pq->recurse_3 = 1;
+
 		} while (!pq->recurse_2);
 
 		/* clear first recurse flag */
@@ -3315,7 +3343,8 @@ usbd_transfer_poll(struct usb_xfer **ppxfer, uint16_t max)
 		USB_BUS_CONTROL_XFER_PROC(udev->bus)->up_msleep = 0;
 		USB_BUS_EXPLORE_PROC(udev->bus)->up_msleep = 0;
 		USB_BUS_GIANT_PROC(udev->bus)->up_msleep = 0;
-		USB_BUS_NON_GIANT_PROC(udev->bus)->up_msleep = 0;
+		USB_BUS_NON_GIANT_ISOC_PROC(udev->bus)->up_msleep = 0;
+		USB_BUS_NON_GIANT_BULK_PROC(udev->bus)->up_msleep = 0;
 
 		/* poll USB hardware */
 		(udev->bus->methods->xfer_poll) (udev->bus);
diff --git a/sys/dev/usb/usbdi.h b/sys/dev/usb/usbdi.h
index 09b0ca7..ecd5a81 100644
--- a/sys/dev/usb/usbdi.h
+++ b/sys/dev/usb/usbdi.h
@@ -128,6 +128,8 @@ struct usb_xfer_queue {
 	void    (*command) (struct usb_xfer_queue *pq);
 	uint8_t	recurse_1:1;
 	uint8_t	recurse_2:1;
+	uint8_t	recurse_3:1;
+	uint8_t	reserved:5;
 };
 
 /*
diff --git a/sys/dev/vt/hw/efifb/efifb.c b/sys/dev/vt/hw/efifb/efifb.c
index ec029c8..4184f77 100644
--- a/sys/dev/vt/hw/efifb/efifb.c
+++ b/sys/dev/vt/hw/efifb/efifb.c
@@ -96,7 +96,6 @@ vt_efifb_probe(struct vt_device *vd)
 static int
 vt_efifb_init(struct vt_device *vd)
 {
-	int		depth, d;
 	struct fb_info	*info;
 	struct efi_fb	*efifb;
 	caddr_t		kmdp;
@@ -116,16 +115,13 @@ vt_efifb_init(struct vt_device *vd)
 	info->fb_height = efifb->fb_height;
 	info->fb_width = efifb->fb_width;
 
-	depth = fls(efifb->fb_mask_red);
-	d = fls(efifb->fb_mask_green);
-	depth = d > depth ? d : depth;
-	d = fls(efifb->fb_mask_blue);
-	depth = d > depth ? d : depth;
-	d = fls(efifb->fb_mask_reserved);
-	depth = d > depth ? d : depth;
-	info->fb_depth = depth;
+	info->fb_depth = fls(efifb->fb_mask_red | efifb->fb_mask_green |
+	    efifb->fb_mask_blue | efifb->fb_mask_reserved);
+	/* Round to a multiple of the bits in a byte. */
+	info->fb_bpp = (info->fb_depth + NBBY - 1) & ~(NBBY - 1);
 
-	info->fb_stride = efifb->fb_stride * (depth / 8);
+	/* Stride in bytes, not pixels */
+	info->fb_stride = efifb->fb_stride * (info->fb_bpp / NBBY);
 
 	vt_generate_cons_palette(info->fb_cmap, COLOR_FORMAT_RGB,
 	    efifb->fb_mask_red, ffs(efifb->fb_mask_red) - 1,
@@ -137,16 +133,6 @@ vt_efifb_init(struct vt_device *vd)
 	info->fb_vbase = (intptr_t)pmap_mapdev_attr(info->fb_pbase,
 	    info->fb_size, VM_MEMATTR_WRITE_COMBINING);
 
-	/* Get pixel storage size. */
-	info->fb_bpp = info->fb_stride / info->fb_width * 8;
-
-	/*
-	 * Early FB driver work with static window buffer, so reduce to minimal
-	 * size, buffer or screen.
-	 */
-	info->fb_width = MIN(info->fb_width, VT_FB_DEFAULT_WIDTH);
-	info->fb_height = MIN(info->fb_height, VT_FB_DEFAULT_HEIGHT);
-
 	vt_fb_init(vd);
 
 	return (CN_INTERNAL);
diff --git a/sys/dev/vt/hw/vga/vt_vga.c b/sys/dev/vt/hw/vga/vt_vga.c
index 0b7ebe4..4661f35 100644
--- a/sys/dev/vt/hw/vga/vt_vga.c
+++ b/sys/dev/vt/hw/vga/vt_vga.c
@@ -883,9 +883,9 @@ vga_bitblt_text_txtmode(struct vt_device *vd, const struct vt_window *vw,
 			/* Convert colors to VGA attributes. */
 			attr = bg << 4 | fg;
 
-			MEM_WRITE1(sc, 0x18000 + (row * 80 + col) * 2 + 0,
+			MEM_WRITE1(sc, (row * 80 + col) * 2 + 0,
 			    ch);
-			MEM_WRITE1(sc, 0x18000 + (row * 80 + col) * 2 + 1,
+			MEM_WRITE1(sc, (row * 80 + col) * 2 + 1,
 			    attr);
 		}
 	}
@@ -1226,8 +1226,6 @@ vga_init(struct vt_device *vd)
 # error "Architecture not yet supported!"
 #endif
 
-	bus_space_map(sc->vga_fb_tag, VGA_MEM_BASE, VGA_MEM_SIZE, 0,
-	    &sc->vga_fb_handle);
 	bus_space_map(sc->vga_reg_tag, VGA_REG_BASE, VGA_REG_SIZE, 0,
 	    &sc->vga_reg_handle);
 
@@ -1236,9 +1234,13 @@ vga_init(struct vt_device *vd)
 		vd->vd_flags |= VDF_TEXTMODE;
 		vd->vd_width = 80;
 		vd->vd_height = 25;
+		bus_space_map(sc->vga_fb_tag, VGA_TXT_BASE, VGA_TXT_SIZE, 0,
+		    &sc->vga_fb_handle);
 	} else {
 		vd->vd_width = VT_VGA_WIDTH;
 		vd->vd_height = VT_VGA_HEIGHT;
+		bus_space_map(sc->vga_fb_tag, VGA_MEM_BASE, VGA_MEM_SIZE, 0,
+		    &sc->vga_fb_handle);
 	}
 	if (vga_initialize(vd, textmode) != 0)
 		return (CN_DEAD);
diff --git a/sys/dev/vt/hw/vga/vt_vga_reg.h b/sys/dev/vt/hw/vga/vt_vga_reg.h
index 5bfb8ce..cf33a37 100644
--- a/sys/dev/vt/hw/vga/vt_vga_reg.h
+++ b/sys/dev/vt/hw/vga/vt_vga_reg.h
@@ -49,6 +49,8 @@
 
 #define	VGA_MEM_BASE	0xA0000
 #define	VGA_MEM_SIZE	0x10000
+#define	VGA_TXT_BASE	0xB8000
+#define	VGA_TXT_SIZE	0x08000
 #define	VGA_REG_BASE	0x3c0
 #define	VGA_REG_SIZE	0x10+0x0c
 
diff --git a/sys/dev/vt/vt_core.c b/sys/dev/vt/vt_core.c
index 99da892..702df42 100644
--- a/sys/dev/vt/vt_core.c
+++ b/sys/dev/vt/vt_core.c
@@ -264,8 +264,9 @@ vt_update_static(void *dummy)
 	if (!vty_enabled(VTY_VT))
 		return;
 	if (main_vd->vd_driver != NULL)
-		printf("VT: running with driver \"%s\".\n",
-		    main_vd->vd_driver->vd_name);
+		printf("VT(%s): %s %ux%u\n", main_vd->vd_driver->vd_name,
+		    (main_vd->vd_flags & VDF_TEXTMODE) ? "text" : "resolution",
+		    main_vd->vd_width, main_vd->vd_height);
 	else
 		printf("VT: init without driver.\n");
 
diff --git a/sys/dev/xen/netfront/netfront.c b/sys/dev/xen/netfront/netfront.c
index 2f972b8..302c017 100644
--- a/sys/dev/xen/netfront/netfront.c
+++ b/sys/dev/xen/netfront/netfront.c
@@ -280,8 +280,6 @@ struct netfront_info {
 	struct callout	        xn_stat_ch;
 
 	u_long			rx_pfn_array[NET_RX_RING_SIZE];
-	multicall_entry_t	rx_mcl[NET_RX_RING_SIZE+1];
-	mmu_update_t		rx_mmu[NET_RX_RING_SIZE];
 	struct ifmedia		sc_media;
 
 	bool			xn_resume;
@@ -882,13 +880,6 @@ refill:
 			gnttab_grant_foreign_transfer_ref(ref,
 			    otherend_id, pfn);
 			sc->rx_pfn_array[nr_flips] = pfn;
-			if (!xen_feature(XENFEAT_auto_translated_physmap)) {
-				/* Remove this page before passing
-				 * back to Xen.
-				 */
-				MULTI_update_va_mapping(&sc->rx_mcl[i],
-				    vaddr, 0, 0);
-			}
 			nr_flips++;
 		} else {
 			gnttab_grant_foreign_access_ref(ref,
@@ -918,25 +909,6 @@ refill:
 		reservation.extent_order = 0;
 		reservation.address_bits = 0;
 		reservation.domid        = DOMID_SELF;
-
-		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
-			/* After all PTEs have been zapped, flush the TLB. */
-			sc->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] =
-			    UVMF_TLB_FLUSH|UVMF_ALL;
-	
-			/* Give away a batch of pages. */
-			sc->rx_mcl[i].op = __HYPERVISOR_memory_op;
-			sc->rx_mcl[i].args[0] = XENMEM_decrease_reservation;
-			sc->rx_mcl[i].args[1] =  (u_long)&reservation;
-			/* Zap PTEs and give away pages in one big multicall. */
-			(void)HYPERVISOR_multicall(sc->rx_mcl, i+1);
-
-			if (__predict_false(sc->rx_mcl[i].result != i ||
-			    HYPERVISOR_memory_op(XENMEM_decrease_reservation,
-			    &reservation) != i))
-				panic("%s: unable to reduce memory "
-				    "reservation\n", __func__);
-		}
 	} else {
 		wmb();
 	}
@@ -961,7 +933,6 @@ xn_rxeof(struct netfront_info *np)
 	struct netif_rx_response *rx = &rinfo.rx;
 	struct netif_extra_info *extras = rinfo.extras;
 	RING_IDX i, rp;
-	multicall_entry_t *mcl;
 	struct mbuf *m;
 	struct mbufq rxq, errq;
 	int err, pages_flipped = 0, work_to_do;
@@ -1022,19 +993,6 @@ xn_rxeof(struct netfront_info *np)
 #ifdef notyet
 			balloon_update_driver_allowance(-pages_flipped);
 #endif
-			/* Do all the remapping work, and M->P updates, in one big
-			 * hypercall.
-			 */
-			if (!!xen_feature(XENFEAT_auto_translated_physmap)) {
-				mcl = np->rx_mcl + pages_flipped;
-				mcl->op = __HYPERVISOR_mmu_update;
-				mcl->args[0] = (u_long)np->rx_mmu;
-				mcl->args[1] = pages_flipped;
-				mcl->args[2] = 0;
-				mcl->args[3] = DOMID_SELF;
-				(void)HYPERVISOR_multicall(np->rx_mcl,
-				    pages_flipped + 1);
-			}
 		}
 	
 		mbufq_drain(&errq);
@@ -1273,8 +1231,6 @@ xennet_get_responses(struct netfront_info *np,
 	int *pages_flipped_p)
 {
 	int pages_flipped = *pages_flipped_p;
-	struct mmu_update *mmu;
-	struct multicall_entry *mcl;
 	struct netif_rx_response *rx = &rinfo->rx;
 	struct netif_extra_info *extras = rinfo->extras;
 	struct mbuf *m, *m0, *m_prev;
@@ -1346,22 +1302,6 @@ xennet_get_responses(struct netfront_info *np,
 				goto next;
 			}
 
-			if (!xen_feature( XENFEAT_auto_translated_physmap)) {
-				/* Remap the page. */
-				void *vaddr = mtod(m, void *);
-				uint32_t pfn;
-
-				mcl = np->rx_mcl + pages_flipped;
-				mmu = np->rx_mmu + pages_flipped;
-
-				MULTI_update_va_mapping(mcl, (u_long)vaddr,
-				    (((vm_paddr_t)mfn) << PAGE_SHIFT) | PG_RW |
-				    PG_V | PG_M | PG_A, 0);
-				pfn = (uintptr_t)m->m_ext.ext_arg1;
-				mmu->ptr = ((vm_paddr_t)mfn << PAGE_SHIFT) |
-				    MMU_MACHPHYS_UPDATE;
-				mmu->val = pfn;
-			}
 			pages_flipped++;
 		} else {
 			ret = gnttab_end_foreign_access_ref(ref);
diff --git a/sys/fs/nfsserver/nfs_nfsdstate.c b/sys/fs/nfsserver/nfs_nfsdstate.c
index d1ade4a..c0e05b9 100644
--- a/sys/fs/nfsserver/nfs_nfsdstate.c
+++ b/sys/fs/nfsserver/nfs_nfsdstate.c
@@ -401,9 +401,12 @@ nfsrv_setclient(struct nfsrv_descript *nd, struct nfsclient **new_clpp,
 	}
 
 	/* For NFSv4.1, mark that we found a confirmed clientid. */
-	if ((nd->nd_flag & ND_NFSV41) != 0)
+	if ((nd->nd_flag & ND_NFSV41) != 0) {
+		clientidp->lval[0] = clp->lc_clientid.lval[0];
+		clientidp->lval[1] = clp->lc_clientid.lval[1];
+		confirmp->lval[0] = 0;	/* Ignored by client */
 		confirmp->lval[1] = 1;
-	else {
+	} else {
 		/*
 		 * id and verifier match, so update the net address info
 		 * and get rid of any existing callback authentication
diff --git a/sys/kern/genassym.sh b/sys/kern/genassym.sh
index 1cbc32b..521c7a2 100644
--- a/sys/kern/genassym.sh
+++ b/sys/kern/genassym.sh
@@ -10,7 +10,7 @@ usage()
 
 work()
 {
-	${NM:='nm'} "$1" | ${AWK:='awk'} '
+	${NM:='nm'} ${NMFLAGS} "$1" | ${AWK:='awk'} '
 	/ C .*sign$/ {
 		sign = substr($1, length($1) - 3, 4)
 		sub("^0*", "", sign)
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
index 3310d1d..d84c26f 100644
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c
@@ -981,6 +981,10 @@ proc_to_reap(struct thread *td, struct proc *p, idtype_t idtype, id_t id,
 
 	switch (idtype) {
 	case P_ALL:
+		if (p->p_procdesc != NULL) {
+			PROC_UNLOCK(p);
+			return (0);
+		}
 		break;
 	case P_PID:
 		if (p->p_pid != (pid_t)id) {
diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c
index d09f0b6..432e38a 100644
--- a/sys/kern/kern_tc.c
+++ b/sys/kern/kern_tc.c
@@ -133,6 +133,8 @@ SYSCTL_PROC(_kern_timecounter, OID_AUTO, alloweddeviation,
     sysctl_kern_timecounter_adjprecision, "I",
     "Allowed time interval deviation in percents");
 
+static int tc_chosen;	/* Non-zero if a specific tc was chosen via sysctl. */
+
 static void tc_windup(void);
 static void cpu_tick_calibrate(int);
 
@@ -1197,10 +1199,13 @@ tc_init(struct timecounter *tc)
 	    "quality", CTLFLAG_RD, &(tc->tc_quality), 0,
 	    "goodness of time counter");
 	/*
-	 * Never automatically use a timecounter with negative quality.
+	 * Do not automatically switch if the current tc was specifically
+	 * chosen.  Never automatically use a timecounter with negative quality.
 	 * Even though we run on the dummy counter, switching here may be
-	 * worse since this timecounter may not be monotonous.
+	 * worse since this timecounter may not be monotonic.
 	 */
+	if (tc_chosen)
+		return;
 	if (tc->tc_quality < 0)
 		return;
 	if (tc->tc_quality < timecounter->tc_quality)
@@ -1433,9 +1438,12 @@ sysctl_kern_timecounter_hardware(SYSCTL_HANDLER_ARGS)
 	strlcpy(newname, tc->tc_name, sizeof(newname));
 
 	error = sysctl_handle_string(oidp, &newname[0], sizeof(newname), req);
-	if (error != 0 || req->newptr == NULL ||
-	    strcmp(newname, tc->tc_name) == 0)
+	if (error != 0 || req->newptr == NULL)
 		return (error);
+	/* Record that the tc in use now was specifically chosen. */
+	tc_chosen = 1;
+	if (strcmp(newname, tc->tc_name) == 0)
+		return (0);
 	for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) {
 		if (strcmp(newname, newtc->tc_name) != 0)
 			continue;
@@ -1464,7 +1472,7 @@ SYSCTL_PROC(_kern_timecounter, OID_AUTO, hardware, CTLTYPE_STRING | CTLFLAG_RW,
     "Timecounter hardware selected");
 
 
-/* Report or change the active timecounter hardware. */
+/* Report the available timecounter hardware. */
 static int
 sysctl_kern_timecounter_choice(SYSCTL_HANDLER_ARGS)
 {
diff --git a/sys/modules/Makefile b/sys/modules/Makefile
index bd52356..21009a9 100644
--- a/sys/modules/Makefile
+++ b/sys/modules/Makefile
@@ -296,6 +296,9 @@ SUBDIR=	\
 	${_qlxgbe} \
 	ral \
 	${_ralfw} \
+	${_random_fortuna} \
+	${_random_yarrow} \
+	${_random_other} \
 	rc4 \
 	${_rdma} \
 	${_rdrand_rng} \
@@ -398,6 +401,9 @@ _autofs=	autofs
 .if exists(${.CURDIR}/../opencrypto)
 _crypto=	crypto
 _cryptodev=	cryptodev
+_random_fortuna=random_fortuna
+_random_yarrow=	random_yarrow
+_random_other=	random_other
 .endif
 .endif
 
diff --git a/sys/modules/am335x_dmtpps/Makefile b/sys/modules/am335x_dmtpps/Makefile
new file mode 100644
index 0000000..4d9deac
--- /dev/null
+++ b/sys/modules/am335x_dmtpps/Makefile
@@ -0,0 +1,8 @@
+# $FreeBSD$
+
+.PATH: ${.CURDIR}/../../arm/ti/am335x
+
+KMOD=	am335x_dmtpps
+SRCS=	am335x_dmtpps.c
+
+.include <bsd.kmod.mk>
diff --git a/sys/modules/ctl/Makefile b/sys/modules/ctl/Makefile
index e97ec38..c74f000 100644
--- a/sys/modules/ctl/Makefile
+++ b/sys/modules/ctl/Makefile
@@ -11,7 +11,7 @@ SRCS+=	ctl_backend_ramdisk.c
 SRCS+=	ctl_cmd_table.c
 SRCS+=	ctl_frontend.c
 SRCS+=	ctl_frontend_cam_sim.c
-SRCS+=	ctl_frontend_internal.c
+SRCS+=	ctl_frontend_ioctl.c
 SRCS+=	ctl_frontend_iscsi.c
 SRCS+=	ctl_scsi_all.c
 SRCS+=	ctl_tpc.c
diff --git a/sys/modules/gpio/gpiobus/Makefile b/sys/modules/gpio/gpiobus/Makefile
index e868cba..2a3f86d 100644
--- a/sys/modules/gpio/gpiobus/Makefile
+++ b/sys/modules/gpio/gpiobus/Makefile
@@ -32,8 +32,9 @@
 .PATH:	${.CURDIR}/../../../dev/gpio/
 
 KMOD=	gpiobus
-SRCS=	gpiobus.c
-SRCS+=	device_if.h bus_if.h gpio_if.h gpiobus_if.h opt_platform.h
+SRCS=	gpiobus.c gpioc.c
+SRCS+=	gpio_if.c gpio_if.h gpiobus_if.c gpiobus_if.h
+SRCS+=	device_if.h bus_if.h opt_platform.h
 
 CFLAGS+=  -I. -I${.CURDIR}/../../../dev/gpio/
 
diff --git a/sys/modules/random_fortuna/Makefile b/sys/modules/random_fortuna/Makefile
new file mode 100644
index 0000000..d28ae4d
--- /dev/null
+++ b/sys/modules/random_fortuna/Makefile
@@ -0,0 +1,11 @@
+# $FreeBSD$
+
+.PATH:	${.CURDIR}/../../dev/random
+
+KMOD	= random_fortuna
+SRCS	= randomdev.c hash.c fortuna.c
+SRCS	+= opt_param.h bus_if.h device_if.h
+SRCS	+= opt_ddb.h
+CFLAGS	+= -DRANDOM_LOADABLE
+
+.include <bsd.kmod.mk>
diff --git a/sys/modules/random_other/Makefile b/sys/modules/random_other/Makefile
new file mode 100644
index 0000000..6ce586b
--- /dev/null
+++ b/sys/modules/random_other/Makefile
@@ -0,0 +1,11 @@
+# $FreeBSD$
+
+.PATH:	${.CURDIR}/../../dev/random
+
+KMOD	= random_OTHER
+SRCS	= randomdev.c hash.c other_algorithm.c
+SRCS	+= opt_param.h bus_if.h device_if.h
+SRCS	+= opt_ddb.h
+CFLAGS	+= -DRANDOM_LOADABLE
+
+.include <bsd.kmod.mk>
diff --git a/sys/modules/random_yarrow/Makefile b/sys/modules/random_yarrow/Makefile
new file mode 100644
index 0000000..1750af4
--- /dev/null
+++ b/sys/modules/random_yarrow/Makefile
@@ -0,0 +1,11 @@
+# $FreeBSD$
+
+.PATH:	${.CURDIR}/../../dev/random
+
+KMOD	= random_yarrow
+SRCS	= randomdev.c hash.c yarrow.c
+SRCS	+= opt_param.h bus_if.h device_if.h
+SRCS	+= opt_ddb.h
+CFLAGS	+= -DRANDOM_LOADABLE
+
+.include <bsd.kmod.mk>
diff --git a/sys/net/ieee8023ad_lacp.c b/sys/net/ieee8023ad_lacp.c
index 64aafb1..1af4ffc 100644
--- a/sys/net/ieee8023ad_lacp.c
+++ b/sys/net/ieee8023ad_lacp.c
@@ -522,7 +522,7 @@ lacp_port_create(struct lagg_port *lgp)
 	int error;
 
 	boolean_t active = TRUE; /* XXX should be configurable */
-	boolean_t fast = FALSE; /* XXX should be configurable */
+	boolean_t fast = FALSE; /* Configurable via ioctl */ 
 
 	link_init_sdl(ifp, (struct sockaddr *)&sdl, IFT_ETHER);
 	sdl.sdl_alen = ETHER_ADDR_LEN;
diff --git a/sys/net/ieee8023ad_lacp.h b/sys/net/ieee8023ad_lacp.h
index e814f83..8f0f51a 100644
--- a/sys/net/ieee8023ad_lacp.h
+++ b/sys/net/ieee8023ad_lacp.h
@@ -251,6 +251,7 @@ struct lacp_softc {
 		u_int32_t	lsc_tx_test;
 	} lsc_debug;
 	u_int32_t		lsc_strict_mode;
+	boolean_t		lsc_fast_timeout; /* if set, fast timeout */
 };
 
 #define	LACP_TYPE_ACTORINFO	1
diff --git a/sys/net/if_lagg.c b/sys/net/if_lagg.c
index dcd005a..b623493 100644
--- a/sys/net/if_lagg.c
+++ b/sys/net/if_lagg.c
@@ -1257,6 +1257,8 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 				ro->ro_opts |= LAGG_OPT_LACP_RXTEST;
 			if (lsc->lsc_strict_mode != 0)
 				ro->ro_opts |= LAGG_OPT_LACP_STRICT;
+			if (lsc->lsc_fast_timeout != 0)
+				ro->ro_opts |= LAGG_OPT_LACP_TIMEOUT;
 
 			ro->ro_active = sc->sc_active;
 		} else {
@@ -1292,6 +1294,8 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 		case -LAGG_OPT_LACP_RXTEST:
 		case LAGG_OPT_LACP_STRICT:
 		case -LAGG_OPT_LACP_STRICT:
+		case LAGG_OPT_LACP_TIMEOUT:
+		case -LAGG_OPT_LACP_TIMEOUT:
 			valid = lacp = 1;
 			break;
 		default:
@@ -1320,6 +1324,7 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 				sc->sc_opts &= ~ro->ro_opts;
 		} else {
 			struct lacp_softc *lsc;
+			struct lacp_port *lp;
 
 			lsc = (struct lacp_softc *)sc->sc_psc;
 
@@ -1342,6 +1347,20 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 			case -LAGG_OPT_LACP_STRICT:
 				lsc->lsc_strict_mode = 0;
 				break;
+			case LAGG_OPT_LACP_TIMEOUT:
+				LACP_LOCK(lsc);
+        			LIST_FOREACH(lp, &lsc->lsc_ports, lp_next)
+                        		lp->lp_state |= LACP_STATE_TIMEOUT;
+				LACP_UNLOCK(lsc);
+				lsc->lsc_fast_timeout = 1;
+				break;
+			case -LAGG_OPT_LACP_TIMEOUT:
+				LACP_LOCK(lsc);
+        			LIST_FOREACH(lp, &lsc->lsc_ports, lp_next)
+                        		lp->lp_state &= ~LACP_STATE_TIMEOUT;
+				LACP_UNLOCK(lsc);
+				lsc->lsc_fast_timeout = 0;
+				break;
 			}
 		}
 		LAGG_WUNLOCK(sc);
diff --git a/sys/net/if_lagg.h b/sys/net/if_lagg.h
index a45fa16..bb5ea23 100644
--- a/sys/net/if_lagg.h
+++ b/sys/net/if_lagg.h
@@ -150,6 +150,7 @@ struct lagg_reqopts {
 #define	LAGG_OPT_LACP_STRICT		0x10		/* LACP strict mode */
 #define	LAGG_OPT_LACP_TXTEST		0x20		/* LACP debug: txtest */
 #define	LAGG_OPT_LACP_RXTEST		0x40		/* LACP debug: rxtest */
+#define	LAGG_OPT_LACP_TIMEOUT		0x80		/* LACP timeout */
 	u_int			ro_count;		/* number of ports */
 	u_int			ro_active;		/* active port count */
 	u_int			ro_flapping;		/* number of flapping */
diff --git a/sys/netinet/if_ether.c b/sys/netinet/if_ether.c
index f90925a..263c197 100644
--- a/sys/netinet/if_ether.c
+++ b/sys/netinet/if_ether.c
@@ -130,6 +130,13 @@ static void	arptimer(void *);
 static void	in_arpinput(struct mbuf *);
 #endif
 
+static void arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr,
+    struct ifnet *ifp, int bridged, struct llentry *la);
+static void arp_update_lle(struct arphdr *ah, struct ifnet *ifp,
+    struct llentry *la);
+static void arp_mark_lle_reachable(struct llentry *la);
+
+
 static const struct netisr_handler arp_nh = {
 	.nh_name = "arp",
 	.nh_handler = arpintr,
@@ -302,57 +309,37 @@ arprequest(struct ifnet *ifp, const struct in_addr *sip,
 }
 
 /*
- * Resolve an IP address into an ethernet address.
- * On input:
- *    ifp is the interface we use
- *    is_gw != if @dst represents gateway to some destination
- *    m is the mbuf. May be NULL if we don't have a packet.
- *    dst is the next hop,
- *    desten is where we want the address.
- *    flags returns lle entry flags.
+ * Resolve an IP address into an ethernet address - heavy version.
+ * Used internally by arpresolve().
+ * We have already checked than  we can't use existing lle without
+ * modification so we have to acquire LLE_EXCLUSIVE lle lock.
  *
  * On success, desten and flags are filled in and the function returns 0;
  * If the packet must be held pending resolution, we return EWOULDBLOCK
  * On other errors, we return the corresponding error code.
  * Note that m_freem() handles NULL.
  */
-int
-arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m,
+static int
+arpresolve_full(struct ifnet *ifp, int is_gw, int create, struct mbuf *m,
 	const struct sockaddr *dst, u_char *desten, uint32_t *pflags)
 {
-	struct llentry *la = 0;
-	u_int flags = 0;
+	struct llentry *la = NULL;
 	struct mbuf *curr = NULL;
 	struct mbuf *next = NULL;
-	int create, error, renew;
+	int error, renew;
 
 	if (pflags != NULL)
 		*pflags = 0;
 
-	create = 0;
-	if (m != NULL) {
-		if (m->m_flags & M_BCAST) {
-			/* broadcast */
-			(void)memcpy(desten,
-			    ifp->if_broadcastaddr, ifp->if_addrlen);
-			return (0);
-		}
-		if (m->m_flags & M_MCAST) {
-			/* multicast */
-			ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten);
-			return (0);
-		}
+	if (create == 0) {
+		IF_AFDATA_RLOCK(ifp);
+		la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
+		IF_AFDATA_RUNLOCK(ifp);
 	}
-retry:
-	IF_AFDATA_RLOCK(ifp);
-	la = lla_lookup(LLTABLE(ifp), flags, dst);
-	IF_AFDATA_RUNLOCK(ifp);
-	if ((la == NULL) && ((flags & LLE_EXCLUSIVE) == 0)
-	    && ((ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0)) {
+	if (la == NULL && (ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0) {
 		create = 1;
-		flags |= LLE_EXCLUSIVE;
 		IF_AFDATA_WLOCK(ifp);
-		la = lla_create(LLTABLE(ifp), flags, dst);
+		la = lla_create(LLTABLE(ifp), 0, dst);
 		IF_AFDATA_WUNLOCK(ifp);
 	}
 	if (la == NULL) {
@@ -382,10 +369,7 @@ retry:
 		if (pflags != NULL)
 			*pflags = la->la_flags;
 
-		if (flags & LLE_EXCLUSIVE)
-			LLE_WUNLOCK(la);
-		else
-			LLE_RUNLOCK(la);
+		LLE_WUNLOCK(la);
 
 		if (renew == 1)
 			arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL);
@@ -393,20 +377,7 @@ retry:
 		return (0);
 	}
 
-	if (la->la_flags & LLE_STATIC) {   /* should not happen! */
-		log(LOG_DEBUG, "arpresolve: ouch, empty static llinfo for %s\n",
-		    inet_ntoa(SIN(dst)->sin_addr));
-		m_freem(m);
-		error = EINVAL;
-		goto done;
-	}
-
 	renew = (la->la_asked == 0 || la->la_expire != time_uptime);
-	if ((renew || m != NULL) && (flags & LLE_EXCLUSIVE) == 0) {
-		flags |= LLE_EXCLUSIVE;
-		LLE_RUNLOCK(la);
-		goto retry;
-	}
 	/*
 	 * There is an arptab entry, but no ethernet address
 	 * response yet.  Add the mbuf to the list, dropping
@@ -431,11 +402,6 @@ retry:
 		} else
 			la->la_hold = m;
 		la->la_numheld++;
-		if (renew == 0 && (flags & LLE_EXCLUSIVE)) {
-			flags &= ~LLE_EXCLUSIVE;
-			LLE_DOWNGRADE(la);
-		}
-
 	}
 	/*
 	 * Return EWOULDBLOCK if we have tried less than arp_maxtries. It
@@ -462,15 +428,88 @@ retry:
 		arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL);
 		return (error);
 	}
-done:
-	if (flags & LLE_EXCLUSIVE)
-		LLE_WUNLOCK(la);
-	else
-		LLE_RUNLOCK(la);
+
+	LLE_WUNLOCK(la);
 	return (error);
 }
 
 /*
+ * Resolve an IP address into an ethernet address.
+ * On input:
+ *    ifp is the interface we use
+ *    is_gw != 0 if @dst represents gateway to some destination
+ *    m is the mbuf. May be NULL if we don't have a packet.
+ *    dst is the next hop,
+ *    desten is the storage to put LL address.
+ *    flags returns lle entry flags.
+ *
+ * On success, desten and flags are filled in and the function returns 0;
+ * If the packet must be held pending resolution, we return EWOULDBLOCK
+ * On other errors, we return the corresponding error code.
+ * Note that m_freem() handles NULL.
+ */
+int
+arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m,
+	const struct sockaddr *dst, u_char *desten, uint32_t *pflags)
+{
+	struct llentry *la = 0;
+	int renew;
+
+	if (pflags != NULL)
+		*pflags = 0;
+
+	if (m != NULL) {
+		if (m->m_flags & M_BCAST) {
+			/* broadcast */
+			(void)memcpy(desten,
+			    ifp->if_broadcastaddr, ifp->if_addrlen);
+			return (0);
+		}
+		if (m->m_flags & M_MCAST) {
+			/* multicast */
+			ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten);
+			return (0);
+		}
+	}
+
+	IF_AFDATA_RLOCK(ifp);
+	la = lla_lookup(LLTABLE(ifp), 0, dst);
+	IF_AFDATA_RUNLOCK(ifp);
+
+	if (la == NULL)
+		return (arpresolve_full(ifp, is_gw, 1, m, dst, desten, pflags));
+
+	if ((la->la_flags & LLE_VALID) &&
+	    ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) {
+		bcopy(&la->ll_addr, desten, ifp->if_addrlen);
+		renew = 0;
+		/*
+		 * If entry has an expiry time and it is approaching,
+		 * see if we need to send an ARP request within this
+		 * arpt_down interval.
+		 */
+		if (!(la->la_flags & LLE_STATIC) &&
+		    time_uptime + la->la_preempt > la->la_expire) {
+			renew = 1;
+			la->la_preempt--;
+		}
+
+		if (pflags != NULL)
+			*pflags = la->la_flags;
+
+		LLE_RUNLOCK(la);
+
+		if (renew == 1)
+			arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL);
+
+		return (0);
+	}
+	LLE_RUNLOCK(la);
+
+	return (arpresolve_full(ifp, is_gw, 0, m, dst, desten, pflags));
+}
+
+/*
  * Common length and type checks are done here,
  * then the protocol-specific routine is called.
  */
@@ -576,10 +615,10 @@ in_arpinput(struct mbuf *m)
 	struct sockaddr sa;
 	struct in_addr isaddr, itaddr, myaddr;
 	u_int8_t *enaddr = NULL;
-	int op, flags;
+	int op;
 	int req_len;
 	int bridged = 0, is_bridge = 0;
-	int carped, create;
+	int carped;
 	struct sockaddr_in sin;
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_family = AF_INET;
@@ -708,6 +747,16 @@ match:
 		    "%s!\n", inet_ntoa(isaddr));
 		goto drop;
 	}
+
+	if (ifp->if_addrlen != ah->ar_hln) {
+		LLE_WUNLOCK(la);
+		ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, "
+		    "i/f %d (ignored)\n", ifp->if_addrlen,
+		    (u_char *) ar_sha(ah), ":", ah->ar_hln,
+		    ifp->if_addrlen);
+		goto drop;
+	}
+
 	/*
 	 * Warn if another host is using the same IP address, but only if the
 	 * IP address isn't 0.0.0.0, which is used for DHCP only, in which
@@ -730,100 +779,22 @@ match:
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_family = AF_INET;
 	sin.sin_addr = isaddr;
-	create = (itaddr.s_addr == myaddr.s_addr) ? 1 : 0;
-	flags = LLE_EXCLUSIVE;
-	IF_AFDATA_LOCK(ifp);
-	if (create != 0)
-		la = lla_create(LLTABLE(ifp), 0, (struct sockaddr *)&sin);
-	else
-		la = lla_lookup(LLTABLE(ifp), flags, (struct sockaddr *)&sin);
-	IF_AFDATA_UNLOCK(ifp);
-	if (la != NULL) {
-		/* the following is not an error when doing bridging */
-		if (!bridged && la->lle_tbl->llt_ifp != ifp) {
-			if (log_arp_wrong_iface)
-				ARP_LOG(LOG_WARNING, "%s is on %s "
-				    "but got reply from %*D on %s\n",
-				    inet_ntoa(isaddr),
-				    la->lle_tbl->llt_ifp->if_xname,
-				    ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
-				    ifp->if_xname);
-			LLE_WUNLOCK(la);
-			goto reply;
-		}
-		if ((la->la_flags & LLE_VALID) &&
-		    bcmp(ar_sha(ah), &la->ll_addr, ifp->if_addrlen)) {
-			if (la->la_flags & LLE_STATIC) {
-				LLE_WUNLOCK(la);
-				if (log_arp_permanent_modify)
-					ARP_LOG(LOG_ERR,
-					    "%*D attempts to modify "
-					    "permanent entry for %s on %s\n",
-					    ifp->if_addrlen,
-					    (u_char *)ar_sha(ah), ":",
-					    inet_ntoa(isaddr), ifp->if_xname);
-				goto reply;
-			}
-			if (log_arp_movements) {
-				ARP_LOG(LOG_INFO, "%s moved from %*D "
-				    "to %*D on %s\n",
-				    inet_ntoa(isaddr),
-				    ifp->if_addrlen,
-				    (u_char *)&la->ll_addr, ":",
-				    ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
-				    ifp->if_xname);
-			}
-		}
-
-		if (ifp->if_addrlen != ah->ar_hln) {
-			LLE_WUNLOCK(la);
-			ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, "
-			    "i/f %d (ignored)\n", ifp->if_addrlen,
-			    (u_char *) ar_sha(ah), ":", ah->ar_hln,
-			    ifp->if_addrlen);
-			goto drop;
-		}
-		(void)memcpy(&la->ll_addr, ar_sha(ah), ifp->if_addrlen);
-		la->la_flags |= LLE_VALID;
-
-		EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED);
-
-		if (!(la->la_flags & LLE_STATIC)) {
-			int canceled;
-
-			LLE_ADDREF(la);
-			la->la_expire = time_uptime + V_arpt_keep;
-			canceled = callout_reset(&la->lle_timer,
-			    hz * V_arpt_keep, arptimer, la);
-			if (canceled)
-				LLE_REMREF(la);
-		}
-		la->la_asked = 0;
-		la->la_preempt = V_arp_maxtries;
+	IF_AFDATA_RLOCK(ifp);
+	la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, (struct sockaddr *)&sin);
+	IF_AFDATA_RUNLOCK(ifp);
+	if (la != NULL)
+		arp_check_update_lle(ah, isaddr, ifp, bridged, la);
+	else if (itaddr.s_addr == myaddr.s_addr) {
 		/*
-		 * The packets are all freed within the call to the output
-		 * routine.
-		 *
-		 * NB: The lock MUST be released before the call to the
-		 * output routine.
+		 * Reply to our address, but no lle exists yet.
+		 * do we really have to create an entry?
 		 */
-		if (la->la_hold != NULL) {
-			struct mbuf *m_hold, *m_hold_next;
-
-			m_hold = la->la_hold;
-			la->la_hold = NULL;
-			la->la_numheld = 0;
-			lltable_fill_sa_entry(la, (struct sockaddr *)&sa);
-			LLE_WUNLOCK(la);
-			for (; m_hold != NULL; m_hold = m_hold_next) {
-				m_hold_next = m_hold->m_nextpkt;
-				m_hold->m_nextpkt = NULL;
-				/* Avoid confusing lower layers. */
-				m_clrprotoflags(m_hold);
-				(*ifp->if_output)(ifp, m_hold, &sa, NULL);
-			}
-		} else
-			LLE_WUNLOCK(la);
+		IF_AFDATA_WLOCK(ifp);
+		la = lla_create(LLTABLE(ifp), 0, (struct sockaddr *)&sin);
+		arp_update_lle(ah, ifp, la);
+		IF_AFDATA_WUNLOCK(ifp);
+		arp_mark_lle_reachable(la);
+		LLE_WUNLOCK(la);
 	}
 reply:
 	if (op != ARPOP_REQUEST)
@@ -934,6 +905,140 @@ drop:
 }
 #endif
 
+/*
+ * Checks received arp data against existing @la.
+ * Updates lle state/performs notification if necessary.
+ */
+static void
+arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp,
+    int bridged, struct llentry *la)
+{
+	struct sockaddr sa;
+	struct mbuf *m_hold, *m_hold_next;
+
+	LLE_WLOCK_ASSERT(la);
+
+	/* the following is not an error when doing bridging */
+	if (!bridged && la->lle_tbl->llt_ifp != ifp) {
+		if (log_arp_wrong_iface)
+			ARP_LOG(LOG_WARNING, "%s is on %s "
+			    "but got reply from %*D on %s\n",
+			    inet_ntoa(isaddr),
+			    la->lle_tbl->llt_ifp->if_xname,
+			    ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
+			    ifp->if_xname);
+		LLE_WUNLOCK(la);
+		return;
+	}
+	if ((la->la_flags & LLE_VALID) &&
+	    bcmp(ar_sha(ah), &la->ll_addr, ifp->if_addrlen)) {
+		if (la->la_flags & LLE_STATIC) {
+			LLE_WUNLOCK(la);
+			if (log_arp_permanent_modify)
+				ARP_LOG(LOG_ERR,
+				    "%*D attempts to modify "
+				    "permanent entry for %s on %s\n",
+				    ifp->if_addrlen,
+				    (u_char *)ar_sha(ah), ":",
+				    inet_ntoa(isaddr), ifp->if_xname);
+			return;
+		}
+		if (log_arp_movements) {
+			ARP_LOG(LOG_INFO, "%s moved from %*D "
+			    "to %*D on %s\n",
+			    inet_ntoa(isaddr),
+			    ifp->if_addrlen,
+			    (u_char *)&la->ll_addr, ":",
+			    ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
+			    ifp->if_xname);
+		}
+	}
+
+	/* Check if something has changed */
+	if (memcmp(&la->ll_addr, ar_sha(ah), ifp->if_addrlen) != 0 ||
+	    (la->la_flags & LLE_VALID) == 0) {
+		/* Perform real LLE update */
+		/* use afdata WLOCK to update fields */
+		LLE_ADDREF(la);
+		LLE_WUNLOCK(la);
+		IF_AFDATA_WLOCK(ifp);
+		LLE_WLOCK(la);
+
+		/*
+		 * Since we droppped LLE lock, other thread might have deleted
+		 * this lle. Check and return
+		 */
+		if ((la->la_flags & LLE_DELETED) != 0) {
+			IF_AFDATA_WUNLOCK(ifp);
+			LLE_FREE_LOCKED(la);
+			return;
+		}
+
+		/* Update data */
+		arp_update_lle(ah, ifp, la);
+
+		IF_AFDATA_WUNLOCK(ifp);
+		LLE_REMREF(la);
+	}
+
+	arp_mark_lle_reachable(la);
+
+	/*
+	 * The packets are all freed within the call to the output
+	 * routine.
+	 *
+	 * NB: The lock MUST be released before the call to the
+	 * output routine.
+	 */
+	if (la->la_hold != NULL) {
+		m_hold = la->la_hold;
+		la->la_hold = NULL;
+		la->la_numheld = 0;
+		lltable_fill_sa_entry(la, &sa);
+		LLE_WUNLOCK(la);
+		for (; m_hold != NULL; m_hold = m_hold_next) {
+			m_hold_next = m_hold->m_nextpkt;
+			m_hold->m_nextpkt = NULL;
+			/* Avoid confusing lower layers. */
+			m_clrprotoflags(m_hold);
+			(*ifp->if_output)(ifp, m_hold, &sa, NULL);
+		}
+	} else
+		LLE_WUNLOCK(la);
+}
+
+/*
+ * Updates @la fields used by fast path code.
+ */
+static void
+arp_update_lle(struct arphdr *ah, struct ifnet *ifp, struct llentry *la)
+{
+
+	memcpy(&la->ll_addr, ar_sha(ah), ifp->if_addrlen);
+	la->la_flags |= LLE_VALID;
+}
+
+static void
+arp_mark_lle_reachable(struct llentry *la)
+{
+	int canceled;
+
+	LLE_WLOCK_ASSERT(la);
+
+	EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED);
+
+	if (!(la->la_flags & LLE_STATIC)) {
+		LLE_ADDREF(la);
+		la->la_expire = time_uptime + V_arpt_keep;
+		canceled = callout_reset(&la->lle_timer,
+		    hz * V_arpt_keep, arptimer, la);
+		if (canceled)
+			LLE_REMREF(la);
+	}
+	la->la_asked = 0;
+	la->la_preempt = V_arp_maxtries;
+}
+
 void
 arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa)
 {
diff --git a/sys/netinet/sctp_timer.c b/sys/netinet/sctp_timer.c
index 6c8589e..3e72585 100644
--- a/sys/netinet/sctp_timer.c
+++ b/sys/netinet/sctp_timer.c
@@ -1492,6 +1492,8 @@ sctp_pathmtu_timer(struct sctp_inpcb *inp,
 #endif
 			if (mtu > next_mtu) {
 				net->mtu = next_mtu;
+			} else {
+				net->mtu = mtu;
 			}
 		}
 	}
diff --git a/sys/ofed/drivers/infiniband/core/cma.c b/sys/ofed/drivers/infiniband/core/cma.c
index f1d26cc..7ee525a 100644
--- a/sys/ofed/drivers/infiniband/core/cma.c
+++ b/sys/ofed/drivers/infiniband/core/cma.c
@@ -72,6 +72,11 @@ static int def_prec2sl = 3;
 module_param_named(def_prec2sl, def_prec2sl, int, 0644);
 MODULE_PARM_DESC(def_prec2sl, "Default value for SL priority with RoCE. Valid values 0 - 7");
 
+static int unify_tcp_port_space = 1;
+module_param(unify_tcp_port_space, int, 0644);
+MODULE_PARM_DESC(unify_tcp_port_space, "Unify the host TCP and RDMA port "
+		 "space allocation (default=1)");
+
 static int debug_level = 0;
 #define cma_pr(level, priv, format, arg...)		\
 	printk(level "CMA: %p: %s: " format, ((struct rdma_id_priv *) priv) , __func__, ## arg)
@@ -957,6 +962,8 @@ static void cma_release_port(struct rdma_id_private *id_priv)
 		kfree(bind_list);
 	}
 	mutex_unlock(&lock);
+	if (id_priv->sock)
+		sock_release(id_priv->sock);
 }
 
 static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
@@ -2449,6 +2456,42 @@ static int cma_bind_listen(struct rdma_id_private *id_priv)
 	return ret;
 }
 
+static int cma_get_tcp_port(struct rdma_id_private *id_priv)
+{
+	int ret;
+	int size;
+	struct socket *sock;
+
+	ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (ret)
+		return ret;
+#ifdef __linux__
+	ret = sock->ops->bind(sock,
+			(struct sockaddr *) &id_priv->id.route.addr.src_addr,
+			ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr));
+#else
+	ret = -sobind(sock,
+			(struct sockaddr *)&id_priv->id.route.addr.src_addr,
+			curthread);
+#endif
+	if (ret) {
+		sock_release(sock);
+		return ret;
+	}
+
+	size = ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr);
+	ret = sock_getname(sock,
+			(struct sockaddr *) &id_priv->id.route.addr.src_addr,
+			&size, 0);
+	if (ret) {
+		sock_release(sock);
+		return ret;
+	}
+
+	id_priv->sock = sock;
+	return 0;
+}
+
 static int cma_get_port(struct rdma_id_private *id_priv)
 {
 	struct idr *ps;
@@ -2460,6 +2503,11 @@ static int cma_get_port(struct rdma_id_private *id_priv)
 		break;
 	case RDMA_PS_TCP:
 		ps = &tcp_ps;
+		if (unify_tcp_port_space) {
+			ret = cma_get_tcp_port(id_priv);
+			if (ret)
+				goto out;
+		}
 		break;
 	case RDMA_PS_UDP:
 		ps = &udp_ps;
@@ -2480,7 +2528,7 @@ static int cma_get_port(struct rdma_id_private *id_priv)
 	else
 		ret = cma_use_port(ps, id_priv);
 	mutex_unlock(&lock);
-
+out:
 	return ret;
 }
 
diff --git a/sys/powerpc/powerpc/trap.c b/sys/powerpc/powerpc/trap.c
index 57008e9..d2e5eaa 100644
--- a/sys/powerpc/powerpc/trap.c
+++ b/sys/powerpc/powerpc/trap.c
@@ -413,8 +413,8 @@ printtrap(u_int vector, struct trapframe *frame, int isfatal, int user)
 	case EXC_DTMISS:
 		printf("   virtual address = 0x%" PRIxPTR "\n", frame->dar);
 #ifdef AIM
-		printf("   dsisr           = 0x%" PRIxPTR "\n",
-		    frame->cpu.aim.dsisr);
+		printf("   dsisr           = 0x%lx\n",
+		    (u_long)frame->cpu.aim.dsisr);
 #endif
 		break;
 	case EXC_ISE:
@@ -438,7 +438,7 @@ printtrap(u_int vector, struct trapframe *frame, int isfatal, int user)
 	    frame->cpu.booke.esr);
 #endif
 	printf("   srr0            = 0x%" PRIxPTR "\n", frame->srr0);
-	printf("   srr1            = 0x%" PRIxPTR "\n", frame->srr1);
+	printf("   srr1            = 0x%lx\n", (u_long)frame->srr1);
 	printf("   lr              = 0x%" PRIxPTR "\n", frame->lr);
 	printf("   curthread       = %p\n", curthread);
 	if (curthread != NULL)
diff --git a/sys/sys/ata.h b/sys/sys/ata.h
index 863f0e8..272b46a 100644
--- a/sys/sys/ata.h
+++ b/sys/sys/ata.h
@@ -399,6 +399,7 @@ struct ata_params {
 #define ATA_IDLE_CMD                    0xe3    /* idle */
 #define ATA_READ_BUFFER                 0xe4    /* read buffer */
 #define ATA_READ_PM                     0xe4    /* read portmultiplier */
+#define ATA_CHECK_POWER_MODE            0xe5    /* device power mode */
 #define ATA_SLEEP                       0xe6    /* sleep */
 #define ATA_FLUSHCACHE                  0xe7    /* flush cache to disk */
 #define ATA_WRITE_PM                    0xe8    /* write portmultiplier */
diff --git a/sys/sys/nv.h b/sys/sys/nv.h
index fa5d138..a985b6d 100644
--- a/sys/sys/nv.h
+++ b/sys/sys/nv.h
@@ -1,5 +1,6 @@
 /*-
  * Copyright (c) 2009-2013 The FreeBSD Foundation
+ * Copyright (c) 2013-2015 Mariusz Zaborski <oshogbo@FreeBSD.org>
  * All rights reserved.
  *
  * This software was developed by Pawel Jakub Dawidek under sponsorship from
@@ -59,6 +60,11 @@ typedef struct nvlist nvlist_t;
 #define	NV_TYPE_NVLIST			5
 #define	NV_TYPE_DESCRIPTOR		6
 #define	NV_TYPE_BINARY			7
+#define	NV_TYPE_BOOL_ARRAY		8
+#define	NV_TYPE_NUMBER_ARRAY		9
+#define	NV_TYPE_STRING_ARRAY		10
+#define	NV_TYPE_NVLIST_ARRAY		11
+#define	NV_TYPE_DESCRIPTOR_ARRAY	12
 
 /*
  * Perform case-insensitive lookups of provided names.
@@ -101,6 +107,11 @@ const char *nvlist_next(const nvlist_t *nvl, int *typep, void **cookiep);
 
 const nvlist_t *nvlist_get_parent(const nvlist_t *nvl, void **cookiep);
 
+const nvlist_t *nvlist_get_array_next(const nvlist_t *nvl);
+bool nvlist_in_array(const nvlist_t *nvl);
+
+const nvlist_t *nvlist_get_pararr(const nvlist_t *nvl, void **cookiep);
+
 /*
  * The nvlist_exists functions check if the given name (optionally of the given
  * type) exists on nvlist.
@@ -114,10 +125,15 @@ bool nvlist_exists_bool(const nvlist_t *nvl, const char *name);
 bool nvlist_exists_number(const nvlist_t *nvl, const char *name);
 bool nvlist_exists_string(const nvlist_t *nvl, const char *name);
 bool nvlist_exists_nvlist(const nvlist_t *nvl, const char *name);
+bool nvlist_exists_binary(const nvlist_t *nvl, const char *name);
+bool nvlist_exists_bool_array(const nvlist_t *nvl, const char *name);
+bool nvlist_exists_number_array(const nvlist_t *nvl, const char *name);
+bool nvlist_exists_string_array(const nvlist_t *nvl, const char *name);
+bool nvlist_exists_nvlist_array(const nvlist_t *nvl, const char *name);
 #ifndef _KERNEL
 bool nvlist_exists_descriptor(const nvlist_t *nvl, const char *name);
+bool nvlist_exists_descriptor_array(const nvlist_t *nvl, const char *name);
 #endif
-bool nvlist_exists_binary(const nvlist_t *nvl, const char *name);
 
 /*
  * The nvlist_add functions add the given name/value pair.
@@ -134,10 +150,15 @@ void nvlist_add_stringf(nvlist_t *nvl, const char *name, const char *valuefmt, .
 void nvlist_add_stringv(nvlist_t *nvl, const char *name, const char *valuefmt, va_list valueap) __printflike(3, 0);
 #endif
 void nvlist_add_nvlist(nvlist_t *nvl, const char *name, const nvlist_t *value);
+void nvlist_add_binary(nvlist_t *nvl, const char *name, const void *value, size_t size);
+void nvlist_add_bool_array(nvlist_t *nvl, const char *name, const bool *value, size_t nitems);
+void nvlist_add_number_array(nvlist_t *nvl, const char *name, const uint64_t *value, size_t nitems);
+void nvlist_add_string_array(nvlist_t *nvl, const char *name, const char * const *value, size_t nitems);
+void nvlist_add_nvlist_array(nvlist_t *nvl, const char *name, const nvlist_t * const *value, size_t nitems);
 #ifndef _KERNEL
 void nvlist_add_descriptor(nvlist_t *nvl, const char *name, int value);
+void nvlist_add_descriptor_array(nvlist_t *nvl, const char *name, const int *value, size_t nitems);
 #endif
-void nvlist_add_binary(nvlist_t *nvl, const char *name, const void *value, size_t size);
 
 /*
  * The nvlist_move functions add the given name/value pair.
@@ -146,10 +167,15 @@ void nvlist_add_binary(nvlist_t *nvl, const char *name, const void *value, size_
 
 void nvlist_move_string(nvlist_t *nvl, const char *name, char *value);
 void nvlist_move_nvlist(nvlist_t *nvl, const char *name, nvlist_t *value);
+void nvlist_move_binary(nvlist_t *nvl, const char *name, void *value, size_t size);
+void nvlist_move_bool_array(nvlist_t *nvl, const char *name, bool *value, size_t nitems);
+void nvlist_move_string_array(nvlist_t *nvl, const char *name, char **value, size_t nitems);
+void nvlist_move_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t **value, size_t nitems);
+void nvlist_move_number_array(nvlist_t *nvl, const char *name, uint64_t *value, size_t nitems);
 #ifndef _KERNEL
 void nvlist_move_descriptor(nvlist_t *nvl, const char *name, int value);
+void nvlist_move_descriptor_array(nvlist_t *nvl, const char *name, int *value, size_t nitems);
 #endif
-void nvlist_move_binary(nvlist_t *nvl, const char *name, void *value, size_t size);
 
 /*
  * The nvlist_get functions returns value associated with the given name.
@@ -157,14 +183,19 @@ void nvlist_move_binary(nvlist_t *nvl, const char *name, void *value, size_t siz
  * not be freed by the caller.
  */
 
-bool		 nvlist_get_bool(const nvlist_t *nvl, const char *name);
-uint64_t	 nvlist_get_number(const nvlist_t *nvl, const char *name);
-const char	*nvlist_get_string(const nvlist_t *nvl, const char *name);
-const nvlist_t	*nvlist_get_nvlist(const nvlist_t *nvl, const char *name);
+bool			 nvlist_get_bool(const nvlist_t *nvl, const char *name);
+uint64_t		 nvlist_get_number(const nvlist_t *nvl, const char *name);
+const char		*nvlist_get_string(const nvlist_t *nvl, const char *name);
+const nvlist_t		*nvlist_get_nvlist(const nvlist_t *nvl, const char *name);
+const void		*nvlist_get_binary(const nvlist_t *nvl, const char *name, size_t *sizep);
+const bool		*nvlist_get_bool_array(const nvlist_t *nvl, const char *name, size_t *nitemsp);
+const uint64_t		*nvlist_get_number_array(const nvlist_t *nvl, const char *name, size_t *nitemsp);
+const char * const	*nvlist_get_string_array(const nvlist_t *nvl, const char *name, size_t *nitemsp);
+const nvlist_t * const	*nvlist_get_nvlist_array(const nvlist_t *nvl, const char *name, size_t *nitemsp);
 #ifndef _KERNEL
-int		 nvlist_get_descriptor(const nvlist_t *nvl, const char *name);
+int			 nvlist_get_descriptor(const nvlist_t *nvl, const char *name);
+const int		*nvlist_get_descriptor_array(const nvlist_t *nvl, const char *name, size_t *nitemsp);
 #endif
-const void	*nvlist_get_binary(const nvlist_t *nvl, const char *name, size_t *sizep);
 
 /*
  * The nvlist_take functions returns value associated with the given name and
@@ -172,14 +203,19 @@ const void	*nvlist_get_binary(const nvlist_t *nvl, const char *name, size_t *siz
  * The caller is responsible for freeing received data.
  */
 
-bool		 nvlist_take_bool(nvlist_t *nvl, const char *name);
-uint64_t	 nvlist_take_number(nvlist_t *nvl, const char *name);
-char		*nvlist_take_string(nvlist_t *nvl, const char *name);
-nvlist_t	*nvlist_take_nvlist(nvlist_t *nvl, const char *name);
+bool		  nvlist_take_bool(nvlist_t *nvl, const char *name);
+uint64_t	  nvlist_take_number(nvlist_t *nvl, const char *name);
+char		 *nvlist_take_string(nvlist_t *nvl, const char *name);
+nvlist_t	 *nvlist_take_nvlist(nvlist_t *nvl, const char *name);
+void		 *nvlist_take_binary(nvlist_t *nvl, const char *name, size_t *sizep);
+bool		 *nvlist_take_bool_array(nvlist_t *nvl, const char *name, size_t *nitemsp);
+uint64_t	 *nvlist_take_number_array(nvlist_t *nvl, const char *name, size_t *nitemsp);
+char		**nvlist_take_string_array(nvlist_t *nvl, const char *name, size_t *nitemsp);
+nvlist_t	**nvlist_take_nvlist_array(nvlist_t *nvl, const char *name, size_t *nitemsp);
 #ifndef _KERNEL
 int		 nvlist_take_descriptor(nvlist_t *nvl, const char *name);
+int		 *nvlist_take_descriptor_array(nvlist_t *nvl, const char *name, size_t *nitemsp);
 #endif
-void		*nvlist_take_binary(nvlist_t *nvl, const char *name, size_t *sizep);
 
 /*
  * The nvlist_free functions removes the given name/value pair from the nvlist
@@ -194,10 +230,16 @@ void nvlist_free_bool(nvlist_t *nvl, const char *name);
 void nvlist_free_number(nvlist_t *nvl, const char *name);
 void nvlist_free_string(nvlist_t *nvl, const char *name);
 void nvlist_free_nvlist(nvlist_t *nvl, const char *name);
+void nvlist_free_binary(nvlist_t *nvl, const char *name);
+void nvlist_free_bool_array(nvlist_t *nvl, const char *name);
+void nvlist_free_number_array(nvlist_t *nvl, const char *name);
+void nvlist_free_string_array(nvlist_t *nvl, const char *name);
+void nvlist_free_nvlist_array(nvlist_t *nvl, const char *name);
+void nvlist_free_binary_array(nvlist_t *nvl, const char *name);
 #ifndef _KERNEL
 void nvlist_free_descriptor(nvlist_t *nvl, const char *name);
+void nvlist_free_descriptor_array(nvlist_t *nvl, const char *name);
 #endif
-void nvlist_free_binary(nvlist_t *nvl, const char *name);
 
 __END_DECLS
 
diff --git a/sys/sys/random.h b/sys/sys/random.h
index 78a9955..92eb80f 100644
--- a/sys/sys/random.h
+++ b/sys/sys/random.h
@@ -33,10 +33,29 @@
 
 #include <sys/types.h>
 
+#include "opt_random.h"
+
+#if defined(RANDOM_LOADABLE) && defined(RANDOM_YARROW)
+#error "Cannot define both RANDOM_LOADABLE and RANDOM_YARROW"
+#endif
+
 struct uio;
 
+#if defined(DEV_RANDOM)
 u_int read_random(void *, u_int);
 int read_random_uio(struct uio *, bool);
+#else
+static __inline int
+read_random_uio(void *a __unused, u_int b __unused)
+{
+	return (0);
+}
+static __inline u_int
+read_random(void *a __unused, u_int b __unused)
+{
+	return (0);
+}
+#endif
 
 /*
  * Note: if you add or remove members of random_entropy_source, remember to also update the
@@ -76,15 +95,15 @@ enum random_entropy_source {
 
 #define RANDOM_HARVEST_EVERYTHING_MASK ((1 << (RANDOM_ENVIRONMENTAL_END + 1)) - 1)
 
-#if defined(RANDOM_DUMMY)
-#define random_harvest_queue(a, b, c, d) do {} while (0)
-#define random_harvest_fast(a, b, c, d) do {} while (0)
-#define random_harvest_direct(a, b, c, d) do {} while (0)
-#else /* !defined(RANDOM_DUMMY) */
+#if defined(DEV_RANDOM)
 void random_harvest_queue(const void *, u_int, u_int, enum random_entropy_source);
 void random_harvest_fast(const void *, u_int, u_int, enum random_entropy_source);
 void random_harvest_direct(const void *, u_int, u_int, enum random_entropy_source);
-#endif /* defined(RANDOM_DUMMY) */
+#else
+#define random_harvest_queue(a, b, c, d) do {} while (0)
+#define random_harvest_fast(a, b, c, d) do {} while (0)
+#define random_harvest_direct(a, b, c, d) do {} while (0)
+#endif
 
 #endif /* _KERNEL */
 
diff --git a/sys/sys/socketvar.h b/sys/sys/socketvar.h
index 112bb2c..26cf9a6 100644
--- a/sys/sys/socketvar.h
+++ b/sys/sys/socketvar.h
@@ -78,7 +78,7 @@ struct socket {
 	short	so_state;		/* (b) internal state flags SS_* */
 	int	so_qstate;		/* (e) internal state flags SQ_* */
 	void	*so_pcb;		/* protocol control block */
-	struct	vnet *so_vnet;		/* network stack instance */
+	struct	vnet *so_vnet;		/* (a) network stack instance */
 	struct	protosw *so_proto;	/* (a) protocol handle */
 /*
  * Variables for connection queuing.
diff --git a/sys/sys/timeet.h b/sys/sys/timeet.h
index 728578b..3d50e51 100644
--- a/sys/sys/timeet.h
+++ b/sys/sys/timeet.h
@@ -53,7 +53,7 @@ typedef int et_deregister_cb_t(struct eventtimer *et, void *arg);
 struct eventtimer {
 	SLIST_ENTRY(eventtimer)	et_all;
 		/* Pointer to the next event timer. */
-	char			*et_name;
+	const char		*et_name;
 		/* Name of the event timer. */
 	int			et_flags;
 		/* Set of capabilities flags: */
diff --git a/sys/sys/timetc.h b/sys/sys/timetc.h
index e68e327..8f00e22 100644
--- a/sys/sys/timetc.h
+++ b/sys/sys/timetc.h
@@ -49,7 +49,7 @@ struct timecounter {
 		/* This mask should mask off any unimplemented bits. */
 	uint64_t		tc_frequency;
 		/* Frequency of the counter in Hz. */
-	char			*tc_name;
+	const char		*tc_name;
 		/* Name of the timecounter. */
 	int			tc_quality;
 		/*
diff --git a/sys/teken/demo/teken_demo.c b/sys/teken/demo/teken_demo.c
index 08323dc..42747ce 100644
--- a/sys/teken/demo/teken_demo.c
+++ b/sys/teken/demo/teken_demo.c
@@ -72,7 +72,7 @@ struct pixel {
 
 #define NCOLS	80
 #define NROWS	24
-struct pixel buffer[NCOLS][NROWS];
+static struct pixel buffer[NCOLS][NROWS];
 
 static int ptfd;
 
diff --git a/sys/teken/teken.c b/sys/teken/teken.c
index 3002a88..8834390 100644
--- a/sys/teken/teken.c
+++ b/sys/teken/teken.c
@@ -29,12 +29,14 @@
 #include <sys/cdefs.h>
 #if defined(__FreeBSD__) && defined(_KERNEL)
 #include <sys/param.h>
+#include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/systm.h>
 #define	teken_assert(x)		MPASS(x)
 #else /* !(__FreeBSD__ && _KERNEL) */
 #include <sys/types.h>
 #include <assert.h>
+#include <limits.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
@@ -405,18 +407,24 @@ teken_state_numbers(teken_t *t, teken_char_t c)
 	teken_assert(t->t_curnum < T_NUMSIZE);
 
 	if (c >= '0' && c <= '9') {
-		/*
-		 * Don't do math with the default value of 1 when a
-		 * custom number is inserted.
-		 */
 		if (t->t_stateflags & TS_FIRSTDIGIT) {
+			/* First digit. */
 			t->t_stateflags &= ~TS_FIRSTDIGIT;
-			t->t_nums[t->t_curnum] = 0;
-		} else {
-			t->t_nums[t->t_curnum] *= 10;
+			t->t_nums[t->t_curnum] = c - '0';
+		} else if (t->t_nums[t->t_curnum] < UINT_MAX / 100) {
+			/*
+			 * There is no need to continue parsing input
+			 * once the value exceeds the size of the
+			 * terminal. It would only allow for integer
+			 * overflows when performing arithmetic on the
+			 * cursor position.
+			 *
+			 * Ignore any further digits if the value is
+			 * already UINT_MAX / 100.
+			 */
+			t->t_nums[t->t_curnum] =
+			    t->t_nums[t->t_curnum] * 10 + c - '0';
 		}
-
-		t->t_nums[t->t_curnum] += c - '0';
 		return (1);
 	} else if (c == ';') {
 		if (t->t_stateflags & TS_FIRSTDIGIT)
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index 91affa0..13916c0 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -566,11 +566,6 @@ vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen,
 		if (pageout_status[i] != VM_PAGER_PEND) {
 			vm_object_pip_wakeup(object);
 			vm_page_sunbusy(mt);
-			if (vm_page_count_severe()) {
-				vm_page_lock(mt);
-				vm_page_try_to_cache(mt);
-				vm_page_unlock(mt);
-			}
 		}
 	}
 	if (prunlen != NULL)
diff --git a/sys/x86/iommu/intel_idpgtbl.c b/sys/x86/iommu/intel_idpgtbl.c
index 405976b..d52e8d4 100644
--- a/sys/x86/iommu/intel_idpgtbl.c
+++ b/sys/x86/iommu/intel_idpgtbl.c
@@ -374,8 +374,9 @@ retry:
 			KASSERT(lvl > 0,
 			    ("lost root page table page %p", domain));
 			/*
-			 * Page table page does not exists, allocate
-			 * it and create pte in the up level.
+			 * Page table page does not exist, allocate
+			 * it and create a pte in the preceeding page level
+			 * to reference the allocated page table page.
 			 */
 			m = dmar_pgalloc(domain->pgtbl_obj, idx, flags |
 			    DMAR_PGF_ZERO);
diff --git a/sys/x86/x86/busdma_bounce.c b/sys/x86/x86/busdma_bounce.c
index dcdeafa..48c500f 100644
--- a/sys/x86/x86/busdma_bounce.c
+++ b/sys/x86/x86/busdma_bounce.c
@@ -79,7 +79,8 @@ struct bounce_page {
 	vm_offset_t	vaddr;		/* kva of bounce buffer */
 	bus_addr_t	busaddr;	/* Physical address */
 	vm_offset_t	datavaddr;	/* kva of client data */
-	bus_addr_t	dataaddr;	/* client physical address */
+	vm_page_t	datapage;	/* physical page of client data */
+	vm_offset_t	dataoffs;	/* page offset of client data */
 	bus_size_t	datacount;	/* client data count */
 	STAILQ_ENTRY(bounce_page) links;
 };
@@ -658,7 +659,7 @@ bounce_bus_dmamap_load_buffer(bus_dma_tag_t dmat, bus_dmamap_t map, void *buf,
 {
 	bus_size_t sgsize, max_sgsize;
 	bus_addr_t curaddr;
-	vm_offset_t vaddr;
+	vm_offset_t kvaddr, vaddr;
 	int error;
 
 	if (map == NULL)
@@ -681,10 +682,13 @@ bounce_bus_dmamap_load_buffer(bus_dma_tag_t dmat, bus_dmamap_t map, void *buf,
 		/*
 		 * Get the physical address for this segment.
 		 */
-		if (pmap == kernel_pmap)
+		if (pmap == kernel_pmap) {
 			curaddr = pmap_kextract(vaddr);
-		else
+			kvaddr = vaddr;
+		} else {
 			curaddr = pmap_extract(pmap, vaddr);
+			kvaddr = 0;
+		}
 
 		/*
 		 * Compute the segment size, and adjust counts.
@@ -696,7 +700,7 @@ bounce_bus_dmamap_load_buffer(bus_dma_tag_t dmat, bus_dmamap_t map, void *buf,
 		    bus_dma_run_filter(&dmat->common, curaddr)) {
 			sgsize = roundup2(sgsize, dmat->common.alignment);
 			sgsize = MIN(sgsize, max_sgsize);
-			curaddr = add_bounce_page(dmat, map, vaddr, curaddr,
+			curaddr = add_bounce_page(dmat, map, kvaddr, curaddr,
 			    sgsize);
 		} else {
 			sgsize = MIN(sgsize, max_sgsize);
@@ -757,48 +761,56 @@ bounce_bus_dmamap_sync(bus_dma_tag_t dmat, bus_dmamap_t map,
     bus_dmasync_op_t op)
 {
 	struct bounce_page *bpage;
+	vm_offset_t datavaddr, tempvaddr;
 
-	if ((bpage = STAILQ_FIRST(&map->bpages)) != NULL) {
-		/*
-		 * Handle data bouncing.  We might also
-		 * want to add support for invalidating
-		 * the caches on broken hardware
-		 */
-		CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x op 0x%x "
-		    "performing bounce", __func__, dmat,
-		    dmat->common.flags, op);
-
-		if ((op & BUS_DMASYNC_PREWRITE) != 0) {
-			while (bpage != NULL) {
-				if (bpage->datavaddr != 0) {
-					bcopy((void *)bpage->datavaddr,
-					    (void *)bpage->vaddr,
-					    bpage->datacount);
-				} else {
-					physcopyout(bpage->dataaddr,
-					    (void *)bpage->vaddr,
-					    bpage->datacount);
-				}
-				bpage = STAILQ_NEXT(bpage, links);
+	if ((bpage = STAILQ_FIRST(&map->bpages)) == NULL)
+		return;
+
+	/*
+	 * Handle data bouncing.  We might also want to add support for
+	 * invalidating the caches on broken hardware.
+	 */
+	CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x op 0x%x "
+	    "performing bounce", __func__, dmat, dmat->common.flags, op);
+
+	if ((op & BUS_DMASYNC_PREWRITE) != 0) {
+		while (bpage != NULL) {
+			tempvaddr = 0;
+			datavaddr = bpage->datavaddr;
+			if (datavaddr == 0) {
+				tempvaddr =
+				    pmap_quick_enter_page(bpage->datapage);
+				datavaddr = tempvaddr | bpage->dataoffs;
 			}
-			dmat->bounce_zone->total_bounced++;
+
+			bcopy((void *)datavaddr,
+			    (void *)bpage->vaddr, bpage->datacount);
+
+			if (tempvaddr != 0)
+				pmap_quick_remove_page(tempvaddr);
+			bpage = STAILQ_NEXT(bpage, links);
 		}
+		dmat->bounce_zone->total_bounced++;
+	}
 
-		if ((op & BUS_DMASYNC_POSTREAD) != 0) {
-			while (bpage != NULL) {
-				if (bpage->datavaddr != 0) {
-					bcopy((void *)bpage->vaddr,
-					    (void *)bpage->datavaddr,
-					    bpage->datacount);
-				} else {
-					physcopyin((void *)bpage->vaddr,
-					    bpage->dataaddr,
-					    bpage->datacount);
-				}
-				bpage = STAILQ_NEXT(bpage, links);
+	if ((op & BUS_DMASYNC_POSTREAD) != 0) {
+		while (bpage != NULL) {
+			tempvaddr = 0;
+			datavaddr = bpage->datavaddr;
+			if (datavaddr == 0) {
+				tempvaddr =
+				    pmap_quick_enter_page(bpage->datapage);
+				datavaddr = tempvaddr | bpage->dataoffs;
 			}
-			dmat->bounce_zone->total_bounced++;
+
+			bcopy((void *)bpage->vaddr,
+			    (void *)datavaddr, bpage->datacount);
+
+			if (tempvaddr != 0)
+				pmap_quick_remove_page(tempvaddr);
+			bpage = STAILQ_NEXT(bpage, links);
 		}
+		dmat->bounce_zone->total_bounced++;
 	}
 }
 
@@ -993,7 +1005,8 @@ add_bounce_page(bus_dma_tag_t dmat, bus_dmamap_t map, vm_offset_t vaddr,
 		bpage->busaddr |= addr & PAGE_MASK;
 	}
 	bpage->datavaddr = vaddr;
-	bpage->dataaddr = addr;
+	bpage->datapage = PHYS_TO_VM_PAGE(addr & ~PAGE_MASK);
+	bpage->dataoffs = addr & PAGE_MASK;
 	bpage->datacount = size;
 	STAILQ_INSERT_TAIL(&(map->bpages), bpage, links);
 	return (bpage->busaddr);
diff --git a/sys/xen/gnttab.h b/sys/xen/gnttab.h
index d0a44ae..9e82124 100644
--- a/sys/xen/gnttab.h
+++ b/sys/xen/gnttab.h
@@ -126,10 +126,8 @@ gnttab_set_map_op(struct gnttab_map_grant_ref *map, vm_paddr_t addr,
 {
 	if (flags & GNTMAP_contains_pte)
 		map->host_addr = addr;
-	else if (xen_feature(XENFEAT_auto_translated_physmap))
-		map->host_addr = vtophys(addr);
 	else
-		map->host_addr = addr;
+		map->host_addr = vtophys(addr);
 
 	map->flags = flags;
 	map->ref = ref;
@@ -142,10 +140,8 @@ gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, vm_paddr_t addr,
 {
 	if (flags & GNTMAP_contains_pte)
 		unmap->host_addr = addr;
-	else if (xen_feature(XENFEAT_auto_translated_physmap))
-		unmap->host_addr = vtophys(addr);
 	else
-		unmap->host_addr = addr;
+		unmap->host_addr = vtophys(addr);
 
 	unmap->handle = handle;
 	unmap->dev_bus_addr = 0;
@@ -155,13 +151,8 @@ static inline void
 gnttab_set_replace_op(struct gnttab_unmap_and_replace *unmap, vm_paddr_t addr,
 		      vm_paddr_t new_addr, grant_handle_t handle)
 {
-	if (xen_feature(XENFEAT_auto_translated_physmap)) {
-		unmap->host_addr = vtophys(addr);
-		unmap->new_addr = vtophys(new_addr);
-	} else {
-		unmap->host_addr = addr;
-		unmap->new_addr = new_addr;
-	}
+	unmap->host_addr = vtophys(addr);
+	unmap->new_addr = vtophys(new_addr);
 
 	unmap->handle = handle;
 }