diff options
Diffstat (limited to 'sys')
199 files changed, 8063 insertions, 5176 deletions
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 1e64fc8..41dea8b 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -390,6 +390,8 @@ static struct md_page *pv_table; */ pt_entry_t *CMAP1 = 0; caddr_t CADDR1 = 0; +static vm_offset_t qframe = 0; +static struct mtx qframe_mtx; static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ @@ -1031,7 +1033,7 @@ pmap_init(void) struct pmap_preinit_mapping *ppim; vm_page_t mpte; vm_size_t s; - int i, pv_npg; + int error, i, pv_npg; /* * Initialize the vm page array entries for the kernel pmap's @@ -1112,6 +1114,12 @@ pmap_init(void) printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i, ppim->pa, ppim->va, ppim->sz, ppim->mode); } + + mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); + error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, + (vmem_addr_t *)&qframe); + if (error != 0) + panic("qframe allocation failed"); } static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, @@ -7019,13 +7027,27 @@ pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, vm_offset_t pmap_quick_enter_page(vm_page_t m) { + vm_paddr_t paddr; - return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); + paddr = VM_PAGE_TO_PHYS(m); + if (paddr < dmaplimit) + return (PHYS_TO_DMAP(paddr)); + mtx_lock_spin(&qframe_mtx); + KASSERT(*vtopte(qframe) == 0, ("qframe busy")); + pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A | + X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0)); + return (qframe); } void pmap_quick_remove_page(vm_offset_t addr) { + + if (addr != qframe) + return; + pte_store(vtopte(qframe), 0); + invlpg(qframe); + mtx_unlock_spin(&qframe_mtx); } #include "opt_ddb.h" diff --git a/sys/arm/arm/cpufunc.c b/sys/arm/arm/cpufunc.c index 0b589ed..fea0581 100644 --- a/sys/arm/arm/cpufunc.c +++ b/sys/arm/arm/cpufunc.c @@ -904,6 +904,7 @@ set_cpufuncs() cputype == CPU_ID_CORTEXA9R1 || cputype == CPU_ID_CORTEXA9R2 || cputype == CPU_ID_CORTEXA9R3 || + cputype == CPU_ID_CORTEXA9R4 || cputype == CPU_ID_CORTEXA12R0 || cputype == CPU_ID_CORTEXA15R0 || cputype == CPU_ID_CORTEXA15R1 || diff --git a/sys/arm/arm/identcpu.c b/sys/arm/arm/identcpu.c index 75bf08c..be1393b1 100644 --- a/sys/arm/arm/identcpu.c +++ b/sys/arm/arm/identcpu.c @@ -185,6 +185,8 @@ const struct cpuidtab cpuids[] = { generic_steppings }, { CPU_ID_CORTEXA9R3, CPU_CLASS_CORTEXA, "Cortex A9-r3", generic_steppings }, + { CPU_ID_CORTEXA9R4, CPU_CLASS_CORTEXA, "Cortex A9-r4", + generic_steppings }, { CPU_ID_CORTEXA12R0, CPU_CLASS_CORTEXA, "Cortex A12-r0", generic_steppings }, { CPU_ID_CORTEXA15R0, CPU_CLASS_CORTEXA, "Cortex A15-r0", diff --git a/sys/arm/arm/pmap-v6-new.c b/sys/arm/arm/pmap-v6-new.c index b18648f..864e05c 100644 --- a/sys/arm/arm/pmap-v6-new.c +++ b/sys/arm/arm/pmap-v6-new.c @@ -1166,10 +1166,9 @@ pmap_init_qpages(void) pc = pcpu_find(i); pc->pc_qmap_addr = kva_alloc(PAGE_SIZE); if (pc->pc_qmap_addr == 0) - panic("pmap_init_qpages: unable to allocate KVA"); + panic("%s: unable to allocate KVA", __func__); } } - SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_qpages, NULL); /* @@ -5728,18 +5727,17 @@ pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t pmap_quick_enter_page(vm_page_t m) { - pt2_entry_t *pte; - vm_offset_t qmap_addr; + pt2_entry_t *pte2p; + vm_offset_t qmap_addr; critical_enter(); - qmap_addr = PCPU_GET(qmap_addr); - pte = pt2map_entry(qmap_addr); + pte2p = pt2map_entry(qmap_addr); - KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy")); + KASSERT(pte2_load(pte2p) == 0, ("%s: PTE2 busy", __func__)); - pte2_store(pte, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), - PTE2_AP_KRW, pmap_page_get_memattr(m))); + pte2_store(pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, + pmap_page_get_memattr(m))); tlb_flush_local(qmap_addr); return (qmap_addr); @@ -5748,16 +5746,16 @@ pmap_quick_enter_page(vm_page_t m) void pmap_quick_remove_page(vm_offset_t addr) { - pt2_entry_t *pte; + pt2_entry_t *pte2p; vm_offset_t qmap_addr; qmap_addr = PCPU_GET(qmap_addr); - pte = pt2map_entry(qmap_addr); + pte2p = pt2map_entry(qmap_addr); - KASSERT(addr == qmap_addr, ("pmap_quick_remove_page: invalid address")); - KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use")); + KASSERT(addr == qmap_addr, ("%s: invalid address", __func__)); + KASSERT(pte2_load(pte2p) != 0, ("%s: PTE2 not in use", __func__)); - pte2_clear(pte); + pte2_clear(pte2p); critical_exit(); } diff --git a/sys/arm/arm/stdatomic.c b/sys/arm/arm/stdatomic.c index 211f26a..3c0b997 100644 --- a/sys/arm/arm/stdatomic.c +++ b/sys/arm/arm/stdatomic.c @@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$"); #include <sys/types.h> #include <machine/acle-compat.h> +#include <machine/atomic.h> #include <machine/cpufunc.h> #include <machine/sysarch.h> @@ -67,19 +68,12 @@ do_sync(void) __asm volatile ("" : : : "memory"); } -#elif __ARM_ARCH >= 7 -static inline void -do_sync(void) -{ - - __asm volatile ("dmb" : : : "memory"); -} #elif __ARM_ARCH >= 6 static inline void do_sync(void) { - __asm volatile ("mcr p15, 0, %0, c7, c10, 5" : : "r" (0) : "memory"); + dmb(); } #endif diff --git a/sys/arm/broadcom/bcm2835/bcm2835_systimer.c b/sys/arm/broadcom/bcm2835/bcm2835_systimer.c index 93bf676..731c7d0 100644 --- a/sys/arm/broadcom/bcm2835/bcm2835_systimer.c +++ b/sys/arm/broadcom/bcm2835/bcm2835_systimer.c @@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$"); #define BCM2835_NUM_TIMERS 4 #define DEFAULT_TIMER 3 +#define DEFAULT_TIMER_NAME "BCM2835-3" #define DEFAULT_FREQUENCY 1000000 #define MIN_PERIOD 5LLU @@ -101,7 +102,7 @@ static struct bcm_systimer_softc *bcm_systimer_sc = NULL; static unsigned bcm_systimer_tc_get_timecount(struct timecounter *); static struct timecounter bcm_systimer_tc = { - .tc_name = "BCM2835 Timecounter", + .tc_name = DEFAULT_TIMER_NAME, .tc_get_timecount = bcm_systimer_tc_get_timecount, .tc_poll_pps = NULL, .tc_counter_mask = ~0u, @@ -238,8 +239,7 @@ bcm_systimer_attach(device_t dev) sc->st[DEFAULT_TIMER].index = DEFAULT_TIMER; sc->st[DEFAULT_TIMER].enabled = 0; - sc->st[DEFAULT_TIMER].et.et_name = malloc(64, M_DEVBUF, M_NOWAIT | M_ZERO); - sprintf(sc->st[DEFAULT_TIMER].et.et_name, "BCM2835 Event Timer %d", DEFAULT_TIMER); + sc->st[DEFAULT_TIMER].et.et_name = DEFAULT_TIMER_NAME; sc->st[DEFAULT_TIMER].et.et_flags = ET_FLAGS_ONESHOT; sc->st[DEFAULT_TIMER].et.et_quality = 1000; sc->st[DEFAULT_TIMER].et.et_frequency = sc->sysclk_freq; diff --git a/sys/arm/conf/BEAGLEBONE b/sys/arm/conf/BEAGLEBONE index a0ca1b6..12b8290 100644 --- a/sys/arm/conf/BEAGLEBONE +++ b/sys/arm/conf/BEAGLEBONE @@ -26,7 +26,7 @@ ident BEAGLEBONE include "std.armv6" include "../ti/am335x/std.am335x" -makeoptions MODULES_EXTRA="dtb/am335x" +makeoptions MODULES_EXTRA="dtb/am335x am335x_dmtpps" # DTrace support options KDTRACE_HOOKS # Kernel DTrace hooks @@ -77,6 +77,7 @@ device ti_i2c device am335x_pmic # AM335x Power Management IC (TPC65217) device am335x_rtc # RTC support (power management only) +#define am335x_dmtpps # Pulse Per Second capture driver # Console and misc device uart diff --git a/sys/arm/include/armreg.h b/sys/arm/include/armreg.h index 9358703..a300ddf 100644 --- a/sys/arm/include/armreg.h +++ b/sys/arm/include/armreg.h @@ -133,6 +133,7 @@ #define CPU_ID_CORTEXA9R1 0x411fc090 #define CPU_ID_CORTEXA9R2 0x412fc090 #define CPU_ID_CORTEXA9R3 0x413fc090 +#define CPU_ID_CORTEXA9R4 0x414fc090 #define CPU_ID_CORTEXA12R0 0x410fc0d0 #define CPU_ID_CORTEXA15R0 0x410fc0f0 #define CPU_ID_CORTEXA15R1 0x411fc0f0 diff --git a/sys/arm/ti/am335x/am335x_dmtpps.c b/sys/arm/ti/am335x/am335x_dmtpps.c new file mode 100644 index 0000000..08b4104 --- /dev/null +++ b/sys/arm/ti/am335x/am335x_dmtpps.c @@ -0,0 +1,549 @@ +/*- + * Copyright (c) 2015 Ian lepore <ian@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * AM335x PPS driver using DMTimer capture. + * + * Note that this PPS driver does not use an interrupt. Instead it uses the + * hardware's ability to latch the timer's count register in response to a + * signal on an IO pin. Each of timers 4-7 have an associated pin, and this + * code allows any one of those to be used. + * + * The timecounter routines in kern_tc.c call the pps poll routine periodically + * to see if a new counter value has been latched. When a new value has been + * latched, the only processing done in the poll routine is to capture the + * current set of timecounter timehands (done with pps_capture()) and the + * latched value from the timer. The remaining work (done by pps_event() while + * holding a mutex) is scheduled to be done later in a non-interrupt context. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/conf.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/malloc.h> +#include <sys/rman.h> +#include <sys/taskqueue.h> +#include <sys/timepps.h> +#include <sys/timetc.h> +#include <machine/bus.h> + +#include <dev/ofw/openfirm.h> +#include <dev/ofw/ofw_bus.h> +#include <dev/ofw/ofw_bus_subr.h> + +#include <arm/ti/ti_prcm.h> +#include <arm/ti/ti_hwmods.h> +#include <arm/ti/ti_pinmux.h> +#include <arm/ti/am335x/am335x_scm_padconf.h> + +#include "am335x_dmtreg.h" + +#define PPS_CDEV_NAME "dmtpps" + +struct dmtpps_softc { + device_t dev; + int mem_rid; + struct resource * mem_res; + int tmr_num; /* N from hwmod str "timerN" */ + char tmr_name[12]; /* "DMTimerN" */ + uint32_t tclr; /* Cached TCLR register. */ + struct timecounter tc; + int pps_curmode; /* Edge mode now set in hw. */ + struct task pps_task; /* For pps_event handling. */ + struct cdev * pps_cdev; + struct pps_state pps_state; + struct mtx pps_mtx; +}; + +static int dmtpps_tmr_num; /* Set by probe() */ + +/* List of compatible strings for FDT tree */ +static struct ofw_compat_data compat_data[] = { + {"ti,am335x-timer", 1}, + {"ti,am335x-timer-1ms", 1}, + {NULL, 0}, +}; + +/* + * A table relating pad names to the hardware timer number they can be mux'd to. + */ +struct padinfo { + char * ballname; + int tmr_num; +}; +static struct padinfo dmtpps_padinfo[] = { + {"GPMC_ADVn_ALE", 4}, + {"I2C0_SDA", 4}, + {"MII1_TX_EN", 4}, + {"XDMA_EVENT_INTR0", 4}, + {"GPMC_BEn0_CLE", 5}, + {"MDC", 5}, + {"MMC0_DAT3", 5}, + {"UART1_RTSn", 5}, + {"GPMC_WEn", 6}, + {"MDIO", 6}, + {"MMC0_DAT2", 6}, + {"UART1_CTSn", 6}, + {"GPMC_OEn_REn", 7}, + {"I2C0_SCL", 7}, + {"UART0_CTSn", 7}, + {"XDMA_EVENT_INTR1", 7}, + {NULL, 0} +}; + +/* + * This is either brilliantly user-friendly, or utterly lame... + * + * The am335x chip is used on the popular Beaglebone boards. Those boards have + * pins for all four capture-capable timers available on the P8 header. Allow + * users to configure the input pin by giving the name of the header pin. + */ +struct nicknames { + const char * nick; + const char * name; +}; +static struct nicknames dmtpps_pin_nicks[] = { + {"P8-7", "GPMC_ADVn_ALE"}, + {"P8-9", "GPMC_BEn0_CLE"}, + {"P8-10", "GPMC_WEn"}, + {"P8-8", "GPMC_OEn_REn",}, + {NULL, NULL} +}; + +#define DMTIMER_READ4(sc, reg) bus_read_4((sc)->mem_res, (reg)) +#define DMTIMER_WRITE4(sc, reg, val) bus_write_4((sc)->mem_res, (reg), (val)) + +/* + * Translate a short friendly case-insensitive name to its canonical name. + */ +static const char * +dmtpps_translate_nickname(const char *nick) +{ + struct nicknames *nn; + + for (nn = dmtpps_pin_nicks; nn->nick != NULL; nn++) + if (strcasecmp(nick, nn->nick) == 0) + return nn->name; + return (nick); +} + +/* + * See if our tunable is set to the name of the input pin. If not, that's NOT + * an error, return 0. If so, try to configure that pin as a timer capture + * input pin, and if that works, then we have our timer unit number and if it + * fails that IS an error, return -1. + */ +static int +dmtpps_find_tmr_num_by_tunable() +{ + struct padinfo *pi; + char iname[20]; + char muxmode[12]; + const char * ballname; + int err; + + if (!TUNABLE_STR_FETCH("hw.am335x_dmtpps.input", iname, sizeof(iname))) + return (0); + ballname = dmtpps_translate_nickname(iname); + for (pi = dmtpps_padinfo; pi->ballname != NULL; pi++) { + if (strcmp(ballname, pi->ballname) != 0) + continue; + snprintf(muxmode, sizeof(muxmode), "timer%d", pi->tmr_num); + err = ti_pinmux_padconf_set(pi->ballname, muxmode, + PADCONF_INPUT); + if (err != 0) { + printf("am335x_dmtpps: unable to configure capture pin " + "for %s to input mode\n", muxmode); + return (-1); + } else if (bootverbose) { + printf("am335x_dmtpps: configured pin %s as input " + "for %s\n", iname, muxmode); + } + return (pi->tmr_num); + } + + /* Invalid name in the tunable, that's an error. */ + printf("am335x_dmtpps: unknown pin name '%s'\n", iname); + return (-1); +} + +/* + * Ask the pinmux driver whether any pin has been configured as a TIMER4..TIMER7 + * input pin. If so, return the timer number, if not return 0. + */ +static int +dmtpps_find_tmr_num_by_padconf() +{ + int err; + unsigned int padstate; + const char * padmux; + struct padinfo *pi; + char muxmode[12]; + + for (pi = dmtpps_padinfo; pi->ballname != NULL; pi++) { + err = ti_pinmux_padconf_get(pi->ballname, &padmux, &padstate); + snprintf(muxmode, sizeof(muxmode), "timer%d", pi->tmr_num); + if (err == 0 && (padstate & RXACTIVE) != 0 && + strcmp(muxmode, padmux) == 0) + return (pi->tmr_num); + } + /* Nothing found, not an error. */ + return (0); +} + +/* + * Figure out which hardware timer number to use based on input pin + * configuration. This is done just once, the first time probe() runs. + */ +static int +dmtpps_find_tmr_num() +{ + int tmr_num; + + if ((tmr_num = dmtpps_find_tmr_num_by_tunable()) == 0) + tmr_num = dmtpps_find_tmr_num_by_padconf(); + + if (tmr_num <= 0) { + printf("am335x_dmtpps: PPS driver not enabled: unable to find " + "or configure a capture input pin\n"); + tmr_num = -1; /* Must return non-zero to prevent re-probing. */ + } + return (tmr_num); +} + +static void +dmtpps_set_hw_capture(struct dmtpps_softc *sc, bool force_off) +{ + int newmode; + + if (force_off) + newmode = 0; + else + newmode = sc->pps_state.ppsparam.mode & PPS_CAPTUREASSERT; + + if (newmode == sc->pps_curmode) + return; + sc->pps_curmode = newmode; + + if (newmode == PPS_CAPTUREASSERT) + sc->tclr |= DMT_TCLR_CAPTRAN_LOHI; + else + sc->tclr &= ~DMT_TCLR_CAPTRAN_MASK; + DMTIMER_WRITE4(sc, DMT_TCLR, sc->tclr); +} + +static unsigned +dmtpps_get_timecount(struct timecounter *tc) +{ + struct dmtpps_softc *sc; + + sc = tc->tc_priv; + + return (DMTIMER_READ4(sc, DMT_TCRR)); +} + +static void +dmtpps_poll(struct timecounter *tc) +{ + struct dmtpps_softc *sc; + + sc = tc->tc_priv; + + /* + * If a new value has been latched we've got a PPS event. Capture the + * timecounter data, then override the capcount field (pps_capture() + * populates it from the current DMT_TCRR register) with the latched + * value from the TCAR1 register. + * + * There is no locking here, by design. pps_capture() writes into an + * area of struct pps_state which is read only by pps_event(). The + * synchronization of access to that area is temporal rather than + * interlock based... we write in this routine and trigger the task that + * will read the data, so no simultaneous access can occur. + * + * Note that we don't have the TCAR interrupt enabled, but the hardware + * still provides the status bits in the "RAW" status register even when + * they're masked from generating an irq. However, when clearing the + * TCAR status to re-arm the capture for the next second, we have to + * write to the IRQ status register, not the RAW register. Quirky. + */ + if (DMTIMER_READ4(sc, DMT_IRQSTATUS_RAW) & DMT_IRQ_TCAR) { + pps_capture(&sc->pps_state); + sc->pps_state.capcount = DMTIMER_READ4(sc, DMT_TCAR1); + DMTIMER_WRITE4(sc, DMT_IRQSTATUS, DMT_IRQ_TCAR); + taskqueue_enqueue_fast(taskqueue_fast, &sc->pps_task); + } +} + +static void +dmtpps_event(void *arg, int pending) +{ + struct dmtpps_softc *sc; + + sc = arg; + + /* This is the task function that gets enqueued by poll_pps. Once the + * time has been captured by the timecounter polling code which runs in + * primary interrupt context, the remaining (more expensive) work to + * process the event is done later in a threaded context. + * + * Here there is an interlock that protects the event data in struct + * pps_state. That data can be accessed at any time from userland via + * ioctl() calls so we must ensure that there is no read access to + * partially updated data while pps_event() does its work. + */ + mtx_lock(&sc->pps_mtx); + pps_event(&sc->pps_state, PPS_CAPTUREASSERT); + mtx_unlock(&sc->pps_mtx); +} + +static int +dmtpps_open(struct cdev *dev, int flags, int fmt, + struct thread *td) +{ + struct dmtpps_softc *sc; + + sc = dev->si_drv1; + + /* + * Begin polling for pps and enable capture in the hardware whenever the + * device is open. Doing this stuff again is harmless if this isn't the + * first open. + */ + sc->tc.tc_poll_pps = dmtpps_poll; + dmtpps_set_hw_capture(sc, false); + + return 0; +} + +static int +dmtpps_close(struct cdev *dev, int flags, int fmt, + struct thread *td) +{ + struct dmtpps_softc *sc; + + sc = dev->si_drv1; + + /* + * Stop polling and disable capture on last close. Use the force-off + * flag to override the configured mode and turn off the hardware. + */ + sc->tc.tc_poll_pps = NULL; + dmtpps_set_hw_capture(sc, true); + + return 0; +} + +static int +dmtpps_ioctl(struct cdev *dev, u_long cmd, caddr_t data, + int flags, struct thread *td) +{ + struct dmtpps_softc *sc; + int err; + + sc = dev->si_drv1; + + /* Let the kernel do the heavy lifting for ioctl. */ + mtx_lock(&sc->pps_mtx); + err = pps_ioctl(cmd, data, &sc->pps_state); + mtx_unlock(&sc->pps_mtx); + if (err != 0) + return (err); + + /* + * The capture mode could have changed, set the hardware to whatever + * mode is now current. Effectively a no-op if nothing changed. + */ + dmtpps_set_hw_capture(sc, false); + + return (err); +} + +static struct cdevsw dmtpps_cdevsw = { + .d_version = D_VERSION, + .d_open = dmtpps_open, + .d_close = dmtpps_close, + .d_ioctl = dmtpps_ioctl, + .d_name = PPS_CDEV_NAME, +}; + +static int +dmtpps_probe(device_t dev) +{ + char strbuf[64]; + int tmr_num; + + if (!ofw_bus_status_okay(dev)) + return (ENXIO); + + if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0) + return (ENXIO); + + /* + * If we haven't chosen which hardware timer to use yet, go do that now. + * We need to know that to decide whether to return success for this + * hardware timer instance or not. + */ + if (dmtpps_tmr_num == 0) + dmtpps_tmr_num = dmtpps_find_tmr_num(); + + /* + * Figure out which hardware timer is being probed and see if it matches + * the configured timer number determined earlier. + */ + tmr_num = ti_hwmods_get_unit(dev, "timer"); + if (dmtpps_tmr_num != tmr_num) + return (ENXIO); + + snprintf(strbuf, sizeof(strbuf), "AM335x PPS-Capture DMTimer%d", + tmr_num); + device_set_desc_copy(dev, strbuf); + + return(BUS_PROBE_DEFAULT); +} + +static int +dmtpps_attach(device_t dev) +{ + struct dmtpps_softc *sc; + clk_ident_t timer_id; + int err, sysclk_freq; + + sc = device_get_softc(dev); + sc->dev = dev; + + /* Get the base clock frequency. */ + err = ti_prcm_clk_get_source_freq(SYS_CLK, &sysclk_freq); + + /* Enable clocks and power on the device. */ + if ((timer_id = ti_hwmods_get_clock(dev)) == INVALID_CLK_IDENT) + return (ENXIO); + if ((err = ti_prcm_clk_set_source(timer_id, SYSCLK_CLK)) != 0) + return (err); + if ((err = ti_prcm_clk_enable(timer_id)) != 0) + return (err); + + /* Request the memory resources. */ + sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, + &sc->mem_rid, RF_ACTIVE); + if (sc->mem_res == NULL) { + return (ENXIO); + } + + /* Figure out which hardware timer this is and set the name string. */ + sc->tmr_num = ti_hwmods_get_unit(dev, "timer"); + snprintf(sc->tmr_name, sizeof(sc->tmr_name), "DMTimer%d", sc->tmr_num); + + /* Set up timecounter hardware, start it. */ + DMTIMER_WRITE4(sc, DMT_TSICR, DMT_TSICR_RESET); + while (DMTIMER_READ4(sc, DMT_TIOCP_CFG) & DMT_TIOCP_RESET) + continue; + + sc->tclr |= DMT_TCLR_START | DMT_TCLR_AUTOLOAD; + DMTIMER_WRITE4(sc, DMT_TLDR, 0); + DMTIMER_WRITE4(sc, DMT_TCRR, 0); + DMTIMER_WRITE4(sc, DMT_TCLR, sc->tclr); + + /* Register the timecounter. */ + sc->tc.tc_name = sc->tmr_name; + sc->tc.tc_get_timecount = dmtpps_get_timecount; + sc->tc.tc_counter_mask = ~0u; + sc->tc.tc_frequency = sysclk_freq; + sc->tc.tc_quality = 1000; + sc->tc.tc_priv = sc; + + tc_init(&sc->tc); + + /* + * Indicate our PPS capabilities. Have the kernel init its part of the + * pps_state struct and add its capabilities. + * + * While the hardware has a mode to capture each edge, it's not clear we + * can use it that way, because there's only a single interrupt/status + * bit to say something was captured, but not which edge it was. For + * now, just say we can only capture assert events (the positive-going + * edge of the pulse). + */ + mtx_init(&sc->pps_mtx, "dmtpps", NULL, MTX_DEF); + sc->pps_state.ppscap = PPS_CAPTUREASSERT; + sc->pps_state.driver_abi = PPS_ABI_VERSION; + sc->pps_state.driver_mtx = &sc->pps_mtx; + pps_init_abi(&sc->pps_state); + + /* + * Init the task that does deferred pps_event() processing after + * the polling routine has captured a pps pulse time. + */ + TASK_INIT(&sc->pps_task, 0, dmtpps_event, sc); + + /* Create the PPS cdev. */ + sc->pps_cdev = make_dev(&dmtpps_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, + PPS_CDEV_NAME); + sc->pps_cdev->si_drv1 = sc; + + if (bootverbose) + device_printf(sc->dev, "Using %s for PPS device /dev/%s\n", + sc->tmr_name, PPS_CDEV_NAME); + + return (0); +} + +static int +dmtpps_detach(device_t dev) +{ + + /* + * There is no way to remove a timecounter once it has been registered, + * even if it's not in use, so we can never detach. If we were + * dynamically loaded as a module this will prevent unloading. + */ + return (EBUSY); +} + +static device_method_t dmtpps_methods[] = { + DEVMETHOD(device_probe, dmtpps_probe), + DEVMETHOD(device_attach, dmtpps_attach), + DEVMETHOD(device_detach, dmtpps_detach), + { 0, 0 } +}; + +static driver_t dmtpps_driver = { + "am335x_dmtpps", + dmtpps_methods, + sizeof(struct dmtpps_softc), +}; + +static devclass_t dmtpps_devclass; + +DRIVER_MODULE(am335x_dmtpps, simplebus, dmtpps_driver, dmtpps_devclass, 0, 0); +MODULE_DEPEND(am335x_dmtpps, am335x_prcm, 1, 1, 1); + diff --git a/sys/arm/ti/am335x/files.am335x b/sys/arm/ti/am335x/files.am335x index 7293fd0..d0193e8 100644 --- a/sys/arm/ti/am335x/files.am335x +++ b/sys/arm/ti/am335x/files.am335x @@ -3,6 +3,7 @@ arm/ti/aintc.c standard arm/ti/am335x/am335x_dmtimer.c standard +arm/ti/am335x/am335x_dmtpps.c optional am335x_dmtpps arm/ti/am335x/am335x_gpio.c optional gpio arm/ti/am335x/am335x_lcd.c optional sc | vt arm/ti/am335x/am335x_lcd_syscons.c optional sc diff --git a/sys/arm/versatile/sp804.c b/sys/arm/versatile/sp804.c index a69c018..de05700 100644 --- a/sys/arm/versatile/sp804.c +++ b/sys/arm/versatile/sp804.c @@ -244,7 +244,7 @@ sp804_timer_attach(device_t dev) * Timer 1, timecounter */ sc->tc.tc_frequency = sc->sysclk_freq; - sc->tc.tc_name = "SP804 Time Counter"; + sc->tc.tc_name = "SP804-1"; sc->tc.tc_get_timecount = sp804_timer_tc_get_timecount; sc->tc.tc_poll_pps = NULL; sc->tc.tc_counter_mask = ~0u; @@ -263,9 +263,7 @@ sp804_timer_attach(device_t dev) * Timer 2, event timer */ sc->et_enabled = 0; - sc->et.et_name = malloc(64, M_DEVBUF, M_NOWAIT | M_ZERO); - sprintf(sc->et.et_name, "SP804 Event Timer %d", - device_get_unit(dev)); + sc->et.et_name = "SP804-2"; sc->et.et_flags = ET_FLAGS_PERIODIC | ET_FLAGS_ONESHOT; sc->et.et_quality = 1000; sc->et.et_frequency = sc->sysclk_freq / DEFAULT_DIVISOR; diff --git a/sys/arm64/arm64/bus_machdep.c b/sys/arm64/arm64/bus_machdep.c index 25a675e..f6df4a1 100644 --- a/sys/arm64/arm64/bus_machdep.c +++ b/sys/arm64/arm64/bus_machdep.c @@ -49,6 +49,15 @@ void generic_bs_rm_4(void *, bus_space_handle_t, bus_size_t, uint32_t *, void generic_bs_rm_8(void *, bus_space_handle_t, bus_size_t, uint64_t *, bus_size_t); +void generic_bs_rr_1(void *, bus_space_handle_t, bus_size_t, uint8_t *, + bus_size_t); +void generic_bs_rr_2(void *, bus_space_handle_t, bus_size_t, uint16_t *, + bus_size_t); +void generic_bs_rr_4(void *, bus_space_handle_t, bus_size_t, uint32_t *, + bus_size_t); +void generic_bs_rr_8(void *, bus_space_handle_t, bus_size_t, uint64_t *, + bus_size_t); + void generic_bs_w_1(void *, bus_space_handle_t, bus_size_t, uint8_t); void generic_bs_w_2(void *, bus_space_handle_t, bus_size_t, uint16_t); void generic_bs_w_4(void *, bus_space_handle_t, bus_size_t, uint32_t); @@ -63,6 +72,15 @@ void generic_bs_wm_4(void *, bus_space_handle_t, bus_size_t, const uint32_t *, void generic_bs_wm_8(void *, bus_space_handle_t, bus_size_t, const uint64_t *, bus_size_t); +void generic_bs_wr_1(void *, bus_space_handle_t, bus_size_t, const uint8_t *, + bus_size_t); +void generic_bs_wr_2(void *, bus_space_handle_t, bus_size_t, const uint16_t *, + bus_size_t); +void generic_bs_wr_4(void *, bus_space_handle_t, bus_size_t, const uint32_t *, + bus_size_t); +void generic_bs_wr_8(void *, bus_space_handle_t, bus_size_t, const uint64_t *, + bus_size_t); + static int generic_bs_map(void *t, bus_addr_t bpa, bus_size_t size, int flags, bus_space_handle_t *bshp) @@ -126,6 +144,12 @@ struct bus_space memmap_bus = { .bs_rm_4 = generic_bs_rm_4, .bs_rm_8 = generic_bs_rm_8, + /* read region */ + .bs_rr_1 = generic_bs_rr_1, + .bs_rr_2 = generic_bs_rr_2, + .bs_rr_4 = generic_bs_rr_4, + .bs_rr_8 = generic_bs_rr_8, + /* write single */ .bs_w_1 = generic_bs_w_1, .bs_w_2 = generic_bs_w_2, @@ -139,10 +163,10 @@ struct bus_space memmap_bus = { .bs_wm_8 = generic_bs_wm_8, /* write region */ - .bs_wr_1 = NULL, - .bs_wr_2 = NULL, - .bs_wr_4 = NULL, - .bs_wr_8 = NULL, + .bs_wr_1 = generic_bs_wr_1, + .bs_wr_2 = generic_bs_wr_2, + .bs_wr_4 = generic_bs_wr_4, + .bs_wr_8 = generic_bs_wr_8, /* set multiple */ .bs_sm_1 = NULL, diff --git a/sys/arm64/arm64/bus_space_asm.S b/sys/arm64/arm64/bus_space_asm.S index 20d4128..d919bd5 100644 --- a/sys/arm64/arm64/bus_space_asm.S +++ b/sys/arm64/arm64/bus_space_asm.S @@ -133,6 +133,90 @@ ENTRY(generic_bs_rm_8) 2: ret END(generic_bs_rm_8) +ENTRY(generic_bs_rr_1) + /* Is there is anything to read. */ + cbz x4, 2f + + /* Calculate the device address. */ + add x0, x1, x2 + /* + * x0 = The device address. + * x3 = The kernel address. + * x4 = Count + */ + + /* Read the data. */ +1: ldrb w1, [x0], #1 + strb w1, [x3], #1 + subs x4, x4, #1 + b.ne 1b + +2: ret +END(generic_bs_rr_1) + +ENTRY(generic_bs_rr_2) + /* Is there is anything to read. */ + cbz x4, 2f + + /* Calculate the device address. */ + add x0, x1, x2 + /* + * x0 = The device address. + * x3 = The kernel address. + * x4 = Count + */ + + /* Read the data. */ +1: ldrh w1, [x0], #2 + strh w1, [x3], #2 + subs x4, x4, #1 + b.ne 1b + +2: ret +END(generic_bs_rr_2) + +ENTRY(generic_bs_rr_4) + /* Is there is anything to read. */ + cbz x4, 2f + + /* Calculate the device address. */ + add x0, x1, x2 + /* + * x0 = The device address. + * x3 = The kernel address. + * x4 = Count + */ + + /* Read the data. */ +1: ldr w1, [x0], #4 + str w1, [x3], #4 + subs x4, x4, #1 + b.ne 1b + +2: ret +END(generic_bs_rr_4) + +ENTRY(generic_bs_rr_8) + /* Is there is anything to read. */ + cbz x4, 2f + + /* Calculate the device address. */ + add x0, x1, x2 + /* + * x0 = The device address. + * x3 = The kernel address. + * x4 = Count + */ + + /* Read the data. */ +1: ldr x1, [x0], #8 + str x1, [x3], #8 + subs x4, x4, #1 + b.ne 1b + +2: ret +END(generic_bs_rr_8) + ENTRY(generic_bs_w_1) strb w3, [x1, x2] @@ -233,3 +317,83 @@ ENTRY(generic_bs_wm_8) 2: ret END(generic_bs_wm_8) + +ENTRY(generic_bs_wr_1) + /* Is there is anything to write. */ + cbz x4, 2f + + add x0, x1, x2 + /* + * x0 = The device address. + * x3 = The kernel address. + * x4 = Count + */ + + /* Write the data */ +1: ldrb w1, [x3], #1 + strb w1, [x0], #1 + subs x4, x4, #1 + b.ne 1b + +2: ret +END(generic_bs_wr_1) + +ENTRY(generic_bs_wr_2) + /* Is there is anything to write. */ + cbz x4, 2f + + add x0, x1, x2 + /* + * x0 = The device address. + * x3 = The kernel address. + * x4 = Count + */ + + /* Write the data */ +1: ldrh w1, [x3], #2 + strh w1, [x0], #2 + subs x4, x4, #1 + b.ne 1b + +2: ret +END(generic_bs_wr_2) + +ENTRY(generic_bs_wr_4) + /* Is there is anything to write. */ + cbz x4, 2f + + add x0, x1, x2 + /* + * x0 = The device address. + * x3 = The kernel address. + * x4 = Count + */ + + /* Write the data */ +1: ldr w1, [x3], #4 + str w1, [x0], #4 + subs x4, x4, #1 + b.ne 1b + +2: ret +END(generic_bs_wr_4) + +ENTRY(generic_bs_wr_8) + /* Is there is anything to write. */ + cbz x4, 2f + + add x0, x1, x2 + /* + * x0 = The device address. + * x3 = The kernel address. + * x4 = Count + */ + + /* Write the data */ +1: ldr x1, [x3], #8 + str x1, [x0], #8 + subs x4, x4, #1 + b.ne 1b + +2: ret +END(generic_bs_wr_8) diff --git a/sys/arm64/arm64/exception.S b/sys/arm64/arm64/exception.S index 4f457da..b05941f 100644 --- a/sys/arm64/arm64/exception.S +++ b/sys/arm64/arm64/exception.S @@ -104,7 +104,7 @@ __FBSDID("$FreeBSD$"); /* Read the current thread flags */ 1: ldr x1, [x18, #PC_CURTHREAD] /* Load curthread */ - ldr x2, [x1, #TD_FLAGS]! /* TODO: No need for the ! but clang fails without it */ + ldr x2, [x1, #TD_FLAGS] /* Check if we have either bits set */ mov x3, #((TDF_ASTPENDING|TDF_NEEDRESCHED) >> 8) diff --git a/sys/arm64/arm64/trap.c b/sys/arm64/arm64/trap.c index 41e92a6..fa9aaa8 100644 --- a/sys/arm64/arm64/trap.c +++ b/sys/arm64/arm64/trap.c @@ -229,6 +229,21 @@ data_abort(struct trapframe *frame, uint64_t esr, int lower) userret(td, frame); } +static void +print_registers(struct trapframe *frame) +{ + u_int reg; + + for (reg = 0; reg < 31; reg++) { + printf(" %sx%d: %16lx\n", (reg < 10) ? " " : "", reg, + frame->tf_x[reg]); + } + printf(" sp: %16lx\n", frame->tf_sp); + printf(" lr: %16lx\n", frame->tf_lr); + printf(" elr: %16lx\n", frame->tf_elr); + printf("spsr: %16lx\n", frame->tf_spsr); +} + void do_el1h_sync(struct trapframe *frame) { @@ -265,6 +280,7 @@ do_el1h_sync(struct trapframe *frame) switch(exception) { case EXCP_FP_SIMD: case EXCP_TRAP_FP: + print_registers(frame); panic("VFP exception in the kernel"); case EXCP_DATA_ABORT: data_abort(frame, esr, 0); @@ -286,11 +302,30 @@ do_el1h_sync(struct trapframe *frame) #endif break; default: + print_registers(frame); panic("Unknown kernel exception %x esr_el1 %lx\n", exception, esr); } } +/* + * We get EXCP_UNKNOWN from QEMU when executing zeroed memory. For now turn + * this into a SIGILL. + */ +static void +el0_excp_unknown(struct trapframe *frame) +{ + struct thread *td; + uint64_t far; + + td = curthread; + far = READ_SPECIALREG(far_el1); + printf("el0 EXCP_UNKNOWN exception\n"); + print_registers(frame); + call_trapsignal(td, SIGILL, ILL_ILLTRP, (void *)far); + userret(td, frame); +} + void do_el0_sync(struct trapframe *frame) { @@ -332,7 +367,11 @@ do_el0_sync(struct trapframe *frame) case EXCP_DATA_ABORT: data_abort(frame, esr, 1); break; + case EXCP_UNKNOWN: + el0_excp_unknown(frame); + break; default: + print_registers(frame); panic("Unknown userland exception %x esr_el1 %lx\n", exception, esr); } diff --git a/sys/boot/kshim/bsd_kernel.h b/sys/boot/kshim/bsd_kernel.h index 0e40fb0..aba8131 100644 --- a/sys/boot/kshim/bsd_kernel.h +++ b/sys/boot/kshim/bsd_kernel.h @@ -43,7 +43,8 @@ #define M_USBDEV 0 #define USB_PROC_MAX 3 #define USB_BUS_GIANT_PROC(bus) (usb_process + 2) -#define USB_BUS_NON_GIANT_PROC(bus) (usb_process + 2) +#define USB_BUS_NON_GIANT_BULK_PROC(bus) (usb_process + 2) +#define USB_BUS_NON_GIANT_ISOC_PROC(bus) (usb_process + 2) #define USB_BUS_EXPLORE_PROC(bus) (usb_process + 0) #define USB_BUS_CONTROL_XFER_PROC(bus) (usb_process + 1) #define SYSCTL_DECL(...) diff --git a/sys/boot/uboot/fdt/uboot_fdt.c b/sys/boot/uboot/fdt/uboot_fdt.c index 86f46e9..6b646f6 100644 --- a/sys/boot/uboot/fdt/uboot_fdt.c +++ b/sys/boot/uboot/fdt/uboot_fdt.c @@ -69,10 +69,11 @@ fdt_platform_load_dtb(void) } /* - * If the U-boot environment contains a variable giving the name of a - * file, use it if we can load and validate it. + * Try to get FDT filename first from loader env and then from u-boot env */ - s = ub_env_get("fdtfile"); + s = getenv("fdt_file"); + if (s == NULL) + s = ub_env_get("fdtfile"); if (s == NULL) s = ub_env_get("fdt_file"); if (s != NULL && *s != '\0') { diff --git a/sys/cam/ctl/README.ctl.txt b/sys/cam/ctl/README.ctl.txt index a6de201..d4dc938 100644 --- a/sys/cam/ctl/README.ctl.txt +++ b/sys/cam/ctl/README.ctl.txt @@ -366,16 +366,6 @@ This is a CTL frontend port that is also a CAM SIM. The idea is that this frontend allows for using CTL without any target-capable hardware. So any LUNs you create in CTL are visible via this port. - -ctl_frontend_internal.c -ctl_frontend_internal.h: ------------------------ - -This is a frontend port written for Copan to do some system-specific tasks -that required sending commands into CTL from inside the kernel. This isn't -entirely relevant to FreeBSD in general, but can perhaps be repurposed or -removed later. - ctl_ha.h: -------- diff --git a/sys/cam/ctl/ctl.c b/sys/cam/ctl/ctl.c index bdf5e6a..9141fc8 100644 --- a/sys/cam/ctl/ctl.c +++ b/sys/cam/ctl/ctl.c @@ -72,7 +72,6 @@ __FBSDID("$FreeBSD$"); #include <cam/ctl/ctl_io.h> #include <cam/ctl/ctl.h> #include <cam/ctl/ctl_frontend.h> -#include <cam/ctl/ctl_frontend_internal.h> #include <cam/ctl/ctl_util.h> #include <cam/ctl/ctl_backend.h> #include <cam/ctl/ctl_ioctl.h> @@ -383,18 +382,7 @@ static int ctl_init(void); void ctl_shutdown(void); static int ctl_open(struct cdev *dev, int flags, int fmt, struct thread *td); static int ctl_close(struct cdev *dev, int flags, int fmt, struct thread *td); -static void ctl_ioctl_online(void *arg); -static void ctl_ioctl_offline(void *arg); -static int ctl_ioctl_lun_enable(void *arg, int lun_id); -static int ctl_ioctl_lun_disable(void *arg, int lun_id); -static int ctl_ioctl_do_datamove(struct ctl_scsiio *ctsio); static int ctl_serialize_other_sc_cmd(struct ctl_scsiio *ctsio); -static int ctl_ioctl_submit_wait(union ctl_io *io); -static void ctl_ioctl_datamove(union ctl_io *io); -static void ctl_ioctl_done(union ctl_io *io); -static void ctl_ioctl_hard_startstop_callback(void *arg, - struct cfi_metatask *metatask); -static void ctl_ioctl_bbrread_callback(void *arg,struct cfi_metatask *metatask); static int ctl_ioctl_fill_ooa(struct ctl_lun *lun, uint32_t *cur_fill_num, struct ctl_ooa *ooa_hdr, struct ctl_ooa_entry *kern_entries); @@ -529,11 +517,6 @@ static moduledata_t ctl_moduledata = { DECLARE_MODULE(ctl, ctl_moduledata, SI_SUB_CONFIGURE, SI_ORDER_THIRD); MODULE_VERSION(ctl, 1); -static struct ctl_frontend ioctl_frontend = -{ - .name = "ioctl", -}; - #ifdef notyet static void ctl_isc_handler_finish_xfer(struct ctl_softc *ctl_softc, @@ -1064,7 +1047,6 @@ ctl_init(void) { struct ctl_softc *softc; void *other_pool; - struct ctl_port *port; int i, error, retval; //int isc_retval; @@ -1189,32 +1171,6 @@ ctl_init(void) return (error); } - /* - * Initialize the ioctl front end. - */ - ctl_frontend_register(&ioctl_frontend); - port = &softc->ioctl_info.port; - port->frontend = &ioctl_frontend; - sprintf(softc->ioctl_info.port_name, "ioctl"); - port->port_type = CTL_PORT_IOCTL; - port->num_requested_ctl_io = 100; - port->port_name = softc->ioctl_info.port_name; - port->port_online = ctl_ioctl_online; - port->port_offline = ctl_ioctl_offline; - port->onoff_arg = &softc->ioctl_info; - port->lun_enable = ctl_ioctl_lun_enable; - port->lun_disable = ctl_ioctl_lun_disable; - port->targ_lun_arg = &softc->ioctl_info; - port->fe_datamove = ctl_ioctl_datamove; - port->fe_done = ctl_ioctl_done; - port->max_targets = 15; - port->max_target_id = 15; - - if (ctl_port_register(&softc->ioctl_info.port) != 0) { - printf("ctl: ioctl front end registration failed, will " - "continue anyway\n"); - } - SYSCTL_ADD_PROC(&softc->sysctl_ctx,SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "ha_state", CTLTYPE_INT | CTLFLAG_RWTUN, softc, 0, ctl_ha_state_sysctl, "I", "HA state for this head"); @@ -1238,9 +1194,6 @@ ctl_shutdown(void) softc = (struct ctl_softc *)control_softc; - if (ctl_port_deregister(&softc->ioctl_info.port) != 0) - printf("ctl: ioctl front end deregistration failed\n"); - mtx_lock(&softc->ctl_lock); /* @@ -1253,8 +1206,6 @@ ctl_shutdown(void) mtx_unlock(&softc->ctl_lock); - ctl_frontend_deregister(&ioctl_frontend); - #if 0 ctl_shutdown_thread(softc->work_thread); mtx_destroy(&softc->queue_lock); @@ -1426,26 +1377,6 @@ ctl_port_list(struct ctl_port_entry *entries, int num_entries_alloced, return (retval); } -static void -ctl_ioctl_online(void *arg) -{ - struct ctl_ioctl_info *ioctl_info; - - ioctl_info = (struct ctl_ioctl_info *)arg; - - ioctl_info->flags |= CTL_IOCTL_FLAG_ENABLED; -} - -static void -ctl_ioctl_offline(void *arg) -{ - struct ctl_ioctl_info *ioctl_info; - - ioctl_info = (struct ctl_ioctl_info *)arg; - - ioctl_info->flags &= ~CTL_IOCTL_FLAG_ENABLED; -} - /* * Remove an initiator by port number and initiator ID. * Returns 0 for success, -1 for failure. @@ -1641,181 +1572,6 @@ ctl_create_iid(struct ctl_port *port, int iid, uint8_t *buf) } } -static int -ctl_ioctl_lun_enable(void *arg, int lun_id) -{ - return (0); -} - -static int -ctl_ioctl_lun_disable(void *arg, int lun_id) -{ - return (0); -} - -/* - * Data movement routine for the CTL ioctl frontend port. - */ -static int -ctl_ioctl_do_datamove(struct ctl_scsiio *ctsio) -{ - struct ctl_sg_entry *ext_sglist, *kern_sglist; - struct ctl_sg_entry ext_entry, kern_entry; - int ext_sglen, ext_sg_entries, kern_sg_entries; - int ext_sg_start, ext_offset; - int len_to_copy, len_copied; - int kern_watermark, ext_watermark; - int ext_sglist_malloced; - int i, j; - - ext_sglist_malloced = 0; - ext_sg_start = 0; - ext_offset = 0; - - CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove\n")); - - /* - * If this flag is set, fake the data transfer. - */ - if (ctsio->io_hdr.flags & CTL_FLAG_NO_DATAMOVE) { - ctsio->ext_data_filled = ctsio->ext_data_len; - goto bailout; - } - - /* - * To simplify things here, if we have a single buffer, stick it in - * a S/G entry and just make it a single entry S/G list. - */ - if (ctsio->io_hdr.flags & CTL_FLAG_EDPTR_SGLIST) { - int len_seen; - - ext_sglen = ctsio->ext_sg_entries * sizeof(*ext_sglist); - - ext_sglist = (struct ctl_sg_entry *)malloc(ext_sglen, M_CTL, - M_WAITOK); - ext_sglist_malloced = 1; - if (copyin(ctsio->ext_data_ptr, ext_sglist, - ext_sglen) != 0) { - ctl_set_internal_failure(ctsio, - /*sks_valid*/ 0, - /*retry_count*/ 0); - goto bailout; - } - ext_sg_entries = ctsio->ext_sg_entries; - len_seen = 0; - for (i = 0; i < ext_sg_entries; i++) { - if ((len_seen + ext_sglist[i].len) >= - ctsio->ext_data_filled) { - ext_sg_start = i; - ext_offset = ctsio->ext_data_filled - len_seen; - break; - } - len_seen += ext_sglist[i].len; - } - } else { - ext_sglist = &ext_entry; - ext_sglist->addr = ctsio->ext_data_ptr; - ext_sglist->len = ctsio->ext_data_len; - ext_sg_entries = 1; - ext_sg_start = 0; - ext_offset = ctsio->ext_data_filled; - } - - if (ctsio->kern_sg_entries > 0) { - kern_sglist = (struct ctl_sg_entry *)ctsio->kern_data_ptr; - kern_sg_entries = ctsio->kern_sg_entries; - } else { - kern_sglist = &kern_entry; - kern_sglist->addr = ctsio->kern_data_ptr; - kern_sglist->len = ctsio->kern_data_len; - kern_sg_entries = 1; - } - - - kern_watermark = 0; - ext_watermark = ext_offset; - len_copied = 0; - for (i = ext_sg_start, j = 0; - i < ext_sg_entries && j < kern_sg_entries;) { - uint8_t *ext_ptr, *kern_ptr; - - len_to_copy = MIN(ext_sglist[i].len - ext_watermark, - kern_sglist[j].len - kern_watermark); - - ext_ptr = (uint8_t *)ext_sglist[i].addr; - ext_ptr = ext_ptr + ext_watermark; - if (ctsio->io_hdr.flags & CTL_FLAG_BUS_ADDR) { - /* - * XXX KDM fix this! - */ - panic("need to implement bus address support"); -#if 0 - kern_ptr = bus_to_virt(kern_sglist[j].addr); -#endif - } else - kern_ptr = (uint8_t *)kern_sglist[j].addr; - kern_ptr = kern_ptr + kern_watermark; - - kern_watermark += len_to_copy; - ext_watermark += len_to_copy; - - if ((ctsio->io_hdr.flags & CTL_FLAG_DATA_MASK) == - CTL_FLAG_DATA_IN) { - CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: copying %d " - "bytes to user\n", len_to_copy)); - CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: from %p " - "to %p\n", kern_ptr, ext_ptr)); - if (copyout(kern_ptr, ext_ptr, len_to_copy) != 0) { - ctl_set_internal_failure(ctsio, - /*sks_valid*/ 0, - /*retry_count*/ 0); - goto bailout; - } - } else { - CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: copying %d " - "bytes from user\n", len_to_copy)); - CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: from %p " - "to %p\n", ext_ptr, kern_ptr)); - if (copyin(ext_ptr, kern_ptr, len_to_copy)!= 0){ - ctl_set_internal_failure(ctsio, - /*sks_valid*/ 0, - /*retry_count*/0); - goto bailout; - } - } - - len_copied += len_to_copy; - - if (ext_sglist[i].len == ext_watermark) { - i++; - ext_watermark = 0; - } - - if (kern_sglist[j].len == kern_watermark) { - j++; - kern_watermark = 0; - } - } - - ctsio->ext_data_filled += len_copied; - - CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: ext_sg_entries: %d, " - "kern_sg_entries: %d\n", ext_sg_entries, - kern_sg_entries)); - CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: ext_data_len = %d, " - "kern_data_len = %d\n", ctsio->ext_data_len, - ctsio->kern_data_len)); - - - /* XXX KDM set residual?? */ -bailout: - - if (ext_sglist_malloced != 0) - free(ext_sglist, M_CTL); - - return (CTL_RETVAL_COMPLETE); -} - /* * Serialize a command that went down the "wrong" side, and so was sent to * this controller for execution. The logic is a little different than the @@ -1982,149 +1738,6 @@ ctl_serialize_other_sc_cmd(struct ctl_scsiio *ctsio) return (retval); } -static int -ctl_ioctl_submit_wait(union ctl_io *io) -{ - struct ctl_fe_ioctl_params params; - ctl_fe_ioctl_state last_state; - int done, retval; - - retval = 0; - - bzero(¶ms, sizeof(params)); - - mtx_init(¶ms.ioctl_mtx, "ctliocmtx", NULL, MTX_DEF); - cv_init(¶ms.sem, "ctlioccv"); - params.state = CTL_IOCTL_INPROG; - last_state = params.state; - - io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr = ¶ms; - - CTL_DEBUG_PRINT(("ctl_ioctl_submit_wait\n")); - - /* This shouldn't happen */ - if ((retval = ctl_queue(io)) != CTL_RETVAL_COMPLETE) - return (retval); - - done = 0; - - do { - mtx_lock(¶ms.ioctl_mtx); - /* - * Check the state here, and don't sleep if the state has - * already changed (i.e. wakeup has already occured, but we - * weren't waiting yet). - */ - if (params.state == last_state) { - /* XXX KDM cv_wait_sig instead? */ - cv_wait(¶ms.sem, ¶ms.ioctl_mtx); - } - last_state = params.state; - - switch (params.state) { - case CTL_IOCTL_INPROG: - /* Why did we wake up? */ - /* XXX KDM error here? */ - mtx_unlock(¶ms.ioctl_mtx); - break; - case CTL_IOCTL_DATAMOVE: - CTL_DEBUG_PRINT(("got CTL_IOCTL_DATAMOVE\n")); - - /* - * change last_state back to INPROG to avoid - * deadlock on subsequent data moves. - */ - params.state = last_state = CTL_IOCTL_INPROG; - - mtx_unlock(¶ms.ioctl_mtx); - ctl_ioctl_do_datamove(&io->scsiio); - /* - * Note that in some cases, most notably writes, - * this will queue the I/O and call us back later. - * In other cases, generally reads, this routine - * will immediately call back and wake us up, - * probably using our own context. - */ - io->scsiio.be_move_done(io); - break; - case CTL_IOCTL_DONE: - mtx_unlock(¶ms.ioctl_mtx); - CTL_DEBUG_PRINT(("got CTL_IOCTL_DONE\n")); - done = 1; - break; - default: - mtx_unlock(¶ms.ioctl_mtx); - /* XXX KDM error here? */ - break; - } - } while (done == 0); - - mtx_destroy(¶ms.ioctl_mtx); - cv_destroy(¶ms.sem); - - return (CTL_RETVAL_COMPLETE); -} - -static void -ctl_ioctl_datamove(union ctl_io *io) -{ - struct ctl_fe_ioctl_params *params; - - params = (struct ctl_fe_ioctl_params *) - io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr; - - mtx_lock(¶ms->ioctl_mtx); - params->state = CTL_IOCTL_DATAMOVE; - cv_broadcast(¶ms->sem); - mtx_unlock(¶ms->ioctl_mtx); -} - -static void -ctl_ioctl_done(union ctl_io *io) -{ - struct ctl_fe_ioctl_params *params; - - params = (struct ctl_fe_ioctl_params *) - io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr; - - mtx_lock(¶ms->ioctl_mtx); - params->state = CTL_IOCTL_DONE; - cv_broadcast(¶ms->sem); - mtx_unlock(¶ms->ioctl_mtx); -} - -static void -ctl_ioctl_hard_startstop_callback(void *arg, struct cfi_metatask *metatask) -{ - struct ctl_fe_ioctl_startstop_info *sd_info; - - sd_info = (struct ctl_fe_ioctl_startstop_info *)arg; - - sd_info->hs_info.status = metatask->status; - sd_info->hs_info.total_luns = metatask->taskinfo.startstop.total_luns; - sd_info->hs_info.luns_complete = - metatask->taskinfo.startstop.luns_complete; - sd_info->hs_info.luns_failed = metatask->taskinfo.startstop.luns_failed; - - cv_broadcast(&sd_info->sem); -} - -static void -ctl_ioctl_bbrread_callback(void *arg, struct cfi_metatask *metatask) -{ - struct ctl_fe_ioctl_bbrread_info *fe_bbr_info; - - fe_bbr_info = (struct ctl_fe_ioctl_bbrread_info *)arg; - - mtx_lock(fe_bbr_info->lock); - fe_bbr_info->bbr_info->status = metatask->status; - fe_bbr_info->bbr_info->bbr_status = metatask->taskinfo.bbrread.status; - fe_bbr_info->wakeup_done = 1; - mtx_unlock(fe_bbr_info->lock); - - cv_broadcast(&fe_bbr_info->sem); -} - /* * Returns 0 for success, errno for failure. */ @@ -2367,57 +1980,9 @@ ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, retval = 0; switch (cmd) { - case CTL_IO: { - union ctl_io *io; - void *pool_tmp; - - /* - * If we haven't been "enabled", don't allow any SCSI I/O - * to this FETD. - */ - if ((softc->ioctl_info.flags & CTL_IOCTL_FLAG_ENABLED) == 0) { - retval = EPERM; - break; - } - - io = ctl_alloc_io(softc->ioctl_info.port.ctl_pool_ref); - - /* - * Need to save the pool reference so it doesn't get - * spammed by the user's ctl_io. - */ - pool_tmp = io->io_hdr.pool; - memcpy(io, (void *)addr, sizeof(*io)); - io->io_hdr.pool = pool_tmp; - - /* - * No status yet, so make sure the status is set properly. - */ - io->io_hdr.status = CTL_STATUS_NONE; - - /* - * The user sets the initiator ID, target and LUN IDs. - */ - io->io_hdr.nexus.targ_port = softc->ioctl_info.port.targ_port; - io->io_hdr.flags |= CTL_FLAG_USER_REQ; - if ((io->io_hdr.io_type == CTL_IO_SCSI) - && (io->scsiio.tag_type != CTL_TAG_UNTAGGED)) - io->scsiio.tag_num = softc->ioctl_info.cur_tag_num++; - - retval = ctl_ioctl_submit_wait(io); - - if (retval != 0) { - ctl_free_io(io); - break; - } - - memcpy((void *)addr, io, sizeof(*io)); - - /* return this to our pool */ - ctl_free_io(io); - + case CTL_IO: + retval = ctl_ioctl_io(dev, cmd, addr, flag, td); break; - } case CTL_ENABLE_PORT: case CTL_DISABLE_PORT: case CTL_SET_PORT_WWNS: { @@ -2724,103 +2289,6 @@ ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, break; } - case CTL_HARD_START: - case CTL_HARD_STOP: { - struct ctl_fe_ioctl_startstop_info ss_info; - struct cfi_metatask *metatask; - struct mtx hs_mtx; - - mtx_init(&hs_mtx, "HS Mutex", NULL, MTX_DEF); - - cv_init(&ss_info.sem, "hard start/stop cv" ); - - metatask = cfi_alloc_metatask(/*can_wait*/ 1); - if (metatask == NULL) { - retval = ENOMEM; - mtx_destroy(&hs_mtx); - break; - } - - if (cmd == CTL_HARD_START) - metatask->tasktype = CFI_TASK_STARTUP; - else - metatask->tasktype = CFI_TASK_SHUTDOWN; - - metatask->callback = ctl_ioctl_hard_startstop_callback; - metatask->callback_arg = &ss_info; - - cfi_action(metatask); - - /* Wait for the callback */ - mtx_lock(&hs_mtx); - cv_wait_sig(&ss_info.sem, &hs_mtx); - mtx_unlock(&hs_mtx); - - /* - * All information has been copied from the metatask by the - * time cv_broadcast() is called, so we free the metatask here. - */ - cfi_free_metatask(metatask); - - memcpy((void *)addr, &ss_info.hs_info, sizeof(ss_info.hs_info)); - - mtx_destroy(&hs_mtx); - break; - } - case CTL_BBRREAD: { - struct ctl_bbrread_info *bbr_info; - struct ctl_fe_ioctl_bbrread_info fe_bbr_info; - struct mtx bbr_mtx; - struct cfi_metatask *metatask; - - bbr_info = (struct ctl_bbrread_info *)addr; - - bzero(&fe_bbr_info, sizeof(fe_bbr_info)); - - bzero(&bbr_mtx, sizeof(bbr_mtx)); - mtx_init(&bbr_mtx, "BBR Mutex", NULL, MTX_DEF); - - fe_bbr_info.bbr_info = bbr_info; - fe_bbr_info.lock = &bbr_mtx; - - cv_init(&fe_bbr_info.sem, "BBR read cv"); - metatask = cfi_alloc_metatask(/*can_wait*/ 1); - - if (metatask == NULL) { - mtx_destroy(&bbr_mtx); - cv_destroy(&fe_bbr_info.sem); - retval = ENOMEM; - break; - } - metatask->tasktype = CFI_TASK_BBRREAD; - metatask->callback = ctl_ioctl_bbrread_callback; - metatask->callback_arg = &fe_bbr_info; - metatask->taskinfo.bbrread.lun_num = bbr_info->lun_num; - metatask->taskinfo.bbrread.lba = bbr_info->lba; - metatask->taskinfo.bbrread.len = bbr_info->len; - - cfi_action(metatask); - - mtx_lock(&bbr_mtx); - while (fe_bbr_info.wakeup_done == 0) - cv_wait_sig(&fe_bbr_info.sem, &bbr_mtx); - mtx_unlock(&bbr_mtx); - - bbr_info->status = metatask->status; - bbr_info->bbr_status = metatask->taskinfo.bbrread.status; - bbr_info->scsi_status = metatask->taskinfo.bbrread.scsi_status; - memcpy(&bbr_info->sense_data, - &metatask->taskinfo.bbrread.sense_data, - MIN(sizeof(bbr_info->sense_data), - sizeof(metatask->taskinfo.bbrread.sense_data))); - - cfi_free_metatask(metatask); - - mtx_destroy(&bbr_mtx); - cv_destroy(&fe_bbr_info.sem); - - break; - } case CTL_DELAY_IO: { struct ctl_io_delay_info *delay_info; #ifdef CTL_IO_DELAY diff --git a/sys/cam/ctl/ctl.h b/sys/cam/ctl/ctl.h index b1d9118..2826742 100644 --- a/sys/cam/ctl/ctl.h +++ b/sys/cam/ctl/ctl.h @@ -194,6 +194,8 @@ void ctl_portDB_changed(int portnum); #ifdef notyet void ctl_init_isc_msg(void); #endif +int ctl_ioctl_io(struct cdev *dev, u_long cmd, caddr_t addr, int flag, + struct thread *td); /* * KPI to manipulate LUN/port options diff --git a/sys/cam/ctl/ctl_backend.c b/sys/cam/ctl/ctl_backend.c index cabecb7..ae5034b 100644 --- a/sys/cam/ctl/ctl_backend.c +++ b/sys/cam/ctl/ctl_backend.c @@ -55,7 +55,6 @@ __FBSDID("$FreeBSD$"); #include <cam/ctl/ctl.h> #include <cam/ctl/ctl_frontend.h> #include <cam/ctl/ctl_backend.h> -#include <cam/ctl/ctl_frontend_internal.h> #include <cam/ctl/ctl_ioctl.h> #include <cam/ctl/ctl_ha.h> #include <cam/ctl/ctl_private.h> diff --git a/sys/cam/ctl/ctl_backend_block.c b/sys/cam/ctl/ctl_backend_block.c index 5bb3121..65d0491 100644 --- a/sys/cam/ctl/ctl_backend_block.c +++ b/sys/cam/ctl/ctl_backend_block.c @@ -84,7 +84,6 @@ __FBSDID("$FreeBSD$"); #include <cam/ctl/ctl_io.h> #include <cam/ctl/ctl.h> #include <cam/ctl/ctl_backend.h> -#include <cam/ctl/ctl_frontend_internal.h> #include <cam/ctl/ctl_ioctl.h> #include <cam/ctl/ctl_scsi_all.h> #include <cam/ctl/ctl_error.h> @@ -170,7 +169,6 @@ struct ctl_be_block_lun { uint64_t size_blocks; uint64_t size_bytes; uint32_t blocksize; - int blocksize_shift; uint16_t pblockexp; uint16_t pblockoff; uint16_t ublockexp; @@ -773,7 +771,7 @@ ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun, DPRINTF("entered\n"); - off = roff = ((off_t)lbalen->lba) << be_lun->blocksize_shift; + off = roff = ((off_t)lbalen->lba) * be_lun->blocksize; vn_lock(be_lun->vn, LK_SHARED | LK_RETRY); error = VOP_IOCTL(be_lun->vn, FIOSEEKHOLE, &off, 0, curthread->td_ucred, curthread); @@ -791,10 +789,9 @@ ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun, } VOP_UNLOCK(be_lun->vn, 0); - off >>= be_lun->blocksize_shift; data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr; scsi_u64to8b(lbalen->lba, data->descr[0].addr); - scsi_ulto4b(MIN(UINT32_MAX, off - lbalen->lba), + scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->blocksize - lbalen->lba), data->descr[0].length); data->descr[0].status = status; @@ -816,14 +813,14 @@ ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun, const char *attrname) if (strcmp(attrname, "blocksused") == 0) { error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred); if (error == 0) - val = vattr.va_bytes >> be_lun->blocksize_shift; + val = vattr.va_bytes / be_lun->blocksize; } if (strcmp(attrname, "blocksavail") == 0 && (be_lun->vn->v_iflag & VI_DOOMED) == 0) { error = VFS_STATFS(be_lun->vn->v_mount, &statfs); if (error == 0) - val = (statfs.f_bavail * statfs.f_bsize) >> - be_lun->blocksize_shift; + val = statfs.f_bavail * statfs.f_bsize / + be_lun->blocksize; } VOP_UNLOCK(be_lun->vn, 0); return (val); @@ -934,7 +931,7 @@ ctl_be_block_gls_zvol(struct ctl_be_block_lun *be_lun, DPRINTF("entered\n"); - off = roff = ((off_t)lbalen->lba) << be_lun->blocksize_shift; + off = roff = ((off_t)lbalen->lba) * be_lun->blocksize; error = (*dev_data->csw->d_ioctl)(dev_data->cdev, FIOSEEKHOLE, (caddr_t)&off, FREAD, curthread); if (error == 0 && off > roff) @@ -950,10 +947,9 @@ ctl_be_block_gls_zvol(struct ctl_be_block_lun *be_lun, } } - off >>= be_lun->blocksize_shift; data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr; scsi_u64to8b(lbalen->lba, data->descr[0].addr); - scsi_ulto4b(MIN(UINT32_MAX, off - lbalen->lba), + scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->blocksize - lbalen->lba), data->descr[0].length); data->descr[0].status = status; @@ -1866,7 +1862,7 @@ ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req) struct cdevsw *devsw; char *value; int error, atomic, maxio, unmap; - off_t ps, pss, po, pos, us, uss, uo, uos; + off_t ps, pss, po, pos, us, uss, uo, uos, tmp; params = &be_lun->params; @@ -1909,8 +1905,7 @@ ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req) return (ENODEV); } - error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, - (caddr_t)&be_lun->blocksize, FREAD, + error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, (caddr_t)&tmp, FREAD, curthread); if (error) { snprintf(req->error_str, sizeof(req->error_str), @@ -1925,15 +1920,9 @@ ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req) * the user is asking for is an even multiple of the underlying * device's blocksize. */ - if ((params->blocksize_bytes != 0) - && (params->blocksize_bytes > be_lun->blocksize)) { - uint32_t bs_multiple, tmp_blocksize; - - bs_multiple = params->blocksize_bytes / be_lun->blocksize; - - tmp_blocksize = bs_multiple * be_lun->blocksize; - - if (tmp_blocksize == params->blocksize_bytes) { + if ((params->blocksize_bytes != 0) && + (params->blocksize_bytes >= tmp)) { + if (params->blocksize_bytes % tmp == 0) { be_lun->blocksize = params->blocksize_bytes; } else { snprintf(req->error_str, sizeof(req->error_str), @@ -1944,17 +1933,16 @@ ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req) return (EINVAL); } - } else if ((params->blocksize_bytes != 0) - && (params->blocksize_bytes != be_lun->blocksize)) { + } else if (params->blocksize_bytes != 0) { snprintf(req->error_str, sizeof(req->error_str), "requested blocksize %u < backing device " "blocksize %u", params->blocksize_bytes, be_lun->blocksize); return (EINVAL); - } + } else + be_lun->blocksize = tmp; - error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, - (caddr_t)&be_lun->size_bytes, FREAD, + error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, (caddr_t)&tmp, FREAD, curthread); if (error) { snprintf(req->error_str, sizeof(req->error_str), @@ -1965,7 +1953,7 @@ ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req) } if (params->lun_size_bytes != 0) { - if (params->lun_size_bytes > be_lun->size_bytes) { + if (params->lun_size_bytes > tmp) { snprintf(req->error_str, sizeof(req->error_str), "requested LUN size %ju > backing device " "size %ju", @@ -1975,7 +1963,8 @@ ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req) } be_lun->size_bytes = params->lun_size_bytes; - } + } else + be_lun->size_bytes = tmp; error = devsw->d_ioctl(dev, DIOCGSTRIPESIZE, (caddr_t)&ps, FREAD, curthread); @@ -2160,14 +2149,8 @@ ctl_be_block_open(struct ctl_be_block_softc *softc, } VOP_UNLOCK(be_lun->vn, 0); - if (error != 0) { + if (error != 0) ctl_be_block_close(be_lun); - return (error); - } - - be_lun->blocksize_shift = fls(be_lun->blocksize) - 1; - be_lun->size_blocks = be_lun->size_bytes >> be_lun->blocksize_shift; - return (0); } @@ -2224,10 +2207,14 @@ ctl_be_block_create(struct ctl_be_block_softc *softc, struct ctl_lun_req *req) goto bailout_error; } be_lun->dev_path = strdup(value, M_CTLBLK); - be_lun->blocksize = 512; - be_lun->blocksize_shift = fls(be_lun->blocksize) - 1; + be_lun->size_bytes = params->lun_size_bytes; + if (params->blocksize_bytes != 0) + be_lun->blocksize = params->blocksize_bytes; + else + be_lun->blocksize = 512; retval = ctl_be_block_open(softc, be_lun, req); + be_lun->size_blocks = be_lun->size_bytes / be_lun->blocksize; if (retval != 0) { retval = 0; req->status = CTL_LUN_WARNING; @@ -2652,10 +2639,9 @@ ctl_be_block_modify(struct ctl_be_block_softc *softc, struct ctl_lun_req *req) error = ctl_be_block_modify_file(be_lun, req); else error = EINVAL; + be_lun->size_blocks = be_lun->size_bytes / be_lun->blocksize; if (error == 0 && be_lun->size_bytes != oldsize) { - be_lun->size_blocks = be_lun->size_bytes >> - be_lun->blocksize_shift; /* * The maximum LBA is the size - 1. diff --git a/sys/cam/ctl/ctl_backend_ramdisk.c b/sys/cam/ctl/ctl_backend_ramdisk.c index ad90241..211738b 100644 --- a/sys/cam/ctl/ctl_backend_ramdisk.c +++ b/sys/cam/ctl/ctl_backend_ramdisk.c @@ -62,7 +62,6 @@ __FBSDID("$FreeBSD$"); #include <cam/ctl/ctl.h> #include <cam/ctl/ctl_util.h> #include <cam/ctl/ctl_backend.h> -#include <cam/ctl/ctl_frontend_internal.h> #include <cam/ctl/ctl_debug.h> #include <cam/ctl/ctl_ioctl.h> #include <cam/ctl/ctl_error.h> diff --git a/sys/cam/ctl/ctl_cmd_table.c b/sys/cam/ctl/ctl_cmd_table.c index 08ff88a..9a7d70e 100644 --- a/sys/cam/ctl/ctl_cmd_table.c +++ b/sys/cam/ctl/ctl_cmd_table.c @@ -52,7 +52,6 @@ #include <cam/ctl/ctl.h> #include <cam/ctl/ctl_frontend.h> #include <cam/ctl/ctl_backend.h> -#include <cam/ctl/ctl_frontend_internal.h> #include <cam/ctl/ctl_ioctl.h> #include <cam/ctl/ctl_ha.h> #include <cam/ctl/ctl_private.h> diff --git a/sys/cam/ctl/ctl_error.c b/sys/cam/ctl/ctl_error.c index d4d7f79..4b41331 100644 --- a/sys/cam/ctl/ctl_error.c +++ b/sys/cam/ctl/ctl_error.c @@ -57,7 +57,6 @@ __FBSDID("$FreeBSD$"); #include <cam/ctl/ctl_io.h> #include <cam/ctl/ctl.h> #include <cam/ctl/ctl_frontend.h> -#include <cam/ctl/ctl_frontend_internal.h> #include <cam/ctl/ctl_backend.h> #include <cam/ctl/ctl_ioctl.h> #include <cam/ctl/ctl_error.h> diff --git a/sys/cam/ctl/ctl_frontend.c b/sys/cam/ctl/ctl_frontend.c index e22b9d4..34baf44 100644 --- a/sys/cam/ctl/ctl_frontend.c +++ b/sys/cam/ctl/ctl_frontend.c @@ -55,7 +55,6 @@ __FBSDID("$FreeBSD$"); #include <cam/ctl/ctl_io.h> #include <cam/ctl/ctl.h> #include <cam/ctl/ctl_frontend.h> -#include <cam/ctl/ctl_frontend_internal.h> #include <cam/ctl/ctl_backend.h> /* XXX KDM move defines from ctl_ioctl.h to somewhere else */ #include <cam/ctl/ctl_ioctl.h> diff --git a/sys/cam/ctl/ctl_frontend_cam_sim.c b/sys/cam/ctl/ctl_frontend_cam_sim.c index 3abc572..97b361a 100644 --- a/sys/cam/ctl/ctl_frontend_cam_sim.c +++ b/sys/cam/ctl/ctl_frontend_cam_sim.c @@ -64,7 +64,6 @@ __FBSDID("$FreeBSD$"); #include <cam/ctl/ctl_io.h> #include <cam/ctl/ctl.h> #include <cam/ctl/ctl_frontend.h> -#include <cam/ctl/ctl_frontend_internal.h> #include <cam/ctl/ctl_debug.h> #define io_ptr spriv_ptr1 diff --git a/sys/cam/ctl/ctl_frontend_internal.c b/sys/cam/ctl/ctl_frontend_internal.c deleted file mode 100644 index 4768292..0000000 --- a/sys/cam/ctl/ctl_frontend_internal.c +++ /dev/null @@ -1,1612 +0,0 @@ -/*- - * Copyright (c) 2004, 2005 Silicon Graphics International Corp. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. Redistributions in binary form must reproduce at minimum a disclaimer - * substantially similar to the "NO WARRANTY" disclaimer below - * ("Disclaimer") and any redistribution must be conditioned upon - * including a substantially similar Disclaimer requirement for further - * binary redistribution. - * - * NO WARRANTY - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGES. - * - * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_frontend_internal.c#5 $ - */ -/* - * CTL kernel internal frontend target driver. This allows kernel-level - * clients to send commands into CTL. - * - * This has elements of a FETD (e.g. it has to set tag numbers, initiator, - * port, target, and LUN) and elements of an initiator (LUN discovery and - * probing, error recovery, command initiation). Even though this has some - * initiator type elements, this is not intended to be a full fledged - * initiator layer. It is only intended to send a limited number of - * commands to a well known target layer. - * - * To be able to fulfill the role of a full initiator layer, it would need - * a whole lot more functionality. - * - * Author: Ken Merry <ken@FreeBSD.org> - * - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/kernel.h> -#include <sys/types.h> -#include <sys/malloc.h> -#include <sys/module.h> -#include <sys/lock.h> -#include <sys/mutex.h> -#include <sys/condvar.h> -#include <sys/queue.h> -#include <sys/sbuf.h> -#include <sys/sysctl.h> -#include <vm/uma.h> -#include <cam/scsi/scsi_all.h> -#include <cam/scsi/scsi_da.h> -#include <cam/ctl/ctl_io.h> -#include <cam/ctl/ctl.h> -#include <cam/ctl/ctl_frontend.h> -#include <cam/ctl/ctl_frontend_internal.h> -#include <cam/ctl/ctl_backend.h> -#include <cam/ctl/ctl_ioctl.h> -#include <cam/ctl/ctl_util.h> -#include <cam/ctl/ctl_ha.h> -#include <cam/ctl/ctl_private.h> -#include <cam/ctl/ctl_debug.h> -#include <cam/ctl/ctl_scsi_all.h> -#include <cam/ctl/ctl_error.h> - -/* - * Task structure: - * - overall metatask, different potential metatask types (e.g. forced - * shutdown, gentle shutdown) - * - forced shutdown metatask: - * - states: report luns, pending, done? - * - list of luns pending, with the relevant I/O for that lun attached. - * This would allow moving ahead on LUNs with no errors, and going - * into error recovery on LUNs with problems. Per-LUN states might - * include inquiry, stop/offline, done. - * - * Use LUN enable for LUN list instead of getting it manually? We'd still - * need inquiry data for each LUN. - * - * How to handle processor LUN w.r.t. found/stopped counts? - */ -#ifdef oldapi -typedef enum { - CFI_TASK_NONE, - CFI_TASK_SHUTDOWN, - CFI_TASK_STARTUP -} cfi_tasktype; - -struct cfi_task_startstop { - int total_luns; - int luns_complete; - int luns_failed; - cfi_cb_t callback; - void *callback_arg; - /* XXX KDM add more fields here */ -}; - -union cfi_taskinfo { - struct cfi_task_startstop startstop; -}; - -struct cfi_metatask { - cfi_tasktype tasktype; - cfi_mt_status status; - union cfi_taskinfo taskinfo; - void *cfi_context; - STAILQ_ENTRY(cfi_metatask) links; -}; -#endif - -typedef enum { - CFI_ERR_RETRY = 0x000, - CFI_ERR_FAIL = 0x001, - CFI_ERR_LUN_RESET = 0x002, - CFI_ERR_MASK = 0x0ff, - CFI_ERR_NO_DECREMENT = 0x100 -} cfi_error_action; - -typedef enum { - CFI_ERR_SOFT, - CFI_ERR_HARD -} cfi_error_policy; - -typedef enum { - CFI_LUN_INQUIRY, - CFI_LUN_READCAPACITY, - CFI_LUN_READCAPACITY_16, - CFI_LUN_READY -} cfi_lun_state; - -struct cfi_lun { - int lun_id; - struct scsi_inquiry_data inq_data; - uint64_t num_blocks; - uint32_t blocksize; - int blocksize_powerof2; - uint32_t cur_tag_num; - cfi_lun_state state; - struct cfi_softc *softc; - STAILQ_HEAD(, cfi_lun_io) io_list; - STAILQ_ENTRY(cfi_lun) links; -}; - -struct cfi_lun_io { - struct cfi_lun *lun; - struct cfi_metatask *metatask; - cfi_error_policy policy; - void (*done_function)(union ctl_io *io); - union ctl_io *ctl_io; - struct cfi_lun_io *orig_lun_io; - STAILQ_ENTRY(cfi_lun_io) links; -}; - -typedef enum { - CFI_NONE = 0x00, - CFI_ONLINE = 0x01, -} cfi_flags; - -struct cfi_softc { - struct ctl_port port; - char fe_name[40]; - struct mtx lock; - cfi_flags flags; - STAILQ_HEAD(, cfi_lun) lun_list; - STAILQ_HEAD(, cfi_metatask) metatask_list; -}; - -MALLOC_DEFINE(M_CTL_CFI, "ctlcfi", "CTL CFI"); - -static uma_zone_t cfi_lun_zone; -static uma_zone_t cfi_metatask_zone; - -static struct cfi_softc fetd_internal_softc; - -int cfi_init(void); -void cfi_shutdown(void) __unused; -static void cfi_online(void *arg); -static void cfi_offline(void *arg); -static int cfi_lun_enable(void *arg, int lun_id); -static int cfi_lun_disable(void *arg, int lun_id); -static void cfi_datamove(union ctl_io *io); -static cfi_error_action cfi_checkcond_parse(union ctl_io *io, - struct cfi_lun_io *lun_io); -static cfi_error_action cfi_error_parse(union ctl_io *io, - struct cfi_lun_io *lun_io); -static void cfi_init_io(union ctl_io *io, struct cfi_lun *lun, - struct cfi_metatask *metatask, cfi_error_policy policy, - int retries, struct cfi_lun_io *orig_lun_io, - void (*done_function)(union ctl_io *io)); -static void cfi_done(union ctl_io *io); -static void cfi_lun_probe_done(union ctl_io *io); -static void cfi_lun_probe(struct cfi_lun *lun, int have_lock); -static void cfi_metatask_done(struct cfi_softc *softc, - struct cfi_metatask *metatask); -static void cfi_metatask_bbr_errorparse(struct cfi_metatask *metatask, - union ctl_io *io); -static void cfi_metatask_io_done(union ctl_io *io); -static void cfi_err_recovery_done(union ctl_io *io); -static void cfi_lun_io_done(union ctl_io *io); - -static struct ctl_frontend cfi_frontend = -{ - .name = "kernel", - .init = cfi_init, - .shutdown = cfi_shutdown, -}; -CTL_FRONTEND_DECLARE(ctlcfi, cfi_frontend); - -int -cfi_init(void) -{ - struct cfi_softc *softc; - struct ctl_port *port; - int retval; - - softc = &fetd_internal_softc; - - port = &softc->port; - - retval = 0; - - if (sizeof(struct cfi_lun_io) > CTL_PORT_PRIV_SIZE) { - printf("%s: size of struct cfi_lun_io %zd > " - "CTL_PORT_PRIV_SIZE %d\n", __func__, - sizeof(struct cfi_lun_io), - CTL_PORT_PRIV_SIZE); - } - memset(softc, 0, sizeof(*softc)); - - mtx_init(&softc->lock, "CTL frontend mutex", NULL, MTX_DEF); - STAILQ_INIT(&softc->lun_list); - STAILQ_INIT(&softc->metatask_list); - sprintf(softc->fe_name, "kernel"); - port->frontend = &cfi_frontend; - port->port_type = CTL_PORT_INTERNAL; - port->num_requested_ctl_io = 100; - port->port_name = softc->fe_name; - port->port_online = cfi_online; - port->port_offline = cfi_offline; - port->onoff_arg = softc; - port->lun_enable = cfi_lun_enable; - port->lun_disable = cfi_lun_disable; - port->targ_lun_arg = softc; - port->fe_datamove = cfi_datamove; - port->fe_done = cfi_done; - port->max_targets = 15; - port->max_target_id = 15; - - if (ctl_port_register(port) != 0) - { - printf("%s: internal frontend registration failed\n", __func__); - return (0); - } - - cfi_lun_zone = uma_zcreate("cfi_lun", sizeof(struct cfi_lun), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - cfi_metatask_zone = uma_zcreate("cfi_metatask", sizeof(struct cfi_metatask), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - - return (0); -} - -void -cfi_shutdown(void) -{ - struct cfi_softc *softc; - - softc = &fetd_internal_softc; - - /* - * XXX KDM need to clear out any I/O pending on each LUN. - */ - if (ctl_port_deregister(&softc->port) != 0) - printf("%s: ctl_frontend_deregister() failed\n", __func__); - - uma_zdestroy(cfi_lun_zone); - uma_zdestroy(cfi_metatask_zone); -} - -static void -cfi_online(void *arg) -{ - struct cfi_softc *softc; - struct cfi_lun *lun; - - softc = (struct cfi_softc *)arg; - - softc->flags |= CFI_ONLINE; - - /* - * Go through and kick off the probe for each lun. Should we check - * the LUN flags here to determine whether or not to probe it? - */ - mtx_lock(&softc->lock); - STAILQ_FOREACH(lun, &softc->lun_list, links) - cfi_lun_probe(lun, /*have_lock*/ 1); - mtx_unlock(&softc->lock); -} - -static void -cfi_offline(void *arg) -{ - struct cfi_softc *softc; - - softc = (struct cfi_softc *)arg; - - softc->flags &= ~CFI_ONLINE; -} - -static int -cfi_lun_enable(void *arg, int lun_id) -{ - struct cfi_softc *softc; - struct cfi_lun *lun; - int found; - - softc = (struct cfi_softc *)arg; - - found = 0; - mtx_lock(&softc->lock); - STAILQ_FOREACH(lun, &softc->lun_list, links) { - if (lun->lun_id == lun_id) { - found = 1; - break; - } - } - mtx_unlock(&softc->lock); - - /* - * If we already have this target/LUN, there is no reason to add - * it to our lists again. - */ - if (found != 0) - return (0); - - lun = uma_zalloc(cfi_lun_zone, M_NOWAIT | M_ZERO); - if (lun == NULL) { - printf("%s: unable to allocate LUN structure\n", __func__); - return (1); - } - - lun->lun_id = lun_id; - lun->cur_tag_num = 0; - lun->state = CFI_LUN_INQUIRY; - lun->softc = softc; - STAILQ_INIT(&lun->io_list); - - mtx_lock(&softc->lock); - STAILQ_INSERT_TAIL(&softc->lun_list, lun, links); - mtx_unlock(&softc->lock); - - cfi_lun_probe(lun, /*have_lock*/ 0); - - return (0); -} - -static int -cfi_lun_disable(void *arg, int lun_id) -{ - struct cfi_softc *softc; - struct cfi_lun *lun; - int found; - - softc = (struct cfi_softc *)arg; - - found = 0; - - /* - * XXX KDM need to do an invalidate and then a free when any - * pending I/O has completed. Or do we? CTL won't free a LUN - * while any I/O is pending. So we won't get this notification - * unless any I/O we have pending on a LUN has completed. - */ - mtx_lock(&softc->lock); - STAILQ_FOREACH(lun, &softc->lun_list, links) { - if (lun->lun_id == lun_id) { - found = 1; - break; - } - } - if (found != 0) - STAILQ_REMOVE(&softc->lun_list, lun, cfi_lun, links); - - mtx_unlock(&softc->lock); - - if (found == 0) { - printf("%s: can't find lun %d\n", __func__, lun_id); - return (1); - } - - uma_zfree(cfi_lun_zone, lun); - - return (0); -} - -static void -cfi_datamove(union ctl_io *io) -{ - struct ctl_sg_entry *ext_sglist, *kern_sglist; - struct ctl_sg_entry ext_entry, kern_entry; - int ext_sglen, ext_sg_entries, kern_sg_entries; - int ext_sg_start, ext_offset; - int len_to_copy, len_copied; - int kern_watermark, ext_watermark; - int ext_sglist_malloced; - struct ctl_scsiio *ctsio; - int i, j; - - ext_sglist_malloced = 0; - ext_sg_start = 0; - ext_offset = 0; - ext_sglist = NULL; - - CTL_DEBUG_PRINT(("%s\n", __func__)); - - ctsio = &io->scsiio; - - /* - * If this is the case, we're probably doing a BBR read and don't - * actually need to transfer the data. This will effectively - * bit-bucket the data. - */ - if (ctsio->ext_data_ptr == NULL) - goto bailout; - - /* - * To simplify things here, if we have a single buffer, stick it in - * a S/G entry and just make it a single entry S/G list. - */ - if (ctsio->io_hdr.flags & CTL_FLAG_EDPTR_SGLIST) { - int len_seen; - - ext_sglen = ctsio->ext_sg_entries * sizeof(*ext_sglist); - - ext_sglist = (struct ctl_sg_entry *)malloc(ext_sglen, M_CTL_CFI, - M_WAITOK); - ext_sglist_malloced = 1; - if (memcpy(ext_sglist, ctsio->ext_data_ptr, ext_sglen) != 0) { - ctl_set_internal_failure(ctsio, - /*sks_valid*/ 0, - /*retry_count*/ 0); - goto bailout; - } - ext_sg_entries = ctsio->ext_sg_entries; - len_seen = 0; - for (i = 0; i < ext_sg_entries; i++) { - if ((len_seen + ext_sglist[i].len) >= - ctsio->ext_data_filled) { - ext_sg_start = i; - ext_offset = ctsio->ext_data_filled - len_seen; - break; - } - len_seen += ext_sglist[i].len; - } - } else { - ext_sglist = &ext_entry; - ext_sglist->addr = ctsio->ext_data_ptr; - ext_sglist->len = ctsio->ext_data_len; - ext_sg_entries = 1; - ext_sg_start = 0; - ext_offset = ctsio->ext_data_filled; - } - - if (ctsio->kern_sg_entries > 0) { - kern_sglist = (struct ctl_sg_entry *)ctsio->kern_data_ptr; - kern_sg_entries = ctsio->kern_sg_entries; - } else { - kern_sglist = &kern_entry; - kern_sglist->addr = ctsio->kern_data_ptr; - kern_sglist->len = ctsio->kern_data_len; - kern_sg_entries = 1; - } - - - kern_watermark = 0; - ext_watermark = ext_offset; - len_copied = 0; - for (i = ext_sg_start, j = 0; - i < ext_sg_entries && j < kern_sg_entries;) { - uint8_t *ext_ptr, *kern_ptr; - - len_to_copy = MIN(ext_sglist[i].len - ext_watermark, - kern_sglist[j].len - kern_watermark); - - ext_ptr = (uint8_t *)ext_sglist[i].addr; - ext_ptr = ext_ptr + ext_watermark; - if (io->io_hdr.flags & CTL_FLAG_BUS_ADDR) { - /* - * XXX KDM fix this! - */ - panic("need to implement bus address support"); -#if 0 - kern_ptr = bus_to_virt(kern_sglist[j].addr); -#endif - } else - kern_ptr = (uint8_t *)kern_sglist[j].addr; - kern_ptr = kern_ptr + kern_watermark; - - kern_watermark += len_to_copy; - ext_watermark += len_to_copy; - - if ((ctsio->io_hdr.flags & CTL_FLAG_DATA_MASK) == - CTL_FLAG_DATA_IN) { - CTL_DEBUG_PRINT(("%s: copying %d bytes to user\n", - __func__, len_to_copy)); - CTL_DEBUG_PRINT(("%s: from %p to %p\n", __func__, - kern_ptr, ext_ptr)); - memcpy(ext_ptr, kern_ptr, len_to_copy); - } else { - CTL_DEBUG_PRINT(("%s: copying %d bytes from user\n", - __func__, len_to_copy)); - CTL_DEBUG_PRINT(("%s: from %p to %p\n", __func__, - ext_ptr, kern_ptr)); - memcpy(kern_ptr, ext_ptr, len_to_copy); - } - - len_copied += len_to_copy; - - if (ext_sglist[i].len == ext_watermark) { - i++; - ext_watermark = 0; - } - - if (kern_sglist[j].len == kern_watermark) { - j++; - kern_watermark = 0; - } - } - - ctsio->ext_data_filled += len_copied; - - CTL_DEBUG_PRINT(("%s: ext_sg_entries: %d, kern_sg_entries: %d\n", - __func__, ext_sg_entries, kern_sg_entries)); - CTL_DEBUG_PRINT(("%s: ext_data_len = %d, kern_data_len = %d\n", - __func__, ctsio->ext_data_len, ctsio->kern_data_len)); - - - /* XXX KDM set residual?? */ -bailout: - - if (ext_sglist_malloced != 0) - free(ext_sglist, M_CTL_CFI); - - io->scsiio.be_move_done(io); - - return; -} - -/* - * For any sort of check condition, busy, etc., we just retry. We do not - * decrement the retry count for unit attention type errors. These are - * normal, and we want to save the retry count for "real" errors. Otherwise, - * we could end up with situations where a command will succeed in some - * situations and fail in others, depending on whether a unit attention is - * pending. Also, some of our error recovery actions, most notably the - * LUN reset action, will cause a unit attention. - * - * We can add more detail here later if necessary. - */ -static cfi_error_action -cfi_checkcond_parse(union ctl_io *io, struct cfi_lun_io *lun_io) -{ - cfi_error_action error_action; - int error_code, sense_key, asc, ascq; - - /* - * Default to retrying the command. - */ - error_action = CFI_ERR_RETRY; - - scsi_extract_sense_len(&io->scsiio.sense_data, - io->scsiio.sense_len, - &error_code, - &sense_key, - &asc, - &ascq, - /*show_errors*/ 1); - - switch (error_code) { - case SSD_DEFERRED_ERROR: - case SSD_DESC_DEFERRED_ERROR: - error_action |= CFI_ERR_NO_DECREMENT; - break; - case SSD_CURRENT_ERROR: - case SSD_DESC_CURRENT_ERROR: - default: { - switch (sense_key) { - case SSD_KEY_UNIT_ATTENTION: - error_action |= CFI_ERR_NO_DECREMENT; - break; - case SSD_KEY_HARDWARE_ERROR: - /* - * This is our generic "something bad happened" - * error code. It often isn't recoverable. - */ - if ((asc == 0x44) && (ascq == 0x00)) - error_action = CFI_ERR_FAIL; - break; - case SSD_KEY_NOT_READY: - /* - * If the LUN is powered down, there likely isn't - * much point in retrying right now. - */ - if ((asc == 0x04) && (ascq == 0x02)) - error_action = CFI_ERR_FAIL; - /* - * If the LUN is offline, there probably isn't much - * point in retrying, either. - */ - if ((asc == 0x04) && (ascq == 0x03)) - error_action = CFI_ERR_FAIL; - break; - } - } - } - - return (error_action); -} - -static cfi_error_action -cfi_error_parse(union ctl_io *io, struct cfi_lun_io *lun_io) -{ - cfi_error_action error_action; - - error_action = CFI_ERR_RETRY; - - switch (io->io_hdr.io_type) { - case CTL_IO_SCSI: - switch (io->io_hdr.status & CTL_STATUS_MASK) { - case CTL_SCSI_ERROR: - switch (io->scsiio.scsi_status) { - case SCSI_STATUS_RESERV_CONFLICT: - /* - * For a reservation conflict, we'll usually - * want the hard error recovery policy, so - * we'll reset the LUN. - */ - if (lun_io->policy == CFI_ERR_HARD) - error_action = - CFI_ERR_LUN_RESET; - else - error_action = - CFI_ERR_RETRY; - break; - case SCSI_STATUS_CHECK_COND: - default: - error_action = cfi_checkcond_parse(io, lun_io); - break; - } - break; - default: - error_action = CFI_ERR_RETRY; - break; - } - break; - case CTL_IO_TASK: - /* - * In theory task management commands shouldn't fail... - */ - error_action = CFI_ERR_RETRY; - break; - default: - printf("%s: invalid ctl_io type %d\n", __func__, - io->io_hdr.io_type); - panic("%s: invalid ctl_io type %d\n", __func__, - io->io_hdr.io_type); - break; - } - - return (error_action); -} - -static void -cfi_init_io(union ctl_io *io, struct cfi_lun *lun, - struct cfi_metatask *metatask, cfi_error_policy policy, int retries, - struct cfi_lun_io *orig_lun_io, - void (*done_function)(union ctl_io *io)) -{ - struct cfi_lun_io *lun_io; - - io->io_hdr.nexus.initid.id = 7; - io->io_hdr.nexus.targ_port = lun->softc->port.targ_port; - io->io_hdr.nexus.targ_target.id = 0; - io->io_hdr.nexus.targ_lun = lun->lun_id; - io->io_hdr.retries = retries; - lun_io = (struct cfi_lun_io *)io->io_hdr.port_priv; - io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr = lun_io; - lun_io->lun = lun; - lun_io->metatask = metatask; - lun_io->ctl_io = io; - lun_io->policy = policy; - lun_io->orig_lun_io = orig_lun_io; - lun_io->done_function = done_function; - /* - * We only set the tag number for SCSI I/Os. For task management - * commands, the tag number is only really needed for aborts, so - * the caller can set it if necessary. - */ - switch (io->io_hdr.io_type) { - case CTL_IO_SCSI: - io->scsiio.tag_num = lun->cur_tag_num++; - break; - case CTL_IO_TASK: - default: - break; - } -} - -static void -cfi_done(union ctl_io *io) -{ - struct cfi_lun_io *lun_io; - struct cfi_softc *softc; - struct cfi_lun *lun; - - lun_io = (struct cfi_lun_io *) - io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr; - - lun = lun_io->lun; - softc = lun->softc; - - /* - * Very minimal retry logic. We basically retry if we got an error - * back, and the retry count is greater than 0. If we ever want - * more sophisticated initiator type behavior, the CAM error - * recovery code in ../common might be helpful. - */ - if (((io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS) - && (io->io_hdr.retries > 0)) { - ctl_io_status old_status; - cfi_error_action error_action; - - error_action = cfi_error_parse(io, lun_io); - - switch (error_action & CFI_ERR_MASK) { - case CFI_ERR_FAIL: - goto done; - break; /* NOTREACHED */ - case CFI_ERR_LUN_RESET: { - union ctl_io *new_io; - struct cfi_lun_io *new_lun_io; - - new_io = ctl_alloc_io(softc->port.ctl_pool_ref); - ctl_zero_io(new_io); - - new_io->io_hdr.io_type = CTL_IO_TASK; - new_io->taskio.task_action = CTL_TASK_LUN_RESET; - - cfi_init_io(new_io, - /*lun*/ lun_io->lun, - /*metatask*/ NULL, - /*policy*/ CFI_ERR_SOFT, - /*retries*/ 0, - /*orig_lun_io*/lun_io, - /*done_function*/ cfi_err_recovery_done); - - - new_lun_io = (struct cfi_lun_io *) - new_io->io_hdr.port_priv; - - mtx_lock(&lun->softc->lock); - STAILQ_INSERT_TAIL(&lun->io_list, new_lun_io, links); - mtx_unlock(&lun->softc->lock); - - io = new_io; - break; - } - case CFI_ERR_RETRY: - default: - if ((error_action & CFI_ERR_NO_DECREMENT) == 0) - io->io_hdr.retries--; - break; - } - - old_status = io->io_hdr.status; - io->io_hdr.status = CTL_STATUS_NONE; -#if 0 - io->io_hdr.flags &= ~CTL_FLAG_ALREADY_DONE; -#endif - io->io_hdr.flags &= ~CTL_FLAG_ABORT; - io->io_hdr.flags &= ~CTL_FLAG_SENT_2OTHER_SC; - - if (ctl_queue(io) != CTL_RETVAL_COMPLETE) { - printf("%s: error returned from ctl_queue()!\n", - __func__); - io->io_hdr.status = old_status; - } else - return; - } -done: - lun_io->done_function(io); -} - -static void -cfi_lun_probe_done(union ctl_io *io) -{ - struct cfi_lun *lun; - struct cfi_lun_io *lun_io; - - lun_io = (struct cfi_lun_io *) - io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr; - lun = lun_io->lun; - - switch (lun->state) { - case CFI_LUN_INQUIRY: { - if ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS) { - /* print out something here?? */ - printf("%s: LUN %d probe failed because inquiry " - "failed\n", __func__, lun->lun_id); - ctl_io_error_print(io, NULL); - } else { - - if (SID_TYPE(&lun->inq_data) != T_DIRECT) { - char path_str[40]; - - lun->state = CFI_LUN_READY; - ctl_scsi_path_string(io, path_str, - sizeof(path_str)); - printf("%s", path_str); - scsi_print_inquiry(&lun->inq_data); - } else { - lun->state = CFI_LUN_READCAPACITY; - cfi_lun_probe(lun, /*have_lock*/ 0); - } - } - mtx_lock(&lun->softc->lock); - STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links); - mtx_unlock(&lun->softc->lock); - ctl_free_io(io); - break; - } - case CFI_LUN_READCAPACITY: - case CFI_LUN_READCAPACITY_16: { - uint64_t maxlba; - uint32_t blocksize; - - maxlba = 0; - blocksize = 0; - - if ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS) { - printf("%s: LUN %d probe failed because READ CAPACITY " - "failed\n", __func__, lun->lun_id); - ctl_io_error_print(io, NULL); - } else { - - if (lun->state == CFI_LUN_READCAPACITY) { - struct scsi_read_capacity_data *rdcap; - - rdcap = (struct scsi_read_capacity_data *) - io->scsiio.ext_data_ptr; - - maxlba = scsi_4btoul(rdcap->addr); - blocksize = scsi_4btoul(rdcap->length); - if (blocksize == 0) { - printf("%s: LUN %d has invalid " - "blocksize 0, probe aborted\n", - __func__, lun->lun_id); - } else if (maxlba == 0xffffffff) { - lun->state = CFI_LUN_READCAPACITY_16; - cfi_lun_probe(lun, /*have_lock*/ 0); - } else - lun->state = CFI_LUN_READY; - } else { - struct scsi_read_capacity_data_long *rdcap_long; - - rdcap_long = (struct - scsi_read_capacity_data_long *) - io->scsiio.ext_data_ptr; - maxlba = scsi_8btou64(rdcap_long->addr); - blocksize = scsi_4btoul(rdcap_long->length); - - if (blocksize == 0) { - printf("%s: LUN %d has invalid " - "blocksize 0, probe aborted\n", - __func__, lun->lun_id); - } else - lun->state = CFI_LUN_READY; - } - } - - if (lun->state == CFI_LUN_READY) { - char path_str[40]; - - lun->num_blocks = maxlba + 1; - lun->blocksize = blocksize; - - /* - * If this is true, the blocksize is a power of 2. - * We already checked for 0 above. - */ - if (((blocksize - 1) & blocksize) == 0) { - int i; - - for (i = 0; i < 32; i++) { - if ((blocksize & (1 << i)) != 0) { - lun->blocksize_powerof2 = i; - break; - } - } - } - ctl_scsi_path_string(io, path_str,sizeof(path_str)); - printf("%s", path_str); - scsi_print_inquiry(&lun->inq_data); - printf("%s %ju blocks, blocksize %d\n", path_str, - (uintmax_t)maxlba + 1, blocksize); - } - mtx_lock(&lun->softc->lock); - STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links); - mtx_unlock(&lun->softc->lock); - free(io->scsiio.ext_data_ptr, M_CTL_CFI); - ctl_free_io(io); - break; - } - case CFI_LUN_READY: - default: - mtx_lock(&lun->softc->lock); - /* How did we get here?? */ - STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links); - mtx_unlock(&lun->softc->lock); - ctl_free_io(io); - break; - } -} - -static void -cfi_lun_probe(struct cfi_lun *lun, int have_lock) -{ - - if (have_lock == 0) - mtx_lock(&lun->softc->lock); - if ((lun->softc->flags & CFI_ONLINE) == 0) { - if (have_lock == 0) - mtx_unlock(&lun->softc->lock); - return; - } - if (have_lock == 0) - mtx_unlock(&lun->softc->lock); - - switch (lun->state) { - case CFI_LUN_INQUIRY: { - struct cfi_lun_io *lun_io; - union ctl_io *io; - - io = ctl_alloc_io(lun->softc->port.ctl_pool_ref); - ctl_scsi_inquiry(io, - /*data_ptr*/(uint8_t *)&lun->inq_data, - /*data_len*/ sizeof(lun->inq_data), - /*byte2*/ 0, - /*page_code*/ 0, - /*tag_type*/ CTL_TAG_SIMPLE, - /*control*/ 0); - - cfi_init_io(io, - /*lun*/ lun, - /*metatask*/ NULL, - /*policy*/ CFI_ERR_SOFT, - /*retries*/ 5, - /*orig_lun_io*/ NULL, - /*done_function*/ - cfi_lun_probe_done); - - lun_io = (struct cfi_lun_io *)io->io_hdr.port_priv; - - if (have_lock == 0) - mtx_lock(&lun->softc->lock); - STAILQ_INSERT_TAIL(&lun->io_list, lun_io, links); - if (have_lock == 0) - mtx_unlock(&lun->softc->lock); - - if (ctl_queue(io) != CTL_RETVAL_COMPLETE) { - printf("%s: error returned from ctl_queue()!\n", - __func__); - STAILQ_REMOVE(&lun->io_list, lun_io, - cfi_lun_io, links); - ctl_free_io(io); - } - break; - } - case CFI_LUN_READCAPACITY: - case CFI_LUN_READCAPACITY_16: { - struct cfi_lun_io *lun_io; - uint8_t *dataptr; - union ctl_io *io; - - io = ctl_alloc_io(lun->softc->port.ctl_pool_ref); - - dataptr = malloc(sizeof(struct scsi_read_capacity_data_long), - M_CTL_CFI, M_NOWAIT); - if (dataptr == NULL) { - printf("%s: unable to allocate SCSI read capacity " - "buffer for lun %d\n", __func__, lun->lun_id); - return; - } - if (lun->state == CFI_LUN_READCAPACITY) { - ctl_scsi_read_capacity(io, - /*data_ptr*/ dataptr, - /*data_len*/ - sizeof(struct scsi_read_capacity_data_long), - /*addr*/ 0, - /*reladr*/ 0, - /*pmi*/ 0, - /*tag_type*/ CTL_TAG_SIMPLE, - /*control*/ 0); - } else { - ctl_scsi_read_capacity_16(io, - /*data_ptr*/ dataptr, - /*data_len*/ - sizeof(struct scsi_read_capacity_data_long), - /*addr*/ 0, - /*reladr*/ 0, - /*pmi*/ 0, - /*tag_type*/ CTL_TAG_SIMPLE, - /*control*/ 0); - } - cfi_init_io(io, - /*lun*/ lun, - /*metatask*/ NULL, - /*policy*/ CFI_ERR_SOFT, - /*retries*/ 7, - /*orig_lun_io*/ NULL, - /*done_function*/ cfi_lun_probe_done); - - lun_io = (struct cfi_lun_io *)io->io_hdr.port_priv; - - if (have_lock == 0) - mtx_lock(&lun->softc->lock); - STAILQ_INSERT_TAIL(&lun->io_list, lun_io, links); - if (have_lock == 0) - mtx_unlock(&lun->softc->lock); - - if (ctl_queue(io) != CTL_RETVAL_COMPLETE) { - printf("%s: error returned from ctl_queue()!\n", - __func__); - STAILQ_REMOVE(&lun->io_list, lun_io, - cfi_lun_io, links); - free(dataptr, M_CTL_CFI); - ctl_free_io(io); - } - break; - } - case CFI_LUN_READY: - default: - /* Why were we called? */ - break; - } -} - -static void -cfi_metatask_done(struct cfi_softc *softc, struct cfi_metatask *metatask) -{ - mtx_lock(&softc->lock); - STAILQ_REMOVE(&softc->metatask_list, metatask, cfi_metatask, links); - mtx_unlock(&softc->lock); - - /* - * Return status to the caller. Caller allocated storage, and is - * responsible for calling cfi_free_metatask to release it once - * they've seen the status. - */ - metatask->callback(metatask->callback_arg, metatask); -} - -static void -cfi_metatask_bbr_errorparse(struct cfi_metatask *metatask, union ctl_io *io) -{ - int error_code, sense_key, asc, ascq; - - if (metatask->tasktype != CFI_TASK_BBRREAD) - return; - - if ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS) { - metatask->status = CFI_MT_SUCCESS; - metatask->taskinfo.bbrread.status = CFI_BBR_SUCCESS; - return; - } - - if ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_SCSI_ERROR) { - metatask->status = CFI_MT_ERROR; - metatask->taskinfo.bbrread.status = CFI_BBR_ERROR; - return; - } - - metatask->taskinfo.bbrread.scsi_status = io->scsiio.scsi_status; - memcpy(&metatask->taskinfo.bbrread.sense_data, &io->scsiio.sense_data, - MIN(sizeof(metatask->taskinfo.bbrread.sense_data), - sizeof(io->scsiio.sense_data))); - - if (io->scsiio.scsi_status == SCSI_STATUS_RESERV_CONFLICT) { - metatask->status = CFI_MT_ERROR; - metatask->taskinfo.bbrread.status = CFI_BBR_RESERV_CONFLICT; - return; - } - - if (io->scsiio.scsi_status != SCSI_STATUS_CHECK_COND) { - metatask->status = CFI_MT_ERROR; - metatask->taskinfo.bbrread.status = CFI_BBR_SCSI_ERROR; - return; - } - - scsi_extract_sense_len(&io->scsiio.sense_data, - io->scsiio.sense_len, - &error_code, - &sense_key, - &asc, - &ascq, - /*show_errors*/ 1); - - switch (error_code) { - case SSD_DEFERRED_ERROR: - case SSD_DESC_DEFERRED_ERROR: - metatask->status = CFI_MT_ERROR; - metatask->taskinfo.bbrread.status = CFI_BBR_SCSI_ERROR; - break; - case SSD_CURRENT_ERROR: - case SSD_DESC_CURRENT_ERROR: - default: { - struct scsi_sense_data *sense; - - sense = &io->scsiio.sense_data; - - if ((asc == 0x04) && (ascq == 0x02)) { - metatask->status = CFI_MT_ERROR; - metatask->taskinfo.bbrread.status = CFI_BBR_LUN_STOPPED; - } else if ((asc == 0x04) && (ascq == 0x03)) { - metatask->status = CFI_MT_ERROR; - metatask->taskinfo.bbrread.status = - CFI_BBR_LUN_OFFLINE_CTL; - } else if ((asc == 0x44) && (ascq == 0x00)) { -#ifdef NEEDTOPORT - if (sense->sense_key_spec[0] & SSD_SCS_VALID) { - uint16_t retry_count; - - retry_count = sense->sense_key_spec[1] << 8 | - sense->sense_key_spec[2]; - if (((retry_count & 0xf000) == CSC_RAIDCORE) - && ((retry_count & 0x0f00) == CSC_SHELF_SW) - && ((retry_count & 0xff) == - RC_STS_DEVICE_OFFLINE)) { - metatask->status = CFI_MT_ERROR; - metatask->taskinfo.bbrread.status = - CFI_BBR_LUN_OFFLINE_RC; - } else { - metatask->status = CFI_MT_ERROR; - metatask->taskinfo.bbrread.status = - CFI_BBR_SCSI_ERROR; - } - } else { -#endif /* NEEDTOPORT */ - metatask->status = CFI_MT_ERROR; - metatask->taskinfo.bbrread.status = - CFI_BBR_SCSI_ERROR; -#ifdef NEEDTOPORT - } -#endif - } else { - metatask->status = CFI_MT_ERROR; - metatask->taskinfo.bbrread.status = CFI_BBR_SCSI_ERROR; - } - break; - } - } -} - -static void -cfi_metatask_io_done(union ctl_io *io) -{ - struct cfi_lun_io *lun_io; - struct cfi_metatask *metatask; - struct cfi_softc *softc; - struct cfi_lun *lun; - - lun_io = (struct cfi_lun_io *) - io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr; - - lun = lun_io->lun; - softc = lun->softc; - - metatask = lun_io->metatask; - - switch (metatask->tasktype) { - case CFI_TASK_STARTUP: - case CFI_TASK_SHUTDOWN: { - int failed, done, is_start; - - failed = 0; - done = 0; - if (metatask->tasktype == CFI_TASK_STARTUP) - is_start = 1; - else - is_start = 0; - - mtx_lock(&softc->lock); - if ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS) - metatask->taskinfo.startstop.luns_complete++; - else { - metatask->taskinfo.startstop.luns_failed++; - failed = 1; - } - if ((metatask->taskinfo.startstop.luns_complete + - metatask->taskinfo.startstop.luns_failed) >= - metatask->taskinfo.startstop.total_luns) - done = 1; - - mtx_unlock(&softc->lock); - - if (failed != 0) { - printf("%s: LUN %d %s request failed\n", __func__, - lun_io->lun->lun_id, (is_start == 1) ? "start" : - "stop"); - ctl_io_error_print(io, &lun_io->lun->inq_data); - } - if (done != 0) { - if (metatask->taskinfo.startstop.luns_failed > 0) - metatask->status = CFI_MT_ERROR; - else - metatask->status = CFI_MT_SUCCESS; - cfi_metatask_done(softc, metatask); - } - mtx_lock(&softc->lock); - STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links); - mtx_unlock(&softc->lock); - - ctl_free_io(io); - break; - } - case CFI_TASK_BBRREAD: { - /* - * Translate the SCSI error into an enumeration. - */ - cfi_metatask_bbr_errorparse(metatask, io); - - mtx_lock(&softc->lock); - STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links); - mtx_unlock(&softc->lock); - - ctl_free_io(io); - - cfi_metatask_done(softc, metatask); - break; - } - default: - /* - * This shouldn't happen. - */ - mtx_lock(&softc->lock); - STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links); - mtx_unlock(&softc->lock); - - ctl_free_io(io); - break; - } -} - -static void -cfi_err_recovery_done(union ctl_io *io) -{ - struct cfi_lun_io *lun_io, *orig_lun_io; - struct cfi_lun *lun; - union ctl_io *orig_io; - - lun_io = (struct cfi_lun_io *)io->io_hdr.port_priv; - orig_lun_io = lun_io->orig_lun_io; - orig_io = orig_lun_io->ctl_io; - lun = lun_io->lun; - - if (io->io_hdr.status != CTL_SUCCESS) { - printf("%s: error recovery action failed. Original " - "error:\n", __func__); - - ctl_io_error_print(orig_lun_io->ctl_io, &lun->inq_data); - - printf("%s: error from error recovery action:\n", __func__); - - ctl_io_error_print(io, &lun->inq_data); - - printf("%s: trying original command again...\n", __func__); - } - - mtx_lock(&lun->softc->lock); - STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links); - mtx_unlock(&lun->softc->lock); - ctl_free_io(io); - - orig_io->io_hdr.retries--; - orig_io->io_hdr.status = CTL_STATUS_NONE; - - if (ctl_queue(orig_io) != CTL_RETVAL_COMPLETE) { - printf("%s: error returned from ctl_queue()!\n", __func__); - STAILQ_REMOVE(&lun->io_list, orig_lun_io, - cfi_lun_io, links); - ctl_free_io(orig_io); - } -} - -static void -cfi_lun_io_done(union ctl_io *io) -{ - struct cfi_lun *lun; - struct cfi_lun_io *lun_io; - - lun_io = (struct cfi_lun_io *) - io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr; - lun = lun_io->lun; - - if (lun_io->metatask == NULL) { - printf("%s: I/O has no metatask pointer, discarding\n", - __func__); - STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links); - ctl_free_io(io); - return; - } - cfi_metatask_io_done(io); -} - -void -cfi_action(struct cfi_metatask *metatask) -{ - struct cfi_softc *softc; - - softc = &fetd_internal_softc; - - mtx_lock(&softc->lock); - - STAILQ_INSERT_TAIL(&softc->metatask_list, metatask, links); - - if ((softc->flags & CFI_ONLINE) == 0) { - mtx_unlock(&softc->lock); - metatask->status = CFI_MT_PORT_OFFLINE; - cfi_metatask_done(softc, metatask); - return; - } else - mtx_unlock(&softc->lock); - - switch (metatask->tasktype) { - case CFI_TASK_STARTUP: - case CFI_TASK_SHUTDOWN: { - union ctl_io *io; - int da_luns, ios_allocated, do_start; - struct cfi_lun *lun; - STAILQ_HEAD(, ctl_io_hdr) tmp_io_list; - - da_luns = 0; - ios_allocated = 0; - STAILQ_INIT(&tmp_io_list); - - if (metatask->tasktype == CFI_TASK_STARTUP) - do_start = 1; - else - do_start = 0; - - mtx_lock(&softc->lock); - STAILQ_FOREACH(lun, &softc->lun_list, links) { - if (lun->state != CFI_LUN_READY) - continue; - - if (SID_TYPE(&lun->inq_data) != T_DIRECT) - continue; - da_luns++; - io = ctl_alloc_io_nowait(softc->port.ctl_pool_ref); - if (io != NULL) { - ios_allocated++; - STAILQ_INSERT_TAIL(&tmp_io_list, &io->io_hdr, - links); - } - } - - if (ios_allocated < da_luns) { - printf("%s: error allocating ctl_io for %s\n", - __func__, (do_start == 1) ? "startup" : - "shutdown"); - da_luns = ios_allocated; - } - - metatask->taskinfo.startstop.total_luns = da_luns; - - STAILQ_FOREACH(lun, &softc->lun_list, links) { - struct cfi_lun_io *lun_io; - - if (lun->state != CFI_LUN_READY) - continue; - - if (SID_TYPE(&lun->inq_data) != T_DIRECT) - continue; - - io = (union ctl_io *)STAILQ_FIRST(&tmp_io_list); - if (io == NULL) - break; - - STAILQ_REMOVE(&tmp_io_list, &io->io_hdr, ctl_io_hdr, - links); - - ctl_scsi_start_stop(io, - /*start*/ do_start, - /*load_eject*/ 0, - /*immediate*/ 0, - /*power_conditions*/ - SSS_PC_START_VALID, - /*onoffline*/ 1, - /*ctl_tag_type*/ CTL_TAG_ORDERED, - /*control*/ 0); - - cfi_init_io(io, - /*lun*/ lun, - /*metatask*/ metatask, - /*policy*/ CFI_ERR_HARD, - /*retries*/ 3, - /*orig_lun_io*/ NULL, - /*done_function*/ cfi_lun_io_done); - - lun_io = (struct cfi_lun_io *) io->io_hdr.port_priv; - - STAILQ_INSERT_TAIL(&lun->io_list, lun_io, links); - - if (ctl_queue(io) != CTL_RETVAL_COMPLETE) { - printf("%s: error returned from ctl_queue()!\n", - __func__); - STAILQ_REMOVE(&lun->io_list, lun_io, - cfi_lun_io, links); - ctl_free_io(io); - metatask->taskinfo.startstop.total_luns--; - } - } - - if (STAILQ_FIRST(&tmp_io_list) != NULL) { - printf("%s: error: tmp_io_list != NULL\n", __func__); - for (io = (union ctl_io *)STAILQ_FIRST(&tmp_io_list); - io != NULL; - io = (union ctl_io *)STAILQ_FIRST(&tmp_io_list)) { - STAILQ_REMOVE(&tmp_io_list, &io->io_hdr, - ctl_io_hdr, links); - ctl_free_io(io); - } - } - mtx_unlock(&softc->lock); - - break; - } - case CFI_TASK_BBRREAD: { - union ctl_io *io; - struct cfi_lun *lun; - struct cfi_lun_io *lun_io; - cfi_bbrread_status status; - int req_lun_num; - uint32_t num_blocks; - - status = CFI_BBR_SUCCESS; - - req_lun_num = metatask->taskinfo.bbrread.lun_num; - - mtx_lock(&softc->lock); - STAILQ_FOREACH(lun, &softc->lun_list, links) { - if (lun->lun_id != req_lun_num) - continue; - if (lun->state != CFI_LUN_READY) { - status = CFI_BBR_LUN_UNCONFIG; - break; - } else - break; - } - - if (lun == NULL) - status = CFI_BBR_NO_LUN; - - if (status != CFI_BBR_SUCCESS) { - metatask->status = CFI_MT_ERROR; - metatask->taskinfo.bbrread.status = status; - mtx_unlock(&softc->lock); - cfi_metatask_done(softc, metatask); - break; - } - - /* - * Convert the number of bytes given into blocks and check - * that the number of bytes is a multiple of the blocksize. - * CTL will verify that the LBA is okay. - */ - if (lun->blocksize_powerof2 != 0) { - if ((metatask->taskinfo.bbrread.len & - (lun->blocksize - 1)) != 0) { - metatask->status = CFI_MT_ERROR; - metatask->taskinfo.bbrread.status = - CFI_BBR_BAD_LEN; - cfi_metatask_done(softc, metatask); - break; - } - - num_blocks = metatask->taskinfo.bbrread.len >> - lun->blocksize_powerof2; - } else { - /* - * XXX KDM this could result in floating point - * division, which isn't supported in the kernel on - * x86 at least. - */ - if ((metatask->taskinfo.bbrread.len % - lun->blocksize) != 0) { - metatask->status = CFI_MT_ERROR; - metatask->taskinfo.bbrread.status = - CFI_BBR_BAD_LEN; - cfi_metatask_done(softc, metatask); - break; - } - - /* - * XXX KDM this could result in floating point - * division in some cases. - */ - num_blocks = metatask->taskinfo.bbrread.len / - lun->blocksize; - - } - - io = ctl_alloc_io_nowait(softc->port.ctl_pool_ref); - if (io == NULL) { - metatask->status = CFI_MT_ERROR; - metatask->taskinfo.bbrread.status = CFI_BBR_NO_MEM; - mtx_unlock(&softc->lock); - cfi_metatask_done(softc, metatask); - break; - } - - /* - * XXX KDM need to do a read capacity to get the blocksize - * for this device. - */ - ctl_scsi_read_write(io, - /*data_ptr*/ NULL, - /*data_len*/ metatask->taskinfo.bbrread.len, - /*read_op*/ 1, - /*byte2*/ 0, - /*minimum_cdb_size*/ 0, - /*lba*/ metatask->taskinfo.bbrread.lba, - /*num_blocks*/ num_blocks, - /*tag_type*/ CTL_TAG_SIMPLE, - /*control*/ 0); - - cfi_init_io(io, - /*lun*/ lun, - /*metatask*/ metatask, - /*policy*/ CFI_ERR_SOFT, - /*retries*/ 3, - /*orig_lun_io*/ NULL, - /*done_function*/ cfi_lun_io_done); - - lun_io = (struct cfi_lun_io *)io->io_hdr.port_priv; - - STAILQ_INSERT_TAIL(&lun->io_list, lun_io, links); - - if (ctl_queue(io) != CTL_RETVAL_COMPLETE) { - printf("%s: error returned from ctl_queue()!\n", - __func__); - STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links); - ctl_free_io(io); - metatask->status = CFI_MT_ERROR; - metatask->taskinfo.bbrread.status = CFI_BBR_ERROR; - mtx_unlock(&softc->lock); - cfi_metatask_done(softc, metatask); - break; - } - - mtx_unlock(&softc->lock); - break; - } - default: - panic("invalid metatask type %d", metatask->tasktype); - break; /* NOTREACHED */ - } -} - -struct cfi_metatask * -cfi_alloc_metatask(int can_wait) -{ - struct cfi_metatask *metatask; - struct cfi_softc *softc; - - softc = &fetd_internal_softc; - - metatask = uma_zalloc(cfi_metatask_zone, - (can_wait ? M_WAITOK : M_NOWAIT) | M_ZERO); - if (metatask == NULL) - return (NULL); - - metatask->status = CFI_MT_NONE; - - return (metatask); -} - -void -cfi_free_metatask(struct cfi_metatask *metatask) -{ - - uma_zfree(cfi_metatask_zone, metatask); -} - -/* - * vim: ts=8 - */ diff --git a/sys/cam/ctl/ctl_frontend_internal.h b/sys/cam/ctl/ctl_frontend_internal.h deleted file mode 100644 index cb00dc6..0000000 --- a/sys/cam/ctl/ctl_frontend_internal.h +++ /dev/null @@ -1,154 +0,0 @@ -/*- - * Copyright (c) 2004 Silicon Graphics International Corp. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. Redistributions in binary form must reproduce at minimum a disclaimer - * substantially similar to the "NO WARRANTY" disclaimer below - * ("Disclaimer") and any redistribution must be conditioned upon - * including a substantially similar Disclaimer requirement for further - * binary redistribution. - * - * NO WARRANTY - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGES. - * - * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_frontend_internal.h#1 $ - * $FreeBSD$ - */ -/* - * CTL kernel internal frontend target driver. This allows kernel-level - * clients to send commands into CTL. - * - * Author: Ken Merry <ken@FreeBSD.org> - */ - -#ifndef _CTL_FRONTEND_INTERNAL_H_ -#define _CTL_FRONTEND_INTERNAL_H_ - -/* - * These are general metatask error codes. If the error code is CFI_MT_ERROR, - * check any metatask-specific status codes for more detail on the problem. - */ -typedef enum { - CFI_MT_NONE, - CFI_MT_PORT_OFFLINE, - CFI_MT_ERROR, - CFI_MT_SUCCESS -} cfi_mt_status; - -typedef enum { - CFI_TASK_NONE, - CFI_TASK_SHUTDOWN, - CFI_TASK_STARTUP, - CFI_TASK_BBRREAD -} cfi_tasktype; - -struct cfi_task_startstop { - int total_luns; - int luns_complete; - int luns_failed; -}; - -/* - * Error code description: - * CFI_BBR_SUCCESS - the read was successful - * CFI_BBR_LUN_UNCONFIG - CFI probe for this lun hasn't completed - * CFI_BBR_NO_LUN - this lun doesn't exist, as far as CFI knows - * CFI_BBR_NO_MEM - memory allocation error - * CFI_BBR_BAD_LEN - data length isn't a multiple of the blocksize - * CFI_BBR_RESERV_CONFLICT - another initiator has this lun reserved, so - * we can't issue I/O at all. - * CFI_BBR_LUN_STOPPED - the lun is powered off. - * CFI_BBR_LUN_OFFLINE_CTL - the lun is offline from a CTL standpoint - * CFI_BBR_LUN_OFFLINE_RC - the lun is offline from a RAIDCore standpoint. - * This is bad, because it basically means we've - * had a double failure on the LUN. - * CFI_BBR_SCSI_ERROR - generic SCSI error, see status byte and sense - * data for more resolution if you want it. - * CFI_BBR_ERROR - the catch-all error code. - */ -typedef enum { - CFI_BBR_SUCCESS, - CFI_BBR_LUN_UNCONFIG, - CFI_BBR_NO_LUN, - CFI_BBR_NO_MEM, - CFI_BBR_BAD_LEN, - CFI_BBR_RESERV_CONFLICT, - CFI_BBR_LUN_STOPPED, - CFI_BBR_LUN_OFFLINE_CTL, - CFI_BBR_LUN_OFFLINE_RC, - CFI_BBR_SCSI_ERROR, - CFI_BBR_ERROR, -} cfi_bbrread_status; - -struct cfi_task_bbrread { - int lun_num; /* lun number */ - uint64_t lba; /* logical block address */ - int len; /* length in bytes */ - cfi_bbrread_status status; /* BBR status */ - uint8_t scsi_status; /* SCSI status */ - struct scsi_sense_data sense_data; /* SCSI sense data */ -}; - -union cfi_taskinfo { - struct cfi_task_startstop startstop; - struct cfi_task_bbrread bbrread; -}; - -struct cfi_metatask; - -typedef void (*cfi_cb_t)(void *arg, struct cfi_metatask *metatask); - -struct cfi_metatask { - cfi_tasktype tasktype; /* passed to CFI */ - cfi_mt_status status; /* returned from CFI */ - union cfi_taskinfo taskinfo; /* returned from CFI */ - struct ctl_mem_element *element; /* used by CFI, don't touch*/ - cfi_cb_t callback; /* passed to CFI */ - void *callback_arg; /* passed to CFI */ - STAILQ_ENTRY(cfi_metatask) links; /* used by CFI, don't touch*/ -}; - -#ifdef _KERNEL - -MALLOC_DECLARE(M_CTL_CFI); - -/* - * This is the API for sending meta commands (commands that are sent to more - * than one LUN) to the internal frontend: - * - Allocate a metatask using cfi_alloc_metatask(). can_wait == 0 means - * that you're calling from an interrupt context. can_wait == 1 means - * that you're calling from a thread context and don't mind waiting to - * allocate memory. - * - Setup the task type, callback and callback argument. - * - Call cfi_action(). - * - When the callback comes, note the status and any per-command status - * (see the taskinfo union) and then free the metatask with - * cfi_free_metatask(). - */ -struct cfi_metatask *cfi_alloc_metatask(int can_wait); -void cfi_free_metatask(struct cfi_metatask *metatask); -void cfi_action(struct cfi_metatask *metatask); - -#endif /* _KERNEL */ - -#endif /* _CTL_FRONTEND_INTERNAL_H_ */ - -/* - * vim: ts=8 - */ diff --git a/sys/cam/ctl/ctl_frontend_ioctl.c b/sys/cam/ctl/ctl_frontend_ioctl.c new file mode 100644 index 0000000..7d57314 --- /dev/null +++ b/sys/cam/ctl/ctl_frontend_ioctl.c @@ -0,0 +1,470 @@ +/*- + * Copyright (c) 2003-2009 Silicon Graphics International Corp. + * Copyright (c) 2012 The FreeBSD Foundation + * Copyright (c) 2015 Alexander Motin <mav@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer, + * without modification, immediately at the beginning of the file. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/types.h> +#include <sys/lock.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/condvar.h> +#include <sys/malloc.h> +#include <sys/conf.h> +#include <sys/queue.h> +#include <sys/sysctl.h> + +#include <cam/cam.h> +#include <cam/scsi/scsi_all.h> +#include <cam/scsi/scsi_da.h> +#include <cam/ctl/ctl_io.h> +#include <cam/ctl/ctl.h> +#include <cam/ctl/ctl_frontend.h> +#include <cam/ctl/ctl_util.h> +#include <cam/ctl/ctl_backend.h> +#include <cam/ctl/ctl_ioctl.h> +#include <cam/ctl/ctl_ha.h> +#include <cam/ctl/ctl_private.h> +#include <cam/ctl/ctl_debug.h> +#include <cam/ctl/ctl_error.h> + +struct cfi_softc { + uint32_t cur_tag_num; + struct ctl_port port; +}; + +static struct cfi_softc cfi_softc; + +static int cfi_init(void); +static void cfi_shutdown(void); +static void cfi_online(void *arg); +static void cfi_offline(void *arg); +static int cfi_lun_enable(void *arg, int lun_id); +static int cfi_lun_disable(void *arg, int lun_id); +static void cfi_datamove(union ctl_io *io); +static void cfi_done(union ctl_io *io); + +static struct ctl_frontend cfi_frontend = +{ + .name = "ioctl", + .init = cfi_init, + .shutdown = cfi_shutdown, +}; +CTL_FRONTEND_DECLARE(ctlioctl, cfi_frontend); + +static int +cfi_init(void) +{ + struct cfi_softc *isoftc = &cfi_softc; + struct ctl_port *port; + + memset(isoftc, 0, sizeof(*isoftc)); + + port = &isoftc->port; + port->frontend = &cfi_frontend; + port->port_type = CTL_PORT_IOCTL; + port->num_requested_ctl_io = 100; + port->port_name = "ioctl"; + port->port_online = cfi_online; + port->port_offline = cfi_offline; + port->onoff_arg = &isoftc; + port->lun_enable = cfi_lun_enable; + port->lun_disable = cfi_lun_disable; + port->targ_lun_arg = &isoftc; + port->fe_datamove = cfi_datamove; + port->fe_done = cfi_done; + port->max_targets = 1; + port->max_target_id = 0; + port->max_initiators = 1; + + if (ctl_port_register(port) != 0) { + printf("%s: ioctl port registration failed\n", __func__); + return (0); + } + ctl_port_online(port); + return (0); +} + +void +cfi_shutdown(void) +{ + struct cfi_softc *isoftc = &cfi_softc; + struct ctl_port *port; + + port = &isoftc->port; + ctl_port_offline(port); + if (ctl_port_deregister(&isoftc->port) != 0) + printf("%s: ctl_frontend_deregister() failed\n", __func__); +} + +static void +cfi_online(void *arg) +{ +} + +static void +cfi_offline(void *arg) +{ +} + +static int +cfi_lun_enable(void *arg, int lun_id) +{ + + return (0); +} + +static int +cfi_lun_disable(void *arg, int lun_id) +{ + + return (0); +} + +/* + * Data movement routine for the CTL ioctl frontend port. + */ +static int +ctl_ioctl_do_datamove(struct ctl_scsiio *ctsio) +{ + struct ctl_sg_entry *ext_sglist, *kern_sglist; + struct ctl_sg_entry ext_entry, kern_entry; + int ext_sglen, ext_sg_entries, kern_sg_entries; + int ext_sg_start, ext_offset; + int len_to_copy, len_copied; + int kern_watermark, ext_watermark; + int ext_sglist_malloced; + int i, j; + + ext_sglist_malloced = 0; + ext_sg_start = 0; + ext_offset = 0; + + CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove\n")); + + /* + * If this flag is set, fake the data transfer. + */ + if (ctsio->io_hdr.flags & CTL_FLAG_NO_DATAMOVE) { + ctsio->ext_data_filled = ctsio->ext_data_len; + goto bailout; + } + + /* + * To simplify things here, if we have a single buffer, stick it in + * a S/G entry and just make it a single entry S/G list. + */ + if (ctsio->io_hdr.flags & CTL_FLAG_EDPTR_SGLIST) { + int len_seen; + + ext_sglen = ctsio->ext_sg_entries * sizeof(*ext_sglist); + + ext_sglist = (struct ctl_sg_entry *)malloc(ext_sglen, M_CTL, + M_WAITOK); + ext_sglist_malloced = 1; + if (copyin(ctsio->ext_data_ptr, ext_sglist, + ext_sglen) != 0) { + ctl_set_internal_failure(ctsio, + /*sks_valid*/ 0, + /*retry_count*/ 0); + goto bailout; + } + ext_sg_entries = ctsio->ext_sg_entries; + len_seen = 0; + for (i = 0; i < ext_sg_entries; i++) { + if ((len_seen + ext_sglist[i].len) >= + ctsio->ext_data_filled) { + ext_sg_start = i; + ext_offset = ctsio->ext_data_filled - len_seen; + break; + } + len_seen += ext_sglist[i].len; + } + } else { + ext_sglist = &ext_entry; + ext_sglist->addr = ctsio->ext_data_ptr; + ext_sglist->len = ctsio->ext_data_len; + ext_sg_entries = 1; + ext_sg_start = 0; + ext_offset = ctsio->ext_data_filled; + } + + if (ctsio->kern_sg_entries > 0) { + kern_sglist = (struct ctl_sg_entry *)ctsio->kern_data_ptr; + kern_sg_entries = ctsio->kern_sg_entries; + } else { + kern_sglist = &kern_entry; + kern_sglist->addr = ctsio->kern_data_ptr; + kern_sglist->len = ctsio->kern_data_len; + kern_sg_entries = 1; + } + + + kern_watermark = 0; + ext_watermark = ext_offset; + len_copied = 0; + for (i = ext_sg_start, j = 0; + i < ext_sg_entries && j < kern_sg_entries;) { + uint8_t *ext_ptr, *kern_ptr; + + len_to_copy = MIN(ext_sglist[i].len - ext_watermark, + kern_sglist[j].len - kern_watermark); + + ext_ptr = (uint8_t *)ext_sglist[i].addr; + ext_ptr = ext_ptr + ext_watermark; + if (ctsio->io_hdr.flags & CTL_FLAG_BUS_ADDR) { + /* + * XXX KDM fix this! + */ + panic("need to implement bus address support"); +#if 0 + kern_ptr = bus_to_virt(kern_sglist[j].addr); +#endif + } else + kern_ptr = (uint8_t *)kern_sglist[j].addr; + kern_ptr = kern_ptr + kern_watermark; + + kern_watermark += len_to_copy; + ext_watermark += len_to_copy; + + if ((ctsio->io_hdr.flags & CTL_FLAG_DATA_MASK) == + CTL_FLAG_DATA_IN) { + CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: copying %d " + "bytes to user\n", len_to_copy)); + CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: from %p " + "to %p\n", kern_ptr, ext_ptr)); + if (copyout(kern_ptr, ext_ptr, len_to_copy) != 0) { + ctl_set_internal_failure(ctsio, + /*sks_valid*/ 0, + /*retry_count*/ 0); + goto bailout; + } + } else { + CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: copying %d " + "bytes from user\n", len_to_copy)); + CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: from %p " + "to %p\n", ext_ptr, kern_ptr)); + if (copyin(ext_ptr, kern_ptr, len_to_copy)!= 0){ + ctl_set_internal_failure(ctsio, + /*sks_valid*/ 0, + /*retry_count*/0); + goto bailout; + } + } + + len_copied += len_to_copy; + + if (ext_sglist[i].len == ext_watermark) { + i++; + ext_watermark = 0; + } + + if (kern_sglist[j].len == kern_watermark) { + j++; + kern_watermark = 0; + } + } + + ctsio->ext_data_filled += len_copied; + + CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: ext_sg_entries: %d, " + "kern_sg_entries: %d\n", ext_sg_entries, + kern_sg_entries)); + CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: ext_data_len = %d, " + "kern_data_len = %d\n", ctsio->ext_data_len, + ctsio->kern_data_len)); + + + /* XXX KDM set residual?? */ +bailout: + + if (ext_sglist_malloced != 0) + free(ext_sglist, M_CTL); + + return (CTL_RETVAL_COMPLETE); +} + +static void +cfi_datamove(union ctl_io *io) +{ + struct ctl_fe_ioctl_params *params; + + params = (struct ctl_fe_ioctl_params *) + io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr; + + mtx_lock(¶ms->ioctl_mtx); + params->state = CTL_IOCTL_DATAMOVE; + cv_broadcast(¶ms->sem); + mtx_unlock(¶ms->ioctl_mtx); +} + +static void +cfi_done(union ctl_io *io) +{ + struct ctl_fe_ioctl_params *params; + + params = (struct ctl_fe_ioctl_params *) + io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr; + + mtx_lock(¶ms->ioctl_mtx); + params->state = CTL_IOCTL_DONE; + cv_broadcast(¶ms->sem); + mtx_unlock(¶ms->ioctl_mtx); +} + +static int +cfi_submit_wait(union ctl_io *io) +{ + struct ctl_fe_ioctl_params params; + ctl_fe_ioctl_state last_state; + int done, retval; + + retval = 0; + + bzero(¶ms, sizeof(params)); + + mtx_init(¶ms.ioctl_mtx, "ctliocmtx", NULL, MTX_DEF); + cv_init(¶ms.sem, "ctlioccv"); + params.state = CTL_IOCTL_INPROG; + last_state = params.state; + + io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr = ¶ms; + + CTL_DEBUG_PRINT(("cfi_submit_wait\n")); + + /* This shouldn't happen */ + if ((retval = ctl_queue(io)) != CTL_RETVAL_COMPLETE) + return (retval); + + done = 0; + + do { + mtx_lock(¶ms.ioctl_mtx); + /* + * Check the state here, and don't sleep if the state has + * already changed (i.e. wakeup has already occured, but we + * weren't waiting yet). + */ + if (params.state == last_state) { + /* XXX KDM cv_wait_sig instead? */ + cv_wait(¶ms.sem, ¶ms.ioctl_mtx); + } + last_state = params.state; + + switch (params.state) { + case CTL_IOCTL_INPROG: + /* Why did we wake up? */ + /* XXX KDM error here? */ + mtx_unlock(¶ms.ioctl_mtx); + break; + case CTL_IOCTL_DATAMOVE: + CTL_DEBUG_PRINT(("got CTL_IOCTL_DATAMOVE\n")); + + /* + * change last_state back to INPROG to avoid + * deadlock on subsequent data moves. + */ + params.state = last_state = CTL_IOCTL_INPROG; + + mtx_unlock(¶ms.ioctl_mtx); + ctl_ioctl_do_datamove(&io->scsiio); + /* + * Note that in some cases, most notably writes, + * this will queue the I/O and call us back later. + * In other cases, generally reads, this routine + * will immediately call back and wake us up, + * probably using our own context. + */ + io->scsiio.be_move_done(io); + break; + case CTL_IOCTL_DONE: + mtx_unlock(¶ms.ioctl_mtx); + CTL_DEBUG_PRINT(("got CTL_IOCTL_DONE\n")); + done = 1; + break; + default: + mtx_unlock(¶ms.ioctl_mtx); + /* XXX KDM error here? */ + break; + } + } while (done == 0); + + mtx_destroy(¶ms.ioctl_mtx); + cv_destroy(¶ms.sem); + + return (CTL_RETVAL_COMPLETE); +} + +int +ctl_ioctl_io(struct cdev *dev, u_long cmd, caddr_t addr, int flag, + struct thread *td) +{ + union ctl_io *io; + void *pool_tmp; + int retval = 0; + + /* + * If we haven't been "enabled", don't allow any SCSI I/O + * to this FETD. + */ + if ((cfi_softc.port.status & CTL_PORT_STATUS_ONLINE) == 0) + return (EPERM); + + io = ctl_alloc_io(cfi_softc.port.ctl_pool_ref); + + /* + * Need to save the pool reference so it doesn't get + * spammed by the user's ctl_io. + */ + pool_tmp = io->io_hdr.pool; + memcpy(io, (void *)addr, sizeof(*io)); + io->io_hdr.pool = pool_tmp; + + /* + * No status yet, so make sure the status is set properly. + */ + io->io_hdr.status = CTL_STATUS_NONE; + + /* + * The user sets the initiator ID, target and LUN IDs. + */ + io->io_hdr.nexus.targ_port = cfi_softc.port.targ_port; + io->io_hdr.flags |= CTL_FLAG_USER_REQ; + if ((io->io_hdr.io_type == CTL_IO_SCSI) && + (io->scsiio.tag_type != CTL_TAG_UNTAGGED)) + io->scsiio.tag_num = cfi_softc.cur_tag_num++; + + retval = cfi_submit_wait(io); + if (retval == 0) + memcpy((void *)addr, io, sizeof(*io)); + ctl_free_io(io); + return (retval); +} diff --git a/sys/cam/ctl/ctl_frontend_iscsi.c b/sys/cam/ctl/ctl_frontend_iscsi.c index 652c961..7f8f8a8 100644 --- a/sys/cam/ctl/ctl_frontend_iscsi.c +++ b/sys/cam/ctl/ctl_frontend_iscsi.c @@ -61,7 +61,6 @@ __FBSDID("$FreeBSD$"); #include <cam/ctl/ctl_backend.h> #include <cam/ctl/ctl_error.h> #include <cam/ctl/ctl_frontend.h> -#include <cam/ctl/ctl_frontend_internal.h> #include <cam/ctl/ctl_debug.h> #include <cam/ctl/ctl_ha.h> #include <cam/ctl/ctl_ioctl.h> diff --git a/sys/cam/ctl/ctl_ioctl.h b/sys/cam/ctl/ctl_ioctl.h index c7a3c29..f62bbe1 100644 --- a/sys/cam/ctl/ctl_ioctl.h +++ b/sys/cam/ctl/ctl_ioctl.h @@ -92,23 +92,6 @@ struct ctl_ooa_info { ctl_ooa_status status; /* Returned from CTL */ }; -struct ctl_hard_startstop_info { - cfi_mt_status status; - int total_luns; - int luns_complete; - int luns_failed; -}; - -struct ctl_bbrread_info { - int lun_num; /* Passed in to CTL */ - uint64_t lba; /* Passed in to CTL */ - int len; /* Passed in to CTL */ - cfi_mt_status status; /* Returned from CTL */ - cfi_bbrread_status bbr_status; /* Returned from CTL */ - uint8_t scsi_status; /* Returned from CTL */ - struct scsi_sense_data sense_data; /* Returned from CTL */ -}; - typedef enum { CTL_DELAY_TYPE_NONE, CTL_DELAY_TYPE_CONT, @@ -828,10 +811,6 @@ struct ctl_lun_map { #define CTL_DISABLE_PORT _IOW(CTL_MINOR, 0x05, struct ctl_port_entry) #define CTL_DUMP_OOA _IO(CTL_MINOR, 0x06) #define CTL_CHECK_OOA _IOWR(CTL_MINOR, 0x07, struct ctl_ooa_info) -#define CTL_HARD_STOP _IOR(CTL_MINOR, 0x08, \ - struct ctl_hard_startstop_info) -#define CTL_HARD_START _IOR(CTL_MINOR, 0x09, \ - struct ctl_hard_startstop_info) #define CTL_DELAY_IO _IOWR(CTL_MINOR, 0x10, struct ctl_io_delay_info) #define CTL_REALSYNC_GET _IOR(CTL_MINOR, 0x11, int) #define CTL_REALSYNC_SET _IOW(CTL_MINOR, 0x12, int) @@ -839,7 +818,6 @@ struct ctl_lun_map { #define CTL_GETSYNC _IOWR(CTL_MINOR, 0x14, struct ctl_sync_info) #define CTL_GETSTATS _IOWR(CTL_MINOR, 0x15, struct ctl_stats) #define CTL_ERROR_INJECT _IOWR(CTL_MINOR, 0x16, struct ctl_error_desc) -#define CTL_BBRREAD _IOWR(CTL_MINOR, 0x17, struct ctl_bbrread_info) #define CTL_GET_OOA _IOWR(CTL_MINOR, 0x18, struct ctl_ooa) #define CTL_DUMP_STRUCTS _IO(CTL_MINOR, 0x19) #define CTL_GET_PORT_LIST _IOWR(CTL_MINOR, 0x20, struct ctl_port_list) diff --git a/sys/cam/ctl/ctl_private.h b/sys/cam/ctl/ctl_private.h index a038552..6f7379a 100644 --- a/sys/cam/ctl/ctl_private.h +++ b/sys/cam/ctl/ctl_private.h @@ -47,18 +47,6 @@ #define CTL_PROCESSOR_PRODUCT "CTLPROCESSOR " #define CTL_UNKNOWN_PRODUCT "CTLDEVICE " -struct ctl_fe_ioctl_startstop_info { - struct cv sem; - struct ctl_hard_startstop_info hs_info; -}; - -struct ctl_fe_ioctl_bbrread_info { - struct cv sem; - struct ctl_bbrread_info *bbr_info; - int wakeup_done; - struct mtx *lock; -}; - typedef enum { CTL_IOCTL_INPROG, CTL_IOCTL_DATAMOVE, @@ -81,18 +69,6 @@ struct ctl_io_pool { }; typedef enum { - CTL_IOCTL_FLAG_NONE = 0x00, - CTL_IOCTL_FLAG_ENABLED = 0x01 -} ctl_ioctl_flags; - -struct ctl_ioctl_info { - ctl_ioctl_flags flags; - uint32_t cur_tag_num; - struct ctl_port port; - char port_name[24]; -}; - -typedef enum { CTL_SER_BLOCK, CTL_SER_BLOCKOPT, CTL_SER_EXTENT, @@ -472,7 +448,6 @@ struct ctl_softc { int inquiry_pq_no_lun; struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_tree; - struct ctl_ioctl_info ioctl_info; void *othersc_pool; struct proc *ctl_proc; int targ_online; diff --git a/sys/cam/ctl/ctl_tpc.c b/sys/cam/ctl/ctl_tpc.c index 662ee3d..b1b674f 100644 --- a/sys/cam/ctl/ctl_tpc.c +++ b/sys/cam/ctl/ctl_tpc.c @@ -47,7 +47,6 @@ __FBSDID("$FreeBSD$"); #include <cam/ctl/ctl_io.h> #include <cam/ctl/ctl.h> #include <cam/ctl/ctl_frontend.h> -#include <cam/ctl/ctl_frontend_internal.h> #include <cam/ctl/ctl_util.h> #include <cam/ctl/ctl_backend.h> #include <cam/ctl/ctl_ioctl.h> diff --git a/sys/cam/ctl/ctl_tpc_local.c b/sys/cam/ctl/ctl_tpc_local.c index d0319ee..fb1f2ac 100644 --- a/sys/cam/ctl/ctl_tpc_local.c +++ b/sys/cam/ctl/ctl_tpc_local.c @@ -47,7 +47,6 @@ __FBSDID("$FreeBSD$"); #include <cam/ctl/ctl_io.h> #include <cam/ctl/ctl.h> #include <cam/ctl/ctl_frontend.h> -#include <cam/ctl/ctl_frontend_internal.h> #include <cam/ctl/ctl_util.h> #include <cam/ctl/ctl_backend.h> #include <cam/ctl/ctl_ioctl.h> diff --git a/sys/cddl/compat/opensolaris/sys/nvpair.h b/sys/cddl/compat/opensolaris/sys/nvpair.h index c90ab70..33b62cb 100644 --- a/sys/cddl/compat/opensolaris/sys/nvpair.h +++ b/sys/cddl/compat/opensolaris/sys/nvpair.h @@ -42,29 +42,19 @@ */ #define nvlist_add_binary illumos_nvlist_add_binary #define nvlist_add_bool illumos_nvlist_add_bool +#define nvlist_add_bool_array illumos_nvlist_add_bool_array #define nvlist_add_descriptor illumos_nvlist_add_descriptor +#define nvlist_add_descriptor_array illumos_nvlist_add_descriptor_array #define nvlist_add_null illumos_nvlist_add_null #define nvlist_add_number illumos_nvlist_add_number +#define nvlist_add_number_array illumos_nvlist_add_number_array #define nvlist_add_nvlist illumos_nvlist_add_nvlist +#define nvlist_add_nvlist_array illumos_nvlist_add_nvlist_array #define nvlist_add_nvpair illumos_nvlist_add_nvpair #define nvlist_add_string illumos_nvlist_add_string +#define nvlist_add_string_array illumos_nvlist_add_string_array #define nvlist_add_stringf illumos_nvlist_add_stringf #define nvlist_add_stringv illumos_nvlist_add_stringv -#define nvlist_addf_binary illumos_nvlist_addf_binary -#define nvlist_addf_bool illumos_nvlist_addf_bool -#define nvlist_addf_descriptor illumos_nvlist_addf_descriptor -#define nvlist_addf_null illumos_nvlist_addf_null -#define nvlist_addf_number illumos_nvlist_addf_number -#define nvlist_addf_nvlist illumos_nvlist_addf_nvlist -#define nvlist_addf_string illumos_nvlist_addf_string -#define nvlist_addv_binary illumos_nvlist_addv_binary -#define nvlist_addv_bool illumos_nvlist_addv_bool -#define nvlist_addv_descriptor illumos_nvlist_addv_descriptor -#define nvlist_addv_null illumos_nvlist_addv_null -#define nvlist_addv_number illumos_nvlist_addv_number -#define nvlist_addv_nvlist illumos_nvlist_addv_nvlist -#define nvlist_addv_string illumos_nvlist_addv_string -#define nvlist_check_header illumos_nvlist_check_header #define nvlist_clone illumos_nvlist_clone #define nvlist_create illumos_nvlist_create #define nvlist_descriptors illumos_nvlist_descriptors @@ -75,92 +65,61 @@ #define nvlist_exists illumos_nvlist_exists #define nvlist_exists_binary illumos_nvlist_exists_binary #define nvlist_exists_bool illumos_nvlist_exists_bool +#define nvlist_exists_bool_array illumos_nvlist_exists_bool_array #define nvlist_exists_descriptor illumos_nvlist_exists_descriptor +#define nvlist_exists_descriptor_array illumos_nvlist_exists_descriptor_array #define nvlist_exists_null illumos_nvlist_exists_null #define nvlist_exists_number illumos_nvlist_exists_number +#define nvlist_exists_number_array illumos_nvlist_exists_number_array #define nvlist_exists_nvlist illumos_nvlist_exists_nvlist +#define nvlist_exists_nvlist_array illumos_nvlist_exists_nvlist_array #define nvlist_exists_string illumos_nvlist_exists_string +#define nvlist_exists_string_array illumos_nvlist_exists_string_array #define nvlist_exists_type illumos_nvlist_exists_type -#define nvlist_existsf illumos_nvlist_existsf -#define nvlist_existsf_binary illumos_nvlist_existsf_binary -#define nvlist_existsf_bool illumos_nvlist_existsf_bool -#define nvlist_existsf_descriptor illumos_nvlist_existsf_descriptor -#define nvlist_existsf_null illumos_nvlist_existsf_null -#define nvlist_existsf_number illumos_nvlist_existsf_number -#define nvlist_existsf_nvlist illumos_nvlist_existsf_nvlist -#define nvlist_existsf_string illumos_nvlist_existsf_string -#define nvlist_existsf_type illumos_nvlist_existsf_type -#define nvlist_existsv illumos_nvlist_existsv -#define nvlist_existsv_binary illumos_nvlist_existsv_binary -#define nvlist_existsv_bool illumos_nvlist_existsv_bool -#define nvlist_existsv_descriptor illumos_nvlist_existsv_descriptor -#define nvlist_existsv_null illumos_nvlist_existsv_null -#define nvlist_existsv_number illumos_nvlist_existsv_number -#define nvlist_existsv_nvlist illumos_nvlist_existsv_nvlist -#define nvlist_existsv_string illumos_nvlist_existsv_string -#define nvlist_existsv_type illumos_nvlist_existsv_type #define nvlist_fdump illumos_nvlist_fdump #define nvlist_first_nvpair illumos_nvlist_first_nvpair +#define nvlist_flags illumos_nvlist_flags #define nvlist_free illumos_nvlist_free #define nvlist_free_binary illumos_nvlist_free_binary +#define nvlist_free_binary_array illumos_nvlist_free_binary_array #define nvlist_free_bool illumos_nvlist_free_bool +#define nvlist_free_bool_array illumos_nvlist_free_bool_array #define nvlist_free_descriptor illumos_nvlist_free_descriptor +#define nvlist_free_descriptor_array illumos_nvlist_free_descriptor_array #define nvlist_free_null illumos_nvlist_free_null #define nvlist_free_number illumos_nvlist_free_number +#define nvlist_free_number_array illumos_nvlist_free_number_array #define nvlist_free_nvlist illumos_nvlist_free_nvlist +#define nvlist_free_nvlist_array illumos_nvlist_free_nvlist_array #define nvlist_free_nvpair illumos_nvlist_free_nvpair #define nvlist_free_string illumos_nvlist_free_string +#define nvlist_free_string_array illumos_nvlist_free_string_array #define nvlist_free_type illumos_nvlist_free_type -#define nvlist_freef illumos_nvlist_freef -#define nvlist_freef_binary illumos_nvlist_freef_binary -#define nvlist_freef_bool illumos_nvlist_freef_bool -#define nvlist_freef_descriptor illumos_nvlist_freef_descriptor -#define nvlist_freef_null illumos_nvlist_freef_null -#define nvlist_freef_number illumos_nvlist_freef_number -#define nvlist_freef_nvlist illumos_nvlist_freef_nvlist -#define nvlist_freef_string illumos_nvlist_freef_string -#define nvlist_freef_type illumos_nvlist_freef_type -#define nvlist_freev illumos_nvlist_freev -#define nvlist_freev_binary illumos_nvlist_freev_binary -#define nvlist_freev_bool illumos_nvlist_freev_bool -#define nvlist_freev_descriptor illumos_nvlist_freev_descriptor -#define nvlist_freev_null illumos_nvlist_freev_null -#define nvlist_freev_number illumos_nvlist_freev_number -#define nvlist_freev_nvlist illumos_nvlist_freev_nvlist -#define nvlist_freev_string illumos_nvlist_freev_string -#define nvlist_freev_type illumos_nvlist_freev_type +#define nvlist_get_array_next illumos_nvlist_get_array_next #define nvlist_get_binary illumos_nvlist_get_binary #define nvlist_get_bool illumos_nvlist_get_bool +#define nvlist_get_bool_array illumos_nvlist_get_bool_array #define nvlist_get_descriptor illumos_nvlist_get_descriptor +#define nvlist_get_descriptor_array illumos_nvlist_get_descriptor_array #define nvlist_get_number illumos_nvlist_get_number +#define nvlist_get_number_array illumos_nvlist_get_number_array #define nvlist_get_nvlist illumos_nvlist_get_nvlist #define nvlist_get_nvpair illumos_nvlist_get_nvpair +#define nvlist_get_nvpair_parent illumos_nvlist_get_nvpair_parent +#define nvlist_get_pararr illumos_nvlist_get_pararr +#define nvlist_get_parent illumos_nvlist_get_parent #define nvlist_get_string illumos_nvlist_get_string -#define nvlist_getf_binary illumos_nvlist_getf_binary -#define nvlist_getf_bool illumos_nvlist_getf_bool -#define nvlist_getf_descriptor illumos_nvlist_getf_descriptor -#define nvlist_getf_number illumos_nvlist_getf_number -#define nvlist_getf_nvlist illumos_nvlist_getf_nvlist -#define nvlist_getf_string illumos_nvlist_getf_string -#define nvlist_getv_binary illumos_nvlist_getv_binary -#define nvlist_getv_bool illumos_nvlist_getv_bool -#define nvlist_getv_descriptor illumos_nvlist_getv_descriptor -#define nvlist_getv_number illumos_nvlist_getv_number -#define nvlist_getv_nvlist illumos_nvlist_getv_nvlist -#define nvlist_getv_string illumos_nvlist_getv_string +#define nvlist_in_array illumos_nvlist_in_array #define nvlist_move_binary illumos_nvlist_move_binary +#define nvlist_move_bool_array illumos_nvlist_move_bool_array #define nvlist_move_descriptor illumos_nvlist_move_descriptor +#define nvlist_move_descriptor_array illumos_nvlist_move_descriptor_array +#define nvlist_move_number_array illumos_nvlist_move_number_array #define nvlist_move_nvlist illumos_nvlist_move_nvlist +#define nvlist_move_nvlist_array illumos_nvlist_move_nvlist_array #define nvlist_move_nvpair illumos_nvlist_move_nvpair #define nvlist_move_string illumos_nvlist_move_string -#define nvlist_movef_binary illumos_nvlist_movef_binary -#define nvlist_movef_descriptor illumos_nvlist_movef_descriptor -#define nvlist_movef_nvlist illumos_nvlist_movef_nvlist -#define nvlist_movef_string illumos_nvlist_movef_string -#define nvlist_movev_binary illumos_nvlist_movev_binary -#define nvlist_movev_descriptor illumos_nvlist_movev_descriptor -#define nvlist_movev_nvlist illumos_nvlist_movev_nvlist -#define nvlist_movev_string illumos_nvlist_movev_string +#define nvlist_move_string_array illumos_nvlist_move_string_array #define nvlist_ndescriptors illumos_nvlist_ndescriptors #define nvlist_next illumos_nvlist_next #define nvlist_next_nvpair illumos_nvlist_next_nvpair @@ -168,93 +127,101 @@ #define nvlist_prev_nvpair illumos_nvlist_prev_nvpair #define nvlist_recv illumos_nvlist_recv #define nvlist_remove_nvpair illumos_nvlist_remove_nvpair -#define nvlist_report_missing illumos_nvlist_report_missing #define nvlist_send illumos_nvlist_send +#define nvlist_set_array_next illumos_nvlist_set_array_next #define nvlist_set_error illumos_nvlist_set_error +#define nvlist_set_flags illumos_nvlist_set_flags +#define nvlist_set_parent illumos_nvlist_set_parent #define nvlist_size illumos_nvlist_size #define nvlist_take_binary illumos_nvlist_take_binary #define nvlist_take_bool illumos_nvlist_take_bool +#define nvlist_take_bool_array illumos_nvlist_take_bool_array #define nvlist_take_descriptor illumos_nvlist_take_descriptor +#define nvlist_take_descriptor_array illumos_nvlist_take_descriptor_array #define nvlist_take_number illumos_nvlist_take_number +#define nvlist_take_number_array illumos_nvlist_take_number_array #define nvlist_take_nvlist illumos_nvlist_take_nvlist +#define nvlist_take_nvlist_array illumos_nvlist_take_nvlist_array #define nvlist_take_nvpair illumos_nvlist_take_nvpair #define nvlist_take_string illumos_nvlist_take_string -#define nvlist_takef_binary illumos_nvlist_takef_binary -#define nvlist_takef_bool illumos_nvlist_takef_bool -#define nvlist_takef_descriptor illumos_nvlist_takef_descriptor -#define nvlist_takef_number illumos_nvlist_takef_number -#define nvlist_takef_nvlist illumos_nvlist_takef_nvlist -#define nvlist_takef_string illumos_nvlist_takef_string -#define nvlist_takev_binary illumos_nvlist_takev_binary -#define nvlist_takev_bool illumos_nvlist_takev_bool -#define nvlist_takev_descriptor illumos_nvlist_takev_descriptor -#define nvlist_takev_number illumos_nvlist_takev_number -#define nvlist_takev_nvlist illumos_nvlist_takev_nvlist -#define nvlist_takev_string illumos_nvlist_takev_string +#define nvlist_take_string_array illumos_nvlist_take_string_array #define nvlist_unpack illumos_nvlist_unpack +#define nvlist_unpack_header illumos_nvlist_unpack_header #define nvlist_xfer illumos_nvlist_xfer -#define nvlist_xpack illumos_nvlist_xpack -#define nvlist_xunpack illumos_nvlist_xunpack -#define nvpair_allocv illumos_nvpair_allocv #define nvpair_assert illumos_nvpair_assert #define nvpair_clone illumos_nvpair_clone #define nvpair_create_binary illumos_nvpair_create_binary #define nvpair_create_bool illumos_nvpair_create_bool +#define nvpair_create_bool_array illumos_nvpair_create_bool_array #define nvpair_create_descriptor illumos_nvpair_create_descriptor +#define nvpair_create_descriptor_array illumos_nvpair_create_descriptor_array #define nvpair_create_null illumos_nvpair_create_null #define nvpair_create_number illumos_nvpair_create_number +#define nvpair_create_number_array illumos_nvpair_create_number_array #define nvpair_create_nvlist illumos_nvpair_create_nvlist +#define nvpair_create_nvlist_array illumos_nvpair_create_nvlist_array #define nvpair_create_string illumos_nvpair_create_string +#define nvpair_create_string_array illumos_nvpair_create_string_array #define nvpair_create_stringf illumos_nvpair_create_stringf #define nvpair_create_stringv illumos_nvpair_create_stringv -#define nvpair_createf_binary illumos_nvpair_createf_binary -#define nvpair_createf_bool illumos_nvpair_createf_bool -#define nvpair_createf_descriptor illumos_nvpair_createf_descriptor -#define nvpair_createf_null illumos_nvpair_createf_null -#define nvpair_createf_number illumos_nvpair_createf_number -#define nvpair_createf_nvlist illumos_nvpair_createf_nvlist -#define nvpair_createf_string illumos_nvpair_createf_string -#define nvpair_createv_binary illumos_nvpair_createv_binary -#define nvpair_createv_bool illumos_nvpair_createv_bool -#define nvpair_createv_descriptor illumos_nvpair_createv_descriptor -#define nvpair_createv_null illumos_nvpair_createv_null -#define nvpair_createv_number illumos_nvpair_createv_number -#define nvpair_createv_nvlist illumos_nvpair_createv_nvlist -#define nvpair_createv_string illumos_nvpair_createv_string #define nvpair_free illumos_nvpair_free #define nvpair_free_structure illumos_nvpair_free_structure #define nvpair_get_binary illumos_nvpair_get_binary #define nvpair_get_bool illumos_nvpair_get_bool +#define nvpair_get_bool_array illumos_nvpair_get_bool_array #define nvpair_get_descriptor illumos_nvpair_get_descriptor +#define nvpair_get_descriptor_array illumos_nvpair_get_descriptor_array #define nvpair_get_number illumos_nvpair_get_number +#define nvpair_get_number_array illumos_nvpair_get_number_array #define nvpair_get_nvlist illumos_nvpair_get_nvlist #define nvpair_get_string illumos_nvpair_get_string #define nvpair_header_size illumos_nvpair_header_size +#define nvpair_init_datasize illumos_nvpair_init_datasize #define nvpair_insert illumos_nvpair_insert #define nvpair_move_binary illumos_nvpair_move_binary +#define nvpair_move_bool_array illumos_nvpair_move_bool_array #define nvpair_move_descriptor illumos_nvpair_move_descriptor +#define nvpair_move_descriptor_array illumos_nvpair_move_descriptor_array +#define nvpair_move_number_array illumos_nvpair_move_number_array #define nvpair_move_nvlist illumos_nvpair_move_nvlist +#define nvpair_move_nvlist_array illumos_nvpair_move_nvlist_array #define nvpair_move_string illumos_nvpair_move_string -#define nvpair_movef_binary illumos_nvpair_movef_binary -#define nvpair_movef_descriptor illumos_nvpair_movef_descriptor -#define nvpair_movef_nvlist illumos_nvpair_movef_nvlist -#define nvpair_movef_string illumos_nvpair_movef_string -#define nvpair_movev_binary illumos_nvpair_movev_binary -#define nvpair_movev_descriptor illumos_nvpair_movev_descriptor -#define nvpair_movev_nvlist illumos_nvpair_movev_nvlist -#define nvpair_movev_string illumos_nvpair_movev_string +#define nvpair_move_string_array illumos_nvpair_move_string_array #define nvpair_name illumos_nvpair_name #define nvpair_next illumos_nvpair_next #define nvpair_nvlist illumos_nvpair_nvlist -#define nvpair_pack illumos_nvpair_pack +#define nvpair_pack_binary illumos_nvpair_pack_binary +#define nvpair_pack_bool illumos_nvpair_pack_bool +#define nvpair_pack_bool_array illumos_nvpair_pack_bool_array #define nvpair_pack_descriptor illumos_nvpair_pack_descriptor +#define nvpair_pack_descriptor_array illumos_nvpair_pack_descriptor_array +#define nvpair_pack_header illumos_nvpair_pack_header +#define nvpair_pack_null illumos_nvpair_pack_null +#define nvpair_pack_number illumos_nvpair_pack_number +#define nvpair_pack_number_array illumos_nvpair_pack_number_array +#define nvpair_pack_nvlist_array_next illumos_nvpair_pack_nvlist_array_next +#define nvpair_pack_nvlist_up illumos_nvpair_pack_nvlist_up +#define nvpair_pack_string illumos_nvpair_pack_string +#define nvpair_pack_string_array illumos_nvpair_pack_string_array #define nvpair_prev illumos_nvpair_prev #define nvpair_remove illumos_nvpair_remove #define nvpair_size illumos_nvpair_size #define nvpair_type illumos_nvpair_type #define nvpair_type_string illumos_nvpair_type_string #define nvpair_unpack illumos_nvpair_unpack +#define nvpair_unpack_binary illumos_nvpair_unpack_binary +#define nvpair_unpack_bool illumos_nvpair_unpack_bool +#define nvpair_unpack_bool_array illumos_nvpair_unpack_bool_array #define nvpair_unpack_descriptor illumos_nvpair_unpack_descriptor +#define nvpair_unpack_descriptor_array illumos_nvpair_unpack_descriptor_array +#define nvpair_unpack_header illumos_nvpair_unpack_header +#define nvpair_unpack_null illumos_nvpair_unpack_null +#define nvpair_unpack_number illumos_nvpair_unpack_number +#define nvpair_unpack_number_array illumos_nvpair_unpack_number_array +#define nvpair_unpack_nvlist illumos_nvpair_unpack_nvlist +#define nvpair_unpack_nvlist_array illumos_nvpair_unpack_nvlist_array +#define nvpair_unpack_string illumos_nvpair_unpack_string +#define nvpair_unpack_string_array illumos_nvpair_unpack_string_array #endif /* _KERNEL */ diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c index 52a355d..d59fbf0 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. @@ -129,15 +129,15 @@ zfeature_depends_on(spa_feature_t fid, spa_feature_t check) { static void zfeature_register(spa_feature_t fid, const char *guid, const char *name, - const char *desc, boolean_t readonly, boolean_t mos, - boolean_t activate_on_enable, const spa_feature_t *deps) + const char *desc, zfeature_flags_t flags, const spa_feature_t *deps) { zfeature_info_t *feature = &spa_feature_table[fid]; static spa_feature_t nodeps[] = { SPA_FEATURE_NONE }; ASSERT(name != NULL); ASSERT(desc != NULL); - ASSERT(!readonly || !mos); + ASSERT((flags & ZFEATURE_FLAG_READONLY_COMPAT) == 0 || + (flags & ZFEATURE_FLAG_MOS) == 0); ASSERT3U(fid, <, SPA_FEATURES); ASSERT(zfeature_is_valid_guid(guid)); @@ -148,9 +148,7 @@ zfeature_register(spa_feature_t fid, const char *guid, const char *name, feature->fi_guid = guid; feature->fi_uname = name; feature->fi_desc = desc; - feature->fi_can_readonly = readonly; - feature->fi_mos = mos; - feature->fi_activate_on_enable = activate_on_enable; + feature->fi_flags = flags; feature->fi_depends = deps; } @@ -159,45 +157,46 @@ zpool_feature_init(void) { zfeature_register(SPA_FEATURE_ASYNC_DESTROY, "com.delphix:async_destroy", "async_destroy", - "Destroy filesystems asynchronously.", B_TRUE, B_FALSE, - B_FALSE, NULL); + "Destroy filesystems asynchronously.", + ZFEATURE_FLAG_READONLY_COMPAT, NULL); zfeature_register(SPA_FEATURE_EMPTY_BPOBJ, "com.delphix:empty_bpobj", "empty_bpobj", - "Snapshots use less space.", B_TRUE, B_FALSE, - B_FALSE, NULL); + "Snapshots use less space.", + ZFEATURE_FLAG_READONLY_COMPAT, NULL); zfeature_register(SPA_FEATURE_LZ4_COMPRESS, "org.illumos:lz4_compress", "lz4_compress", - "LZ4 compression algorithm support.", B_FALSE, B_FALSE, - B_TRUE, NULL); + "LZ4 compression algorithm support.", + ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, NULL); zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, "com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump", - "Crash dumps to multiple vdev pools.", B_FALSE, B_FALSE, - B_FALSE, NULL); + "Crash dumps to multiple vdev pools.", + 0, NULL); zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM, "com.delphix:spacemap_histogram", "spacemap_histogram", - "Spacemaps maintain space histograms.", B_TRUE, B_FALSE, - B_FALSE, NULL); + "Spacemaps maintain space histograms.", + ZFEATURE_FLAG_READONLY_COMPAT, NULL); zfeature_register(SPA_FEATURE_ENABLED_TXG, "com.delphix:enabled_txg", "enabled_txg", - "Record txg at which a feature is enabled", B_TRUE, B_FALSE, - B_FALSE, NULL); + "Record txg at which a feature is enabled", + ZFEATURE_FLAG_READONLY_COMPAT, NULL); static spa_feature_t hole_birth_deps[] = { SPA_FEATURE_ENABLED_TXG, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_HOLE_BIRTH, "com.delphix:hole_birth", "hole_birth", "Retain hole birth txg for more precise zfs send", - B_FALSE, B_TRUE, B_TRUE, hole_birth_deps); + ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, + hole_birth_deps); zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET, "com.delphix:extensible_dataset", "extensible_dataset", "Enhanced dataset functionality, used by other features.", - B_FALSE, B_FALSE, B_FALSE, NULL); + 0, NULL); static const spa_feature_t bookmarks_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, @@ -206,7 +205,7 @@ zpool_feature_init(void) zfeature_register(SPA_FEATURE_BOOKMARKS, "com.delphix:bookmarks", "bookmarks", "\"zfs bookmark\" command", - B_TRUE, B_FALSE, B_FALSE, bookmarks_deps); + ZFEATURE_FLAG_READONLY_COMPAT, bookmarks_deps); static const spa_feature_t filesystem_limits_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, @@ -214,13 +213,14 @@ zpool_feature_init(void) }; zfeature_register(SPA_FEATURE_FS_SS_LIMIT, "com.joyent:filesystem_limits", "filesystem_limits", - "Filesystem and snapshot limits.", B_TRUE, B_FALSE, B_FALSE, - filesystem_limits_deps); + "Filesystem and snapshot limits.", + ZFEATURE_FLAG_READONLY_COMPAT, filesystem_limits_deps); zfeature_register(SPA_FEATURE_EMBEDDED_DATA, "com.delphix:embedded_data", "embedded_data", "Blocks which compress very well use even less space.", - B_FALSE, B_TRUE, B_TRUE, NULL); + ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, + NULL); static const spa_feature_t large_blocks_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, @@ -228,6 +228,6 @@ zpool_feature_init(void) }; zfeature_register(SPA_FEATURE_LARGE_BLOCKS, "org.open-zfs:large_blocks", "large_blocks", - "Support for blocks larger than 128KB.", B_FALSE, B_FALSE, B_FALSE, - large_blocks_deps); + "Support for blocks larger than 128KB.", + ZFEATURE_FLAG_PER_DATASET, large_blocks_deps); } diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h index 4ffe435..0e88a9a 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. */ @@ -56,15 +56,23 @@ typedef enum spa_feature { #define SPA_FEATURE_DISABLED (-1ULL) +typedef enum zfeature_flags { + /* Can open pool readonly even if this feature is not supported. */ + ZFEATURE_FLAG_READONLY_COMPAT = (1 << 0), + /* Is this feature necessary to read the MOS? */ + ZFEATURE_FLAG_MOS = (1 << 1), + /* Activate this feature at the same time it is enabled. */ + ZFEATURE_FLAG_ACTIVATE_ON_ENABLE = (1 << 2), + /* Each dataset has a field set if it has ever used this feature. */ + ZFEATURE_FLAG_PER_DATASET = (1 << 3) +} zfeature_flags_t; + typedef struct zfeature_info { spa_feature_t fi_feature; const char *fi_uname; /* User-facing feature name */ const char *fi_guid; /* On-disk feature identifier */ const char *fi_desc; /* Feature description */ - boolean_t fi_can_readonly; /* Can open pool readonly w/o support? */ - boolean_t fi_mos; /* Is the feature necessary to read the MOS? */ - /* Activate this feature at the same time it is enabled */ - boolean_t fi_activate_on_enable; + zfeature_flags_t fi_flags; /* array of dependencies, terminated by SPA_FEATURE_NONE */ const spa_feature_t *fi_depends; } zfeature_info_t; diff --git a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files index 4c7e225..77c7b1d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files +++ b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files @@ -22,7 +22,9 @@ # # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. # Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved. -# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved. +# Copyright (c) 2012 Joyent, Inc. All rights reserved. +# Copyright (c) 2011, 2014 by Delphix. All rights reserved. # Copyright (c) 2013 by Saso Kiselkov. All rights reserved. # # @@ -36,6 +38,7 @@ ZFS_COMMON_OBJS += \ blkptr.o \ bpobj.o \ bptree.o \ + bqueue.o \ dbuf.o \ ddt.o \ ddt_zap.o \ @@ -65,6 +68,7 @@ ZFS_COMMON_OBJS += \ lz4.o \ lzjb.o \ metaslab.o \ + multilist.o \ range_tree.o \ refcount.o \ rrwlock.o \ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index 6de36f2..07fcb51 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -21,9 +21,9 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. - * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ /* @@ -82,9 +82,9 @@ * types of locks: 1) the hash table lock array, and 2) the * arc list locks. * - * Buffers do not have their own mutexs, rather they rely on the - * hash table mutexs for the bulk of their protection (i.e. most - * fields in the arc_buf_hdr_t are protected by these mutexs). + * Buffers do not have their own mutexes, rather they rely on the + * hash table mutexes for the bulk of their protection (i.e. most + * fields in the arc_buf_hdr_t are protected by these mutexes). * * buf_hash_find() returns the appropriate mutex (held) when it * locates the requested buffer in the hash table. It returns @@ -129,6 +129,7 @@ #include <sys/vdev.h> #include <sys/vdev_impl.h> #include <sys/dsl_pool.h> +#include <sys/multilist.h> #ifdef _KERNEL #include <sys/dnlc.h> #endif @@ -149,21 +150,39 @@ int arc_procfd; #endif #endif /* illumos */ -static kmutex_t arc_reclaim_thr_lock; -static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ -static uint8_t arc_thread_exit; +static kmutex_t arc_reclaim_lock; +static kcondvar_t arc_reclaim_thread_cv; +static boolean_t arc_reclaim_thread_exit; +static kcondvar_t arc_reclaim_waiters_cv; + +static kmutex_t arc_user_evicts_lock; +static kcondvar_t arc_user_evicts_cv; +static boolean_t arc_user_evicts_thread_exit; uint_t arc_reduce_dnlc_percent = 3; /* - * The number of iterations through arc_evict_*() before we - * drop & reacquire the lock. + * The number of headers to evict in arc_evict_state_impl() before + * dropping the sublist lock and evicting from another sublist. A lower + * value means we're more likely to evict the "correct" header (i.e. the + * oldest header in the arc state), but comes with higher overhead + * (i.e. more invocations of arc_evict_state_impl()). */ -int arc_evict_iterations = 100; +int zfs_arc_evict_batch_limit = 10; + +/* + * The number of sublists used for each of the arc state lists. If this + * is not set to a suitable value by the user, it will be configured to + * the number of CPUs on the system in arc_init(). + */ +int zfs_arc_num_sublists_per_state = 0; /* number of seconds before growing cache again */ static int arc_grow_retry = 60; +/* shift of arc_c for calculating overflow limit in arc_get_data_buf */ +int zfs_arc_overflow_shift = 8; + /* shift of arc_c for calculating both min and max arc_p */ static int arc_p_min_shift = 4; @@ -201,6 +220,9 @@ extern int zfs_prefetch_disable; */ static boolean_t arc_warm; +/* + * These tunables are for performance analysis. + */ uint64_t zfs_arc_max; uint64_t zfs_arc_min; uint64_t zfs_arc_meta_limit = 0; @@ -312,31 +334,22 @@ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, * second level ARC benefit from these fast lookups. */ -#define ARCS_LOCK_PAD CACHE_LINE_SIZE -struct arcs_lock { - kmutex_t arcs_lock; -#ifdef _KERNEL - unsigned char pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))]; -#endif -}; - -/* - * must be power of two for mask use to work - * - */ -#define ARC_BUFC_NUMDATALISTS 16 -#define ARC_BUFC_NUMMETADATALISTS 16 -#define ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS) - typedef struct arc_state { - uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ - uint64_t arcs_size; /* total amount of data in this state */ - list_t arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */ - struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE); + /* + * list of evictable buffers + */ + multilist_t arcs_list[ARC_BUFC_NUMTYPES]; + /* + * total amount of evictable data in this state + */ + uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; + /* + * total amount of data in this state; this includes: evictable, + * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. + */ + refcount_t arcs_size; } arc_state_t; -#define ARCS_LOCK(s, i) (&((s)->arcs_locks[(i)].arcs_lock)) - /* The 6 states: */ static arc_state_t ARC_anon; static arc_state_t ARC_mru; @@ -362,8 +375,6 @@ typedef struct arc_stats { kstat_named_t arcstat_mfu_ghost_hits; kstat_named_t arcstat_allocated; kstat_named_t arcstat_deleted; - kstat_named_t arcstat_stolen; - kstat_named_t arcstat_recycle_miss; /* * Number of buffers that could not be evicted because the hash lock * was held by another thread. The lock may not necessarily be held @@ -377,9 +388,15 @@ typedef struct arc_stats { * not from the spa we're trying to evict from. */ kstat_named_t arcstat_evict_skip; + /* + * Number of times arc_evict_state() was unable to evict enough + * buffers to reach it's target amount. + */ + kstat_named_t arcstat_evict_not_enough; kstat_named_t arcstat_evict_l2_cached; kstat_named_t arcstat_evict_l2_eligible; kstat_named_t arcstat_evict_l2_ineligible; + kstat_named_t arcstat_evict_l2_skip; kstat_named_t arcstat_hash_elements; kstat_named_t arcstat_hash_elements_max; kstat_named_t arcstat_hash_collisions; @@ -530,7 +547,7 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_writes_sent; kstat_named_t arcstat_l2_writes_done; kstat_named_t arcstat_l2_writes_error; - kstat_named_t arcstat_l2_writes_hdr_miss; + kstat_named_t arcstat_l2_writes_lock_retry; kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; kstat_named_t arcstat_l2_evict_l1cached; @@ -584,13 +601,13 @@ static arc_stats_t arc_stats = { { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, { "allocated", KSTAT_DATA_UINT64 }, { "deleted", KSTAT_DATA_UINT64 }, - { "stolen", KSTAT_DATA_UINT64 }, - { "recycle_miss", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "evict_skip", KSTAT_DATA_UINT64 }, + { "evict_not_enough", KSTAT_DATA_UINT64 }, { "evict_l2_cached", KSTAT_DATA_UINT64 }, { "evict_l2_eligible", KSTAT_DATA_UINT64 }, { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, + { "evict_l2_skip", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, @@ -629,7 +646,7 @@ static arc_stats_t arc_stats = { { "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, - { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, + { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, @@ -806,7 +823,7 @@ typedef struct l1arc_buf_hdr { /* protected by arc state mutex */ arc_state_t *b_state; - list_node_t b_arc_node; + multilist_node_t b_arc_node; /* updated atomically */ clock_t b_arc_access; @@ -877,7 +894,6 @@ sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) #endif static arc_buf_t *arc_eviction_list; -static kmutex_t arc_eviction_mtx; static arc_buf_hdr_t arc_eviction_hdr; #define GHOST_STATE(state) \ @@ -1011,21 +1027,21 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, &l2arc_norw, 0, "no reads during writes"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, - &ARC_anon.arcs_size, 0, "size of anonymous state"); + &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, - &ARC_mru.arcs_size, 0, "size of mru state"); + &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, - &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state"); + &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru ghost state"); @@ -1034,14 +1050,14 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, "size of data in mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, - &ARC_mfu.arcs_size, 0, "size of mfu state"); + &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, - &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state"); + &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu ghost state"); @@ -1050,7 +1066,7 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, "size of data in mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, - &ARC_l2c_only.arcs_size, 0, "size of mru state"); + &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); /* * L2ARC Internals @@ -1106,8 +1122,7 @@ static uint8_t l2arc_thread_exit; static void arc_get_data_buf(arc_buf_t *); static void arc_access(arc_buf_hdr_t *, kmutex_t *); -static int arc_evict_needed(arc_buf_contents_t); -static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t); +static boolean_t arc_is_overflowing(); static void arc_buf_watch(arc_buf_t *); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); @@ -1288,6 +1303,7 @@ hdr_full_cons(void *vbuf, void *unused, int kmflag) cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); refcount_create(&hdr->b_l1hdr.b_refcnt); mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + multilist_link_init(&hdr->b_l1hdr.b_arc_node); arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); return (0); @@ -1332,6 +1348,7 @@ hdr_full_dest(void *vbuf, void *unused) cv_destroy(&hdr->b_l1hdr.b_cv); refcount_destroy(&hdr->b_l1hdr.b_refcnt); mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); } @@ -1368,7 +1385,7 @@ hdr_recl(void *unused) * which is after we do arc_fini(). */ if (!arc_dead) - cv_signal(&arc_reclaim_thr_cv); + cv_signal(&arc_reclaim_thread_cv); } static void @@ -1447,18 +1464,31 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) * l2c_only even though it's about to change. */ nhdr->b_l1hdr.b_state = arc_l2c_only; + + /* Verify previous threads set to NULL before freeing */ + ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL); } else { ASSERT(hdr->b_l1hdr.b_buf == NULL); ASSERT0(hdr->b_l1hdr.b_datacnt); - ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + /* - * We might be removing the L1hdr of a buffer which was just - * written out to L2ARC. If such a buffer is compressed then we - * need to free its b_tmp_cdata before destroying the header. + * If we've reached here, We must have been called from + * arc_evict_hdr(), as such we should have already been + * removed from any ghost list we were previously on + * (which protects us from racing with arc_evict_state), + * thus no locking is needed during this check. */ - if (hdr->b_l1hdr.b_tmp_cdata != NULL && - HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) - l2arc_release_cdata_buf(hdr); + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + + /* + * A buffer must not be moved into the arc_l2c_only + * state if it's not finished being written out to the + * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field + * might try to be accessed, even though it was removed. + */ + VERIFY(!HDR_L2_WRITING(hdr)); + VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); + nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; } /* @@ -1681,23 +1711,6 @@ arc_buf_freeze(arc_buf_t *buf) } static void -get_buf_info(arc_buf_hdr_t *hdr, arc_state_t *state, list_t **list, kmutex_t **lock) -{ - uint64_t buf_hashid = buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth); - - if (arc_buf_type(hdr) == ARC_BUFC_METADATA) - buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1); - else { - buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1); - buf_hashid += ARC_BUFC_NUMMETADATALISTS; - } - - *list = &state->arcs_lists[buf_hashid]; - *lock = ARCS_LOCK(state, buf_hashid); -} - - -static void add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { ASSERT(HDR_HAS_L1HDR(hdr)); @@ -1708,16 +1721,13 @@ add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) (state != arc_anon)) { /* We don't use the L2-only state list. */ if (state != arc_l2c_only) { + arc_buf_contents_t type = arc_buf_type(hdr); uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; - uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; - list_t *list; - kmutex_t *lock; - - get_buf_info(hdr, state, &list, &lock); - ASSERT(!MUTEX_HELD(lock)); - mutex_enter(lock); - ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); - list_remove(list, hdr); + multilist_t *list = &state->arcs_list[type]; + uint64_t *size = &state->arcs_lsize[type]; + + multilist_remove(list, hdr); + if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_datacnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); @@ -1726,7 +1736,6 @@ add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) ASSERT(delta > 0); ASSERT3U(*size, >=, delta); atomic_add_64(size, -delta); - mutex_exit(lock); } /* remove the prefetch flag if we get a reference */ hdr->b_flags &= ~ARC_FLAG_PREFETCH; @@ -1749,25 +1758,21 @@ remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) */ if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { - uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; - list_t *list; - kmutex_t *lock; + arc_buf_contents_t type = arc_buf_type(hdr); + multilist_t *list = &state->arcs_list[type]; + uint64_t *size = &state->arcs_lsize[type]; + + multilist_insert(list, hdr); - get_buf_info(hdr, state, &list, &lock); - ASSERT(!MUTEX_HELD(lock)); - mutex_enter(lock); - ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); - list_insert_head(list, hdr); ASSERT(hdr->b_l1hdr.b_datacnt > 0); atomic_add_64(size, hdr->b_size * hdr->b_l1hdr.b_datacnt); - mutex_exit(lock); } return (cnt); } /* - * Move the supplied buffer to the indicated state. The mutex + * Move the supplied buffer to the indicated state. The hash lock * for the buffer must be held by the caller. */ static void @@ -1779,8 +1784,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, uint32_t datacnt; uint64_t from_delta, to_delta; arc_buf_contents_t buftype = arc_buf_type(hdr); - list_t *list; - kmutex_t *lock; /* * We almost always have an L1 hdr here, since we call arc_hdr_realloc() @@ -1813,17 +1816,10 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { - int use_mutex; uint64_t *size = &old_state->arcs_lsize[buftype]; - get_buf_info(hdr, old_state, &list, &lock); - use_mutex = !MUTEX_HELD(lock); - if (use_mutex) - mutex_enter(lock); - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); - list_remove(list, hdr); + multilist_remove(&old_state->arcs_list[buftype], hdr); /* * If prefetching out of the ghost cache, @@ -1836,12 +1832,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, } ASSERT3U(*size, >=, from_delta); atomic_add_64(size, -from_delta); - - if (use_mutex) - mutex_exit(lock); } if (new_state != arc_anon && new_state != arc_l2c_only) { - int use_mutex; uint64_t *size = &new_state->arcs_lsize[buftype]; /* @@ -1851,23 +1843,15 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, * beforehand. */ ASSERT(HDR_HAS_L1HDR(hdr)); - get_buf_info(hdr, new_state, &list, &lock); - use_mutex = !MUTEX_HELD(lock); - if (use_mutex) - mutex_enter(lock); - - list_insert_head(list, hdr); + multilist_insert(&new_state->arcs_list[buftype], hdr); /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { - ASSERT(datacnt == 0); + ASSERT0(datacnt); ASSERT(hdr->b_l1hdr.b_buf == NULL); to_delta = hdr->b_size; } atomic_add_64(size, to_delta); - - if (use_mutex) - mutex_exit(lock); } } @@ -1876,12 +1860,73 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, buf_hash_remove(hdr); /* adjust state sizes (ignore arc_l2c_only) */ - if (to_delta && new_state != arc_l2c_only) - atomic_add_64(&new_state->arcs_size, to_delta); + + if (to_delta && new_state != arc_l2c_only) { + ASSERT(HDR_HAS_L1HDR(hdr)); + if (GHOST_STATE(new_state)) { + ASSERT0(datacnt); + + /* + * We moving a header to a ghost state, we first + * remove all arc buffers. Thus, we'll have a + * datacnt of zero, and no arc buffer to use for + * the reference. As a result, we use the arc + * header pointer for the reference. + */ + (void) refcount_add_many(&new_state->arcs_size, + hdr->b_size, hdr); + } else { + ASSERT3U(datacnt, !=, 0); + + /* + * Each individual buffer holds a unique reference, + * thus we must remove each of these references one + * at a time. + */ + for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + (void) refcount_add_many(&new_state->arcs_size, + hdr->b_size, buf); + } + } + } + if (from_delta && old_state != arc_l2c_only) { - ASSERT3U(old_state->arcs_size, >=, from_delta); - atomic_add_64(&old_state->arcs_size, -from_delta); + ASSERT(HDR_HAS_L1HDR(hdr)); + if (GHOST_STATE(old_state)) { + /* + * When moving a header off of a ghost state, + * there's the possibility for datacnt to be + * non-zero. This is because we first add the + * arc buffer to the header prior to changing + * the header's state. Since we used the header + * for the reference when putting the header on + * the ghost state, we must balance that and use + * the header when removing off the ghost state + * (even though datacnt is non zero). + */ + + IMPLY(datacnt == 0, new_state == arc_anon || + new_state == arc_l2c_only); + + (void) refcount_remove_many(&old_state->arcs_size, + hdr->b_size, hdr); + } else { + ASSERT3P(datacnt, !=, 0); + + /* + * Each individual buffer holds a unique reference, + * thus we must remove each of these references one + * at a time. + */ + for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + (void) refcount_remove_many( + &old_state->arcs_size, hdr->b_size, buf); + } + } } + if (HDR_HAS_L1HDR(hdr)) hdr->b_l1hdr.b_state = new_state; @@ -1889,10 +1934,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, * L2 headers should never be on the L2 state list since they don't * have L1 headers allocated. */ -#ifdef illumos - ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && - list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); -#endif + ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && + multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); } void @@ -1985,6 +2028,7 @@ arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) hdr->b_l1hdr.b_state = arc_anon; hdr->b_l1hdr.b_arc_access = 0; hdr->b_l1hdr.b_datacnt = 1; + hdr->b_l1hdr.b_tmp_cdata = NULL; arc_get_data_buf(buf); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); @@ -2120,7 +2164,7 @@ arc_buf_free_on_write(void *data, size_t size, { l2arc_data_free_t *df; - df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); + df = kmem_alloc(sizeof (*df), KM_SLEEP); df->l2df_data = data; df->l2df_size = size; df->l2df_func = free_func; @@ -2146,10 +2190,6 @@ arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) } } -/* - * Free up buf->b_data and if 'remove' is set, then pull the - * arc_buf_t off of the the arc_buf_hdr_t's list and free it. - */ static void arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) { @@ -2164,19 +2204,53 @@ arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) if (!HDR_HAS_L1HDR(hdr)) return; - if (hdr->b_l1hdr.b_tmp_cdata == NULL) + /* + * The header isn't being written to the l2arc device, thus it + * shouldn't have a b_tmp_cdata to free. + */ + if (!HDR_L2_WRITING(hdr)) { + ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); + return; + } + + /* + * The header does not have compression enabled. This can be due + * to the buffer not being compressible, or because we're + * freeing the buffer before the second phase of + * l2arc_write_buffer() has started (which does the compression + * step). In either case, b_tmp_cdata does not point to a + * separately compressed buffer, so there's nothing to free (it + * points to the same buffer as the arc_buf_t's b_data field). + */ + if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) { + hdr->b_l1hdr.b_tmp_cdata = NULL; + return; + } + + /* + * There's nothing to free since the buffer was all zero's and + * compressed to a zero length buffer. + */ + if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_EMPTY) { + ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); return; + } - ASSERT(HDR_L2_WRITING(hdr)); - arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size, - zio_data_buf_free); + ASSERT(L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr))); + + arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, + hdr->b_size, zio_data_buf_free); ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); hdr->b_l1hdr.b_tmp_cdata = NULL; } +/* + * Free up buf->b_data and if 'remove' is set, then pull the + * arc_buf_t off of the the arc_buf_hdr_t's list and free it. + */ static void -arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) +arc_buf_destroy(arc_buf_t *buf, boolean_t remove) { arc_buf_t **bufp; @@ -2191,17 +2265,17 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) arc_buf_unwatch(buf); #endif - if (!recycle) { - if (type == ARC_BUFC_METADATA) { - arc_buf_data_free(buf, zio_buf_free); - arc_space_return(size, ARC_SPACE_META); - } else { - ASSERT(type == ARC_BUFC_DATA); - arc_buf_data_free(buf, zio_data_buf_free); - arc_space_return(size, ARC_SPACE_DATA); - } + if (type == ARC_BUFC_METADATA) { + arc_buf_data_free(buf, zio_buf_free); + arc_space_return(size, ARC_SPACE_META); + } else { + ASSERT(type == ARC_BUFC_DATA); + arc_buf_data_free(buf, zio_data_buf_free); + arc_space_return(size, ARC_SPACE_DATA); } - if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { + + /* protected by hash lock, if in the hash table */ + if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { uint64_t *cnt = &state->arcs_lsize[type]; ASSERT(refcount_is_zero( @@ -2211,8 +2285,8 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) ASSERT3U(*cnt, >=, size); atomic_add_64(cnt, -size); } - ASSERT3U(state->arcs_size, >=, size); - atomic_add_64(&state->arcs_size, -size); + + (void) refcount_remove_many(&state->arcs_size, size, buf); buf->b_data = NULL; /* @@ -2339,6 +2413,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) if (!BUF_EMPTY(hdr)) buf_discard_identity(hdr); + if (hdr->b_freeze_cksum != NULL) { kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; @@ -2349,20 +2424,19 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) arc_buf_t *buf = hdr->b_l1hdr.b_buf; if (buf->b_efunc != NULL) { - mutex_enter(&arc_eviction_mtx); + mutex_enter(&arc_user_evicts_lock); mutex_enter(&buf->b_evict_lock); ASSERT(buf->b_hdr != NULL); - arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, - FALSE); + arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE); hdr->b_l1hdr.b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; arc_eviction_list = buf; mutex_exit(&buf->b_evict_lock); - mutex_exit(&arc_eviction_mtx); + cv_signal(&arc_user_evicts_cv); + mutex_exit(&arc_user_evicts_lock); } else { - arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, - TRUE); + arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE); } } #ifdef ZFS_DEBUG @@ -2375,7 +2449,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) ASSERT3P(hdr->b_hash_next, ==, NULL); if (HDR_HAS_L1HDR(hdr)) { - ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); kmem_cache_free(hdr_full_cache, hdr); } else { @@ -2401,7 +2475,7 @@ arc_buf_free(arc_buf_t *buf, void *tag) (void) remove_reference(hdr, hash_lock, tag); if (hdr->b_l1hdr.b_datacnt > 1) { - arc_buf_destroy(buf, FALSE, TRUE); + arc_buf_destroy(buf, TRUE); } else { ASSERT(buf == hdr->b_l1hdr.b_buf); ASSERT(buf->b_efunc == NULL); @@ -2415,16 +2489,16 @@ arc_buf_free(arc_buf_t *buf, void *tag) * this buffer unless the write completes before we finish * decrementing the reference count. */ - mutex_enter(&arc_eviction_mtx); + mutex_enter(&arc_user_evicts_lock); (void) remove_reference(hdr, NULL, tag); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); - mutex_exit(&arc_eviction_mtx); + mutex_exit(&arc_user_evicts_lock); if (destroy_hdr) arc_hdr_destroy(hdr); } else { if (remove_reference(hdr, NULL, tag) > 0) - arc_buf_destroy(buf, FALSE, TRUE); + arc_buf_destroy(buf, TRUE); else arc_hdr_destroy(hdr); } @@ -2453,7 +2527,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) (void) remove_reference(hdr, hash_lock, tag); if (hdr->b_l1hdr.b_datacnt > 1) { if (no_callback) - arc_buf_destroy(buf, FALSE, TRUE); + arc_buf_destroy(buf, TRUE); } else if (no_callback) { ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); ASSERT(buf->b_efunc == NULL); @@ -2514,499 +2588,678 @@ arc_buf_eviction_needed(arc_buf_t *buf) } /* - * Evict buffers from list until we've removed the specified number of - * bytes. Move the removed buffers to the appropriate evict state. - * If the recycle flag is set, then attempt to "recycle" a buffer: - * - look for a buffer to evict that is `bytes' long. - * - return the data block from this buffer rather than freeing it. - * This flag is used by callers that are trying to make space for a - * new buffer in a full arc cache. + * Evict the arc_buf_hdr that is provided as a parameter. The resultant + * state of the header is dependent on it's state prior to entering this + * function. The following transitions are possible: * - * This function makes a "best effort". It skips over any buffers - * it can't get a hash_lock on, and so may not catch all candidates. - * It may also return without evicting as much space as requested. + * - arc_mru -> arc_mru_ghost + * - arc_mfu -> arc_mfu_ghost + * - arc_mru_ghost -> arc_l2c_only + * - arc_mru_ghost -> deleted + * - arc_mfu_ghost -> arc_l2c_only + * - arc_mfu_ghost -> deleted */ -static void * -arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, - arc_buf_contents_t type) +static int64_t +arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { - arc_state_t *evicted_state; - uint64_t bytes_evicted = 0, skipped = 0, missed = 0; - int64_t bytes_remaining; - arc_buf_hdr_t *hdr, *hdr_prev = NULL; - list_t *evicted_list, *list, *evicted_list_start, *list_start; - kmutex_t *lock, *evicted_lock; - kmutex_t *hash_lock; - boolean_t have_lock; - void *stolen = NULL; - arc_buf_hdr_t marker = { 0 }; - int count = 0; - static int evict_metadata_offset, evict_data_offset; - int i, idx, offset, list_count, lists; + arc_state_t *evicted_state, *state; + int64_t bytes_evicted = 0; - ASSERT(state == arc_mru || state == arc_mfu); + ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(HDR_HAS_L1HDR(hdr)); - evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; + state = hdr->b_l1hdr.b_state; + if (GHOST_STATE(state)) { + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(hdr->b_l1hdr.b_buf == NULL); - /* - * Decide which "type" (data vs metadata) to recycle from. - * - * If we are over the metadata limit, recycle from metadata. - * If we are under the metadata minimum, recycle from data. - * Otherwise, recycle from whichever type has the oldest (least - * recently accessed) header. This is not yet implemented. - */ - if (recycle) { - arc_buf_contents_t realtype; - if (state->arcs_lsize[ARC_BUFC_DATA] == 0) { - realtype = ARC_BUFC_METADATA; - } else if (state->arcs_lsize[ARC_BUFC_METADATA] == 0) { - realtype = ARC_BUFC_DATA; - } else if (arc_meta_used >= arc_meta_limit) { - realtype = ARC_BUFC_METADATA; - } else if (arc_meta_used <= arc_meta_min) { - realtype = ARC_BUFC_DATA; -#ifdef illumos - } else if (HDR_HAS_L1HDR(data_hdr) && - HDR_HAS_L1HDR(metadata_hdr) && - data_hdr->b_l1hdr.b_arc_access < - metadata_hdr->b_l1hdr.b_arc_access) { - realtype = ARC_BUFC_DATA; - } else { - realtype = ARC_BUFC_METADATA; -#else - } else { - /* TODO */ - realtype = type; -#endif + /* + * l2arc_write_buffers() relies on a header's L1 portion + * (i.e. it's b_tmp_cdata field) during it's write phase. + * Thus, we cannot push a header onto the arc_l2c_only + * state (removing it's L1 piece) until the header is + * done being written to the l2arc. + */ + if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { + ARCSTAT_BUMP(arcstat_evict_l2_skip); + return (bytes_evicted); } - if (realtype != type) { + + ARCSTAT_BUMP(arcstat_deleted); + bytes_evicted += hdr->b_size; + + DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); + + if (HDR_HAS_L2HDR(hdr)) { + /* + * This buffer is cached on the 2nd Level ARC; + * don't destroy the header. + */ + arc_change_state(arc_l2c_only, hdr, hash_lock); /* - * If we want to evict from a different list, - * we can not recycle, because DATA vs METADATA - * buffers are segregated into different kmem - * caches (and vmem arenas). + * dropping from L1+L2 cached to L2-only, + * realloc to remove the L1 header. */ - type = realtype; - recycle = B_FALSE; + hdr = arc_hdr_realloc(hdr, hdr_full_cache, + hdr_l2only_cache); + } else { + arc_change_state(arc_anon, hdr, hash_lock); + arc_hdr_destroy(hdr); } + return (bytes_evicted); } - if (type == ARC_BUFC_METADATA) { - offset = 0; - list_count = ARC_BUFC_NUMMETADATALISTS; - list_start = &state->arcs_lists[0]; - evicted_list_start = &evicted_state->arcs_lists[0]; - idx = evict_metadata_offset; - } else { - offset = ARC_BUFC_NUMMETADATALISTS; - list_start = &state->arcs_lists[offset]; - evicted_list_start = &evicted_state->arcs_lists[offset]; - list_count = ARC_BUFC_NUMDATALISTS; - idx = evict_data_offset; - } - bytes_remaining = evicted_state->arcs_lsize[type]; - lists = 0; + ASSERT(state == arc_mru || state == arc_mfu); + evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; -evict_start: - list = &list_start[idx]; - evicted_list = &evicted_list_start[idx]; - lock = ARCS_LOCK(state, (offset + idx)); - evicted_lock = ARCS_LOCK(evicted_state, (offset + idx)); + /* prefetch buffers have a minimum lifespan */ + if (HDR_IO_IN_PROGRESS(hdr) || + ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && + ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < + arc_min_prefetch_lifespan)) { + ARCSTAT_BUMP(arcstat_evict_skip); + return (bytes_evicted); + } - /* - * The ghost list lock must be acquired first in order to prevent - * a 3 party deadlock: - * - * - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by - * l2ad_mtx in arc_hdr_realloc - * - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx - * - arc_evict acquires arc_*_ghost->arcs_mtx, followed by - * arc_*_ghost->arcs_mtx and forms a deadlock cycle. - * - * This situation is avoided by acquiring the ghost list lock first. - */ - mutex_enter(evicted_lock); - mutex_enter(lock); - - for (hdr = list_tail(list); hdr; hdr = hdr_prev) { - hdr_prev = list_prev(list, hdr); - if (HDR_HAS_L1HDR(hdr)) { - bytes_remaining -= - (hdr->b_size * hdr->b_l1hdr.b_datacnt); + ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); + ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); + while (hdr->b_l1hdr.b_buf) { + arc_buf_t *buf = hdr->b_l1hdr.b_buf; + if (!mutex_tryenter(&buf->b_evict_lock)) { + ARCSTAT_BUMP(arcstat_mutex_miss); + break; } - /* prefetch buffers have a minimum lifespan */ - if (HDR_IO_IN_PROGRESS(hdr) || - (spa && hdr->b_spa != spa) || - ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && - ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < - arc_min_prefetch_lifespan)) { - skipped++; - continue; + if (buf->b_data != NULL) + bytes_evicted += hdr->b_size; + if (buf->b_efunc != NULL) { + mutex_enter(&arc_user_evicts_lock); + arc_buf_destroy(buf, FALSE); + hdr->b_l1hdr.b_buf = buf->b_next; + buf->b_hdr = &arc_eviction_hdr; + buf->b_next = arc_eviction_list; + arc_eviction_list = buf; + cv_signal(&arc_user_evicts_cv); + mutex_exit(&arc_user_evicts_lock); + mutex_exit(&buf->b_evict_lock); + } else { + mutex_exit(&buf->b_evict_lock); + arc_buf_destroy(buf, TRUE); } - /* "lookahead" for better eviction candidate */ - if (recycle && hdr->b_size != bytes && - hdr_prev && hdr_prev->b_size == bytes) - continue; + } - /* ignore markers */ - if (hdr->b_spa == 0) - continue; + if (HDR_HAS_L2HDR(hdr)) { + ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size); + } else { + if (l2arc_write_eligible(hdr->b_spa, hdr)) + ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size); + else + ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size); + } + + if (hdr->b_l1hdr.b_datacnt == 0) { + arc_change_state(evicted_state, hdr, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(hdr)); + hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; + hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; + DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); + } + + return (bytes_evicted); +} + +static uint64_t +arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, + uint64_t spa, int64_t bytes) +{ + multilist_sublist_t *mls; + uint64_t bytes_evicted = 0; + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + int evict_count = 0; + + ASSERT3P(marker, !=, NULL); + IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); + + mls = multilist_sublist_lock(ml, idx); + + for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; + hdr = multilist_sublist_prev(mls, marker)) { + if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || + (evict_count >= zfs_arc_evict_batch_limit)) + break; /* - * It may take a long time to evict all the bufs requested. - * To avoid blocking all arc activity, periodically drop - * the arcs_mtx and give other threads a chance to run - * before reacquiring the lock. - * - * If we are looking for a buffer to recycle, we are in - * the hot code path, so don't sleep. + * To keep our iteration location, move the marker + * forward. Since we're not holding hdr's hash lock, we + * must be very careful and not remove 'hdr' from the + * sublist. Otherwise, other consumers might mistake the + * 'hdr' as not being on a sublist when they call the + * multilist_link_active() function (they all rely on + * the hash lock protecting concurrent insertions and + * removals). multilist_sublist_move_forward() was + * specifically implemented to ensure this is the case + * (only 'marker' will be removed and re-inserted). */ - if (!recycle && count++ > arc_evict_iterations) { - list_insert_after(list, hdr, &marker); - mutex_exit(lock); - mutex_exit(evicted_lock); - kpreempt(KPREEMPT_SYNC); - mutex_enter(evicted_lock); - mutex_enter(lock); - hdr_prev = list_prev(list, &marker); - list_remove(list, &marker); - count = 0; + multilist_sublist_move_forward(mls, marker); + + /* + * The only case where the b_spa field should ever be + * zero, is the marker headers inserted by + * arc_evict_state(). It's possible for multiple threads + * to be calling arc_evict_state() concurrently (e.g. + * dsl_pool_close() and zio_inject_fault()), so we must + * skip any markers we see from these other threads. + */ + if (hdr->b_spa == 0) + continue; + + /* we're only interested in evicting buffers of a certain spa */ + if (spa != 0 && hdr->b_spa != spa) { + ARCSTAT_BUMP(arcstat_evict_skip); continue; } hash_lock = HDR_LOCK(hdr); - have_lock = MUTEX_HELD(hash_lock); - if (have_lock || mutex_tryenter(hash_lock)) { - ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); - ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); - while (hdr->b_l1hdr.b_buf) { - arc_buf_t *buf = hdr->b_l1hdr.b_buf; - if (!mutex_tryenter(&buf->b_evict_lock)) { - missed += 1; - break; - } - if (buf->b_data != NULL) { - bytes_evicted += hdr->b_size; - if (recycle && - arc_buf_type(hdr) == type && - hdr->b_size == bytes && - !HDR_L2_WRITING(hdr)) { - stolen = buf->b_data; - recycle = FALSE; - } - } - if (buf->b_efunc != NULL) { - mutex_enter(&arc_eviction_mtx); - arc_buf_destroy(buf, - buf->b_data == stolen, FALSE); - hdr->b_l1hdr.b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - mutex_exit(&arc_eviction_mtx); - mutex_exit(&buf->b_evict_lock); - } else { - mutex_exit(&buf->b_evict_lock); - arc_buf_destroy(buf, - buf->b_data == stolen, TRUE); - } - } - if (HDR_HAS_L2HDR(hdr)) { - ARCSTAT_INCR(arcstat_evict_l2_cached, - hdr->b_size); - } else { - if (l2arc_write_eligible(hdr->b_spa, hdr)) { - ARCSTAT_INCR(arcstat_evict_l2_eligible, - hdr->b_size); - } else { - ARCSTAT_INCR( - arcstat_evict_l2_ineligible, - hdr->b_size); - } - } + /* + * We aren't calling this function from any code path + * that would already be holding a hash lock, so we're + * asserting on this assumption to be defensive in case + * this ever changes. Without this check, it would be + * possible to incorrectly increment arcstat_mutex_miss + * below (e.g. if the code changed such that we called + * this function with a hash lock held). + */ + ASSERT(!MUTEX_HELD(hash_lock)); - if (hdr->b_l1hdr.b_datacnt == 0) { - arc_change_state(evicted_state, hdr, hash_lock); - ASSERT(HDR_IN_HASH_TABLE(hdr)); - hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; - hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; - DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); - } - if (!have_lock) - mutex_exit(hash_lock); - if (bytes >= 0 && bytes_evicted >= bytes) - break; - if (bytes_remaining > 0) { - mutex_exit(evicted_lock); - mutex_exit(lock); - idx = ((idx + 1) & (list_count - 1)); - lists++; - goto evict_start; - } - } else { - missed += 1; - } - } + if (mutex_tryenter(hash_lock)) { + uint64_t evicted = arc_evict_hdr(hdr, hash_lock); + mutex_exit(hash_lock); - mutex_exit(lock); - mutex_exit(evicted_lock); + bytes_evicted += evicted; - idx = ((idx + 1) & (list_count - 1)); - lists++; + /* + * If evicted is zero, arc_evict_hdr() must have + * decided to skip this header, don't increment + * evict_count in this case. + */ + if (evicted != 0) + evict_count++; - if (bytes_evicted < bytes) { - if (lists < list_count) - goto evict_start; - else - dprintf("only evicted %lld bytes from %x", - (longlong_t)bytes_evicted, state); + /* + * If arc_size isn't overflowing, signal any + * threads that might happen to be waiting. + * + * For each header evicted, we wake up a single + * thread. If we used cv_broadcast, we could + * wake up "too many" threads causing arc_size + * to significantly overflow arc_c; since + * arc_get_data_buf() doesn't check for overflow + * when it's woken up (it doesn't because it's + * possible for the ARC to be overflowing while + * full of un-evictable buffers, and the + * function should proceed in this case). + * + * If threads are left sleeping, due to not + * using cv_broadcast, they will be woken up + * just before arc_reclaim_thread() sleeps. + */ + mutex_enter(&arc_reclaim_lock); + if (!arc_is_overflowing()) + cv_signal(&arc_reclaim_waiters_cv); + mutex_exit(&arc_reclaim_lock); + } else { + ARCSTAT_BUMP(arcstat_mutex_miss); + } } - if (type == ARC_BUFC_METADATA) - evict_metadata_offset = idx; - else - evict_data_offset = idx; - if (skipped) - ARCSTAT_INCR(arcstat_evict_skip, skipped); - - if (missed) - ARCSTAT_INCR(arcstat_mutex_miss, missed); - - /* - * Note: we have just evicted some data into the ghost state, - * potentially putting the ghost size over the desired size. Rather - * that evicting from the ghost list in this hot code path, leave - * this chore to the arc_reclaim_thread(). - */ + multilist_sublist_unlock(mls); - if (stolen) - ARCSTAT_BUMP(arcstat_stolen); - return (stolen); + return (bytes_evicted); } /* - * Remove buffers from list until we've removed the specified number of - * bytes. Destroy the buffers that are removed. + * Evict buffers from the given arc state, until we've removed the + * specified number of bytes. Move the removed buffers to the + * appropriate evict state. + * + * This function makes a "best effort". It skips over any buffers + * it can't get a hash_lock on, and so, may not catch all candidates. + * It may also return without evicting as much space as requested. + * + * If bytes is specified using the special value ARC_EVICT_ALL, this + * will evict all available (i.e. unlocked and evictable) buffers from + * the given arc state; which is used by arc_flush(). */ -static void -arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) +static uint64_t +arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, + arc_buf_contents_t type) { - arc_buf_hdr_t *hdr, *hdr_prev; - arc_buf_hdr_t marker = { 0 }; - list_t *list, *list_start; - kmutex_t *hash_lock, *lock; - uint64_t bytes_deleted = 0; - uint64_t bufs_skipped = 0; - int count = 0; - static int evict_offset; - int list_count, idx = evict_offset; - int offset, lists = 0; - - ASSERT(GHOST_STATE(state)); + uint64_t total_evicted = 0; + multilist_t *ml = &state->arcs_list[type]; + int num_sublists; + arc_buf_hdr_t **markers; + + IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); + + num_sublists = multilist_get_num_sublists(ml); /* - * data lists come after metadata lists + * If we've tried to evict from each sublist, made some + * progress, but still have not hit the target number of bytes + * to evict, we want to keep trying. The markers allow us to + * pick up where we left off for each individual sublist, rather + * than starting from the tail each time. */ - list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS]; - list_count = ARC_BUFC_NUMDATALISTS; - offset = ARC_BUFC_NUMMETADATALISTS; - -evict_start: - list = &list_start[idx]; - lock = ARCS_LOCK(state, idx + offset); - - mutex_enter(lock); - for (hdr = list_tail(list); hdr; hdr = hdr_prev) { - hdr_prev = list_prev(list, hdr); - if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES) - panic("invalid hdr=%p", (void *)hdr); - if (spa && hdr->b_spa != spa) - continue; + markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); + for (int i = 0; i < num_sublists; i++) { + markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); - /* ignore markers */ - if (hdr->b_spa == 0) - continue; + /* + * A b_spa of 0 is used to indicate that this header is + * a marker. This fact is used in arc_adjust_type() and + * arc_evict_state_impl(). + */ + markers[i]->b_spa = 0; - hash_lock = HDR_LOCK(hdr); - /* caller may be trying to modify this buffer, skip it */ - if (MUTEX_HELD(hash_lock)) - continue; + multilist_sublist_t *mls = multilist_sublist_lock(ml, i); + multilist_sublist_insert_tail(mls, markers[i]); + multilist_sublist_unlock(mls); + } + /* + * While we haven't hit our target number of bytes to evict, or + * we're evicting all available buffers. + */ + while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { /* - * It may take a long time to evict all the bufs requested. - * To avoid blocking all arc activity, periodically drop - * the arcs_mtx and give other threads a chance to run - * before reacquiring the lock. + * Start eviction using a randomly selected sublist, + * this is to try and evenly balance eviction across all + * sublists. Always starting at the same sublist + * (e.g. index 0) would cause evictions to favor certain + * sublists over others. */ - if (count++ > arc_evict_iterations) { - list_insert_after(list, hdr, &marker); - mutex_exit(lock); - kpreempt(KPREEMPT_SYNC); - mutex_enter(lock); - hdr_prev = list_prev(list, &marker); - list_remove(list, &marker); - count = 0; - continue; - } - if (mutex_tryenter(hash_lock)) { - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(!HDR_HAS_L1HDR(hdr) || - hdr->b_l1hdr.b_buf == NULL); - ARCSTAT_BUMP(arcstat_deleted); - bytes_deleted += hdr->b_size; + int sublist_idx = multilist_get_random_index(ml); + uint64_t scan_evicted = 0; - if (HDR_HAS_L2HDR(hdr)) { - /* - * This buffer is cached on the 2nd Level ARC; - * don't destroy the header. - */ - arc_change_state(arc_l2c_only, hdr, hash_lock); - /* - * dropping from L1+L2 cached to L2-only, - * realloc to remove the L1 header. - */ - hdr = arc_hdr_realloc(hdr, hdr_full_cache, - hdr_l2only_cache); - mutex_exit(hash_lock); - } else { - arc_change_state(arc_anon, hdr, hash_lock); - mutex_exit(hash_lock); - arc_hdr_destroy(hdr); - } + for (int i = 0; i < num_sublists; i++) { + uint64_t bytes_remaining; + uint64_t bytes_evicted; - DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); - if (bytes >= 0 && bytes_deleted >= bytes) + if (bytes == ARC_EVICT_ALL) + bytes_remaining = ARC_EVICT_ALL; + else if (total_evicted < bytes) + bytes_remaining = bytes - total_evicted; + else break; - } else if (bytes < 0) { + + bytes_evicted = arc_evict_state_impl(ml, sublist_idx, + markers[sublist_idx], spa, bytes_remaining); + + scan_evicted += bytes_evicted; + total_evicted += bytes_evicted; + + /* we've reached the end, wrap to the beginning */ + if (++sublist_idx >= num_sublists) + sublist_idx = 0; + } + + /* + * If we didn't evict anything during this scan, we have + * no reason to believe we'll evict more during another + * scan, so break the loop. + */ + if (scan_evicted == 0) { + /* This isn't possible, let's make that obvious */ + ASSERT3S(bytes, !=, 0); + /* - * Insert a list marker and then wait for the - * hash lock to become available. Once its - * available, restart from where we left off. + * When bytes is ARC_EVICT_ALL, the only way to + * break the loop is when scan_evicted is zero. + * In that case, we actually have evicted enough, + * so we don't want to increment the kstat. */ - list_insert_after(list, hdr, &marker); - mutex_exit(lock); - mutex_enter(hash_lock); - mutex_exit(hash_lock); - mutex_enter(lock); - hdr_prev = list_prev(list, &marker); - list_remove(list, &marker); - } else { - bufs_skipped += 1; + if (bytes != ARC_EVICT_ALL) { + ASSERT3S(total_evicted, <, bytes); + ARCSTAT_BUMP(arcstat_evict_not_enough); + } + + break; } + } + for (int i = 0; i < num_sublists; i++) { + multilist_sublist_t *mls = multilist_sublist_lock(ml, i); + multilist_sublist_remove(mls, markers[i]); + multilist_sublist_unlock(mls); + + kmem_cache_free(hdr_full_cache, markers[i]); } - mutex_exit(lock); - idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1)); - lists++; + kmem_free(markers, sizeof (*markers) * num_sublists); - if (lists < list_count) - goto evict_start; + return (total_evicted); +} - evict_offset = idx; - if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] && - (bytes < 0 || bytes_deleted < bytes)) { - list_start = &state->arcs_lists[0]; - list_count = ARC_BUFC_NUMMETADATALISTS; - offset = lists = 0; - goto evict_start; +/* + * Flush all "evictable" data of the given type from the arc state + * specified. This will not evict any "active" buffers (i.e. referenced). + * + * When 'retry' is set to FALSE, the function will make a single pass + * over the state and evict any buffers that it can. Since it doesn't + * continually retry the eviction, it might end up leaving some buffers + * in the ARC due to lock misses. + * + * When 'retry' is set to TRUE, the function will continually retry the + * eviction until *all* evictable buffers have been removed from the + * state. As a result, if concurrent insertions into the state are + * allowed (e.g. if the ARC isn't shutting down), this function might + * wind up in an infinite loop, continually trying to evict buffers. + */ +static uint64_t +arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, + boolean_t retry) +{ + uint64_t evicted = 0; + + while (state->arcs_lsize[type] != 0) { + evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); + + if (!retry) + break; } - if (bufs_skipped) { - ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); - ASSERT(bytes >= 0); + return (evicted); +} + +/* + * Evict the specified number of bytes from the state specified, + * restricting eviction to the spa and type given. This function + * prevents us from trying to evict more from a state's list than + * is "evictable", and to skip evicting altogether when passed a + * negative value for "bytes". In contrast, arc_evict_state() will + * evict everything it can, when passed a negative value for "bytes". + */ +static uint64_t +arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, + arc_buf_contents_t type) +{ + int64_t delta; + + if (bytes > 0 && state->arcs_lsize[type] > 0) { + delta = MIN(state->arcs_lsize[type], bytes); + return (arc_evict_state(state, spa, delta, type)); } - if (bytes_deleted < bytes) - dprintf("only deleted %lld bytes from %p", - (longlong_t)bytes_deleted, state); + return (0); } -static void +/* + * Evict metadata buffers from the cache, such that arc_meta_used is + * capped by the arc_meta_limit tunable. + */ +static uint64_t +arc_adjust_meta(void) +{ + uint64_t total_evicted = 0; + int64_t target; + + /* + * If we're over the meta limit, we want to evict enough + * metadata to get back under the meta limit. We don't want to + * evict so much that we drop the MRU below arc_p, though. If + * we're over the meta limit more than we're over arc_p, we + * evict some from the MRU here, and some from the MFU below. + */ + target = MIN((int64_t)(arc_meta_used - arc_meta_limit), + (int64_t)(refcount_count(&arc_anon->arcs_size) + + refcount_count(&arc_mru->arcs_size) - arc_p)); + + total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); + + /* + * Similar to the above, we want to evict enough bytes to get us + * below the meta limit, but not so much as to drop us below the + * space alloted to the MFU (which is defined as arc_c - arc_p). + */ + target = MIN((int64_t)(arc_meta_used - arc_meta_limit), + (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p))); + + total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + + return (total_evicted); +} + +/* + * Return the type of the oldest buffer in the given arc state + * + * This function will select a random sublist of type ARC_BUFC_DATA and + * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist + * is compared, and the type which contains the "older" buffer will be + * returned. + */ +static arc_buf_contents_t +arc_adjust_type(arc_state_t *state) +{ + multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA]; + multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA]; + int data_idx = multilist_get_random_index(data_ml); + int meta_idx = multilist_get_random_index(meta_ml); + multilist_sublist_t *data_mls; + multilist_sublist_t *meta_mls; + arc_buf_contents_t type; + arc_buf_hdr_t *data_hdr; + arc_buf_hdr_t *meta_hdr; + + /* + * We keep the sublist lock until we're finished, to prevent + * the headers from being destroyed via arc_evict_state(). + */ + data_mls = multilist_sublist_lock(data_ml, data_idx); + meta_mls = multilist_sublist_lock(meta_ml, meta_idx); + + /* + * These two loops are to ensure we skip any markers that + * might be at the tail of the lists due to arc_evict_state(). + */ + + for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; + data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { + if (data_hdr->b_spa != 0) + break; + } + + for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; + meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { + if (meta_hdr->b_spa != 0) + break; + } + + if (data_hdr == NULL && meta_hdr == NULL) { + type = ARC_BUFC_DATA; + } else if (data_hdr == NULL) { + ASSERT3P(meta_hdr, !=, NULL); + type = ARC_BUFC_METADATA; + } else if (meta_hdr == NULL) { + ASSERT3P(data_hdr, !=, NULL); + type = ARC_BUFC_DATA; + } else { + ASSERT3P(data_hdr, !=, NULL); + ASSERT3P(meta_hdr, !=, NULL); + + /* The headers can't be on the sublist without an L1 header */ + ASSERT(HDR_HAS_L1HDR(data_hdr)); + ASSERT(HDR_HAS_L1HDR(meta_hdr)); + + if (data_hdr->b_l1hdr.b_arc_access < + meta_hdr->b_l1hdr.b_arc_access) { + type = ARC_BUFC_DATA; + } else { + type = ARC_BUFC_METADATA; + } + } + + multilist_sublist_unlock(meta_mls); + multilist_sublist_unlock(data_mls); + + return (type); +} + +/* + * Evict buffers from the cache, such that arc_size is capped by arc_c. + */ +static uint64_t arc_adjust(void) { - int64_t adjustment, delta; + uint64_t total_evicted = 0; + uint64_t bytes; + int64_t target; + + /* + * If we're over arc_meta_limit, we want to correct that before + * potentially evicting data buffers below. + */ + total_evicted += arc_adjust_meta(); /* * Adjust MRU size + * + * If we're over the target cache size, we want to evict enough + * from the list to get back to our target size. We don't want + * to evict too much from the MRU, such that it drops below + * arc_p. So, if we're over our target cache size more than + * the MRU is over arc_p, we'll evict enough to get back to + * arc_p here, and then evict more from the MFU below. */ + target = MIN((int64_t)(arc_size - arc_c), + (int64_t)(refcount_count(&arc_anon->arcs_size) + + refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p)); - adjustment = MIN((int64_t)(arc_size - arc_c), - (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - - arc_p)); + /* + * If we're below arc_meta_min, always prefer to evict data. + * Otherwise, try to satisfy the requested number of bytes to + * evict from the type which contains older buffers; in an + * effort to keep newer buffers in the cache regardless of their + * type. If we cannot satisfy the number of bytes from this + * type, spill over into the next type. + */ + if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && + arc_meta_used > arc_meta_min) { + bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); + total_evicted += bytes; - if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { - delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); - (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); - adjustment -= delta; - } + /* + * If we couldn't evict our target number of bytes from + * metadata, we try to get the rest from data. + */ + target -= bytes; + + total_evicted += + arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); + } else { + bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; + + /* + * If we couldn't evict our target number of bytes from + * data, we try to get the rest from metadata. + */ + target -= bytes; - if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { - delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); - (void) arc_evict(arc_mru, 0, delta, FALSE, - ARC_BUFC_METADATA); + total_evicted += + arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); } /* * Adjust MFU size + * + * Now that we've tried to evict enough from the MRU to get its + * size back to arc_p, if we're still above the target cache + * size, we evict the rest from the MFU. */ + target = arc_size - arc_c; - adjustment = arc_size - arc_c; + if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && + arc_meta_used > arc_meta_min) { + bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + total_evicted += bytes; - if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { - delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); - (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); - adjustment -= delta; - } + /* + * If we couldn't evict our target number of bytes from + * metadata, we try to get the rest from data. + */ + target -= bytes; + + total_evicted += + arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); + } else { + bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; + + /* + * If we couldn't evict our target number of bytes from + * data, we try to get the rest from data. + */ + target -= bytes; - if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { - int64_t delta = MIN(adjustment, - arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); - (void) arc_evict(arc_mfu, 0, delta, FALSE, - ARC_BUFC_METADATA); + total_evicted += + arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); } /* * Adjust ghost lists + * + * In addition to the above, the ARC also defines target values + * for the ghost lists. The sum of the mru list and mru ghost + * list should never exceed the target size of the cache, and + * the sum of the mru list, mfu list, mru ghost list, and mfu + * ghost list should never exceed twice the target size of the + * cache. The following logic enforces these limits on the ghost + * caches, and evicts from them as needed. */ + target = refcount_count(&arc_mru->arcs_size) + + refcount_count(&arc_mru_ghost->arcs_size) - arc_c; - adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; + bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; - if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { - delta = MIN(arc_mru_ghost->arcs_size, adjustment); - arc_evict_ghost(arc_mru_ghost, 0, delta); - } + target -= bytes; - adjustment = - arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; + total_evicted += + arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); - if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { - delta = MIN(arc_mfu_ghost->arcs_size, adjustment); - arc_evict_ghost(arc_mfu_ghost, 0, delta); - } + /* + * We assume the sum of the mru list and mfu list is less than + * or equal to arc_c (we enforced this above), which means we + * can use the simpler of the two equations below: + * + * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c + * mru ghost + mfu ghost <= arc_c + */ + target = refcount_count(&arc_mru_ghost->arcs_size) + + refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; + + bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; + + target -= bytes; + + total_evicted += + arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); + + return (total_evicted); } static void arc_do_user_evicts(void) { - static arc_buf_t *tmp_arc_eviction_list; - - /* - * Move list over to avoid LOR - */ -restart: - mutex_enter(&arc_eviction_mtx); - tmp_arc_eviction_list = arc_eviction_list; - arc_eviction_list = NULL; - mutex_exit(&arc_eviction_mtx); - - while (tmp_arc_eviction_list != NULL) { - arc_buf_t *buf = tmp_arc_eviction_list; - tmp_arc_eviction_list = buf->b_next; + mutex_enter(&arc_user_evicts_lock); + while (arc_eviction_list != NULL) { + arc_buf_t *buf = arc_eviction_list; + arc_eviction_list = buf->b_next; mutex_enter(&buf->b_evict_lock); buf->b_hdr = NULL; mutex_exit(&buf->b_evict_lock); + mutex_exit(&arc_user_evicts_lock); if (buf->b_efunc != NULL) VERIFY0(buf->b_efunc(buf->b_private)); @@ -3014,58 +3267,45 @@ restart: buf->b_efunc = NULL; buf->b_private = NULL; kmem_cache_free(buf_cache, buf); + mutex_enter(&arc_user_evicts_lock); } - - if (arc_eviction_list != NULL) - goto restart; + mutex_exit(&arc_user_evicts_lock); } -/* - * Flush all *evictable* data from the cache for the given spa. - * NOTE: this will not touch "active" (i.e. referenced) data. - */ void -arc_flush(spa_t *spa) +arc_flush(spa_t *spa, boolean_t retry) { uint64_t guid = 0; + /* + * If retry is TRUE, a spa must not be specified since we have + * no good way to determine if all of a spa's buffers have been + * evicted from an arc state. + */ + ASSERT(!retry || spa == 0); + if (spa != NULL) guid = spa_load_guid(spa); - while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) { - (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); - if (spa != NULL) - break; - } - while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) { - (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); - if (spa != NULL) - break; - } - while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) { - (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); - if (spa != NULL) - break; - } - while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) { - (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); - if (spa != NULL) - break; - } + (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); + + (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); - arc_evict_ghost(arc_mru_ghost, guid, -1); - arc_evict_ghost(arc_mfu_ghost, guid, -1); + (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); + + (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); - mutex_enter(&arc_reclaim_thr_lock); arc_do_user_evicts(); - mutex_exit(&arc_reclaim_thr_lock); ASSERT(spa || arc_eviction_list == NULL); } void arc_shrink(int64_t to_free) { - if (arc_c > arc_c_min) { DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, arc_c_min, uint64_t, arc_p, uint64_t, to_free); @@ -3090,7 +3330,7 @@ arc_shrink(int64_t to_free) if (arc_size > arc_c) { DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, uint64_t, arc_c); - arc_adjust(); + (void) arc_adjust(); } } @@ -3329,17 +3569,37 @@ arc_kmem_reap_now(void) DTRACE_PROBE(arc__kmem_reap_end); } +/* + * Threads can block in arc_get_data_buf() waiting for this thread to evict + * enough data and signal them to proceed. When this happens, the threads in + * arc_get_data_buf() are sleeping while holding the hash lock for their + * particular arc header. Thus, we must be careful to never sleep on a + * hash lock in this thread. This is to prevent the following deadlock: + * + * - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L", + * waiting for the reclaim thread to signal it. + * + * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, + * fails, and goes to sleep forever. + * + * This possible deadlock is avoided by always acquiring a hash lock + * using mutex_tryenter() from arc_reclaim_thread(). + */ static void arc_reclaim_thread(void *dummy __unused) { clock_t growtime = 0; callb_cpr_t cpr; - CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); + CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); - mutex_enter(&arc_reclaim_thr_lock); - while (arc_thread_exit == 0) { + mutex_enter(&arc_reclaim_lock); + while (!arc_reclaim_thread_exit) { int64_t free_memory = arc_available_memory(); + uint64_t evicted = 0; + + mutex_exit(&arc_reclaim_lock); + if (free_memory < 0) { arc_no_grow = B_TRUE; @@ -3373,17 +3633,60 @@ arc_reclaim_thread(void *dummy __unused) arc_no_grow = B_FALSE; } - arc_adjust(); + evicted = arc_adjust(); - if (arc_eviction_list != NULL) - arc_do_user_evicts(); + mutex_enter(&arc_reclaim_lock); + /* + * If evicted is zero, we couldn't evict anything via + * arc_adjust(). This could be due to hash lock + * collisions, but more likely due to the majority of + * arc buffers being unevictable. Therefore, even if + * arc_size is above arc_c, another pass is unlikely to + * be helpful and could potentially cause us to enter an + * infinite loop. + */ + if (arc_size <= arc_c || evicted == 0) { #ifdef _KERNEL - if (needfree) { needfree = 0; - wakeup(&needfree); - } #endif + /* + * We're either no longer overflowing, or we + * can't evict anything more, so we should wake + * up any threads before we go to sleep. + */ + cv_broadcast(&arc_reclaim_waiters_cv); + + /* + * Block until signaled, or after one second (we + * might need to perform arc_kmem_reap_now() + * even if we aren't being signalled) + */ + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait(&arc_reclaim_thread_cv, + &arc_reclaim_lock, hz); + CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); + } + } + + arc_reclaim_thread_exit = FALSE; + cv_broadcast(&arc_reclaim_thread_cv); + CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ + thread_exit(); +} + +static void +arc_user_evicts_thread(void *dummy __unused) +{ + callb_cpr_t cpr; + + CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG); + + mutex_enter(&arc_user_evicts_lock); + while (!arc_user_evicts_thread_exit) { + mutex_exit(&arc_user_evicts_lock); + + arc_do_user_evicts(); /* * This is necessary in order for the mdb ::arc dcmd to @@ -3399,16 +3702,21 @@ arc_reclaim_thread(void *dummy __unused) if (arc_ksp != NULL) arc_ksp->ks_update(arc_ksp, KSTAT_READ); - /* block until needed, or one second, whichever is shorter */ + mutex_enter(&arc_user_evicts_lock); + + /* + * Block until signaled, or after one second (we need to + * call the arc's kstat update function regularly). + */ CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait(&arc_reclaim_thr_cv, - &arc_reclaim_thr_lock, hz); - CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); + (void) cv_timedwait(&arc_user_evicts_cv, + &arc_user_evicts_lock, hz); + CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock); } - arc_thread_exit = 0; - cv_broadcast(&arc_reclaim_thr_cv); - CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ + arc_user_evicts_thread_exit = FALSE; + cv_broadcast(&arc_user_evicts_cv); + CALLB_CPR_EXIT(&cpr); /* drops arc_user_evicts_lock */ thread_exit(); } @@ -3422,6 +3730,8 @@ arc_adapt(int bytes, arc_state_t *state) { int mult; uint64_t arc_p_min = (arc_c >> arc_p_min_shift); + int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size); + int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size); if (state == arc_l2c_only) return; @@ -3436,16 +3746,14 @@ arc_adapt(int bytes, arc_state_t *state) * target size of the MRU list. */ if (state == arc_mru_ghost) { - mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? - 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); + mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); } else if (state == arc_mfu_ghost) { uint64_t delta; - mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? - 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); + mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); mult = MIN(mult, 10); delta = MIN(bytes * mult, arc_p); @@ -3454,7 +3762,7 @@ arc_adapt(int bytes, arc_state_t *state) ASSERT((int64_t)arc_p >= 0); if (arc_reclaim_needed()) { - cv_signal(&arc_reclaim_thr_cv); + cv_signal(&arc_reclaim_thread_cv); return; } @@ -3482,43 +3790,25 @@ arc_adapt(int bytes, arc_state_t *state) } /* - * Check if the cache has reached its limits and eviction is required - * prior to insert. + * Check if arc_size has grown past our upper threshold, determined by + * zfs_arc_overflow_shift. */ -static int -arc_evict_needed(arc_buf_contents_t type) +static boolean_t +arc_is_overflowing(void) { - if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) - return (1); + /* Always allow at least one block of overflow */ + uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, + arc_c >> zfs_arc_overflow_shift); - if (arc_reclaim_needed()) - return (1); - - return (arc_size > arc_c); + return (arc_size >= arc_c + overflow); } /* - * The buffer, supplied as the first argument, needs a data block. - * So, if we are at cache max, determine which cache should be victimized. - * We have the following cases: - * - * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> - * In this situation if we're out of space, but the resident size of the MFU is - * under the limit, victimize the MFU cache to satisfy this insertion request. - * - * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> - * Here, we've used up all of the available space for the MRU, so we need to - * evict from our own cache instead. Evict from the set of resident MRU - * entries. - * - * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> - * c minus p represents the MFU space in the cache, since p is the size of the - * cache that is dedicated to the MRU. In this situation there's still space on - * the MFU side, so the MRU side needs to be victimized. - * - * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> - * MFU's resident set is consuming more space than it has been allotted. In - * this situation, we must victimize our own cache, the MFU, for this insertion. + * The buffer, supplied as the first argument, needs a data block. If we + * are hitting the hard limit for the cache size, we must sleep, waiting + * for the eviction thread to catch up. If we're past the target size + * but below the hard limit, we'll only signal the reclaim thread and + * continue on. */ static void arc_get_data_buf(arc_buf_t *buf) @@ -3530,62 +3820,70 @@ arc_get_data_buf(arc_buf_t *buf) arc_adapt(size, state); /* - * We have not yet reached cache maximum size, - * just allocate a new buffer. + * If arc_size is currently overflowing, and has grown past our + * upper limit, we must be adding data faster than the evict + * thread can evict. Thus, to ensure we don't compound the + * problem by adding more data and forcing arc_size to grow even + * further past it's target size, we halt and wait for the + * eviction thread to catch up. + * + * It's also possible that the reclaim thread is unable to evict + * enough buffers to get arc_size below the overflow limit (e.g. + * due to buffers being un-evictable, or hash lock collisions). + * In this case, we want to proceed regardless if we're + * overflowing; thus we don't use a while loop here. */ - if (!arc_evict_needed(type)) { - if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); - arc_space_consume(size, ARC_SPACE_META); - } else { - ASSERT(type == ARC_BUFC_DATA); - buf->b_data = zio_data_buf_alloc(size); - arc_space_consume(size, ARC_SPACE_DATA); + if (arc_is_overflowing()) { + mutex_enter(&arc_reclaim_lock); + + /* + * Now that we've acquired the lock, we may no longer be + * over the overflow limit, lets check. + * + * We're ignoring the case of spurious wake ups. If that + * were to happen, it'd let this thread consume an ARC + * buffer before it should have (i.e. before we're under + * the overflow limit and were signalled by the reclaim + * thread). As long as that is a rare occurrence, it + * shouldn't cause any harm. + */ + if (arc_is_overflowing()) { + cv_signal(&arc_reclaim_thread_cv); + cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); } - goto out; + + mutex_exit(&arc_reclaim_lock); } - /* - * If we are prefetching from the mfu ghost list, this buffer - * will end up on the mru list; so steal space from there. - */ - if (state == arc_mfu_ghost) - state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu; - else if (state == arc_mru_ghost) - state = arc_mru; - - if (state == arc_mru || state == arc_anon) { - uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; - state = (arc_mfu->arcs_lsize[type] >= size && - arc_p > mru_used) ? arc_mfu : arc_mru; + if (type == ARC_BUFC_METADATA) { + buf->b_data = zio_buf_alloc(size); + arc_space_consume(size, ARC_SPACE_META); } else { - /* MFU cases */ - uint64_t mfu_space = arc_c - arc_p; - state = (arc_mru->arcs_lsize[type] >= size && - mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; + ASSERT(type == ARC_BUFC_DATA); + buf->b_data = zio_data_buf_alloc(size); + arc_space_consume(size, ARC_SPACE_DATA); } - if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { - if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); - arc_space_consume(size, ARC_SPACE_META); - } else { - ASSERT(type == ARC_BUFC_DATA); - buf->b_data = zio_data_buf_alloc(size); - arc_space_consume(size, ARC_SPACE_DATA); - } - ARCSTAT_BUMP(arcstat_recycle_miss); - } - ASSERT(buf->b_data != NULL); -out: + /* * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { arc_buf_hdr_t *hdr = buf->b_hdr; + arc_state_t *state = hdr->b_l1hdr.b_state; + + (void) refcount_add_many(&state->arcs_size, size, buf); - atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size); - if (list_link_active(&hdr->b_l1hdr.b_arc_node)) { + /* + * If this is reached via arc_read, the link is + * protected by the hash lock. If reached via + * arc_buf_alloc, the header should not be accessed by + * any other thread. And, if reached via arc_read_done, + * the hash lock will protect it if it's found in the + * hash table; otherwise no other thread should be + * trying to [add|remove]_reference it. + */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], size); @@ -3595,7 +3893,8 @@ out: * data, and we have outgrown arc_p, update arc_p */ if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && - arc_anon->arcs_size + arc_mru->arcs_size > arc_p) + (refcount_count(&arc_anon->arcs_size) + + refcount_count(&arc_mru->arcs_size) > arc_p)) arc_p = MIN(arc_c, arc_p + size); } ARCSTAT_BUMP(arcstat_allocated); @@ -3638,7 +3937,8 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) */ if (HDR_PREFETCH(hdr)) { if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { - ASSERT(list_link_active( + /* link protected by hash lock */ + ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { hdr->b_flags &= ~ARC_FLAG_PREFETCH; @@ -3698,7 +3998,8 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) */ if ((HDR_PREFETCH(hdr)) != 0) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); + /* link protected by hash_lock */ + ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node)); } ARCSTAT_BUMP(arcstat_mfu_hits); hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); @@ -3903,7 +4204,7 @@ arc_read_done(zio_t *zio) } /* - * "Read" the block block at the specified DVA (in bp) via the + * "Read" the block at the specified DVA (in bp) via the * cache. If the block is found in the cache, invoke the provided * callback immediately and return. Note that the `zio' parameter * in the callback will be NULL in this case, since no IO was @@ -4070,7 +4371,7 @@ top: ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - ASSERT(hdr->b_l1hdr.b_buf == NULL); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* if this is a prefetch, we don't have a reference */ if (*arc_flags & ARC_FLAG_PREFETCH) @@ -4297,8 +4598,6 @@ arc_clear_callback(arc_buf_t *buf) kmutex_t *hash_lock; arc_evict_func_t *efunc = buf->b_efunc; void *private = buf->b_private; - list_t *list, *evicted_list; - kmutex_t *lock, *evicted_lock; mutex_enter(&buf->b_evict_lock); hdr = buf->b_hdr; @@ -4334,7 +4633,7 @@ arc_clear_callback(arc_buf_t *buf) if (hdr->b_l1hdr.b_datacnt > 1) { mutex_exit(&buf->b_evict_lock); - arc_buf_destroy(buf, FALSE, TRUE); + arc_buf_destroy(buf, TRUE); } else { ASSERT(buf == hdr->b_l1hdr.b_buf); hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; @@ -4364,6 +4663,9 @@ arc_release(arc_buf_t *buf, void *tag) */ mutex_enter(&buf->b_evict_lock); + + ASSERT(HDR_HAS_L1HDR(hdr)); + /* * We don't grab the hash lock prior to this check, because if * the buffer's header is in the arc_anon state, it won't be @@ -4449,8 +4751,10 @@ arc_release(arc_buf_t *buf, void *tag) buf->b_next = NULL; ASSERT3P(state, !=, arc_l2c_only); - ASSERT3U(state->arcs_size, >=, hdr->b_size); - atomic_add_64(&state->arcs_size, -hdr->b_size); + + (void) refcount_remove_many( + &state->arcs_size, hdr->b_size, buf); + if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { ASSERT3P(state, !=, arc_l2c_only); uint64_t *size = &state->arcs_lsize[type]; @@ -4487,17 +4791,18 @@ arc_release(arc_buf_t *buf, void *tag) nhdr->b_l1hdr.b_datacnt = 1; nhdr->b_l1hdr.b_state = arc_anon; nhdr->b_l1hdr.b_arc_access = 0; + nhdr->b_l1hdr.b_tmp_cdata = NULL; nhdr->b_freeze_cksum = NULL; (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; mutex_exit(&buf->b_evict_lock); - atomic_add_64(&arc_anon->arcs_size, blksz); + (void) refcount_add_many(&arc_anon->arcs_size, blksz, buf); } else { mutex_exit(&buf->b_evict_lock); ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); - /* protected by hash lock */ - ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + /* protected by hash lock, or hdr is on arc_anon */ + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); arc_change_state(arc_anon, hdr, hash_lock); hdr->b_l1hdr.b_arc_access = 0; @@ -4759,7 +5064,8 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) * network delays from blocking transactions that are ready to be * assigned to a txg. */ - anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); + anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) - + arc_loaned_bytes), 0); /* * Writes will, almost always, require additional memory allocations @@ -4796,7 +5102,7 @@ static void arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, kstat_named_t *evict_data, kstat_named_t *evict_metadata) { - size->value.ui64 = state->arcs_size; + size->value.ui64 = refcount_count(&state->arcs_size); evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; } @@ -4834,6 +5140,41 @@ arc_kstat_update(kstat_t *ksp, int rw) return (0); } +/* + * This function *must* return indices evenly distributed between all + * sublists of the multilist. This is needed due to how the ARC eviction + * code is laid out; arc_evict_state() assumes ARC buffers are evenly + * distributed between all sublists and uses this assumption when + * deciding which sublist to evict from and how much to evict from it. + */ +unsigned int +arc_state_multilist_index_func(multilist_t *ml, void *obj) +{ + arc_buf_hdr_t *hdr = obj; + + /* + * We rely on b_dva to generate evenly distributed index + * numbers using buf_hash below. So, as an added precaution, + * let's make sure we never add empty buffers to the arc lists. + */ + ASSERT(!BUF_EMPTY(hdr)); + + /* + * The assumption here, is the hash value for a given + * arc_buf_hdr_t will remain constant throughout it's lifetime + * (i.e. it's b_spa, b_dva, and b_birth fields don't change). + * Thus, we don't need to store the header's sublist index + * on insertion, as this index can be recalculated on removal. + * + * Also, the low order bits of the hash value are thought to be + * distributed evenly. Otherwise, in the case that the multilist + * has a power of two number of sublists, each sublists' usage + * would not be evenly distributed. + */ + return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % + multilist_get_num_sublists(ml)); +} + #ifdef _KERNEL static eventhandler_tag arc_event_lowmem = NULL; @@ -4841,11 +5182,11 @@ static void arc_lowmem(void *arg __unused, int howto __unused) { - mutex_enter(&arc_reclaim_thr_lock); + mutex_enter(&arc_reclaim_lock); /* XXX: Memory deficit should be passed as argument. */ needfree = btoc(arc_c >> arc_shrink_shift); DTRACE_PROBE(arc__needfree); - cv_signal(&arc_reclaim_thr_cv); + cv_signal(&arc_reclaim_thread_cv); /* * It is unsafe to block here in arbitrary threads, because we can come @@ -4853,8 +5194,8 @@ arc_lowmem(void *arg __unused, int howto __unused) * with ARC reclaim thread. */ if (curproc == pageproc) - msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0); - mutex_exit(&arc_reclaim_thr_lock); + (void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); + mutex_exit(&arc_reclaim_lock); } #endif @@ -4863,8 +5204,12 @@ arc_init(void) { int i, prefetch_tunable_set = 0; - mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); + cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); + + mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL); /* Convert seconds to clock ticks */ arc_min_prefetch_lifespan = 1 * hz; @@ -4936,6 +5281,9 @@ arc_init(void) if (zfs_arc_p_min_shift > 0) arc_p_min_shift = zfs_arc_p_min_shift; + if (zfs_arc_num_sublists_per_state < 1) + zfs_arc_num_sublists_per_state = MAX(max_ncpus, 1); + /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; @@ -4953,45 +5301,59 @@ arc_init(void) arc_l2c_only = &ARC_l2c_only; arc_size = 0; - for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { - mutex_init(&arc_anon->arcs_locks[i].arcs_lock, - NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mru->arcs_locks[i].arcs_lock, - NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock, - NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mfu->arcs_locks[i].arcs_lock, - NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock, - NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock, - NULL, MUTEX_DEFAULT, NULL); - - list_create(&arc_mru->arcs_lists[i], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - list_create(&arc_mru_ghost->arcs_lists[i], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - list_create(&arc_mfu->arcs_lists[i], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - list_create(&arc_mfu_ghost->arcs_lists[i], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - list_create(&arc_mfu_ghost->arcs_lists[i], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - list_create(&arc_l2c_only->arcs_lists[i], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - } + multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + + refcount_create(&arc_anon->arcs_size); + refcount_create(&arc_mru->arcs_size); + refcount_create(&arc_mru_ghost->arcs_size); + refcount_create(&arc_mfu->arcs_size); + refcount_create(&arc_mfu_ghost->arcs_size); + refcount_create(&arc_l2c_only->arcs_size); buf_init(); - arc_thread_exit = 0; + arc_reclaim_thread_exit = FALSE; + arc_user_evicts_thread_exit = FALSE; arc_eviction_list = NULL; - mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, @@ -5011,6 +5373,9 @@ arc_init(void) EVENTHANDLER_PRI_FIRST); #endif + (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); + arc_dead = FALSE; arc_warm = B_FALSE; @@ -5069,16 +5434,32 @@ arc_init(void) void arc_fini(void) { - int i; + mutex_enter(&arc_reclaim_lock); + arc_reclaim_thread_exit = TRUE; + /* + * The reclaim thread will set arc_reclaim_thread_exit back to + * FALSE when it is finished exiting; we're waiting for that. + */ + while (arc_reclaim_thread_exit) { + cv_signal(&arc_reclaim_thread_cv); + cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); + } + mutex_exit(&arc_reclaim_lock); - mutex_enter(&arc_reclaim_thr_lock); - arc_thread_exit = 1; - cv_signal(&arc_reclaim_thr_cv); - while (arc_thread_exit != 0) - cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); - mutex_exit(&arc_reclaim_thr_lock); + mutex_enter(&arc_user_evicts_lock); + arc_user_evicts_thread_exit = TRUE; + /* + * The user evicts thread will set arc_user_evicts_thread_exit + * to FALSE when it is finished exiting; we're waiting for that. + */ + while (arc_user_evicts_thread_exit) { + cv_signal(&arc_user_evicts_cv); + cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock); + } + mutex_exit(&arc_user_evicts_lock); - arc_flush(NULL); + /* Use TRUE to ensure *all* buffers are evicted */ + arc_flush(NULL, TRUE); arc_dead = TRUE; @@ -5087,24 +5468,28 @@ arc_fini(void) arc_ksp = NULL; } - mutex_destroy(&arc_eviction_mtx); - mutex_destroy(&arc_reclaim_thr_lock); - cv_destroy(&arc_reclaim_thr_cv); + mutex_destroy(&arc_reclaim_lock); + cv_destroy(&arc_reclaim_thread_cv); + cv_destroy(&arc_reclaim_waiters_cv); - for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { - list_destroy(&arc_mru->arcs_lists[i]); - list_destroy(&arc_mru_ghost->arcs_lists[i]); - list_destroy(&arc_mfu->arcs_lists[i]); - list_destroy(&arc_mfu_ghost->arcs_lists[i]); - list_destroy(&arc_l2c_only->arcs_lists[i]); + mutex_destroy(&arc_user_evicts_lock); + cv_destroy(&arc_user_evicts_cv); - mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock); - mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock); - mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock); - mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock); - mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock); - mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock); - } + refcount_destroy(&arc_anon->arcs_size); + refcount_destroy(&arc_mru->arcs_size); + refcount_destroy(&arc_mru_ghost->arcs_size); + refcount_destroy(&arc_mfu->arcs_size); + refcount_destroy(&arc_mfu_ghost->arcs_size); + refcount_destroy(&arc_l2c_only->arcs_size); + + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); buf_fini(); @@ -5450,39 +5835,68 @@ l2arc_write_done(zio_t *zio) if (zio->io_error != 0) ARCSTAT_BUMP(arcstat_l2_writes_error); - mutex_enter(&dev->l2ad_mtx); - /* * All writes completed, or an error was hit. */ +top: + mutex_enter(&dev->l2ad_mtx); for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); hash_lock = HDR_LOCK(hdr); + + /* + * We cannot use mutex_enter or else we can deadlock + * with l2arc_write_buffers (due to swapping the order + * the hash lock and l2ad_mtx are taken). + */ if (!mutex_tryenter(hash_lock)) { /* - * This buffer misses out. It may be in a stage - * of eviction. Its ARC_FLAG_L2_WRITING flag will be - * left set, denying reads to this buffer. + * Missed the hash lock. We must retry so we + * don't leave the ARC_FLAG_L2_WRITING bit set. */ - ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); - continue; + ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); + + /* + * We don't want to rescan the headers we've + * already marked as having been written out, so + * we reinsert the head node so we can pick up + * where we left off. + */ + list_remove(buflist, head); + list_insert_after(buflist, hdr, head); + + mutex_exit(&dev->l2ad_mtx); + + /* + * We wait for the hash lock to become available + * to try and prevent busy waiting, and increase + * the chance we'll be able to acquire the lock + * the next time around. + */ + mutex_enter(hash_lock); + mutex_exit(hash_lock); + goto top; } /* - * It's possible that this buffer got evicted from the L1 cache - * before we grabbed the vdev + hash locks, in which case - * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated. - * Only free the buffer if we still have an L1 hdr. + * We could not have been moved into the arc_l2c_only + * state while in-flight due to our ARC_FLAG_L2_WRITING + * bit being set. Let's just ensure that's being enforced. */ - if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL && - HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) - l2arc_release_cdata_buf(hdr); + ASSERT(HDR_HAS_L1HDR(hdr)); + + /* + * We may have allocated a buffer for L2ARC compression, + * we must release it to avoid leaking this data. + */ + l2arc_release_cdata_buf(hdr); if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ + list_remove(buflist, hdr); trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0); hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; @@ -5496,7 +5910,8 @@ l2arc_write_done(zio_t *zio) } /* - * Allow ARC to begin reads to this L2ARC entry. + * Allow ARC to begin reads and ghost list evictions to + * this L2ARC entry. */ hdr->b_flags &= ~ARC_FLAG_L2_WRITING; @@ -5604,36 +6019,37 @@ l2arc_read_done(zio_t *zio) * the data lists. This function returns a locked list, and also returns * the lock pointer. */ -static list_t * -l2arc_list_locked(int list_num, kmutex_t **lock) +static multilist_sublist_t * +l2arc_sublist_lock(int list_num) { - list_t *list = NULL; - int idx; - - ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS); - - if (list_num < ARC_BUFC_NUMMETADATALISTS) { - idx = list_num; - list = &arc_mfu->arcs_lists[idx]; - *lock = ARCS_LOCK(arc_mfu, idx); - } else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) { - idx = list_num - ARC_BUFC_NUMMETADATALISTS; - list = &arc_mru->arcs_lists[idx]; - *lock = ARCS_LOCK(arc_mru, idx); - } else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 + - ARC_BUFC_NUMDATALISTS)) { - idx = list_num - ARC_BUFC_NUMMETADATALISTS; - list = &arc_mfu->arcs_lists[idx]; - *lock = ARCS_LOCK(arc_mfu, idx); - } else { - idx = list_num - ARC_BUFC_NUMLISTS; - list = &arc_mru->arcs_lists[idx]; - *lock = ARCS_LOCK(arc_mru, idx); + multilist_t *ml = NULL; + unsigned int idx; + + ASSERT(list_num >= 0 && list_num <= 3); + + switch (list_num) { + case 0: + ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; + break; + case 1: + ml = &arc_mru->arcs_list[ARC_BUFC_METADATA]; + break; + case 2: + ml = &arc_mfu->arcs_list[ARC_BUFC_DATA]; + break; + case 3: + ml = &arc_mru->arcs_list[ARC_BUFC_DATA]; + break; } - ASSERT(!(MUTEX_HELD(*lock))); - mutex_enter(*lock); - return (list); + /* + * Return a randomly-selected sublist. This is acceptable + * because the caller feeds only a little bit of data for each + * call (8MB). Subsequent calls will result in different + * sublists being selected. + */ + idx = multilist_get_random_index(ml); + return (multilist_sublist_lock(ml, idx)); } /* @@ -5678,6 +6094,12 @@ top: hdr_prev = list_prev(buflist, hdr); hash_lock = HDR_LOCK(hdr); + + /* + * We cannot use mutex_enter or else we can deadlock + * with l2arc_write_buffers (due to swapping the order + * the hash lock and l2ad_mtx are taken). + */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. Retry. @@ -5733,6 +6155,10 @@ top: hdr->b_flags |= ARC_FLAG_L2_EVICTED; } + /* Ensure this header has finished being written */ + ASSERT(!HDR_L2_WRITING(hdr)); + ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); + arc_hdr_l2hdr_destroy(hdr); } mutex_exit(hash_lock); @@ -5756,11 +6182,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, boolean_t *headroom_boost) { arc_buf_hdr_t *hdr, *hdr_prev, *head; - list_t *list; uint64_t write_asize, write_psize, write_sz, headroom, buf_compress_minsz; void *buf_data; - kmutex_t *list_lock; boolean_t full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; @@ -5790,11 +6214,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* * Copy buffers for L2ARC writing. */ - mutex_enter(&dev->l2ad_mtx); - for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) { + for (try = 0; try <= 3; try++) { + multilist_sublist_t *mls = l2arc_sublist_lock(try); uint64_t passed_sz = 0; - list = l2arc_list_locked(try, &list_lock); ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); /* @@ -5804,13 +6227,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * head of the ARC lists rather than the tail. */ if (arc_warm == B_FALSE) - hdr = list_head(list); + hdr = multilist_sublist_head(mls); else - hdr = list_tail(list); + hdr = multilist_sublist_tail(mls); if (hdr == NULL) ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); - headroom = target_sz * l2arc_headroom * 2 / ARC_BUFC_NUMLISTS; + headroom = target_sz * l2arc_headroom; if (do_headroom_boost) headroom = (headroom * l2arc_headroom_boost) / 100; @@ -5819,9 +6242,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, uint64_t buf_sz; if (arc_warm == B_FALSE) - hdr_prev = list_next(list, hdr); + hdr_prev = multilist_sublist_next(mls, hdr); else - hdr_prev = list_prev(list, hdr); + hdr_prev = multilist_sublist_prev(mls, hdr); ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size); hash_lock = HDR_LOCK(hdr); @@ -5861,7 +6284,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * l2arc_write_done() can find where the * write buffers begin without searching. */ + mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, head); + mutex_exit(&dev->l2ad_mtx); cb = kmem_alloc( sizeof (l2arc_write_callback_t), KM_SLEEP); @@ -5915,7 +6340,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, buf_sz = hdr->b_size; hdr->b_flags |= ARC_FLAG_HAS_L2HDR; + mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, hdr); + mutex_exit(&dev->l2ad_mtx); /* * Compute and store the buffer cksum before @@ -5929,7 +6356,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, write_sz += buf_sz; } - mutex_exit(list_lock); + multilist_sublist_unlock(mls); if (full == B_TRUE) break; @@ -5938,12 +6365,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* No buffers selected for writing? */ if (pio == NULL) { ASSERT0(write_sz); - mutex_exit(&dev->l2ad_mtx); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); return (0); } + mutex_enter(&dev->l2ad_mtx); + /* * Now start writing the buffers. We're starting at the write head * and work backwards, retracing the course of the buffer selector @@ -5954,6 +6382,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, uint64_t buf_sz; /* + * We rely on the L1 portion of the header below, so + * it's invalid for this header to have been evicted out + * of the ghost cache, prior to being written out. The + * ARC_FLAG_L2_WRITING bit ensures this won't happen. + */ + ASSERT(HDR_HAS_L1HDR(hdr)); + + /* * We shouldn't need to lock the buffer here, since we flagged * it as ARC_FLAG_L2_WRITING in the previous step, but we must * take care to only access its L2 cache parameters. In @@ -5981,14 +6417,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, buf_sz = hdr->b_l2hdr.b_asize; /* - * If the data has not been compressed, then clear b_tmp_cdata - * to make sure that it points only to a temporary compression - * buffer. - */ - if (!L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr))) - hdr->b_l1hdr.b_tmp_cdata = NULL; - - /* * We need to do this regardless if buf_sz is zero or * not, otherwise, when this l2hdr is evicted we'll * remove a reference that was never added. @@ -6081,6 +6509,12 @@ l2arc_compress_buf(arc_buf_hdr_t *hdr) csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata, cdata, l2hdr->b_asize); + rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE); + if (rounded > csize) { + bzero((char *)cdata + csize, rounded - csize); + csize = rounded; + } + if (csize == 0) { /* zero block, indicate that there's nothing to write */ zio_data_buf_free(cdata, len); @@ -6089,19 +6523,11 @@ l2arc_compress_buf(arc_buf_hdr_t *hdr) hdr->b_l1hdr.b_tmp_cdata = NULL; ARCSTAT_BUMP(arcstat_l2_compress_zeros); return (B_TRUE); - } - - rounded = P2ROUNDUP(csize, - (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift); - if (rounded < len) { + } else if (csize > 0 && csize < len) { /* * Compression succeeded, we'll keep the cdata around for * writing and release it afterwards. */ - if (rounded > csize) { - bzero((char *)cdata + csize, rounded - csize); - csize = rounded; - } HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4); l2hdr->b_asize = csize; hdr->b_l1hdr.b_tmp_cdata = cdata; @@ -6189,8 +6615,26 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) static void l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) { + enum zio_compress comp = HDR_GET_COMPRESS(hdr); + ASSERT(HDR_HAS_L1HDR(hdr)); - if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) { + ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp)); + + if (comp == ZIO_COMPRESS_OFF) { + /* + * In this case, b_tmp_cdata points to the same buffer + * as the arc_buf_t's b_data field. We don't want to + * free it, since the arc_buf_t will handle that. + */ + hdr->b_l1hdr.b_tmp_cdata = NULL; + } else if (comp == ZIO_COMPRESS_EMPTY) { + /* + * In this case, b_tmp_cdata was compressed to an empty + * buffer, thus there's nothing to free and b_tmp_cdata + * should have been set to NULL in l2arc_write_buffers(). + */ + ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); + } else { /* * If the data was compressed, then we've allocated a * temporary buffer for it, so now we need to release it. @@ -6199,9 +6643,8 @@ l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size); hdr->b_l1hdr.b_tmp_cdata = NULL; - } else { - ASSERT(hdr->b_l1hdr.b_tmp_cdata == NULL); } + } /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c index 5f7d76f..b2b9887 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c @@ -154,7 +154,7 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, int err; struct bptree_args *ba = arg; - if (BP_IS_HOLE(bp)) + if (bp == NULL || BP_IS_HOLE(bp)) return (0); err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c new file mode 100644 index 0000000..1ddc697 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c @@ -0,0 +1,111 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Delphix. All rights reserved. + */ + +#include <sys/bqueue.h> +#include <sys/zfs_context.h> + +static inline bqueue_node_t * +obj2node(bqueue_t *q, void *data) +{ + return ((bqueue_node_t *)((char *)data + q->bq_node_offset)); +} + +/* + * Initialize a blocking queue The maximum capacity of the queue is set to + * size. Types that want to be stored in a bqueue must contain a bqueue_node_t, + * and offset should give its offset from the start of the struct. Return 0 on + * success, or -1 on failure. + */ +int +bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset) +{ + list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t), + node_offset + offsetof(bqueue_node_t, bqn_node)); + cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL); + cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL); + q->bq_node_offset = node_offset; + q->bq_size = 0; + q->bq_maxsize = size; + return (0); +} + +/* + * Destroy a blocking queue. This function asserts that there are no + * elements in the queue, and no one is blocked on the condition + * variables. + */ +void +bqueue_destroy(bqueue_t *q) +{ + ASSERT0(q->bq_size); + cv_destroy(&q->bq_add_cv); + cv_destroy(&q->bq_pop_cv); + mutex_destroy(&q->bq_lock); + list_destroy(&q->bq_list); +} + +/* + * Add data to q, consuming size units of capacity. If there is insufficient + * capacity to consume size units, block until capacity exists. Asserts size is + * > 0. + */ +void +bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size) +{ + ASSERT3U(item_size, >, 0); + ASSERT3U(item_size, <, q->bq_maxsize); + mutex_enter(&q->bq_lock); + obj2node(q, data)->bqn_size = item_size; + while (q->bq_size + item_size > q->bq_maxsize) { + cv_wait(&q->bq_add_cv, &q->bq_lock); + } + q->bq_size += item_size; + list_insert_tail(&q->bq_list, data); + cv_signal(&q->bq_pop_cv); + mutex_exit(&q->bq_lock); +} +/* + * Take the first element off of q. If there are no elements on the queue, wait + * until one is put there. Return the removed element. + */ +void * +bqueue_dequeue(bqueue_t *q) +{ + void *ret; + uint64_t item_size; + mutex_enter(&q->bq_lock); + while (q->bq_size == 0) { + cv_wait(&q->bq_pop_cv, &q->bq_lock); + } + ret = list_remove_head(&q->bq_list); + item_size = obj2node(q, ret)->bqn_size; + q->bq_size -= item_size; + mutex_exit(&q->bq_lock); + cv_signal(&q->bq_add_cv); + return (ret); +} + +/* + * Returns true if the space used is 0. + */ +boolean_t +bqueue_empty(bqueue_t *q) +{ + return (q->bq_size == 0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index 79b6aed..16d8a2e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -548,11 +548,35 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) return (abuf); } +/* + * Calculate which level n block references the data at the level 0 offset + * provided. + */ uint64_t -dbuf_whichblock(dnode_t *dn, uint64_t offset) +dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset) { - if (dn->dn_datablkshift) { - return (offset >> dn->dn_datablkshift); + if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { + /* + * The level n blkid is equal to the level 0 blkid divided by + * the number of level 0s in a level n block. + * + * The level 0 blkid is offset >> datablkshift = + * offset / 2^datablkshift. + * + * The number of level 0s in a level n is the number of block + * pointers in an indirect block, raised to the power of level. + * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = + * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). + * + * Thus, the level n blkid is: offset / + * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT))) + * = offset / 2^(datablkshift + level * + * (indblkshift - SPA_BLKPTRSHIFT)) + * = offset >> (datablkshift + level * + * (indblkshift - SPA_BLKPTRSHIFT)) + */ + return (offset >> (dn->dn_datablkshift + level * + (dn->dn_indblkshift - SPA_BLKPTRSHIFT))); } else { ASSERT3U(offset, <, dn->dn_datablksz); return (0); @@ -1549,6 +1573,11 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, struct dirty_leaf *dl; dmu_object_type_t type; + if (etype == BP_EMBEDDED_TYPE_DATA) { + ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), + SPA_FEATURE_EMBEDDED_DATA)); + } + DB_DNODE_ENTER(db); type = DB_DNODE(db)->dn_type; DB_DNODE_EXIT(db); @@ -1715,6 +1744,12 @@ dbuf_clear(dmu_buf_impl_t *db) dbuf_rele(parent, db); } +/* + * Note: While bpp will always be updated if the function returns success, + * parentp will not be updated if the dnode does not have dn_dbuf filled in; + * this happens when the dnode is the meta-dnode, or a userused or groupused + * object. + */ static int dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, dmu_buf_impl_t **parentp, blkptr_t **bpp) @@ -1755,7 +1790,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, } else if (level < nlevels-1) { /* this block is referenced from an indirect block */ int err = dbuf_hold_impl(dn, level+1, - blkid >> epbs, fail_sparse, NULL, parentp); + blkid >> epbs, fail_sparse, FALSE, NULL, parentp); if (err) return (err); err = dbuf_read(*parentp, NULL, @@ -1930,11 +1965,96 @@ dbuf_destroy(dmu_buf_impl_t *db) arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); } +typedef struct dbuf_prefetch_arg { + spa_t *dpa_spa; /* The spa to issue the prefetch in. */ + zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ + int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ + int dpa_curlevel; /* The current level that we're reading */ + zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ + zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ + arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ +} dbuf_prefetch_arg_t; + +/* + * Actually issue the prefetch read for the block given. + */ +static void +dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) +{ + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + return; + + arc_flags_t aflags = + dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; + + ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); + ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); + ASSERT(dpa->dpa_zio != NULL); + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, + dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &aflags, &dpa->dpa_zb); +} + +/* + * Called when an indirect block above our prefetch target is read in. This + * will either read in the next indirect block down the tree or issue the actual + * prefetch if the next block down is our target. + */ +static void +dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) +{ + dbuf_prefetch_arg_t *dpa = private; + + ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); + ASSERT3S(dpa->dpa_curlevel, >, 0); + if (zio != NULL) { + ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); + ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); + ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); + } + + dpa->dpa_curlevel--; + + uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> + (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); + blkptr_t *bp = ((blkptr_t *)abuf->b_data) + + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); + if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) { + kmem_free(dpa, sizeof (*dpa)); + } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { + ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); + dbuf_issue_final_prefetch(dpa, bp); + kmem_free(dpa, sizeof (*dpa)); + } else { + arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; + zbookmark_phys_t zb; + + ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); + + SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, + dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); + + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, + bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &iter_aflags, &zb); + } + (void) arc_buf_remove_ref(abuf, private); +} + +/* + * Issue prefetch reads for the given block on the given level. If the indirect + * blocks above that block are not in memory, we will read them in + * asynchronously. As a result, this call never blocks waiting for a read to + * complete. + */ void -dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) +dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, + arc_flags_t aflags) { - dmu_buf_impl_t *db = NULL; - blkptr_t *bp = NULL; + blkptr_t bp; + int epbs, nlevels, curlevel; + uint64_t curblkid; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); @@ -1942,35 +2062,104 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) if (dnode_block_freed(dn, blkid)) return; - /* dbuf_find() returns with db_mtx held */ - if (db = dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid)) { + /* + * This dnode hasn't been written to disk yet, so there's nothing to + * prefetch. + */ + nlevels = dn->dn_phys->dn_nlevels; + if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) + return; + + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) + return; + + dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, + level, blkid); + if (db != NULL) { + mutex_exit(&db->db_mtx); /* - * This dbuf is already in the cache. We assume that - * it is already CACHED, or else about to be either - * read or filled. + * This dbuf already exists. It is either CACHED, or + * (we assume) about to be read or filled. */ - mutex_exit(&db->db_mtx); return; } - if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { - if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { - dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; - arc_flags_t aflags = - ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; - zbookmark_phys_t zb; + /* + * Find the closest ancestor (indirect block) of the target block + * that is present in the cache. In this indirect block, we will + * find the bp that is at curlevel, curblkid. + */ + curlevel = level; + curblkid = blkid; + while (curlevel < nlevels - 1) { + int parent_level = curlevel + 1; + uint64_t parent_blkid = curblkid >> epbs; + dmu_buf_impl_t *db; + + if (dbuf_hold_impl(dn, parent_level, parent_blkid, + FALSE, TRUE, FTAG, &db) == 0) { + blkptr_t *bpp = db->db_buf->b_data; + bp = bpp[P2PHASE(curblkid, 1 << epbs)]; + dbuf_rele(db, FTAG); + break; + } - SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, - dn->dn_object, 0, blkid); + curlevel = parent_level; + curblkid = parent_blkid; + } - (void) arc_read(NULL, dn->dn_objset->os_spa, - bp, NULL, NULL, prio, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, - &aflags, &zb); - } - if (db) - dbuf_rele(db, NULL); + if (curlevel == nlevels - 1) { + /* No cached indirect blocks found. */ + ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); + bp = dn->dn_phys->dn_blkptr[curblkid]; + } + if (BP_IS_HOLE(&bp)) + return; + + ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); + + zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, + ZIO_FLAG_CANFAIL); + + dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, + dn->dn_object, level, blkid); + dpa->dpa_curlevel = curlevel; + dpa->dpa_prio = prio; + dpa->dpa_aflags = aflags; + dpa->dpa_spa = dn->dn_objset->os_spa; + dpa->dpa_epbs = epbs; + dpa->dpa_zio = pio; + + /* + * If we have the indirect just above us, no need to do the asynchronous + * prefetch chain; we'll just run the last step ourselves. If we're at + * a higher level, though, we want to issue the prefetches for all the + * indirect blocks asynchronously, so we can go on with whatever we were + * doing. + */ + if (curlevel == level) { + ASSERT3U(curblkid, ==, blkid); + dbuf_issue_final_prefetch(dpa, &bp); + kmem_free(dpa, sizeof (*dpa)); + } else { + arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; + zbookmark_phys_t zb; + + SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, + dn->dn_object, curlevel, curblkid); + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, + &bp, dbuf_prefetch_indirect_done, dpa, prio, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &iter_aflags, &zb); } + /* + * We use pio here instead of dpa_zio since it's possible that + * dpa may have already been freed. + */ + zio_nowait(pio); } /* @@ -1978,7 +2167,8 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) * Note: dn_struct_rwlock must be held. */ int -dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, +dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, + boolean_t fail_sparse, boolean_t fail_uncached, void *tag, dmu_buf_impl_t **dbp) { dmu_buf_impl_t *db, *parent = NULL; @@ -1996,6 +2186,9 @@ top: blkptr_t *bp = NULL; int err; + if (fail_uncached) + return (SET_ERROR(ENOENT)); + ASSERT3P(parent, ==, NULL); err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); if (fail_sparse) { @@ -2012,6 +2205,11 @@ top: db = dbuf_create(dn, level, blkid, parent, bp); } + if (fail_uncached && db->db_state != DB_CACHED) { + mutex_exit(&db->db_mtx); + return (SET_ERROR(ENOENT)); + } + if (db->db_buf && refcount_is_zero(&db->db_holds)) { arc_buf_add_ref(db->db_buf, db); if (db->db_buf->b_data == NULL) { @@ -2067,16 +2265,14 @@ top: dmu_buf_impl_t * dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) { - dmu_buf_impl_t *db; - int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); - return (err ? NULL : db); + return (dbuf_hold_level(dn, 0, blkid, tag)); } dmu_buf_impl_t * dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) { dmu_buf_impl_t *db; - int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); + int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); return (err ? NULL : db); } @@ -2429,8 +2625,8 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) if (parent == NULL) { mutex_exit(&db->db_mtx); rw_enter(&dn->dn_struct_rwlock, RW_READER); - (void) dbuf_hold_impl(dn, db->db_level+1, - db->db_blkid >> epbs, FALSE, db, &parent); + parent = dbuf_hold_level(dn, db->db_level + 1, + db->db_blkid >> epbs, db); rw_exit(&dn->dn_struct_rwlock); mutex_enter(&db->db_mtx); db->db_parent = parent; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c index f45071b..91cd511 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -141,7 +141,7 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); - blkid = dbuf_whichblock(dn, offset); + blkid = dbuf_whichblock(dn, 0, offset); rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); @@ -424,7 +424,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); - blkid = dbuf_whichblock(dn, offset); + blkid = dbuf_whichblock(dn, 0, offset); for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); if (db == NULL) { @@ -528,17 +528,16 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) } /* - * Issue prefetch i/os for the given blocks. + * Issue prefetch i/os for the given blocks. If level is greater than 0, the + * indirect blocks prefeteched will be those that point to the blocks containing + * the data starting at offset, and continuing to offset + len. * - * Note: The assumption is that we *know* these blocks will be needed - * almost immediately. Therefore, the prefetch i/os will be issued at - * ZIO_PRIORITY_SYNC_READ - * - * Note: indirect blocks and other metadata will be read synchronously, - * causing this function to block if they are not already cached. + * Note that if the indirect blocks above the blocks being prefetched are not in + * cache, they will be asychronously read in. */ void -dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) +dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, + uint64_t len, zio_priority_t pri) { dnode_t *dn; uint64_t blkid; @@ -554,8 +553,9 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); - blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); - dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ); + blkid = dbuf_whichblock(dn, level, + object * sizeof (dnode_phys_t)); + dbuf_prefetch(dn, level, blkid, pri, 0); rw_exit(&dn->dn_struct_rwlock); return; } @@ -570,18 +570,24 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (dn->dn_datablkshift) { - int blkshift = dn->dn_datablkshift; - nblks = (P2ROUNDUP(offset + len, 1 << blkshift) - - P2ALIGN(offset, 1 << blkshift)) >> blkshift; + /* + * offset + len - 1 is the last byte we want to prefetch for, and offset + * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the + * last block we want to prefetch, and dbuf_whichblock(dn, level, + * offset) is the first. Then the number we need to prefetch is the + * last - first + 1. + */ + if (level > 0 || dn->dn_datablkshift != 0) { + nblks = dbuf_whichblock(dn, level, offset + len - 1) - + dbuf_whichblock(dn, level, offset) + 1; } else { nblks = (offset < dn->dn_datablksz); } if (nblks != 0) { - blkid = dbuf_whichblock(dn, offset); + blkid = dbuf_whichblock(dn, level, offset); for (int i = 0; i < nblks; i++) - dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ); + dbuf_prefetch(dn, level, blkid + i, pri, 0); } rw_exit(&dn->dn_struct_rwlock); @@ -1393,7 +1399,7 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, DB_DNODE_ENTER(dbuf); dn = DB_DNODE(dbuf); rw_enter(&dn->dn_struct_rwlock, RW_READER); - blkid = dbuf_whichblock(dn, offset); + blkid = dbuf_whichblock(dn, 0, offset); VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(dbuf); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c index bd9e894..e88968b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c @@ -138,7 +138,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (issig(JUSTLOOKING) && issig(FORREAL)) return (SET_ERROR(EINTR)); - if (zb->zb_object != DMU_META_DNODE_OBJECT) + if (bp == NULL || zb->zb_object != DMU_META_DNODE_OBJECT) return (0); if (BP_IS_HOLE(bp)) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c index 808864a..6ca021e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c @@ -148,6 +148,11 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) return (0); } +/* + * Return (in *objectp) the next object which is allocated (or a hole) + * after *object, taking into account only objects that may have been modified + * after the specified txg. + */ int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c index be1f46d..267aa35 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. @@ -53,6 +53,7 @@ #include <sys/blkptr.h> #include <sys/dsl_bookmark.h> #include <sys/zfeature.h> +#include <sys/bqueue.h> #ifdef __FreeBSD__ #undef dump_write @@ -61,10 +62,34 @@ /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ int zfs_send_corrupt_data = B_FALSE; +int zfs_send_queue_length = 16 * 1024 * 1024; +int zfs_recv_queue_length = 16 * 1024 * 1024; static char *dmu_recv_tag = "dmu_recv_tag"; static const char *recv_clone_name = "%recv"; +#define BP_SPAN(datablkszsec, indblkshift, level) \ + (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \ + (level) * (indblkshift - SPA_BLKPTRSHIFT))) + +struct send_thread_arg { + bqueue_t q; + dsl_dataset_t *ds; /* Dataset to traverse */ + uint64_t fromtxg; /* Traverse from this txg */ + int flags; /* flags to pass to traverse_dataset */ + int error_code; + boolean_t cancel; +}; + +struct send_block_record { + boolean_t eos_marker; /* Marks the end of the stream */ + blkptr_t bp; + zbookmark_phys_t zb; + uint8_t indblkshift; + uint16_t datablkszsec; + bqueue_node_t ln; +}; + static int dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) { @@ -455,58 +480,116 @@ backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) return (B_FALSE); } -#define BP_SPAN(dnp, level) \ - (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ - (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) +/* + * This is the callback function to traverse_dataset that acts as the worker + * thread for dmu_send_impl. + */ +/*ARGSUSED*/ +static int +send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) +{ + struct send_thread_arg *sta = arg; + struct send_block_record *record; + uint64_t record_size; + int err = 0; -/* ARGSUSED */ + if (sta->cancel) + return (SET_ERROR(EINTR)); + + if (bp == NULL) { + ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL); + return (0); + } else if (zb->zb_level < 0) { + return (0); + } + + record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP); + record->eos_marker = B_FALSE; + record->bp = *bp; + record->zb = *zb; + record->indblkshift = dnp->dn_indblkshift; + record->datablkszsec = dnp->dn_datablkszsec; + record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; + bqueue_enqueue(&sta->q, record, record_size); + + return (err); +} + +/* + * This function kicks off the traverse_dataset. It also handles setting the + * error code of the thread in case something goes wrong, and pushes the End of + * Stream record when the traverse_dataset call has finished. If there is no + * dataset to traverse, the thread immediately pushes End of Stream marker. + */ +static void +send_traverse_thread(void *arg) +{ + struct send_thread_arg *st_arg = arg; + int err; + struct send_block_record *data; + + if (st_arg->ds != NULL) { + err = traverse_dataset(st_arg->ds, st_arg->fromtxg, + st_arg->flags, send_cb, arg); + if (err != EINTR) + st_arg->error_code = err; + } + data = kmem_zalloc(sizeof (*data), KM_SLEEP); + data->eos_marker = B_TRUE; + bqueue_enqueue(&st_arg->q, data, 1); + thread_exit(); +} + +/* + * This function actually handles figuring out what kind of record needs to be + * dumped, reading the data (which has hopefully been prefetched), and calling + * the appropriate helper function. + */ static int -backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) { - dmu_sendarg_t *dsp = arg; + dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os); + const blkptr_t *bp = &data->bp; + const zbookmark_phys_t *zb = &data->zb; + uint8_t indblkshift = data->indblkshift; + uint16_t dblkszsec = data->datablkszsec; + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; int err = 0; - if (issig(JUSTLOOKING) && issig(FORREAL)) - return (SET_ERROR(EINTR)); + ASSERT3U(zb->zb_level, >=, 0); if (zb->zb_object != DMU_META_DNODE_OBJECT && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { return (0); - } else if (zb->zb_level == ZB_ZIL_LEVEL) { - /* - * If we are sending a non-snapshot (which is allowed on - * read-only pools), it may have a ZIL, which must be ignored. - */ - return (0); } else if (BP_IS_HOLE(bp) && zb->zb_object == DMU_META_DNODE_OBJECT) { - uint64_t span = BP_SPAN(dnp, zb->zb_level); + uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; - err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); + err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT); } else if (BP_IS_HOLE(bp)) { - uint64_t span = BP_SPAN(dnp, zb->zb_level); - err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); + uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); + uint64_t offset = zb->zb_blkid * span; + err = dump_free(dsa, zb->zb_object, offset, span); } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { return (0); } else if (type == DMU_OT_DNODE) { - dnode_phys_t *blk; - int i; int blksz = BP_GET_LSIZE(bp); arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; + ASSERT0(zb->zb_level); + if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) return (SET_ERROR(EIO)); - blk = abuf->b_data; - for (i = 0; i < blksz >> DNODE_SHIFT; i++) { - uint64_t dnobj = (zb->zb_blkid << - (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; - err = dump_dnode(dsp, dnobj, blk+i); + dnode_phys_t *blk = abuf->b_data; + uint64_t dnobj = zb->zb_blkid * (blksz >> DNODE_SHIFT); + for (int i = 0; i < blksz >> DNODE_SHIFT; i++) { + err = dump_dnode(dsa, dnobj + i, blk + i); if (err != 0) break; } @@ -521,20 +604,21 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, &aflags, zb) != 0) return (SET_ERROR(EIO)); - err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); + err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data); (void) arc_buf_remove_ref(abuf, &abuf); - } else if (backup_do_embed(dsp, bp)) { + } else if (backup_do_embed(dsa, bp)) { /* it's an embedded level-0 block of a regular object */ - int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; - err = dump_write_embedded(dsp, zb->zb_object, + int blksz = dblkszsec << SPA_MINBLOCKSHIFT; + ASSERT0(zb->zb_level); + err = dump_write_embedded(dsa, zb->zb_object, zb->zb_blkid * blksz, blksz, bp); - } else { /* it's a level-0 block of a regular object */ + } else { + /* it's a level-0 block of a regular object */ arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; - int blksz = BP_GET_LSIZE(bp); + int blksz = dblkszsec << SPA_MINBLOCKSHIFT; uint64_t offset; - ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); ASSERT0(zb->zb_level); if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, @@ -555,20 +639,20 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, offset = zb->zb_blkid * blksz; - if (!(dsp->dsa_featureflags & + if (!(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && blksz > SPA_OLD_MAXBLOCKSIZE) { char *buf = abuf->b_data; while (blksz > 0 && err == 0) { int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); - err = dump_write(dsp, type, zb->zb_object, + err = dump_write(dsa, type, zb->zb_object, offset, n, NULL, buf); offset += n; buf += n; blksz -= n; } } else { - err = dump_write(dsp, type, zb->zb_object, + err = dump_write(dsa, type, zb->zb_object, offset, blksz, bp, abuf->b_data); } (void) arc_buf_remove_ref(abuf, &abuf); @@ -579,11 +663,24 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, } /* - * Releases dp using the specified tag. + * Pop the new data off the queue, and free the old data. + */ +static struct send_block_record * +get_next_record(bqueue_t *bq, struct send_block_record *data) +{ + struct send_block_record *tmp = bqueue_dequeue(bq); + kmem_free(data, sizeof (*data)); + return (tmp); +} + +/* + * Actually do the bulk of the work in a zfs send. + * + * Note: Releases dp using the specified tag. */ static int -dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, - zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok, +dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, + zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, boolean_t embedok, #ifdef illumos boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off) #else @@ -596,8 +693,9 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, int err; uint64_t fromtxg = 0; uint64_t featureflags = 0; + struct send_thread_arg to_arg; - err = dmu_objset_from_ds(ds, &os); + err = dmu_objset_from_ds(to_ds, &os); if (err != 0) { dsl_pool_rele(dp, tag); return (err); @@ -623,35 +721,34 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, } #endif - if (large_block_ok && ds->ds_large_blocks) + if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS]) featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; if (embedok && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; - } else { - embedok = B_FALSE; } DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, featureflags); drr->drr_u.drr_begin.drr_creation_time = - dsl_dataset_phys(ds)->ds_creation_time; + dsl_dataset_phys(to_ds)->ds_creation_time; drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); if (is_clone) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; - drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(ds)->ds_guid; - if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) + drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid; + if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; - if (fromzb != NULL) { - drr->drr_u.drr_begin.drr_fromguid = fromzb->zbm_guid; - fromtxg = fromzb->zbm_creation_txg; + if (ancestor_zb != NULL) { + drr->drr_u.drr_begin.drr_fromguid = + ancestor_zb->zbm_guid; + fromtxg = ancestor_zb->zbm_creation_txg; } - dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); - if (!ds->ds_is_snapshot) { + dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname); + if (!to_ds->ds_is_snapshot) { (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", sizeof (drr->drr_u.drr_begin.drr_toname)); } @@ -665,16 +762,16 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, dsp->dsa_fp = fp; dsp->dsa_os = os; dsp->dsa_off = off; - dsp->dsa_toguid = dsl_dataset_phys(ds)->ds_guid; + dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid; dsp->dsa_pending_op = PENDING_NONE; - dsp->dsa_incremental = (fromzb != NULL); + dsp->dsa_incremental = (ancestor_zb != NULL); dsp->dsa_featureflags = featureflags; - mutex_enter(&ds->ds_sendstream_lock); - list_insert_head(&ds->ds_sendstreams, dsp); - mutex_exit(&ds->ds_sendstream_lock); + mutex_enter(&to_ds->ds_sendstream_lock); + list_insert_head(&to_ds->ds_sendstreams, dsp); + mutex_exit(&to_ds->ds_sendstream_lock); - dsl_dataset_long_hold(ds, FTAG); + dsl_dataset_long_hold(to_ds, FTAG); dsl_pool_rele(dp, tag); if (dump_record(dsp, NULL, 0) != 0) { @@ -682,8 +779,41 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, goto out; } - err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, - backup_cb, dsp); + err = bqueue_init(&to_arg.q, zfs_send_queue_length, + offsetof(struct send_block_record, ln)); + to_arg.error_code = 0; + to_arg.cancel = B_FALSE; + to_arg.ds = to_ds; + to_arg.fromtxg = fromtxg; + to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH; + (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, curproc, + TS_RUN, minclsyspri); + + struct send_block_record *to_data; + to_data = bqueue_dequeue(&to_arg.q); + + while (!to_data->eos_marker && err == 0) { + err = do_dump(dsp, to_data); + to_data = get_next_record(&to_arg.q, to_data); + if (issig(JUSTLOOKING) && issig(FORREAL)) + err = EINTR; + } + + if (err != 0) { + to_arg.cancel = B_TRUE; + while (!to_data->eos_marker) { + to_data = get_next_record(&to_arg.q, to_data); + } + } + kmem_free(to_data, sizeof (*to_data)); + + bqueue_destroy(&to_arg.q); + + if (err == 0 && to_arg.error_code != 0) + err = to_arg.error_code; + + if (err != 0) + goto out; if (dsp->dsa_pending_op != PENDING_NONE) if (dump_record(dsp, NULL, 0) != 0) @@ -700,20 +830,18 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; - if (dump_record(dsp, NULL, 0) != 0) { + if (dump_record(dsp, NULL, 0) != 0) err = dsp->dsa_err; - goto out; - } out: - mutex_enter(&ds->ds_sendstream_lock); - list_remove(&ds->ds_sendstreams, dsp); - mutex_exit(&ds->ds_sendstream_lock); + mutex_enter(&to_ds->ds_sendstream_lock); + list_remove(&to_ds->ds_sendstreams, dsp); + mutex_exit(&to_ds->ds_sendstream_lock); kmem_free(drr, sizeof (dmu_replay_record_t)); kmem_free(dsp, sizeof (dmu_sendarg_t)); - dsl_dataset_long_rele(ds, FTAG); + dsl_dataset_long_rele(to_ds, FTAG); return (err); } @@ -1144,7 +1272,8 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) * If it's a non-clone incremental, we are missing the * target fs, so fail the recv. */ - if (fromguid != 0 && !(flags & DRR_FLAG_CLONE)) + if (fromguid != 0 && !(flags & DRR_FLAG_CLONE || + drba->drba_origin)) return (SET_ERROR(ENOENT)); /* Open the parent of tofs */ @@ -1250,13 +1379,6 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) } VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); - if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & - DMU_BACKUP_FEATURE_LARGE_BLOCKS) && - !newds->ds_large_blocks) { - dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx); - newds->ds_large_blocks = B_TRUE; - } - dmu_buf_will_dirty(newds->ds_dbuf, tx); dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; @@ -1326,22 +1448,58 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, &drba, 5, ZFS_SPACE_CHECK_NORMAL)); } -struct restorearg { +struct receive_record_arg { + dmu_replay_record_t header; + void *payload; /* Pointer to a buffer containing the payload */ + /* + * If the record is a write, pointer to the arc_buf_t containing the + * payload. + */ + arc_buf_t *write_buf; + int payload_size; + boolean_t eos_marker; /* Marks the end of the stream */ + bqueue_node_t node; +}; + +struct receive_writer_arg { objset_t *os; - int err; boolean_t byteswap; + bqueue_t q; + /* + * These three args are used to signal to the main thread that we're + * done. + */ + kmutex_t mutex; + kcondvar_t cv; + boolean_t done; + int err; + /* A map from guid to dataset to help handle dedup'd streams. */ + avl_tree_t *guid_to_ds_map; +}; + +struct receive_arg { + objset_t *os; kthread_t *td; struct file *fp; - uint64_t voff; - int bufsize; /* amount of memory allocated for buf */ - - dmu_replay_record_t *drr; - dmu_replay_record_t *next_drr; - char *buf; + uint64_t voff; /* The current offset in the stream */ + /* + * A record that has had its payload read in, but hasn't yet been handed + * off to the worker thread. + */ + struct receive_record_arg *rrd; + /* A record that has had its header read in, but not its payload. */ + struct receive_record_arg *next_rrd; zio_cksum_t cksum; zio_cksum_t prev_cksum; + int err; + boolean_t byteswap; + /* Sorted list of objects not to issue prefetches for. */ + list_t ignore_obj_list; +}; - avl_tree_t *guid_to_ds_map; +struct receive_ign_obj_node { + list_node_t node; + uint64_t object; }; typedef struct guid_map_entry { @@ -1380,7 +1538,7 @@ free_guid_map_onexit(void *arg) } static int -restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *resid) +restore_bytes(struct receive_arg *ra, void *buf, int len, off_t off, ssize_t *resid) { struct uio auio; struct iovec aiov; @@ -1406,13 +1564,12 @@ restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *res } static int -restore_read(struct restorearg *ra, int len, void *buf) +receive_read(struct receive_arg *ra, int len, void *buf) { int done = 0; /* some things will require 8-byte alignment, so everything must */ ASSERT0(len % 8); - ASSERT3U(len, <=, ra->bufsize); while (done < len) { ssize_t resid; @@ -1529,7 +1686,8 @@ deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) } static int -restore_object(struct restorearg *ra, struct drr_object *drro, void *data) +receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, + void *data) { dmu_object_info_t doi; dmu_tx_t *tx; @@ -1543,12 +1701,12 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || drro->drr_blksz < SPA_MINBLOCKSIZE || - drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(ra->os)) || + drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) || drro->drr_bonuslen > DN_MAX_BONUSLEN) { return (SET_ERROR(EINVAL)); } - err = dmu_object_info(ra->os, drro->drr_object, &doi); + err = dmu_object_info(rwa->os, drro->drr_object, &doi); if (err != 0 && err != ENOENT) return (SET_ERROR(EINVAL)); @@ -1567,14 +1725,14 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) if (drro->drr_blksz != doi.doi_data_block_size || nblkptr < doi.doi_nblkptr) { - err = dmu_free_long_range(ra->os, drro->drr_object, + err = dmu_free_long_range(rwa->os, drro->drr_object, 0, DMU_OBJECT_END); if (err != 0) return (SET_ERROR(EINVAL)); } } - tx = dmu_tx_create(ra->os); + tx = dmu_tx_create(rwa->os); dmu_tx_hold_bonus(tx, object); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { @@ -1584,7 +1742,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) if (object == DMU_NEW_OBJECT) { /* currently free, want to be allocated */ - err = dmu_object_claim(ra->os, drro->drr_object, + err = dmu_object_claim(rwa->os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, tx); } else if (drro->drr_type != doi.doi_type || @@ -1592,7 +1750,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) drro->drr_bonustype != doi.doi_bonus_type || drro->drr_bonuslen != doi.doi_bonus_size) { /* currently allocated, but with different properties */ - err = dmu_object_reclaim(ra->os, drro->drr_object, + err = dmu_object_reclaim(rwa->os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, tx); } @@ -1601,20 +1759,20 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) return (SET_ERROR(EINVAL)); } - dmu_object_set_checksum(ra->os, drro->drr_object, + dmu_object_set_checksum(rwa->os, drro->drr_object, drro->drr_checksumtype, tx); - dmu_object_set_compress(ra->os, drro->drr_object, + dmu_object_set_compress(rwa->os, drro->drr_object, drro->drr_compress, tx); if (data != NULL) { dmu_buf_t *db; - VERIFY0(dmu_bonus_hold(ra->os, drro->drr_object, FTAG, &db)); + VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, >=, drro->drr_bonuslen); bcopy(data, db->db_data, drro->drr_bonuslen); - if (ra->byteswap) { + if (rwa->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drro->drr_bonustype); dmu_ot_byteswap[byteswap].ob_func(db->db_data, @@ -1628,7 +1786,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) /* ARGSUSED */ static int -restore_freeobjects(struct restorearg *ra, +receive_freeobjects(struct receive_writer_arg *rwa, struct drr_freeobjects *drrfo) { uint64_t obj; @@ -1638,13 +1796,13 @@ restore_freeobjects(struct restorearg *ra, for (obj = drrfo->drr_firstobj; obj < drrfo->drr_firstobj + drrfo->drr_numobjs; - (void) dmu_object_next(ra->os, &obj, FALSE, 0)) { + (void) dmu_object_next(rwa->os, &obj, FALSE, 0)) { int err; - if (dmu_object_info(ra->os, obj, NULL) != 0) + if (dmu_object_info(rwa->os, obj, NULL) != 0) continue; - err = dmu_free_long_object(ra->os, obj); + err = dmu_free_long_object(rwa->os, obj); if (err != 0) return (err); } @@ -1652,7 +1810,8 @@ restore_freeobjects(struct restorearg *ra, } static int -restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) +receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, + arc_buf_t *abuf) { dmu_tx_t *tx; int err; @@ -1661,10 +1820,10 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) !DMU_OT_IS_VALID(drrw->drr_type)) return (SET_ERROR(EINVAL)); - if (dmu_object_info(ra->os, drrw->drr_object, NULL) != 0) + if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); - tx = dmu_tx_create(ra->os); + tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrw->drr_object, drrw->drr_offset, drrw->drr_length); @@ -1673,7 +1832,7 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) dmu_tx_abort(tx); return (err); } - if (ra->byteswap) { + if (rwa->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drrw->drr_type); dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, @@ -1681,7 +1840,7 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) } dmu_buf_t *bonus; - if (dmu_bonus_hold(ra->os, drrw->drr_object, FTAG, &bonus) != 0) + if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0) return (SET_ERROR(EINVAL)); dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); dmu_tx_commit(tx); @@ -1697,7 +1856,8 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) * data from the stream to fulfill this write. */ static int -restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) +receive_write_byref(struct receive_writer_arg *rwa, + struct drr_write_byref *drrwbr) { dmu_tx_t *tx; int err; @@ -1716,14 +1876,14 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) */ if (drrwbr->drr_toguid != drrwbr->drr_refguid) { gmesrch.guid = drrwbr->drr_refguid; - if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, + if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch, &where)) == NULL) { return (SET_ERROR(EINVAL)); } if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) return (SET_ERROR(EINVAL)); } else { - ref_os = ra->os; + ref_os = rwa->os; } err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, @@ -1731,7 +1891,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) if (err != 0) return (err); - tx = dmu_tx_create(ra->os); + tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length); @@ -1740,7 +1900,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) dmu_tx_abort(tx); return (err); } - dmu_write(ra->os, drrwbr->drr_object, + dmu_write(rwa->os, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); dmu_buf_rele(dbp, FTAG); dmu_tx_commit(tx); @@ -1748,7 +1908,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) } static int -restore_write_embedded(struct restorearg *ra, +receive_write_embedded(struct receive_writer_arg *rwa, struct drr_write_embedded *drrwnp, void *data) { dmu_tx_t *tx; @@ -1765,7 +1925,7 @@ restore_write_embedded(struct restorearg *ra, if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS) return (EINVAL); - tx = dmu_tx_create(ra->os); + tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrwnp->drr_object, drrwnp->drr_offset, drrwnp->drr_length); @@ -1775,36 +1935,37 @@ restore_write_embedded(struct restorearg *ra, return (err); } - dmu_write_embedded(ra->os, drrwnp->drr_object, + dmu_write_embedded(rwa->os, drrwnp->drr_object, drrwnp->drr_offset, data, drrwnp->drr_etype, drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize, - ra->byteswap ^ ZFS_HOST_BYTEORDER, tx); + rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx); dmu_tx_commit(tx); return (0); } static int -restore_spill(struct restorearg *ra, struct drr_spill *drrs, void *data) +receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, + void *data) { dmu_tx_t *tx; dmu_buf_t *db, *db_spill; int err; if (drrs->drr_length < SPA_MINBLOCKSIZE || - drrs->drr_length > spa_maxblocksize(dmu_objset_spa(ra->os))) + drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os))) return (SET_ERROR(EINVAL)); - if (dmu_object_info(ra->os, drrs->drr_object, NULL) != 0) + if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); - VERIFY0(dmu_bonus_hold(ra->os, drrs->drr_object, FTAG, &db)); + VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { dmu_buf_rele(db, FTAG); return (err); } - tx = dmu_tx_create(ra->os); + tx = dmu_tx_create(rwa->os); dmu_tx_hold_spill(tx, db->db_object); @@ -1831,7 +1992,7 @@ restore_spill(struct restorearg *ra, struct drr_spill *drrs, void *data) /* ARGSUSED */ static int -restore_free(struct restorearg *ra, struct drr_free *drrf) +receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) { int err; @@ -1839,11 +2000,12 @@ restore_free(struct restorearg *ra, struct drr_free *drrf) drrf->drr_offset + drrf->drr_length < drrf->drr_offset) return (SET_ERROR(EINVAL)); - if (dmu_object_info(ra->os, drrf->drr_object, NULL) != 0) + if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); - err = dmu_free_long_range(ra->os, drrf->drr_object, + err = dmu_free_long_range(rwa->os, drrf->drr_object, drrf->drr_offset, drrf->drr_length); + return (err); } @@ -1858,7 +2020,7 @@ dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) } static void -restore_cksum(struct restorearg *ra, int len, void *buf) +receive_cksum(struct receive_arg *ra, int len, void *buf) { if (ra->byteswap) { fletcher_4_incremental_byteswap(buf, len, &ra->cksum); @@ -1868,30 +2030,42 @@ restore_cksum(struct restorearg *ra, int len, void *buf) } /* - * If len != 0, read payload into buf. - * Read next record's header into ra->next_drr. + * Read the payload into a buffer of size len, and update the current record's + * payload field. + * Allocate ra->next_rrd and read the next record's header into + * ra->next_rrd->header. * Verify checksum of payload and next record. */ static int -restore_read_payload_and_next_header(struct restorearg *ra, int len, void *buf) +receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) { int err; if (len != 0) { - ASSERT3U(len, <=, ra->bufsize); - err = restore_read(ra, len, buf); + ASSERT3U(len, <=, SPA_MAXBLOCKSIZE); + ra->rrd->payload = buf; + ra->rrd->payload_size = len; + err = receive_read(ra, len, ra->rrd->payload); if (err != 0) return (err); - restore_cksum(ra, len, buf); + receive_cksum(ra, len, ra->rrd->payload); } ra->prev_cksum = ra->cksum; - err = restore_read(ra, sizeof (*ra->next_drr), ra->next_drr); - if (err != 0) + ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP); + err = receive_read(ra, sizeof (ra->next_rrd->header), + &ra->next_rrd->header); + if (err != 0) { + kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); + ra->next_rrd = NULL; return (err); - if (ra->next_drr->drr_type == DRR_BEGIN) + } + if (ra->next_rrd->header.drr_type == DRR_BEGIN) { + kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); + ra->next_rrd = NULL; return (SET_ERROR(EINVAL)); + } /* * Note: checksum is of everything up to but not including the @@ -1899,107 +2073,248 @@ restore_read_payload_and_next_header(struct restorearg *ra, int len, void *buf) */ ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); - restore_cksum(ra, + receive_cksum(ra, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), - ra->next_drr); + &ra->next_rrd->header); - zio_cksum_t cksum_orig = ra->next_drr->drr_u.drr_checksum.drr_checksum; - zio_cksum_t *cksump = &ra->next_drr->drr_u.drr_checksum.drr_checksum; + zio_cksum_t cksum_orig = + ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; + zio_cksum_t *cksump = + &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; if (ra->byteswap) - byteswap_record(ra->next_drr); + byteswap_record(&ra->next_rrd->header); if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) && - !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) + !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) { + kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); + ra->next_rrd = NULL; return (SET_ERROR(ECKSUM)); + } - restore_cksum(ra, sizeof (cksum_orig), &cksum_orig); + receive_cksum(ra, sizeof (cksum_orig), &cksum_orig); return (0); } +/* + * Issue the prefetch reads for any necessary indirect blocks. + * + * We use the object ignore list to tell us whether or not to issue prefetches + * for a given object. We do this for both correctness (in case the blocksize + * of an object has changed) and performance (if the object doesn't exist, don't + * needlessly try to issue prefetches). We also trim the list as we go through + * the stream to prevent it from growing to an unbounded size. + * + * The object numbers within will always be in sorted order, and any write + * records we see will also be in sorted order, but they're not sorted with + * respect to each other (i.e. we can get several object records before + * receiving each object's write records). As a result, once we've reached a + * given object number, we can safely remove any reference to lower object + * numbers in the ignore list. In practice, we receive up to 32 object records + * before receiving write records, so the list can have up to 32 nodes in it. + */ +/* ARGSUSED */ +static void +receive_read_prefetch(struct receive_arg *ra, + uint64_t object, uint64_t offset, uint64_t length) +{ + struct receive_ign_obj_node *node = list_head(&ra->ignore_obj_list); + while (node != NULL && node->object < object) { + VERIFY3P(node, ==, list_remove_head(&ra->ignore_obj_list)); + kmem_free(node, sizeof (*node)); + node = list_head(&ra->ignore_obj_list); + } + if (node == NULL || node->object > object) { + dmu_prefetch(ra->os, object, 1, offset, length, + ZIO_PRIORITY_SYNC_READ); + } +} + +/* + * Read records off the stream, issuing any necessary prefetches. + */ static int -restore_process_record(struct restorearg *ra) +receive_read_record(struct receive_arg *ra) { int err; - switch (ra->drr->drr_type) { + switch (ra->rrd->header.drr_type) { case DRR_OBJECT: { - struct drr_object *drro = &ra->drr->drr_u.drr_object; - err = restore_read_payload_and_next_header(ra, - P2ROUNDUP(drro->drr_bonuslen, 8), ra->buf); - if (err != 0) + struct drr_object *drro = &ra->rrd->header.drr_u.drr_object; + uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8); + void *buf = kmem_zalloc(size, KM_SLEEP); + dmu_object_info_t doi; + err = receive_read_payload_and_next_header(ra, size, buf); + if (err != 0) { + kmem_free(buf, size); return (err); - return (restore_object(ra, drro, ra->buf)); + } + err = dmu_object_info(ra->os, drro->drr_object, &doi); + /* + * See receive_read_prefetch for an explanation why we're + * storing this object in the ignore_obj_list. + */ + if (err == ENOENT || + (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) { + struct receive_ign_obj_node *node = + kmem_zalloc(sizeof (*node), + KM_SLEEP); + node->object = drro->drr_object; +#ifdef ZFS_DEBUG + struct receive_ign_obj_node *last_object = + list_tail(&ra->ignore_obj_list); + uint64_t last_objnum = (last_object != NULL ? + last_object->object : 0); + ASSERT3U(node->object, >, last_objnum); +#endif + list_insert_tail(&ra->ignore_obj_list, node); + err = 0; + } + return (err); } case DRR_FREEOBJECTS: { - struct drr_freeobjects *drrfo = - &ra->drr->drr_u.drr_freeobjects; - err = restore_read_payload_and_next_header(ra, 0, NULL); - if (err != 0) - return (err); - return (restore_freeobjects(ra, drrfo)); + err = receive_read_payload_and_next_header(ra, 0, NULL); + return (err); } case DRR_WRITE: { - struct drr_write *drrw = &ra->drr->drr_u.drr_write; + struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write; arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os), drrw->drr_length); - err = restore_read_payload_and_next_header(ra, + err = receive_read_payload_and_next_header(ra, drrw->drr_length, abuf->b_data); - if (err != 0) - return (err); - err = restore_write(ra, drrw, abuf); - /* if restore_write() is successful, it consumes the arc_buf */ - if (err != 0) + if (err != 0) { dmu_return_arcbuf(abuf); + return (err); + } + ra->rrd->write_buf = abuf; + receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset, + drrw->drr_length); return (err); } case DRR_WRITE_BYREF: { - struct drr_write_byref *drrwbr = - &ra->drr->drr_u.drr_write_byref; - err = restore_read_payload_and_next_header(ra, 0, NULL); - if (err != 0) - return (err); - return (restore_write_byref(ra, drrwbr)); + struct drr_write_byref *drrwb = + &ra->rrd->header.drr_u.drr_write_byref; + err = receive_read_payload_and_next_header(ra, 0, NULL); + receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset, + drrwb->drr_length); + return (err); } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = - &ra->drr->drr_u.drr_write_embedded; - err = restore_read_payload_and_next_header(ra, - P2ROUNDUP(drrwe->drr_psize, 8), ra->buf); - if (err != 0) + &ra->rrd->header.drr_u.drr_write_embedded; + uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8); + void *buf = kmem_zalloc(size, KM_SLEEP); + + err = receive_read_payload_and_next_header(ra, size, buf); + if (err != 0) { + kmem_free(buf, size); return (err); - return (restore_write_embedded(ra, drrwe, ra->buf)); + } + + receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset, + drrwe->drr_length); + return (err); } case DRR_FREE: { - struct drr_free *drrf = &ra->drr->drr_u.drr_free; - err = restore_read_payload_and_next_header(ra, 0, NULL); - if (err != 0) - return (err); - return (restore_free(ra, drrf)); + /* + * It might be beneficial to prefetch indirect blocks here, but + * we don't really have the data to decide for sure. + */ + err = receive_read_payload_and_next_header(ra, 0, NULL); + return (err); } case DRR_END: { - struct drr_end *drre = &ra->drr->drr_u.drr_end; + struct drr_end *drre = &ra->rrd->header.drr_u.drr_end; if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum)) return (SET_ERROR(EINVAL)); return (0); } case DRR_SPILL: { - struct drr_spill *drrs = &ra->drr->drr_u.drr_spill; - err = restore_read_payload_and_next_header(ra, - drrs->drr_length, ra->buf); + struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill; + void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP); + err = receive_read_payload_and_next_header(ra, drrs->drr_length, + buf); if (err != 0) - return (err); - return (restore_spill(ra, drrs, ra->buf)); + kmem_free(buf, drrs->drr_length); + return (err); + } + default: + return (SET_ERROR(EINVAL)); + } +} + +/* + * Commit the records to the pool. + */ +static int +receive_process_record(struct receive_writer_arg *rwa, + struct receive_record_arg *rrd) +{ + int err; + + switch (rrd->header.drr_type) { + case DRR_OBJECT: + { + struct drr_object *drro = &rrd->header.drr_u.drr_object; + err = receive_object(rwa, drro, rrd->payload); + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + return (err); + } + case DRR_FREEOBJECTS: + { + struct drr_freeobjects *drrfo = + &rrd->header.drr_u.drr_freeobjects; + return (receive_freeobjects(rwa, drrfo)); + } + case DRR_WRITE: + { + struct drr_write *drrw = &rrd->header.drr_u.drr_write; + err = receive_write(rwa, drrw, rrd->write_buf); + /* if receive_write() is successful, it consumes the arc_buf */ + if (err != 0) + dmu_return_arcbuf(rrd->write_buf); + rrd->write_buf = NULL; + rrd->payload = NULL; + return (err); + } + case DRR_WRITE_BYREF: + { + struct drr_write_byref *drrwbr = + &rrd->header.drr_u.drr_write_byref; + return (receive_write_byref(rwa, drrwbr)); + } + case DRR_WRITE_EMBEDDED: + { + struct drr_write_embedded *drrwe = + &rrd->header.drr_u.drr_write_embedded; + err = receive_write_embedded(rwa, drrwe, rrd->payload); + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + return (err); + } + case DRR_FREE: + { + struct drr_free *drrf = &rrd->header.drr_u.drr_free; + return (receive_free(rwa, drrf)); + } + case DRR_SPILL: + { + struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; + err = receive_spill(rwa, drrs, rrd->payload); + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + return (err); } default: return (SET_ERROR(EINVAL)); @@ -2007,6 +2322,51 @@ restore_process_record(struct restorearg *ra) } /* + * dmu_recv_stream's worker thread; pull records off the queue, and then call + * receive_process_record When we're done, signal the main thread and exit. + */ +static void +receive_writer_thread(void *arg) +{ + struct receive_writer_arg *rwa = arg; + struct receive_record_arg *rrd; + for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker; + rrd = bqueue_dequeue(&rwa->q)) { + /* + * If there's an error, the main thread will stop putting things + * on the queue, but we need to clear everything in it before we + * can exit. + */ + if (rwa->err == 0) { + rwa->err = receive_process_record(rwa, rrd); + } else if (rrd->write_buf != NULL) { + dmu_return_arcbuf(rrd->write_buf); + rrd->write_buf = NULL; + rrd->payload = NULL; + } else if (rrd->payload != NULL) { + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + } + kmem_free(rrd, sizeof (*rrd)); + } + kmem_free(rrd, sizeof (*rrd)); + mutex_enter(&rwa->mutex); + rwa->done = B_TRUE; + cv_signal(&rwa->cv); + mutex_exit(&rwa->mutex); + thread_exit(); +} + +/* + * Read in the stream's records, one by one, and apply them to the pool. There + * are two threads involved; the thread that calls this function will spin up a + * worker thread, read the records off the stream one by one, and issue + * prefetches for any necessary indirect blocks. It will then push the records + * onto an internal blocking queue. The worker thread will pull the records off + * the queue, and actually write the data into the DMU. This way, the worker + * thread doesn't have to wait for reads to complete, since everything it needs + * (the indirect blocks) will be prefetched. + * * NB: callers *must* call dmu_recv_end() if this succeeds. */ int @@ -2014,7 +2374,8 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, int cleanup_fd, uint64_t *action_handlep) { int err = 0; - struct restorearg ra = { 0 }; + struct receive_arg ra = { 0 }; + struct receive_writer_arg rwa = { 0 }; int featureflags; ra.byteswap = drc->drc_byteswap; @@ -2022,10 +2383,8 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, ra.td = curthread; ra.fp = fp; ra.voff = *voffp; - ra.bufsize = SPA_MAXBLOCKSIZE; - ra.drr = kmem_alloc(sizeof (*ra.drr), KM_SLEEP); - ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); - ra.next_drr = kmem_alloc(sizeof (*ra.next_drr), KM_SLEEP); + list_create(&ra.ignore_obj_list, sizeof (struct receive_ign_obj_node), + offsetof(struct receive_ign_obj_node, node)); /* these were verified in dmu_recv_begin */ ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, @@ -2056,48 +2415,92 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, } if (*action_handlep == 0) { - ra.guid_to_ds_map = + rwa.guid_to_ds_map = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); - avl_create(ra.guid_to_ds_map, guid_compare, + avl_create(rwa.guid_to_ds_map, guid_compare, sizeof (guid_map_entry_t), offsetof(guid_map_entry_t, avlnode)); err = zfs_onexit_add_cb(minor, - free_guid_map_onexit, ra.guid_to_ds_map, + free_guid_map_onexit, rwa.guid_to_ds_map, action_handlep); if (ra.err != 0) goto out; } else { err = zfs_onexit_cb_data(minor, *action_handlep, - (void **)&ra.guid_to_ds_map); + (void **)&rwa.guid_to_ds_map); if (ra.err != 0) goto out; } - drc->drc_guid_to_ds_map = ra.guid_to_ds_map; + drc->drc_guid_to_ds_map = rwa.guid_to_ds_map; } - err = restore_read_payload_and_next_header(&ra, 0, NULL); - if (err != 0) + err = receive_read_payload_and_next_header(&ra, 0, NULL); + if (err) goto out; - for (;;) { - void *tmp; + (void) bqueue_init(&rwa.q, zfs_recv_queue_length, + offsetof(struct receive_record_arg, node)); + cv_init(&rwa.cv, NULL, CV_DEFAULT, NULL); + mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL); + rwa.os = ra.os; + rwa.byteswap = drc->drc_byteswap; + + (void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, curproc, + TS_RUN, minclsyspri); + /* + * We're reading rwa.err without locks, which is safe since we are the + * only reader, and the worker thread is the only writer. It's ok if we + * miss a write for an iteration or two of the loop, since the writer + * thread will keep freeing records we send it until we send it an eos + * marker. + * + * We can leave this loop in 3 ways: First, if rwa.err is + * non-zero. In that case, the writer thread will free the rrd we just + * pushed. Second, if we're interrupted; in that case, either it's the + * first loop and ra.rrd was never allocated, or it's later, and ra.rrd + * has been handed off to the writer thread who will free it. Finally, + * if receive_read_record fails or we're at the end of the stream, then + * we free ra.rrd and exit. + */ + while (rwa.err == 0) { if (issig(JUSTLOOKING) && issig(FORREAL)) { err = SET_ERROR(EINTR); break; } - tmp = ra.next_drr; - ra.next_drr = ra.drr; - ra.drr = tmp; + ASSERT3P(ra.rrd, ==, NULL); + ra.rrd = ra.next_rrd; + ra.next_rrd = NULL; + /* Allocates and loads header into ra.next_rrd */ + err = receive_read_record(&ra); - /* process ra.drr, read in ra.next_drr */ - err = restore_process_record(&ra); - if (err != 0) - break; - if (ra.drr->drr_type == DRR_END) + if (ra.rrd->header.drr_type == DRR_END || err != 0) { + kmem_free(ra.rrd, sizeof (*ra.rrd)); + ra.rrd = NULL; break; + } + + bqueue_enqueue(&rwa.q, ra.rrd, + sizeof (struct receive_record_arg) + ra.rrd->payload_size); + ra.rrd = NULL; + } + if (ra.next_rrd == NULL) + ra.next_rrd = kmem_zalloc(sizeof (*ra.next_rrd), KM_SLEEP); + ra.next_rrd->eos_marker = B_TRUE; + bqueue_enqueue(&rwa.q, ra.next_rrd, 1); + + mutex_enter(&rwa.mutex); + while (!rwa.done) { + cv_wait(&rwa.cv, &rwa.mutex); } + mutex_exit(&rwa.mutex); + + cv_destroy(&rwa.cv); + mutex_destroy(&rwa.mutex); + bqueue_destroy(&rwa.q); + if (err == 0) + err = rwa.err; out: if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) @@ -2111,10 +2514,13 @@ out: dmu_recv_cleanup_ds(drc); } - kmem_free(ra.drr, sizeof (*ra.drr)); - kmem_free(ra.buf, ra.bufsize); - kmem_free(ra.next_drr, sizeof (*ra.next_drr)); *voffp = ra.voff; + for (struct receive_ign_obj_node *n = + list_remove_head(&ra.ignore_obj_list); n != NULL; + n = list_remove_head(&ra.ignore_obj_list)) { + kmem_free(n, sizeof (*n)); + } + list_destroy(&ra.ignore_obj_list); return (err); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c index e246c49..151d04c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c @@ -158,7 +158,7 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, * If we already visited this bp & everything below, * don't bother doing it again. */ - if (zbookmark_is_before(dnp, zb, td->td_resume)) + if (zbookmark_subtree_completed(dnp, zb, td->td_resume)) return (RESUME_SKIP_ALL); /* @@ -425,6 +425,17 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, int j, err = 0; zbookmark_phys_t czb; + if (td->td_flags & TRAVERSE_PRE) { + SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, + ZB_DNODE_BLKID); + err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, + td->td_arg); + if (err == TRAVERSE_VISIT_NO_CHILDREN) + return (0); + if (err != 0) + return (err); + } + for (j = 0; j < dnp->dn_nblkptr; j++) { SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); @@ -432,10 +443,21 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, break; } - if (err == 0 && dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb); } + + if (err == 0 && (td->td_flags & TRAVERSE_POST)) { + SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, + ZB_DNODE_BLKID); + err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, + td->td_arg); + if (err == TRAVERSE_VISIT_NO_CHILDREN) + return (0); + if (err != 0) + return (err); + } return (err); } @@ -448,6 +470,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; ASSERT(pfd->pd_bytes_fetched >= 0); + if (bp == NULL) + return (0); if (pfd->pd_cancel) return (SET_ERROR(EINTR)); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c index dff9fab..65a017f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c @@ -315,7 +315,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db); + err = dbuf_hold_impl(dn, 0, start, + FALSE, FALSE, FTAG, &db); rw_exit(&dn->dn_struct_rwlock); if (err) { @@ -516,7 +517,8 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) blkoff = P2PHASE(blkid, epb); tochk = MIN(epb - blkoff, nblks); - err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf); + err = dbuf_hold_impl(dn, 1, blkid >> epbs, + FALSE, FALSE, FTAG, &dbuf); if (err) { txh->txh_tx->tx_err = err; break; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c index 77100ef..65ce914 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c @@ -305,7 +305,8 @@ dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks) fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks); for (i = 0; i < fetchsz; i++) { - dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_ASYNC_READ); + dbuf_prefetch(dn, 0, blkid + i, ZIO_PRIORITY_ASYNC_READ, + ARC_FLAG_PREFETCH); } return (fetchsz); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c index 5b953fc..0fdcde4 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c @@ -1116,7 +1116,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, drop_struct_lock = TRUE; } - blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t)); + blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t)); db = dbuf_hold(mdn, blk, FTAG); if (drop_struct_lock) @@ -1413,7 +1413,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) goto fail; /* resize the old block */ - err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db); + err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); if (err == 0) dbuf_new_size(db, size, tx); else if (err != ENOENT) @@ -1586,8 +1586,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) ASSERT3U(blkoff + head, ==, blksz); if (len < head) head = len; - if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE, - FTAG, &db) == 0) { + if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), + TRUE, FALSE, FTAG, &db) == 0) { caddr_t data; /* don't dirty if it isn't on disk and isn't dirty */ @@ -1624,8 +1624,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) if (tail) { if (len < tail) tail = len; - if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), - TRUE, FTAG, &db) == 0) { + if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len), + TRUE, FALSE, FTAG, &db) == 0) { /* don't dirty if not on disk and not dirty */ if (db->db_last_dirty || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { @@ -1854,7 +1854,7 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) */ static int dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, - int lvl, uint64_t blkfill, uint64_t txg) + int lvl, uint64_t blkfill, uint64_t txg) { dmu_buf_impl_t *db = NULL; void *data = NULL; @@ -1876,8 +1876,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, epb = dn->dn_phys->dn_nblkptr; data = dn->dn_phys->dn_blkptr; } else { - uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl); - error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db); + uint64_t blkid = dbuf_whichblock(dn, lvl, *offset); + error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db); if (error) { if (error != ENOENT) return (error); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c index 0633604..0787885 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c @@ -188,7 +188,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) rw_enter(&dn->dn_struct_rwlock, RW_READER); err = dbuf_hold_impl(dn, db->db_level-1, - (db->db_blkid << epbs) + i, TRUE, FTAG, &child); + (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child); rw_exit(&dn->dn_struct_rwlock); if (err == ENOENT) continue; @@ -284,7 +284,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, continue; rw_enter(&dn->dn_struct_rwlock, RW_READER); VERIFY0(dbuf_hold_impl(dn, db->db_level - 1, - i, B_TRUE, FTAG, &subdb)); + i, TRUE, FALSE, FTAG, &subdb)); rw_exit(&dn->dn_struct_rwlock); ASSERT3P(bp, ==, subdb->db_blkptr); @@ -357,7 +357,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, continue; rw_enter(&dn->dn_struct_rwlock, RW_READER); VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i, - TRUE, FTAG, &db)); + TRUE, FALSE, FTAG, &db)); rw_exit(&dn->dn_struct_rwlock); free_children(db, blkid, nblks, tx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c index 551e35b..f4fdaf9 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright (c) 2011 Martin Matuska <mm@FreeBSD.org> - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014 RackTop Systems. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -130,8 +130,10 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) dsl_dataset_phys(ds)->ds_compressed_bytes += compressed; dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed; dsl_dataset_phys(ds)->ds_unique_bytes += used; - if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) - ds->ds_need_large_blocks = B_TRUE; + if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) { + ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] = + B_TRUE; + } mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, compressed, uncompressed, tx); @@ -433,19 +435,23 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, offsetof(dmu_sendarg_t, dsa_link)); if (doi.doi_type == DMU_OTN_ZAP_METADATA) { - int zaperr = zap_contains(mos, dsobj, - DS_FIELD_LARGE_BLOCKS); - if (zaperr != ENOENT) { - VERIFY0(zaperr); - ds->ds_large_blocks = B_TRUE; + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (!(spa_feature_table[f].fi_flags & + ZFEATURE_FLAG_PER_DATASET)) + continue; + err = zap_contains(mos, dsobj, + spa_feature_table[f].fi_guid); + if (err == 0) { + ds->ds_feature_inuse[f] = B_TRUE; + } else { + ASSERT3U(err, ==, ENOENT); + err = 0; + } } } - if (err == 0) { - err = dsl_dir_hold_obj(dp, - dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, - &ds->ds_dir); - } + err = dsl_dir_hold_obj(dp, + dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, &ds->ds_dir); if (err != 0) { mutex_destroy(&ds->ds_lock); mutex_destroy(&ds->ds_opening_lock); @@ -540,6 +546,7 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name, const char *snapname; uint64_t obj; int err = 0; + dsl_dataset_t *ds; err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname); if (err != 0) @@ -548,36 +555,37 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name, ASSERT(dsl_pool_config_held(dp)); obj = dsl_dir_phys(dd)->dd_head_dataset_obj; if (obj != 0) - err = dsl_dataset_hold_obj(dp, obj, tag, dsp); + err = dsl_dataset_hold_obj(dp, obj, tag, &ds); else err = SET_ERROR(ENOENT); /* we may be looking for a snapshot */ if (err == 0 && snapname != NULL) { - dsl_dataset_t *ds; + dsl_dataset_t *snap_ds; if (*snapname++ != '@') { - dsl_dataset_rele(*dsp, tag); + dsl_dataset_rele(ds, tag); dsl_dir_rele(dd, FTAG); return (SET_ERROR(ENOENT)); } dprintf("looking for snapshot '%s'\n", snapname); - err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); + err = dsl_dataset_snap_lookup(ds, snapname, &obj); if (err == 0) - err = dsl_dataset_hold_obj(dp, obj, tag, &ds); - dsl_dataset_rele(*dsp, tag); + err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds); + dsl_dataset_rele(ds, tag); if (err == 0) { - mutex_enter(&ds->ds_lock); - if (ds->ds_snapname[0] == 0) - (void) strlcpy(ds->ds_snapname, snapname, - sizeof (ds->ds_snapname)); - mutex_exit(&ds->ds_lock); - *dsp = ds; + mutex_enter(&snap_ds->ds_lock); + if (snap_ds->ds_snapname[0] == 0) + (void) strlcpy(snap_ds->ds_snapname, snapname, + sizeof (snap_ds->ds_snapname)); + mutex_exit(&snap_ds->ds_lock); + ds = snap_ds; } } - + if (err == 0) + *dsp = ds; dsl_dir_rele(dd, FTAG); return (err); } @@ -699,6 +707,34 @@ dsl_dataset_tryown(dsl_dataset_t *ds, void *tag) return (gotit); } +static void +dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset; + uint64_t zero = 0; + + VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET); + + spa_feature_incr(spa, f, tx); + dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx); + + VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid, + sizeof (zero), 1, &zero, tx)); +} + +void +dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset; + + VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET); + + VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx)); + spa_feature_decr(spa, f, tx); +} + uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, uint64_t flags, dmu_tx_t *tx) @@ -759,8 +795,10 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags & (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET); - if (origin->ds_large_blocks) - dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx); + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (origin->ds_feature_inuse[f]) + dsl_dataset_activate_feature(dsobj, f, tx); + } dmu_buf_will_dirty(origin->ds_dbuf, tx); dsl_dataset_phys(origin)->ds_num_children++; @@ -1322,8 +1360,10 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp; dmu_buf_rele(dbuf, FTAG); - if (ds->ds_large_blocks) - dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx); + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (ds->ds_feature_inuse[f]) + dsl_dataset_activate_feature(dsobj, f, tx); + } ASSERT3U(ds->ds_prev != 0, ==, dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); @@ -1615,9 +1655,13 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) dmu_objset_sync(ds->ds_objset, zio, tx); - if (ds->ds_need_large_blocks && !ds->ds_large_blocks) { - dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx); - ds->ds_large_blocks = B_TRUE; + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (ds->ds_feature_activation_needed[f]) { + if (ds->ds_feature_inuse[f]) + continue; + dsl_dataset_activate_feature(ds->ds_object, f, tx); + ds->ds_feature_inuse[f] = B_TRUE; + } } } @@ -2781,6 +2825,40 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota); ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev); + /* + * Swap per-dataset feature flags. + */ + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (!(spa_feature_table[f].fi_flags & + ZFEATURE_FLAG_PER_DATASET)) { + ASSERT(!clone->ds_feature_inuse[f]); + ASSERT(!origin_head->ds_feature_inuse[f]); + continue; + } + + boolean_t clone_inuse = clone->ds_feature_inuse[f]; + boolean_t origin_head_inuse = origin_head->ds_feature_inuse[f]; + + if (clone_inuse) { + dsl_dataset_deactivate_feature(clone->ds_object, f, tx); + clone->ds_feature_inuse[f] = B_FALSE; + } + if (origin_head_inuse) { + dsl_dataset_deactivate_feature(origin_head->ds_object, + f, tx); + origin_head->ds_feature_inuse[f] = B_FALSE; + } + if (clone_inuse) { + dsl_dataset_activate_feature(origin_head->ds_object, + f, tx); + origin_head->ds_feature_inuse[f] = B_TRUE; + } + if (origin_head_inuse) { + dsl_dataset_activate_feature(clone->ds_object, f, tx); + clone->ds_feature_inuse[f] = B_TRUE; + } + } + dmu_buf_will_dirty(clone->ds_dbuf, tx); dmu_buf_will_dirty(origin_head->ds_dbuf, tx); @@ -3335,77 +3413,6 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, return (err); } -static int -dsl_dataset_activate_large_blocks_check(void *arg, dmu_tx_t *tx) -{ - const char *dsname = arg; - dsl_dataset_t *ds; - dsl_pool_t *dp = dmu_tx_pool(tx); - int error = 0; - - if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) - return (SET_ERROR(ENOTSUP)); - - ASSERT(spa_feature_is_enabled(dp->dp_spa, - SPA_FEATURE_EXTENSIBLE_DATASET)); - - error = dsl_dataset_hold(dp, dsname, FTAG, &ds); - if (error != 0) - return (error); - - if (ds->ds_large_blocks) - error = EALREADY; - dsl_dataset_rele(ds, FTAG); - - return (error); -} - -void -dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx) -{ - spa_t *spa = dmu_tx_pool(tx)->dp_spa; - objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset; - uint64_t zero = 0; - - spa_feature_incr(spa, SPA_FEATURE_LARGE_BLOCKS, tx); - dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx); - - VERIFY0(zap_add(mos, dsobj, DS_FIELD_LARGE_BLOCKS, - sizeof (zero), 1, &zero, tx)); -} - -static void -dsl_dataset_activate_large_blocks_sync(void *arg, dmu_tx_t *tx) -{ - const char *dsname = arg; - dsl_dataset_t *ds; - - VERIFY0(dsl_dataset_hold(dmu_tx_pool(tx), dsname, FTAG, &ds)); - - dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx); - ASSERT(!ds->ds_large_blocks); - ds->ds_large_blocks = B_TRUE; - dsl_dataset_rele(ds, FTAG); -} - -int -dsl_dataset_activate_large_blocks(const char *dsname) -{ - int error; - - error = dsl_sync_task(dsname, - dsl_dataset_activate_large_blocks_check, - dsl_dataset_activate_large_blocks_sync, (void *)dsname, - 1, ZFS_SPACE_CHECK_RESERVED); - - /* - * EALREADY indicates that this dataset already supports large blocks. - */ - if (error == EALREADY) - error = 0; - return (error); -} - /* * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline. * For example, they could both be snapshots of the same filesystem, and @@ -3450,7 +3457,6 @@ dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier, return (ret); } - void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c index 7f90469..c7a623c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2013 by Joyent, Inc. All rights reserved. */ @@ -267,9 +267,11 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) obj = ds->ds_object; - if (ds->ds_large_blocks) { - ASSERT0(zap_contains(mos, obj, DS_FIELD_LARGE_BLOCKS)); - spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx); + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (ds->ds_feature_inuse[f]) { + dsl_dataset_deactivate_feature(obj, f, tx); + ds->ds_feature_inuse[f] = B_FALSE; + } } if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { ASSERT3P(ds->ds_prev, ==, NULL); @@ -552,7 +554,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, struct killarg *ka = arg; dmu_tx_t *tx = ka->tx; - if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); if (zb->zb_level == ZB_ZIL_LEVEL) { @@ -736,12 +738,16 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) ASSERT0(ds->ds_reserved); } - if (ds->ds_large_blocks) - spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx); + obj = ds->ds_object; - dsl_scan_ds_destroyed(ds, tx); + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (ds->ds_feature_inuse[f]) { + dsl_dataset_deactivate_feature(obj, f, tx); + ds->ds_feature_inuse[f] = B_FALSE; + } + } - obj = ds->ds_object; + dsl_scan_ds_destroyed(ds, tx); if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { /* This is a clone */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c index d58886b..189ca19 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c @@ -415,7 +415,14 @@ dsl_pool_close(dsl_pool_t *dp) txg_list_destroy(&dp->dp_sync_tasks); txg_list_destroy(&dp->dp_dirty_dirs); - arc_flush(dp->dp_spa); + /* + * We can't set retry to TRUE since we're explicitly specifying + * a spa to flush. This is good enough; any missed buffers for + * this spa won't cause trouble, and they'll eventually fall + * out of the ARC just like any other unused buffer. + */ + arc_flush(dp->dp_spa, FALSE); + txg_fini(dp); dsl_scan_fini(dp); dmu_buf_user_evict_wait(); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c index d08b5fb..406af3b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c @@ -600,7 +600,8 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, * If we already visited this bp & everything below (in * a prior txg sync), don't bother doing it again. */ - if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) + if (zbookmark_subtree_completed(dnp, zb, + &scn->scn_phys.scn_bookmark)) return (B_TRUE); /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c new file mode 100644 index 0000000..8296057 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c @@ -0,0 +1,366 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/multilist.h> + +/* needed for spa_get_random() */ +#include <sys/spa.h> + +/* + * Given the object contained on the list, return a pointer to the + * object's multilist_node_t structure it contains. + */ +static multilist_node_t * +multilist_d2l(multilist_t *ml, void *obj) +{ + return ((multilist_node_t *)((char *)obj + ml->ml_offset)); +} + +/* + * Initialize a new mutlilist using the parameters specified. + * + * - 'size' denotes the size of the structure containing the + * multilist_node_t. + * - 'offset' denotes the byte offset of the mutlilist_node_t within + * the structure that contains it. + * - 'num' specifies the number of internal sublists to create. + * - 'index_func' is used to determine which sublist to insert into + * when the multilist_insert() function is called; as well as which + * sublist to remove from when multilist_remove() is called. The + * requirements this function must meet, are the following: + * + * - It must always return the same value when called on the same + * object (to ensure the object is removed from the list it was + * inserted into). + * + * - It must return a value in the range [0, number of sublists). + * The multilist_get_num_sublists() function may be used to + * determine the number of sublists in the multilist. + * + * Also, in order to reduce internal contention between the sublists + * during insertion and removal, this function should choose evenly + * between all available sublists when inserting. This isn't a hard + * requirement, but a general rule of thumb in order to garner the + * best multi-threaded performance out of the data structure. + */ +void +multilist_create(multilist_t *ml, size_t size, size_t offset, unsigned int num, + multilist_sublist_index_func_t *index_func) +{ + ASSERT3P(ml, !=, NULL); + ASSERT3U(size, >, 0); + ASSERT3U(size, >=, offset + sizeof (multilist_node_t)); + ASSERT3U(num, >, 0); + ASSERT3P(index_func, !=, NULL); + + ml->ml_offset = offset; + ml->ml_num_sublists = num; + ml->ml_index_func = index_func; + + ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) * + ml->ml_num_sublists, KM_SLEEP); + + ASSERT3P(ml->ml_sublists, !=, NULL); + + for (int i = 0; i < ml->ml_num_sublists; i++) { + multilist_sublist_t *mls = &ml->ml_sublists[i]; + mutex_init(&mls->mls_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&mls->mls_list, size, offset); + } +} + +/* + * Destroy the given multilist object, and free up any memory it holds. + */ +void +multilist_destroy(multilist_t *ml) +{ + ASSERT(multilist_is_empty(ml)); + + for (int i = 0; i < ml->ml_num_sublists; i++) { + multilist_sublist_t *mls = &ml->ml_sublists[i]; + + ASSERT(list_is_empty(&mls->mls_list)); + + list_destroy(&mls->mls_list); + mutex_destroy(&mls->mls_lock); + } + + ASSERT3P(ml->ml_sublists, !=, NULL); + kmem_free(ml->ml_sublists, + sizeof (multilist_sublist_t) * ml->ml_num_sublists); + + ml->ml_num_sublists = 0; + ml->ml_offset = 0; +} + +/* + * Insert the given object into the multilist. + * + * This function will insert the object specified into the sublist + * determined using the function given at multilist creation time. + * + * The sublist locks are automatically acquired if not already held, to + * ensure consistency when inserting and removing from multiple threads. + */ +void +multilist_insert(multilist_t *ml, void *obj) +{ + unsigned int sublist_idx = ml->ml_index_func(ml, obj); + multilist_sublist_t *mls; + boolean_t need_lock; + + DTRACE_PROBE3(multilist__insert, multilist_t *, ml, + unsigned int, sublist_idx, void *, obj); + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + + mls = &ml->ml_sublists[sublist_idx]; + + /* + * Note: Callers may already hold the sublist lock by calling + * multilist_sublist_lock(). Here we rely on MUTEX_HELD() + * returning TRUE if and only if the current thread holds the + * lock. While it's a little ugly to make the lock recursive in + * this way, it works and allows the calling code to be much + * simpler -- otherwise it would have to pass around a flag + * indicating that it already has the lock. + */ + need_lock = !MUTEX_HELD(&mls->mls_lock); + + if (need_lock) + mutex_enter(&mls->mls_lock); + + ASSERT(!multilist_link_active(multilist_d2l(ml, obj))); + + multilist_sublist_insert_head(mls, obj); + + if (need_lock) + mutex_exit(&mls->mls_lock); +} + +/* + * Remove the given object from the multilist. + * + * This function will remove the object specified from the sublist + * determined using the function given at multilist creation time. + * + * The necessary sublist locks are automatically acquired, to ensure + * consistency when inserting and removing from multiple threads. + */ +void +multilist_remove(multilist_t *ml, void *obj) +{ + unsigned int sublist_idx = ml->ml_index_func(ml, obj); + multilist_sublist_t *mls; + boolean_t need_lock; + + DTRACE_PROBE3(multilist__remove, multilist_t *, ml, + unsigned int, sublist_idx, void *, obj); + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + + mls = &ml->ml_sublists[sublist_idx]; + /* See comment in multilist_insert(). */ + need_lock = !MUTEX_HELD(&mls->mls_lock); + + if (need_lock) + mutex_enter(&mls->mls_lock); + + ASSERT(multilist_link_active(multilist_d2l(ml, obj))); + + multilist_sublist_remove(mls, obj); + + if (need_lock) + mutex_exit(&mls->mls_lock); +} + +/* + * Check to see if this multilist object is empty. + * + * This will return TRUE if it finds all of the sublists of this + * multilist to be empty, and FALSE otherwise. Each sublist lock will be + * automatically acquired as necessary. + * + * If concurrent insertions and removals are occurring, the semantics + * of this function become a little fuzzy. Instead of locking all + * sublists for the entire call time of the function, each sublist is + * only locked as it is individually checked for emptiness. Thus, it's + * possible for this function to return TRUE with non-empty sublists at + * the time the function returns. This would be due to another thread + * inserting into a given sublist, after that specific sublist was check + * and deemed empty, but before all sublists have been checked. + */ +int +multilist_is_empty(multilist_t *ml) +{ + for (int i = 0; i < ml->ml_num_sublists; i++) { + multilist_sublist_t *mls = &ml->ml_sublists[i]; + /* See comment in multilist_insert(). */ + boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock); + + if (need_lock) + mutex_enter(&mls->mls_lock); + + if (!list_is_empty(&mls->mls_list)) { + if (need_lock) + mutex_exit(&mls->mls_lock); + + return (FALSE); + } + + if (need_lock) + mutex_exit(&mls->mls_lock); + } + + return (TRUE); +} + +/* Return the number of sublists composing this multilist */ +unsigned int +multilist_get_num_sublists(multilist_t *ml) +{ + return (ml->ml_num_sublists); +} + +/* Return a randomly selected, valid sublist index for this multilist */ +unsigned int +multilist_get_random_index(multilist_t *ml) +{ + return (spa_get_random(ml->ml_num_sublists)); +} + +/* Lock and return the sublist specified at the given index */ +multilist_sublist_t * +multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx) +{ + multilist_sublist_t *mls; + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + mls = &ml->ml_sublists[sublist_idx]; + mutex_enter(&mls->mls_lock); + + return (mls); +} + +void +multilist_sublist_unlock(multilist_sublist_t *mls) +{ + mutex_exit(&mls->mls_lock); +} + +/* + * We're allowing any object to be inserted into this specific sublist, + * but this can lead to trouble if multilist_remove() is called to + * remove this object. Specifically, if calling ml_index_func on this + * object returns an index for sublist different than what is passed as + * a parameter here, any call to multilist_remove() with this newly + * inserted object is undefined! (the call to multilist_remove() will + * remove the object from a list that it isn't contained in) + */ +void +multilist_sublist_insert_head(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_insert_head(&mls->mls_list, obj); +} + +/* please see comment above multilist_sublist_insert_head */ +void +multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_insert_tail(&mls->mls_list, obj); +} + +/* + * Move the object one element forward in the list. + * + * This function will move the given object forward in the list (towards + * the head) by one object. So, in essence, it will swap its position in + * the list with its "prev" pointer. If the given object is already at the + * head of the list, it cannot be moved forward any more than it already + * is, so no action is taken. + * + * NOTE: This function **must not** remove any object from the list other + * than the object given as the parameter. This is relied upon in + * arc_evict_state_impl(). + */ +void +multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj) +{ + void *prev = list_prev(&mls->mls_list, obj); + + ASSERT(MUTEX_HELD(&mls->mls_lock)); + ASSERT(!list_is_empty(&mls->mls_list)); + + /* 'obj' must be at the head of the list, nothing to do */ + if (prev == NULL) + return; + + list_remove(&mls->mls_list, obj); + list_insert_before(&mls->mls_list, prev, obj); +} + +void +multilist_sublist_remove(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_remove(&mls->mls_list, obj); +} + +void * +multilist_sublist_head(multilist_sublist_t *mls) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_head(&mls->mls_list)); +} + +void * +multilist_sublist_tail(multilist_sublist_t *mls) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_tail(&mls->mls_list)); +} + +void * +multilist_sublist_next(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_next(&mls->mls_list, obj)); +} + +void * +multilist_sublist_prev(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_prev(&mls->mls_list, obj)); +} + +void +multilist_link_init(multilist_node_t *link) +{ + list_link_init(link); +} + +int +multilist_link_active(multilist_node_t *link) +{ + return (list_link_active(link)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index 40efaba..a5389c3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -1943,7 +1943,7 @@ static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { - if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); /* * Note: normally this routine will not be called if diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c index aeac124..1ea829f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c @@ -80,8 +80,8 @@ space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) mutex_exit(sm->sm_lock); if (end > bufsize) { - dmu_prefetch(sm->sm_os, space_map_object(sm), bufsize, - end - bufsize); + dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize, + end - bufsize, ZIO_PRIORITY_SYNC_READ); } mutex_enter(sm->sm_lock); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h index 4d13cb1..a26d8f8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h @@ -37,6 +37,12 @@ extern "C" { #include <sys/dmu.h> #include <sys/spa.h> +/* + * Used by arc_flush() to inform arc_evict_state() that it should evict + * all available buffers from the arc state being passed in. + */ +#define ARC_EVICT_ALL -1ULL + typedef struct arc_buf_hdr arc_buf_hdr_t; typedef struct arc_buf arc_buf_t; typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv); @@ -154,7 +160,7 @@ void arc_freed(spa_t *spa, const blkptr_t *bp); void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *priv); boolean_t arc_clear_callback(arc_buf_t *buf); -void arc_flush(spa_t *spa); +void arc_flush(spa_t *spa, boolean_t retry); void arc_tempreserve_clear(uint64_t reserve); int arc_tempreserve_space(uint64_t reserve, uint64_t txg); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h new file mode 100644 index 0000000..63722df --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h @@ -0,0 +1,54 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Delphix. All rights reserved. + */ + +#ifndef _BQUEUE_H +#define _BQUEUE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/zfs_context.h> + +typedef struct bqueue { + list_t bq_list; + kmutex_t bq_lock; + kcondvar_t bq_add_cv; + kcondvar_t bq_pop_cv; + uint64_t bq_size; + uint64_t bq_maxsize; + size_t bq_node_offset; +} bqueue_t; + +typedef struct bqueue_node { + list_node_t bqn_node; + uint64_t bqn_size; +} bqueue_node_t; + + +int bqueue_init(bqueue_t *, uint64_t, size_t); +void bqueue_destroy(bqueue_t *); +void bqueue_enqueue(bqueue_t *, void *, uint64_t); +void *bqueue_dequeue(bqueue_t *); +boolean_t bqueue_empty(bqueue_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _BQUEUE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h index 2e07185..482ccb0 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h @@ -245,8 +245,7 @@ typedef struct dbuf_hash_table { kmutex_t hash_mutexes[DBUF_MUTEXES]; } dbuf_hash_table_t; - -uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset); +uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset); dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data); void dbuf_create_bonus(struct dnode *dn); @@ -258,10 +257,12 @@ void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx); dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag); dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, void *tag); -int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create, +int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, + boolean_t fail_sparse, boolean_t fail_uncached, void *tag, dmu_buf_impl_t **dbp); -void dbuf_prefetch(struct dnode *dn, uint64_t blkid, zio_priority_t prio); +void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid, + zio_priority_t prio, arc_flags_t aflags); void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h index 3c5cfbe..f6c72b0 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h @@ -45,6 +45,7 @@ #include <sys/zfs_context.h> #include <sys/cred.h> #include <sys/fs/zfs.h> +#include <sys/zio_priority.h> #ifdef __cplusplus extern "C" { @@ -748,8 +749,8 @@ extern int zfs_max_recordsize; /* * Asynchronously try to read in the data. */ -void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, - uint64_t len); +void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, + uint64_t len, enum zio_priority pri); typedef struct dmu_object_info { /* All sizes are in bytes unless otherwise indicated. */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h index 7d490ec..001bff5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -38,6 +38,7 @@ #include <sys/zfs_context.h> #include <sys/dsl_deadlist.h> #include <sys/refcount.h> +#include <zfeature_common.h> #ifdef __cplusplus extern "C" { @@ -145,8 +146,6 @@ typedef struct dsl_dataset { /* only used in syncing context, only valid for non-snapshots: */ struct dsl_dataset *ds_prev; uint64_t ds_bookmarks; /* DMU_OTN_ZAP_METADATA */ - boolean_t ds_large_blocks; - boolean_t ds_need_large_blocks; /* has internal locking: */ dsl_deadlist_t ds_deadlist; @@ -185,6 +184,18 @@ typedef struct dsl_dataset { kmutex_t ds_sendstream_lock; list_t ds_sendstreams; + /* + * For ZFEATURE_FLAG_PER_DATASET features, set if this dataset + * uses this feature. + */ + uint8_t ds_feature_inuse[SPA_FEATURES]; + + /* + * Set if we need to activate the feature on this dataset this txg + * (used only in syncing context). + */ + uint8_t ds_feature_activation_needed[SPA_FEATURES]; + /* Protected by ds_lock; keep at end of struct for better locality */ char ds_snapname[MAXNAMELEN]; } dsl_dataset_t; @@ -264,8 +275,6 @@ int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds); -int dsl_dataset_activate_large_blocks(const char *dsname); -void dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx); int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf); @@ -305,6 +314,9 @@ void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx); int dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result); +void dsl_dataset_deactivate_feature(uint64_t dsobj, + spa_feature_t f, dmu_tx_t *tx); + #ifdef ZFS_DEBUG #define dprintf_ds(ds, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h new file mode 100644 index 0000000..5ebb7fe --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h @@ -0,0 +1,106 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + */ + +#ifndef _SYS_MULTILIST_H +#define _SYS_MULTILIST_H + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef list_node_t multilist_node_t; +typedef struct multilist multilist_t; +typedef struct multilist_sublist multilist_sublist_t; +typedef unsigned int multilist_sublist_index_func_t(multilist_t *, void *); + +struct multilist_sublist { + /* + * The mutex used internally to implement thread safe insertions + * and removals to this individual sublist. It can also be locked + * by a consumer using multilist_sublist_{lock,unlock}, which is + * useful if a consumer needs to traverse the list in a thread + * safe manner. + */ + kmutex_t mls_lock; + /* + * The actual list object containing all objects in this sublist. + */ + list_t mls_list; + /* + * Pad to cache line (64 bytes), in an effort to try and prevent + * cache line contention. + */ + uint8_t mls_pad[24]; +}; + +struct multilist { + /* + * This is used to get to the multilist_node_t structure given + * the void *object contained on the list. + */ + size_t ml_offset; + /* + * The number of sublists used internally by this multilist. + */ + uint64_t ml_num_sublists; + /* + * The array of pointers to the actual sublists. + */ + multilist_sublist_t *ml_sublists; + /* + * Pointer to function which determines the sublist to use + * when inserting and removing objects from this multilist. + * Please see the comment above multilist_create for details. + */ + multilist_sublist_index_func_t *ml_index_func; +}; + +void multilist_destroy(multilist_t *); +void multilist_create(multilist_t *, size_t, size_t, unsigned int, + multilist_sublist_index_func_t *); + +void multilist_insert(multilist_t *, void *); +void multilist_remove(multilist_t *, void *); +int multilist_is_empty(multilist_t *); + +unsigned int multilist_get_num_sublists(multilist_t *); +unsigned int multilist_get_random_index(multilist_t *); + +multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int); +void multilist_sublist_unlock(multilist_sublist_t *); + +void multilist_sublist_insert_head(multilist_sublist_t *, void *); +void multilist_sublist_insert_tail(multilist_sublist_t *, void *); +void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj); +void multilist_sublist_remove(multilist_sublist_t *, void *); + +void *multilist_sublist_head(multilist_sublist_t *); +void *multilist_sublist_tail(multilist_sublist_t *); +void *multilist_sublist_next(multilist_sublist_t *, void *); +void *multilist_sublist_prev(multilist_sublist_t *, void *); + +void multilist_link_init(multilist_node_t *); +int multilist_link_active(multilist_node_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MULTILIST_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h index 36739cd..342c9cd 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h @@ -29,6 +29,7 @@ #ifndef _ZIO_H #define _ZIO_H +#include <sys/zio_priority.h> #include <sys/zfs_context.h> #include <sys/spa.h> #include <sys/txg.h> @@ -144,18 +145,6 @@ enum zio_compress { #define ZIO_FAILURE_MODE_CONTINUE 1 #define ZIO_FAILURE_MODE_PANIC 2 -typedef enum zio_priority { - ZIO_PRIORITY_SYNC_READ, - ZIO_PRIORITY_SYNC_WRITE, /* ZIL */ - ZIO_PRIORITY_ASYNC_READ, /* prefetch */ - ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ - ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ - ZIO_PRIORITY_TRIM, /* free requests used for TRIM */ - ZIO_PRIORITY_NUM_QUEUEABLE, - - ZIO_PRIORITY_NOW /* non-queued I/Os (e.g. ioctl) */ -} zio_priority_t; - enum zio_flag { /* * Flags inherited by gang, ddt, and vdev children, @@ -260,6 +249,7 @@ extern const char *zio_type_name[ZIO_TYPES]; * Root blocks (objset_phys_t) are object 0, level -1: <objset, 0, -1, 0>. * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>. * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>. + * dnode visit bookmarks are <objset, object id of dnode, -3, 0>. * * Note: this structure is called a bookmark because its original purpose * was to remember where to resume a pool-wide traverse. @@ -292,6 +282,9 @@ typedef struct zbookmark_phys { #define ZB_ZIL_OBJECT (0ULL) #define ZB_ZIL_LEVEL (-2LL) +#define ZB_DNODE_LEVEL (-3LL) +#define ZB_DNODE_BLKID (0ULL) + #define ZB_IS_ZERO(zb) \ ((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \ (zb)->zb_level == 0 && (zb)->zb_blkid == 0) @@ -633,8 +626,10 @@ extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, extern void spa_handle_ignored_writes(spa_t *spa); /* zbookmark_phys functions */ -boolean_t zbookmark_is_before(const struct dnode_phys *dnp, - const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2); +boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp, + const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block); +int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, + uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h index a921a2f..0c293ab 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h @@ -44,7 +44,7 @@ typedef struct zio_checksum_info { zio_checksum_func_t *ci_func[2]; /* checksum function per byteorder */ int ci_correctable; /* number of correctable bits */ int ci_eck; /* uses zio embedded checksum? */ - int ci_dedup; /* strong enough for dedup? */ + boolean_t ci_dedup; /* strong enough for dedup? */ char *ci_name; /* descriptive name */ } zio_checksum_info_t; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h new file mode 100644 index 0000000..32e90e2 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h @@ -0,0 +1,41 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Delphix. All rights reserved. + */ +#ifndef _ZIO_PRIORITY_H +#define _ZIO_PRIORITY_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum zio_priority { + ZIO_PRIORITY_SYNC_READ, + ZIO_PRIORITY_SYNC_WRITE, /* ZIL */ + ZIO_PRIORITY_ASYNC_READ, /* prefetch */ + ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ + ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ + ZIO_PRIORITY_TRIM, /* free requests used for TRIM */ + ZIO_PRIORITY_NUM_QUEUEABLE, + + ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */ +} zio_priority_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _ZIO_PRIORITY_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c index 36969e8..44919d2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c @@ -162,8 +162,9 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); tbl->zt_nextblk = newblk; ASSERT0(tbl->zt_blks_copied); - dmu_prefetch(zap->zap_objset, zap->zap_object, - tbl->zt_blk << bs, tbl->zt_numblks << bs); + dmu_prefetch(zap->zap_objset, zap->zap_object, 0, + tbl->zt_blk << bs, tbl->zt_numblks << bs, + ZIO_PRIORITY_SYNC_READ); } /* @@ -939,7 +940,8 @@ fzap_prefetch(zap_name_t *zn) if (zap_idx_to_blk(zap, idx, &blk) != 0) return; bs = FZAP_BLOCK_SHIFT(zap); - dmu_prefetch(zap->zap_objset, zap->zap_object, blk << bs, 1 << bs); + dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs, + ZIO_PRIORITY_SYNC_READ); } /* @@ -1310,9 +1312,10 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) } else { int b; - dmu_prefetch(zap->zap_objset, zap->zap_object, + dmu_prefetch(zap->zap_objset, zap->zap_object, 0, zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, - zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs); + zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs, + ZIO_PRIORITY_SYNC_READ); for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks; b++) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c index 7540320..80a3f0b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -245,7 +245,7 @@ feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature, { int err; uint64_t refcount; - uint64_t zapobj = feature->fi_can_readonly ? + uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; /* @@ -296,7 +296,7 @@ feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount, dmu_tx_t *tx) { ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature)); - uint64_t zapobj = feature->fi_can_readonly ? + uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid, @@ -322,7 +322,7 @@ feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount, if (refcount == 0) spa_deactivate_mos_feature(spa, feature->fi_guid); - else if (feature->fi_mos) + else if (feature->fi_flags & ZFEATURE_FLAG_MOS) spa_activate_mos_feature(spa, feature->fi_guid, tx); } @@ -333,8 +333,9 @@ feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount, void feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) { - uint64_t initial_refcount = feature->fi_activate_on_enable ? 1 : 0; - uint64_t zapobj = feature->fi_can_readonly ? + uint64_t initial_refcount = + (feature->fi_flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE) ? 1 : 0; + uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; ASSERT(0 != zapobj); @@ -379,7 +380,7 @@ feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action, { uint64_t refcount; zfeature_info_t *feature = &spa_feature_table[fid]; - uint64_t zapobj = feature->fi_can_readonly ? + uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; ASSERT(VALID_FEATURE_FID(fid)); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c index c2dd020..693ba41 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. */ /* @@ -1149,10 +1150,11 @@ zfsctl_shares_lookup(ap) ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOTSUP)); } - if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) + if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { error = VOP_LOOKUP(ZTOV(dzp), vpp, cnp); + VN_RELE(ZTOV(dzp)); + } - VN_RELE(ZTOV(dzp)); ZFS_EXIT(zfsvfs); return (error); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c index 2a583d4..2e51916 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -5190,6 +5190,7 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) { VN_RELE(vp); + VN_RELE(ZTOV(sharedir)); ZFS_EXIT(zfsvfs); return (error); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c index 8a08c8d..ed56d17 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c @@ -22,7 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>. * All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -950,7 +950,7 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp) error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error) - return (error); + goto out; } else { /* * Pre SA versions file systems should never touch diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c index 1038a87..45a2bd7 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -2675,7 +2675,8 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon /* Prefetch znode */ if (prefetch) - dmu_prefetch(os, objnum, 0, 0); + dmu_prefetch(os, objnum, 0, 0, 0, + ZIO_PRIORITY_SYNC_READ); skip_entry: /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c index 48de571..867b798 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -94,6 +94,9 @@ extern vmem_t *zio_alloc_arena; #define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_STOP 0x101 +#define BP_SPANB(indblkshift, level) \ + (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) +#define COMPARE_META_LEVEL 0x80000000ul /* * The following actions directly effect the spa's sync-to-convergence logic. * The values below define the sync pass when we start performing the action. @@ -3461,37 +3464,127 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_done }; -/* dnp is the dnode for zb1->zb_object */ -boolean_t -zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1, - const zbookmark_phys_t *zb2) -{ - uint64_t zb1nextL0, zb2thisobj; - ASSERT(zb1->zb_objset == zb2->zb_objset); - ASSERT(zb2->zb_level == 0); - /* The objset_phys_t isn't before anything. */ - if (dnp == NULL) - return (B_FALSE); - zb1nextL0 = (zb1->zb_blkid + 1) << - ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); +/* + * Compare two zbookmark_phys_t's to see which we would reach first in a + * pre-order traversal of the object tree. + * + * This is simple in every case aside from the meta-dnode object. For all other + * objects, we traverse them in order (object 1 before object 2, and so on). + * However, all of these objects are traversed while traversing object 0, since + * the data it points to is the list of objects. Thus, we need to convert to a + * canonical representation so we can compare meta-dnode bookmarks to + * non-meta-dnode bookmarks. + * + * We do this by calculating "equivalents" for each field of the zbookmark. + * zbookmarks outside of the meta-dnode use their own object and level, and + * calculate the level 0 equivalent (the first L0 blkid that is contained in the + * blocks this bookmark refers to) by multiplying their blkid by their span + * (the number of L0 blocks contained within one block at their level). + * zbookmarks inside the meta-dnode calculate their object equivalent + * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use + * level + 1<<31 (any value larger than a level could ever be) for their level. + * This causes them to always compare before a bookmark in their object + * equivalent, compare appropriately to bookmarks in other objects, and to + * compare appropriately to other bookmarks in the meta-dnode. + */ +int +zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, + const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) +{ + /* + * These variables represent the "equivalent" values for the zbookmark, + * after converting zbookmarks inside the meta dnode to their + * normal-object equivalents. + */ + uint64_t zb1obj, zb2obj; + uint64_t zb1L0, zb2L0; + uint64_t zb1level, zb2level; - zb2thisobj = zb2->zb_object ? zb2->zb_object : - zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); + if (zb1->zb_object == zb2->zb_object && + zb1->zb_level == zb2->zb_level && + zb1->zb_blkid == zb2->zb_blkid) + return (0); + + /* + * BP_SPANB calculates the span in blocks. + */ + zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); + zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); if (zb1->zb_object == DMU_META_DNODE_OBJECT) { - uint64_t nextobj = zb1nextL0 * - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; - return (nextobj <= zb2thisobj); + zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); + zb1L0 = 0; + zb1level = zb1->zb_level + COMPARE_META_LEVEL; + } else { + zb1obj = zb1->zb_object; + zb1level = zb1->zb_level; } - if (zb1->zb_object < zb2thisobj) - return (B_TRUE); - if (zb1->zb_object > zb2thisobj) - return (B_FALSE); - if (zb2->zb_object == DMU_META_DNODE_OBJECT) + if (zb2->zb_object == DMU_META_DNODE_OBJECT) { + zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); + zb2L0 = 0; + zb2level = zb2->zb_level + COMPARE_META_LEVEL; + } else { + zb2obj = zb2->zb_object; + zb2level = zb2->zb_level; + } + + /* Now that we have a canonical representation, do the comparison. */ + if (zb1obj != zb2obj) + return (zb1obj < zb2obj ? -1 : 1); + else if (zb1L0 != zb2L0) + return (zb1L0 < zb2L0 ? -1 : 1); + else if (zb1level != zb2level) + return (zb1level > zb2level ? -1 : 1); + /* + * This can (theoretically) happen if the bookmarks have the same object + * and level, but different blkids, if the block sizes are not the same. + * There is presently no way to change the indirect block sizes + */ + return (0); +} + +/* + * This function checks the following: given that last_block is the place that + * our traversal stopped last time, does that guarantee that we've visited + * every node under subtree_root? Therefore, we can't just use the raw output + * of zbookmark_compare. We have to pass in a modified version of + * subtree_root; by incrementing the block id, and then checking whether + * last_block is before or equal to that, we can tell whether or not having + * visited last_block implies that all of subtree_root's children have been + * visited. + */ +boolean_t +zbookmark_subtree_completed(const dnode_phys_t *dnp, + const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) +{ + zbookmark_phys_t mod_zb = *subtree_root; + mod_zb.zb_blkid++; + ASSERT(last_block->zb_level == 0); + + /* The objset_phys_t isn't before anything. */ + if (dnp == NULL) return (B_FALSE); - return (zb1nextL0 <= zb2->zb_blkid); + + /* + * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the + * data block size in sectors, because that variable is only used if + * the bookmark refers to a block in the meta-dnode. Since we don't + * know without examining it what object it refers to, and there's no + * harm in passing in this value in other cases, we always pass it in. + * + * We pass in 0 for the indirect block size shift because zb2 must be + * level 0. The indirect block size is only used to calculate the span + * of the bookmark, but since the bookmark must be level 0, the span is + * always 1, so the math works out. + * + * If you make changes to how the zbookmark_compare code works, be sure + * to make sure that this code still works afterwards. + */ + return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, + 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, + last_block) <= 0); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c index 991a0a3..0a7f4e4 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c @@ -438,7 +438,11 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) * fault injection isn't a performance critical path. */ if (flags & ZINJECT_FLUSH_ARC) - arc_flush(NULL); + /* + * We must use FALSE to ensure arc_flush returns, since + * we're not preventing concurrent ARC insertions. + */ + arc_flush(NULL, FALSE); return (0); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c index 55de1b4..2c90810 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c @@ -358,7 +358,7 @@ zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, zvol_extent_t *ze; int bs = ma->ma_zv->zv_volblocksize; - if (BP_IS_HOLE(bp) || + if (bp == NULL || BP_IS_HOLE(bp) || zb->zb_object != ZVOL_OBJ || zb->zb_level != 0) return (0); diff --git a/sys/compat/cloudabi/cloudabi_proc.c b/sys/compat/cloudabi/cloudabi_proc.c index 9c735fa..d917337 100644 --- a/sys/compat/cloudabi/cloudabi_proc.c +++ b/sys/compat/cloudabi/cloudabi_proc.c @@ -46,14 +46,19 @@ cloudabi_sys_proc_exec(struct thread *td, struct cloudabi_sys_proc_exec_args *uap) { struct image_args args; + struct vmspace *oldvmspace; int error; + error = pre_execve(td, &oldvmspace); + if (error != 0) + return (error); error = exec_copyin_data_fds(td, &args, uap->data, uap->datalen, uap->fds, uap->fdslen); if (error == 0) { args.fd = uap->fd; error = kern_execve(td, &args, NULL); } + post_execve(td, error, oldvmspace); return (error); } diff --git a/sys/conf/Makefile.arm b/sys/conf/Makefile.arm index 86b11c6..af5f7da 100644 --- a/sys/conf/Makefile.arm +++ b/sys/conf/Makefile.arm @@ -66,10 +66,6 @@ SYSTEM_LD_TAIL +=;sed s/" + SIZEOF_HEADERS"// ldscript.$M\ ${KERNEL_KO}.bin; \ rm ${FULLKERNEL}.noheader -.if defined(MFS_IMAGE) -SYSTEM_LD_TAIL += ;sh ${S}/tools/embed_mfs.sh ${KERNEL_KO}.bin ${MFS_IMAGE}; -.endif - FILES_CPU_FUNC = \ $S/$M/$M/cpufunc_asm_arm9.S \ $S/$M/$M/cpufunc_asm_arm10.S \ diff --git a/sys/conf/NOTES b/sys/conf/NOTES index b0619cb..7bc2048 100644 --- a/sys/conf/NOTES +++ b/sys/conf/NOTES @@ -2981,9 +2981,10 @@ options MAXFILES=999 # Random number generator # Only ONE of the below two may be used; they are mutually exclusive. -# If neither is present, then the Fortuna algorithm is used. -options RANDOM_YARROW # Yarrow CSPRNG (old default) -#options RANDOM_DUMMY # Dummy CSPRNG that always blocks +# If neither is present, then the Fortuna algorithm is selected. +#options RANDOM_YARROW # Yarrow CSPRNG (old default) +#options RANDOM_LOADABLE # Allow the algorithm to be loaded as + # a module. # For developers. options RANDOM_DEBUG # Extra debugging messages diff --git a/sys/conf/files b/sys/conf/files index 531647f..dfe9763 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -83,7 +83,7 @@ cam/ctl/ctl_backend_ramdisk.c optional ctl cam/ctl/ctl_cmd_table.c optional ctl cam/ctl/ctl_frontend.c optional ctl cam/ctl/ctl_frontend_cam_sim.c optional ctl -cam/ctl/ctl_frontend_internal.c optional ctl +cam/ctl/ctl_frontend_ioctl.c optional ctl cam/ctl/ctl_frontend_iscsi.c optional ctl cam/ctl/ctl_scsi_all.c optional ctl cam/ctl/ctl_tpc.c optional ctl @@ -145,6 +145,7 @@ cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c optional zfs compile-with cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c optional zfs compile-with "${ZFS_C}" +cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c optional zfs compile-with "${ZFS_C}" @@ -174,6 +175,7 @@ cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c optional zfs compile-with "$ cddl/contrib/opensolaris/uts/common/fs/zfs/lz4.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c optional zfs compile-with "${ZFS_C}" +cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c optional zfs compile-with "${ZFS_C}" cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c optional zfs compile-with "${ZFS_C}" @@ -548,14 +550,14 @@ crypto/des/des_ecb.c optional crypto | ipsec | netsmb crypto/des/des_setkey.c optional crypto | ipsec | netsmb crypto/rc4/rc4.c optional netgraph_mppc_encryption | kgssapi crypto/rijndael/rijndael-alg-fst.c optional crypto | geom_bde | \ - ipsec | random random_yarrow | random !random_yarrow !random_dummy | wlan_ccmp -crypto/rijndael/rijndael-api-fst.c optional geom_bde | random random_yarrow | random !random_yarrow !random_dummy + ipsec | random !random_loadable | wlan_ccmp +crypto/rijndael/rijndael-api-fst.c optional geom_bde | random !random_loadable crypto/rijndael/rijndael-api.c optional crypto | ipsec | wlan_ccmp crypto/sha1.c optional carp | crypto | ipsec | \ netgraph_mppc_encryption | sctp -crypto/sha2/sha2.c optional crypto | geom_bde | ipsec | random random_yarrow | random !random_yarrow !random_dummy | \ +crypto/sha2/sha2.c optional crypto | geom_bde | ipsec | random !random_loadable | \ sctp | zfs -crypto/sha2/sha256c.c optional crypto | geom_bde | ipsec | random random_yarrow | random !random_yarrow !random_dummy | \ +crypto/sha2/sha256c.c optional crypto | geom_bde | ipsec | random !random_loadable | \ sctp | zfs crypto/siphash/siphash.c optional inet | inet6 crypto/siphash/siphash_test.c optional inet | inet6 @@ -2312,12 +2314,14 @@ rt2860.fw optional rt2860fw | ralfw \ compile-with "${NORMAL_FW}" \ no-obj no-implicit-rule \ clean "rt2860.fw" -dev/random/randomdev_none.c optional !random -dev/random/randomdev.c optional random -dev/random/random_harvestq.c optional random random_yarrow | random !random_dummy +dev/random/random_infra.c optional random +dev/random/random_harvestq.c optional random +dev/random/randomdev.c optional random random_yarrow | \ + random !random_yarrow !random_loadable dev/random/yarrow.c optional random random_yarrow -dev/random/fortuna.c optional random !random_yarrow !random_dummy -dev/random/hash.c optional random random_yarrow | random !random_dummy +dev/random/fortuna.c optional random !random_yarrow !random_loadable +dev/random/hash.c optional random random_yarrow | \ + random !random_yarrow !random_loadable dev/rc/rc.c optional rc dev/re/if_re.c optional re dev/rl/if_rl.c optional rl pci diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64 index 2ffe102..8451e00 100644 --- a/sys/conf/files.amd64 +++ b/sys/conf/files.amd64 @@ -40,7 +40,7 @@ ia32_genassym.o standard \ # ia32_assym.h standard \ dependency "$S/kern/genassym.sh ia32_genassym.o" \ - compile-with "env NM='${NM}' sh $S/kern/genassym.sh ia32_genassym.o > ${.TARGET}" \ + compile-with "env NM='${NM}' NMFLAGS='${NMFLAGS}' sh $S/kern/genassym.sh ia32_genassym.o > ${.TARGET}" \ no-obj no-implicit-rule before-depend \ clean "ia32_assym.h" # diff --git a/sys/conf/kern.post.mk b/sys/conf/kern.post.mk index 28ea453..137e72c 100644 --- a/sys/conf/kern.post.mk +++ b/sys/conf/kern.post.mk @@ -121,7 +121,7 @@ gdbinit: .endif .endif -${FULLKERNEL}: ${SYSTEM_DEP} vers.o ${MFS_IMAGE} +${FULLKERNEL}: ${SYSTEM_DEP} vers.o @rm -f ${.TARGET} @echo linking ${.TARGET} ${SYSTEM_LD} @@ -133,9 +133,6 @@ ${FULLKERNEL}: ${SYSTEM_DEP} vers.o ${MFS_IMAGE} ${OBJCOPY} --strip-debug ${.TARGET} .endif ${SYSTEM_LD_TAIL} -.if defined(MFS_IMAGE) - sh ${S}/tools/embed_mfs.sh ${FULLKERNEL} ${MFS_IMAGE} -.endif .if !exists(${.OBJDIR}/.depend) ${SYSTEM_OBJS}: assym.s vnode_if.h ${BEFORE_DEPEND:M*.h} ${MFILES:T:S/.m$/.h/} @@ -177,7 +174,7 @@ hack.So: Makefile ./assym.s: assym.s assym.s: $S/kern/genassym.sh genassym.o - NM='${NM}' sh $S/kern/genassym.sh genassym.o > ${.TARGET} + NM='${NM}' NMFLAGS='${NMFLAGS}' sh $S/kern/genassym.sh genassym.o > ${.TARGET} genassym.o: $S/$M/$M/genassym.c ${CC} -c ${CFLAGS:N-fno-common} $S/$M/$M/genassym.c @@ -301,6 +298,27 @@ vnode_if_newproto.h: vnode_if_typedef.h: ${AWK} -f $S/tools/vnode_if.awk $S/kern/vnode_if.src -q +.if ${MFS_IMAGE:Uno} != "no" +# Generate an object file from the file system image to embed in the kernel +# via linking. Make sure the contents are in the mfs section and rename the +# start/end/size variables to __start_mfs, __stop_mfs, and mfs_size, +# respectively. +embedfs_${MFS_IMAGE:T:R}.o: ${MFS_IMAGE} + ${OBJCOPY} --input-target binary \ + --output-target ${EMBEDFS_FORMAT.${MACHINE_ARCH}} \ + --binary-architecture ${EMBEDFS_ARCH.${MACHINE_ARCH}} \ + ${MFS_IMAGE} ${.TARGET} + ${OBJCOPY} \ + --rename-section .data=mfs,contents,alloc,load,readonly,data \ + --redefine-sym \ + _binary_${MFS_IMAGE:C,[^[:alnum:]],_,g}_size=__mfs_root_size \ + --redefine-sym \ + _binary_${MFS_IMAGE:C,[^[:alnum:]],_,g}_start=mfs_root \ + --redefine-sym \ + _binary_${MFS_IMAGE:C,[^[:alnum:]],_,g}_end=mfs_root_end \ + ${.TARGET} +.endif + # XXX strictly, everything depends on Makefile because changes to ${PROF} # only appear there, but we don't handle that. diff --git a/sys/conf/kern.pre.mk b/sys/conf/kern.pre.mk index cf1b127..3783881 100644 --- a/sys/conf/kern.pre.mk +++ b/sys/conf/kern.pre.mk @@ -191,6 +191,9 @@ SYSTEM_DEP= Makefile ${SYSTEM_OBJS} SYSTEM_OBJS= locore.o ${MDOBJS} ${OBJS} SYSTEM_OBJS+= ${SYSTEM_CFILES:.c=.o} SYSTEM_OBJS+= hack.So +.if ${MFS_IMAGE:Uno} != "no" +SYSTEM_OBJS+= embedfs_${MFS_IMAGE:T:R}.o +.endif SYSTEM_LD= @${LD} -Bdynamic -T ${LDSCRIPT} ${_LDFLAGS} --no-warn-mismatch \ --warn-common --export-dynamic --dynamic-linker /red/herring \ -o ${.TARGET} -X ${SYSTEM_OBJS} vers.o @@ -222,6 +225,32 @@ MKMODULESENV+= DEBUG_FLAGS="${DEBUG}" .endif MKMODULESENV+= _MPATH="${_MPATH}" +# Architecture and output format arguments for objdump to convert image to +# object file +.if ${MFS_IMAGE:Uno} != "no" + +.if !defined(EMBEDFS_FORMAT.${MACHINE_ARCH}) +EMBEDFS_FORMAT.${MACHINE_ARCH}!= awk -F'"' '/OUTPUT_FORMAT/ {print $$2}' ${LDSCRIPT} +.if empty(EMBEDFS_FORMAT.${MACHINE_ARCH}) +.undef EMBEDFS_FORMAT.${MACHINE_ARCH} +.endif +.endif + +.if !defined(EMBEDFS_ARCH.${MACHINE_ARCH}) +EMBEDFS_ARCH.${MACHINE_ARCH}!= sed -n '/OUTPUT_ARCH/s/.*(\(.*\)).*/\1/p' ${LDSCRIPT} +.if empty(EMBEDFS_ARCH.${MACHINE_ARCH}) +.undef EMBEDFS_ARCH.${MACHINE_ARCH} +.endif +.endif + +EMBEDFS_FORMAT.arm?= elf32-littlearm +EMBEDFS_FORMAT.armv6?= elf32-littlearm +EMBEDFS_FORMAT.mips?= elf32-tradbigmips +EMBEDFS_FORMAT.mipsel?= elf32-tradlittlemips +EMBEDFS_FORMAT.mips64?= elf64-tradbigmips +EMBEDFS_FORMAT.mips64el?= elf64-tradlittlemips +.endif + # Detect kernel config options that force stack frames to be turned on. DDB_ENABLED!= grep DDB opt_ddb.h || true ; echo DTR_ENABLED!= grep KDTRACE_FRAME opt_kdtrace.h || true ; echo diff --git a/sys/conf/options b/sys/conf/options index bf6c4a6..30bbc53 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -711,6 +711,7 @@ DEV_PCI opt_pci.h DEV_PF opt_pf.h DEV_PFLOG opt_pf.h DEV_PFSYNC opt_pf.h +DEV_RANDOM opt_global.h DEV_SPLASH opt_splash.h DEV_VLAN opt_vlan.h @@ -946,13 +947,14 @@ RCTL opt_global.h # The DEBUG option is in global.h as the random harvesting # puts probes all over the place, and it makes little sense # to pollute these headers with an extra include. -# the DUMMY option is in global.h because it is used to -# turn off harvesting all over the kernel. -RANDOM_DEBUG opt_global.h +RANDOM_DEBUG opt_random.h # Which CSPRNG hashes we get. -# These are mutually exclusive. With neither, Fortuna is selected. -RANDOM_DUMMY opt_global.h +# If Yarrow is not chosen, Fortuna is selected. RANDOM_YARROW opt_random.h +# With this, no entropy processor is loaded, but the entropy +# harvesting infrastructure is present. This means an entropy +# processor may be loaded as a module. +RANDOM_LOADABLE opt_random.h # Intel em(4) driver EM_MULTIQUEUE opt_em.h diff --git a/sys/contrib/libnv/nv_impl.h b/sys/contrib/libnv/nv_impl.h index 7928431..b50bdf7 100644 --- a/sys/contrib/libnv/nv_impl.h +++ b/sys/contrib/libnv/nv_impl.h @@ -1,5 +1,6 @@ /*- * Copyright (c) 2013 The FreeBSD Foundation + * Copyright (c) 2013-2015 Mariusz Zaborski <oshogbo@FreeBSD.org> * All rights reserved. * * This software was developed by Pawel Jakub Dawidek under sponsorship from @@ -39,12 +40,14 @@ struct nvpair; typedef struct nvpair nvpair_t; #endif +#define NV_TYPE_NVLIST_ARRAY_NEXT 254 #define NV_TYPE_NVLIST_UP 255 #define NV_TYPE_FIRST NV_TYPE_NULL -#define NV_TYPE_LAST NV_TYPE_BINARY +#define NV_TYPE_LAST NV_TYPE_DESCRIPTOR_ARRAY -#define NV_FLAG_BIG_ENDIAN 0x80 +#define NV_FLAG_BIG_ENDIAN 0x080 +#define NV_FLAG_IN_ARRAY 0x100 #ifdef _KERNEL #define nv_malloc(size) malloc((size), M_NVLIST, M_WAITOK) @@ -86,6 +89,7 @@ typedef struct nvpair nvpair_t; int *nvlist_descriptors(const nvlist_t *nvl, size_t *nitemsp); size_t nvlist_ndescriptors(const nvlist_t *nvl); +void nvlist_set_flags(nvlist_t *nvl, int flags); nvpair_t *nvlist_first_nvpair(const nvlist_t *nvl); nvpair_t *nvlist_next_nvpair(const nvlist_t *nvl, const nvpair_t *nvp); @@ -96,6 +100,7 @@ void nvlist_add_nvpair(nvlist_t *nvl, const nvpair_t *nvp); bool nvlist_move_nvpair(nvlist_t *nvl, nvpair_t *nvp); void nvlist_set_parent(nvlist_t *nvl, nvpair_t *parent); +void nvlist_set_array_next(nvlist_t *nvl, nvpair_t *ele); const nvpair_t *nvlist_get_nvpair(const nvlist_t *nvl, const char *name); @@ -120,18 +125,33 @@ nvpair_t *nvpair_create_stringv(const char *name, const char *valuefmt, va_list nvpair_t *nvpair_create_nvlist(const char *name, const nvlist_t *value); nvpair_t *nvpair_create_descriptor(const char *name, int value); nvpair_t *nvpair_create_binary(const char *name, const void *value, size_t size); +nvpair_t *nvpair_create_bool_array(const char *name, const bool *value, size_t nitems); +nvpair_t *nvpair_create_number_array(const char *name, const uint64_t *value, size_t nitems); +nvpair_t *nvpair_create_string_array(const char *name, const char * const *value, size_t nitems); +nvpair_t *nvpair_create_nvlist_array(const char *name, const nvlist_t * const *value, size_t nitems); +nvpair_t *nvpair_create_descriptor_array(const char *name, const int *value, size_t nitems); nvpair_t *nvpair_move_string(const char *name, char *value); nvpair_t *nvpair_move_nvlist(const char *name, nvlist_t *value); nvpair_t *nvpair_move_descriptor(const char *name, int value); nvpair_t *nvpair_move_binary(const char *name, void *value, size_t size); - -bool nvpair_get_bool(const nvpair_t *nvp); -uint64_t nvpair_get_number(const nvpair_t *nvp); -const char *nvpair_get_string(const nvpair_t *nvp); -const nvlist_t *nvpair_get_nvlist(const nvpair_t *nvp); -int nvpair_get_descriptor(const nvpair_t *nvp); -const void *nvpair_get_binary(const nvpair_t *nvp, size_t *sizep); +nvpair_t *nvpair_move_bool_array(const char *name, bool *value, size_t nitems); +nvpair_t *nvpair_move_nvlist_array(const char *name, nvlist_t **value, size_t nitems); +nvpair_t *nvpair_move_descriptor_array(const char *name, int *value, size_t nitems); +nvpair_t *nvpair_move_number_array(const char *name, uint64_t *value, size_t nitems); +nvpair_t *nvpair_move_string_array(const char *name, char **value, size_t nitems); + +bool nvpair_get_bool(const nvpair_t *nvp); +uint64_t nvpair_get_number(const nvpair_t *nvp); +const char *nvpair_get_string(const nvpair_t *nvp); +const nvlist_t *nvpair_get_nvlist(const nvpair_t *nvp); +int nvpair_get_descriptor(const nvpair_t *nvp); +const void *nvpair_get_binary(const nvpair_t *nvp, size_t *sizep); +const bool *nvpair_get_bool_array(const nvpair_t *nvp, size_t *nitemsp); +const uint64_t *nvpair_get_number_array(const nvpair_t *nvp, size_t *nitemsp); +const char * const *nvpair_get_string_array(const nvpair_t *nvp, size_t *nitemsp); +const nvlist_t * const *nvpair_get_nvlist_array(const nvpair_t *nvp, size_t *nitemsp); +const int *nvpair_get_descriptor_array(const nvpair_t *nvp, size_t *nitemsp); void nvpair_free(nvpair_t *nvp); diff --git a/sys/contrib/libnv/nvlist.c b/sys/contrib/libnv/nvlist.c index edcd074..cf8281e 100644 --- a/sys/contrib/libnv/nvlist.c +++ b/sys/contrib/libnv/nvlist.c @@ -1,5 +1,6 @@ /*- * Copyright (c) 2009-2013 The FreeBSD Foundation + * Copyright (c) 2013-2015 Mariusz Zaborski <oshogbo@FreeBSD.org> * All rights reserved. * * This software was developed by Pawel Jakub Dawidek under sponsorship from @@ -88,7 +89,7 @@ __FBSDID("$FreeBSD$"); #endif #endif -#define NV_FLAG_PRIVATE_MASK (NV_FLAG_BIG_ENDIAN) +#define NV_FLAG_PRIVATE_MASK (NV_FLAG_BIG_ENDIAN | NV_FLAG_IN_ARRAY) #define NV_FLAG_PUBLIC_MASK (NV_FLAG_IGNORE_CASE | NV_FLAG_NO_UNIQUE) #define NV_FLAG_ALL_MASK (NV_FLAG_PRIVATE_MASK | NV_FLAG_PUBLIC_MASK) @@ -98,6 +99,7 @@ struct nvlist { int nvl_error; int nvl_flags; nvpair_t *nvl_parent; + nvpair_t *nvl_array_next; struct nvl_head nvl_head; }; @@ -135,6 +137,7 @@ nvlist_create(int flags) nvl->nvl_error = 0; nvl->nvl_flags = flags; nvl->nvl_parent = NULL; + nvl->nvl_array_next = NULL; TAILQ_INIT(&nvl->nvl_head); nvl->nvl_magic = NVLIST_MAGIC; @@ -157,6 +160,10 @@ nvlist_destroy(nvlist_t *nvl) nvlist_remove_nvpair(nvl, nvp); nvpair_free(nvp); } + if (nvl->nvl_array_next != NULL) + nvpair_free_structure(nvl->nvl_array_next); + nvl->nvl_array_next = NULL; + nvl->nvl_parent = NULL; nvl->nvl_magic = 0; nv_free(nvl); @@ -223,6 +230,59 @@ nvlist_set_parent(nvlist_t *nvl, nvpair_t *parent) nvl->nvl_parent = parent; } +void +nvlist_set_array_next(nvlist_t *nvl, nvpair_t *ele) +{ + + NVLIST_ASSERT(nvl); + + if (ele != NULL) + nvl->nvl_flags |= NV_FLAG_IN_ARRAY; + else + nvl->nvl_flags &= ~NV_FLAG_IN_ARRAY; + + nvl->nvl_array_next = ele; +} + +bool +nvlist_in_array(const nvlist_t *nvl) +{ + + NVLIST_ASSERT(nvl); + + return ((nvl->nvl_flags & NV_FLAG_IN_ARRAY) != 0); +} + +const nvlist_t * +nvlist_get_array_next(const nvlist_t *nvl) +{ + nvpair_t *nvp; + + NVLIST_ASSERT(nvl); + + nvp = nvl->nvl_array_next; + if (nvp == NULL) + return (NULL); + + return (nvpair_get_nvlist(nvp)); +} + +const nvlist_t * +nvlist_get_pararr(const nvlist_t *nvl, void **cookiep) +{ + const nvlist_t *ret; + + ret = nvlist_get_array_next(nvl); + if (ret != NULL) { + if (cookiep != NULL) + *cookiep = NULL; + return (ret); + } + + ret = nvlist_get_parent(nvl, cookiep); + return (ret); +} + bool nvlist_empty(const nvlist_t *nvl) { @@ -239,9 +299,18 @@ nvlist_flags(const nvlist_t *nvl) NVLIST_ASSERT(nvl); PJDLOG_ASSERT(nvl->nvl_error == 0); - PJDLOG_ASSERT((nvl->nvl_flags & ~(NV_FLAG_PUBLIC_MASK)) == 0); - return (nvl->nvl_flags); + return (nvl->nvl_flags & NV_FLAG_PUBLIC_MASK); +} + +void +nvlist_set_flags(nvlist_t *nvl, int flags) +{ + + NVLIST_ASSERT(nvl); + PJDLOG_ASSERT(nvl->nvl_error == 0); + + nvl->nvl_flags = flags; } static void @@ -418,17 +487,129 @@ nvlist_dump(const nvlist_t *nvl, int fd) dprintf(fd, "\n"); break; } + case NV_TYPE_BOOL_ARRAY: + { + const bool *value; + unsigned int ii; + size_t nitems; + + value = nvpair_get_bool_array(nvp, &nitems); + dprintf(fd, " [ "); + for (ii = 0; ii < nitems; ii++) { + dprintf(fd, "%s", value[ii] ? "TRUE" : "FALSE"); + if (ii != nitems - 1) + dprintf(fd, ", "); + } + dprintf(fd, " ]\n"); + break; + } + case NV_TYPE_STRING_ARRAY: + { + const char * const *value; + unsigned int ii; + size_t nitems; + + value = nvpair_get_string_array(nvp, &nitems); + dprintf(fd, " [ "); + for (ii = 0; ii < nitems; ii++) { + if (value[ii] == NULL) + dprintf(fd, "NULL"); + else + dprintf(fd, "\"%s\"", value[ii]); + if (ii != nitems - 1) + dprintf(fd, ", "); + } + dprintf(fd, " ]\n"); + break; + } + case NV_TYPE_NUMBER_ARRAY: + { + const uint64_t *value; + unsigned int ii; + size_t nitems; + + value = nvpair_get_number_array(nvp, &nitems); + dprintf(fd, " [ "); + for (ii = 0; ii < nitems; ii++) { + dprintf(fd, "%ju (%jd) (0x%jx)", + value[ii], value[ii], value[ii]); + if (ii != nitems - 1) + dprintf(fd, ", "); + } + dprintf(fd, " ]\n"); + break; + } + case NV_TYPE_DESCRIPTOR_ARRAY: + { + const int *value; + unsigned int ii; + size_t nitems; + + value = nvpair_get_descriptor_array(nvp, &nitems); + dprintf(fd, " [ "); + for (ii = 0; ii < nitems; ii++) { + dprintf(fd, "%d", value[ii]); + if (ii != nitems - 1) + dprintf(fd, ", "); + } + dprintf(fd, " ]\n"); + break; + } + case NV_TYPE_NVLIST_ARRAY: + { + const nvlist_t * const *value; + unsigned int ii; + size_t nitems; + + value = nvpair_get_nvlist_array(nvp, &nitems); + dprintf(fd, " %zu\n", nitems); + tmpnvl = NULL; + tmpnvp = NULL; + for (ii = 0; ii < nitems; ii++) { + if (nvlist_dump_error_check(value[ii], fd, + level + 1)) { + break; + } + + if (tmpnvl == NULL) { + tmpnvp = nvlist_first_nvpair(value[ii]); + if (tmpnvp != NULL) { + tmpnvl = value[ii]; + } else { + dprintf(fd, "%*s,\n", + (level + 1) * 4, ""); + } + } + } + if (tmpnvp != NULL) { + nvl = tmpnvl; + nvp = tmpnvp; + level++; + continue; + } + break; + } default: PJDLOG_ABORT("Unknown type: %d.", nvpair_type(nvp)); } while ((nvp = nvlist_next_nvpair(nvl, nvp)) == NULL) { - cookie = NULL; - nvl = nvlist_get_parent(nvl, &cookie); - if (nvl == NULL) - return; - nvp = cookie; - level--; + do { + cookie = NULL; + if (nvlist_in_array(nvl)) + dprintf(fd, "%*s,\n", level * 4, ""); + nvl = nvlist_get_pararr(nvl, &cookie); + if (nvl == NULL) + return; + if (nvlist_in_array(nvl) && cookie == NULL) { + nvp = nvlist_first_nvpair(nvl); + } else { + nvp = cookie; + level--; + } + } while (nvp == NULL); + if (nvlist_in_array(nvl) && cookie == NULL) + break; } } } @@ -449,9 +630,11 @@ size_t nvlist_size(const nvlist_t *nvl) { const nvlist_t *tmpnvl; + const nvlist_t * const *nvlarray; const nvpair_t *nvp, *tmpnvp; void *cookie; - size_t size; + size_t size, nitems; + unsigned int ii; NVLIST_ASSERT(nvl); PJDLOG_ASSERT(nvl->nvl_error == 0); @@ -472,16 +655,47 @@ nvlist_size(const nvlist_t *nvl) nvp = tmpnvp; continue; } + } else if (nvpair_type(nvp) == NV_TYPE_NVLIST_ARRAY) { + nvlarray = nvpair_get_nvlist_array(nvp, &nitems); + PJDLOG_ASSERT(nitems > 0); + + size += (nvpair_header_size() + 1) * nitems; + size += sizeof(struct nvlist_header) * nitems; + + tmpnvl = NULL; + tmpnvp = NULL; + for (ii = 0; ii < nitems; ii++) { + PJDLOG_ASSERT(nvlarray[ii]->nvl_error == 0); + tmpnvp = nvlist_first_nvpair(nvlarray[ii]); + if (tmpnvp != NULL) { + tmpnvl = nvlarray[ii]; + break; + } + } + if (tmpnvp != NULL) { + nvp = tmpnvp; + nvl = tmpnvl; + continue; + } + } else { size += nvpair_size(nvp); } while ((nvp = nvlist_next_nvpair(nvl, nvp)) == NULL) { - cookie = NULL; - nvl = nvlist_get_parent(nvl, &cookie); - if (nvl == NULL) - goto out; - nvp = cookie; + do { + cookie = NULL; + nvl = nvlist_get_pararr(nvl, &cookie); + if (nvl == NULL) + goto out; + if (nvlist_in_array(nvl) && cookie == NULL) { + nvp = nvlist_first_nvpair(nvl); + } else { + nvp = cookie; + } + } while (nvp == NULL); + if (nvlist_in_array(nvl) && cookie == NULL) + break; } } @@ -508,13 +722,40 @@ nvlist_xdescriptors(const nvlist_t *nvl, int *descs) *descs = nvpair_get_descriptor(nvp); descs++; break; + case NV_TYPE_DESCRIPTOR_ARRAY: + { + const int *value; + size_t nitems; + unsigned int ii; + + value = nvpair_get_descriptor_array(nvp, + &nitems); + for (ii = 0; ii < nitems; ii++) { + *descs = value[ii]; + descs++; + } + break; + } case NV_TYPE_NVLIST: nvl = nvpair_get_nvlist(nvp); nvp = NULL; break; + case NV_TYPE_NVLIST_ARRAY: + { + const nvlist_t * const *value; + size_t nitems; + + value = nvpair_get_nvlist_array(nvp, &nitems); + PJDLOG_ASSERT(value != NULL); + PJDLOG_ASSERT(nitems > 0); + + nvl = value[0]; + nvp = NULL; + break; + } } } - } while ((nvl = nvlist_get_parent(nvl, (void**)&nvp)) != NULL); + } while ((nvl = nvlist_get_pararr(nvl, (void**)&nvp)) != NULL); return (descs); } @@ -564,9 +805,31 @@ nvlist_ndescriptors(const nvlist_t *nvl) nvl = nvpair_get_nvlist(nvp); nvp = NULL; break; + case NV_TYPE_NVLIST_ARRAY: + { + const nvlist_t * const *value; + size_t nitems; + + value = nvpair_get_nvlist_array(nvp, &nitems); + PJDLOG_ASSERT(value != NULL); + PJDLOG_ASSERT(nitems > 0); + + nvl = value[0]; + nvp = NULL; + break; + } + case NV_TYPE_DESCRIPTOR_ARRAY: + { + size_t nitems; + + (void)nvpair_get_descriptor_array(nvp, + &nitems); + ndescs += nitems; + break; + } } } - } while ((nvl = nvlist_get_parent(nvl, (void**)&nvp)) != NULL); + } while ((nvl = nvlist_get_pararr(nvl, (void**)&nvp)) != NULL); return (ndescs); #else @@ -661,24 +924,86 @@ nvlist_xpack(const nvlist_t *nvl, int64_t *fdidxp, size_t *sizep) case NV_TYPE_DESCRIPTOR: ptr = nvpair_pack_descriptor(nvp, ptr, fdidxp, &left); break; + case NV_TYPE_DESCRIPTOR_ARRAY: + ptr = nvpair_pack_descriptor_array(nvp, ptr, fdidxp, + &left); + break; #endif case NV_TYPE_BINARY: ptr = nvpair_pack_binary(nvp, ptr, &left); break; + case NV_TYPE_BOOL_ARRAY: + ptr = nvpair_pack_bool_array(nvp, ptr, &left); + break; + case NV_TYPE_NUMBER_ARRAY: + ptr = nvpair_pack_number_array(nvp, ptr, &left); + break; + case NV_TYPE_STRING_ARRAY: + ptr = nvpair_pack_string_array(nvp, ptr, &left); + break; + case NV_TYPE_NVLIST_ARRAY: + { + const nvlist_t * const * value; + size_t nitems; + unsigned int ii; + + tmpnvl = NULL; + value = nvpair_get_nvlist_array(nvp, &nitems); + for (ii = 0; ii < nitems; ii++) { + ptr = nvlist_pack_header(value[ii], ptr, &left); + if (ptr == NULL) + goto out; + tmpnvp = nvlist_first_nvpair(value[ii]); + if (tmpnvp != NULL) { + tmpnvl = value[ii]; + break; + } + ptr = nvpair_pack_nvlist_array_next(ptr, &left); + if (ptr == NULL) + goto out; + } + if (tmpnvl != NULL) { + nvl = tmpnvl; + nvp = tmpnvp; + continue; + } + break; + } default: PJDLOG_ABORT("Invalid type (%d).", nvpair_type(nvp)); } if (ptr == NULL) goto fail; while ((nvp = nvlist_next_nvpair(nvl, nvp)) == NULL) { - cookie = NULL; - nvl = nvlist_get_parent(nvl, &cookie); - if (nvl == NULL) - goto out; - nvp = cookie; - ptr = nvpair_pack_nvlist_up(ptr, &left); - if (ptr == NULL) - goto fail; + do { + cookie = NULL; + if (nvlist_in_array(nvl)) { + ptr = nvpair_pack_nvlist_array_next(ptr, + &left); + if (ptr == NULL) + goto fail; + } + nvl = nvlist_get_pararr(nvl, &cookie); + if (nvl == NULL) + goto out; + if (nvlist_in_array(nvl) && cookie == NULL) { + nvp = nvlist_first_nvpair(nvl); + ptr = nvlist_pack_header(nvl, ptr, + &left); + if (ptr == NULL) + goto fail; + } else if (nvpair_type((nvpair_t *)cookie) != + NV_TYPE_NVLIST_ARRAY) { + ptr = nvpair_pack_nvlist_up(ptr, &left); + if (ptr == NULL) + goto fail; + nvp = cookie; + } else { + nvp = cookie; + } + } while (nvp == NULL); + if (nvlist_in_array(nvl) && cookie == NULL) + break; } } @@ -741,6 +1066,7 @@ nvlist_unpack_header(nvlist_t *nvl, const unsigned char *ptr, size_t nfds, bool *isbep, size_t *leftp) { struct nvlist_header nvlhdr; + int inarrayf; if (*leftp < sizeof(nvlhdr)) goto failed; @@ -762,7 +1088,8 @@ nvlist_unpack_header(nvlist_t *nvl, const unsigned char *ptr, size_t nfds, if ((nvlhdr.nvlh_flags & ~NV_FLAG_ALL_MASK) != 0) goto failed; - nvl->nvl_flags = (nvlhdr.nvlh_flags & NV_FLAG_PUBLIC_MASK); + inarrayf = (nvl->nvl_flags & NV_FLAG_IN_ARRAY); + nvl->nvl_flags = (nvlhdr.nvlh_flags & NV_FLAG_PUBLIC_MASK) | inarrayf; ptr += sizeof(nvlhdr); if (isbep != NULL) @@ -780,7 +1107,7 @@ nvlist_xunpack(const void *buf, size_t size, const int *fds, size_t nfds, int flags) { const unsigned char *ptr; - nvlist_t *nvl, *retnvl, *tmpnvl; + nvlist_t *nvl, *retnvl, *tmpnvl, *array; nvpair_t *nvp; size_t left; bool isbe; @@ -790,7 +1117,7 @@ nvlist_xunpack(const void *buf, size_t size, const int *fds, size_t nfds, left = size; ptr = buf; - tmpnvl = NULL; + tmpnvl = array = NULL; nvl = retnvl = nvlist_create(0); if (nvl == NULL) goto failed; @@ -832,6 +1159,10 @@ nvlist_xunpack(const void *buf, size_t size, const int *fds, size_t nfds, ptr = nvpair_unpack_descriptor(isbe, nvp, ptr, &left, fds, nfds); break; + case NV_TYPE_DESCRIPTOR_ARRAY: + ptr = nvpair_unpack_descriptor_array(isbe, nvp, ptr, + &left, fds, nfds); + break; #endif case NV_TYPE_BINARY: ptr = nvpair_unpack_binary(isbe, nvp, ptr, &left); @@ -842,6 +1173,44 @@ nvlist_xunpack(const void *buf, size_t size, const int *fds, size_t nfds, nvl = nvpair_nvlist(nvl->nvl_parent); nvpair_free_structure(nvp); continue; + case NV_TYPE_NVLIST_ARRAY_NEXT: + if (nvl->nvl_array_next == NULL) { + if (nvl->nvl_parent == NULL) + goto failed; + nvl = nvpair_nvlist(nvl->nvl_parent); + } else { + nvl = __DECONST(nvlist_t *, + nvlist_get_array_next(nvl)); + ptr = nvlist_unpack_header(nvl, ptr, nfds, + &isbe, &left); + if (ptr == NULL) + goto failed; + } + nvpair_free_structure(nvp); + continue; + case NV_TYPE_BOOL_ARRAY: + ptr = nvpair_unpack_bool_array(isbe, nvp, ptr, &left); + break; + case NV_TYPE_NUMBER_ARRAY: + ptr = nvpair_unpack_number_array(isbe, nvp, ptr, &left); + break; + case NV_TYPE_STRING_ARRAY: + ptr = nvpair_unpack_string_array(isbe, nvp, ptr, &left); + break; + case NV_TYPE_NVLIST_ARRAY: + ptr = nvpair_unpack_nvlist_array(isbe, nvp, ptr, &left, + &array); + if (ptr == NULL) + goto failed; + tmpnvl = array; + while (array != NULL) { + nvlist_set_parent(array, nvp); + array = __DECONST(nvlist_t *, + nvlist_get_array_next(array)); + } + ptr = nvlist_unpack_header(tmpnvl, ptr, nfds, &isbe, + &left); + break; default: PJDLOG_ABORT("Invalid type (%d).", nvpair_type(nvp)); } @@ -1062,10 +1431,15 @@ NVLIST_EXISTS(bool, BOOL) NVLIST_EXISTS(number, NUMBER) NVLIST_EXISTS(string, STRING) NVLIST_EXISTS(nvlist, NVLIST) +NVLIST_EXISTS(binary, BINARY) +NVLIST_EXISTS(bool_array, BOOL_ARRAY) +NVLIST_EXISTS(number_array, NUMBER_ARRAY) +NVLIST_EXISTS(string_array, STRING_ARRAY) +NVLIST_EXISTS(nvlist_array, NVLIST_ARRAY) #ifndef _KERNEL NVLIST_EXISTS(descriptor, DESCRIPTOR) +NVLIST_EXISTS(descriptor_array, DESCRIPTOR_ARRAY) #endif -NVLIST_EXISTS(binary, BINARY) #undef NVLIST_EXISTS @@ -1198,6 +1572,37 @@ NVLIST_ADD(int, descriptor); #undef NVLIST_ADD +#define NVLIST_ADD_ARRAY(vtype, type) \ +void \ +nvlist_add_##type##_array(nvlist_t *nvl, const char *name, vtype value, \ + size_t nitems) \ +{ \ + nvpair_t *nvp; \ + \ + if (nvlist_error(nvl) != 0) { \ + ERRNO_SET(nvlist_error(nvl)); \ + return; \ + } \ + \ + nvp = nvpair_create_##type##_array(name, value, nitems); \ + if (nvp == NULL) { \ + nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM); \ + ERRNO_SET(nvl->nvl_error); \ + } else { \ + (void)nvlist_move_nvpair(nvl, nvp); \ + } \ +} + +NVLIST_ADD_ARRAY(const bool *, bool) +NVLIST_ADD_ARRAY(const uint64_t *, number) +NVLIST_ADD_ARRAY(const char * const *, string) +NVLIST_ADD_ARRAY(const nvlist_t * const *, nvlist) +#ifndef _KERNEL +NVLIST_ADD_ARRAY(const int *, descriptor) +#endif + +#undef NVLIST_ADD_ARRAY + bool nvlist_move_nvpair(nvlist_t *nvl, nvpair_t *nvp) { @@ -1306,6 +1711,131 @@ nvlist_move_binary(nvlist_t *nvl, const char *name, void *value, size_t size) } } +void +nvlist_move_bool_array(nvlist_t *nvl, const char *name, bool *value, + size_t nitems) +{ + nvpair_t *nvp; + + if (nvlist_error(nvl) != 0) { + nv_free(value); + ERRNO_SET(nvlist_error(nvl)); + return; + } + + nvp = nvpair_move_bool_array(name, value, nitems); + if (nvp == NULL) { + nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM); + ERRNO_SET(nvl->nvl_error); + } else { + (void)nvlist_move_nvpair(nvl, nvp); + } +} + +void +nvlist_move_string_array(nvlist_t *nvl, const char *name, char **value, + size_t nitems) +{ + nvpair_t *nvp; + size_t i; + + if (nvlist_error(nvl) != 0) { + if (value != NULL) { + for (i = 0; i < nitems; i++) + nv_free(value[i]); + nv_free(value); + } + ERRNO_SET(nvlist_error(nvl)); + return; + } + + nvp = nvpair_move_string_array(name, value, nitems); + if (nvp == NULL) { + nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM); + ERRNO_SET(nvl->nvl_error); + } else { + (void)nvlist_move_nvpair(nvl, nvp); + } +} + +void +nvlist_move_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t **value, + size_t nitems) +{ + nvpair_t *nvp; + size_t i; + + if (nvlist_error(nvl) != 0) { + if (value != NULL) { + for (i = 0; i < nitems; i++) { + if (nvlist_get_pararr(value[i], NULL) == NULL) + nvlist_destroy(value[i]); + } + } + nv_free(value); + ERRNO_SET(nvlist_error(nvl)); + return; + } + + nvp = nvpair_move_nvlist_array(name, value, nitems); + if (nvp == NULL) { + nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM); + ERRNO_SET(nvl->nvl_error); + } else { + (void)nvlist_move_nvpair(nvl, nvp); + } +} + +void +nvlist_move_number_array(nvlist_t *nvl, const char *name, uint64_t *value, + size_t nitems) +{ + nvpair_t *nvp; + + if (nvlist_error(nvl) != 0) { + nv_free(value); + ERRNO_SET(nvlist_error(nvl)); + return; + } + + nvp = nvpair_move_number_array(name, value, nitems); + if (nvp == NULL) { + nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM); + ERRNO_SET(nvl->nvl_error); + } else { + (void)nvlist_move_nvpair(nvl, nvp); + } +} + +#ifndef _KERNEL +void +nvlist_move_descriptor_array(nvlist_t *nvl, const char *name, int *value, + size_t nitems) +{ + nvpair_t *nvp; + size_t i; + + if (nvlist_error(nvl) != 0) { + if (value != 0) { + for (i = 0; i < nitems; i++) + close(value[i]); + nv_free(value); + } + + ERRNO_SET(nvlist_error(nvl)); + return; + } + + nvp = nvpair_move_descriptor_array(name, value, nitems); + if (nvp == NULL) { + nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM); + ERRNO_SET(nvl->nvl_error); + } else { + (void)nvlist_move_nvpair(nvl, nvp); + } +} +#endif + const nvpair_t * nvlist_get_nvpair(const nvlist_t *nvl, const char *name) { @@ -1347,6 +1877,29 @@ nvlist_get_binary(const nvlist_t *nvl, const char *name, size_t *sizep) return (nvpair_get_binary(nvp, sizep)); } +#define NVLIST_GET_ARRAY(ftype, type, TYPE) \ +ftype \ +nvlist_get_##type##_array(const nvlist_t *nvl, const char *name, \ + size_t *nitems) \ +{ \ + const nvpair_t *nvp; \ + \ + nvp = nvlist_find(nvl, NV_TYPE_##TYPE##_ARRAY, name); \ + if (nvp == NULL) \ + nvlist_report_missing(NV_TYPE_##TYPE##_ARRAY, name); \ + return (nvpair_get_##type##_array(nvp, nitems)); \ +} + +NVLIST_GET_ARRAY(const bool *, bool, BOOL) +NVLIST_GET_ARRAY(const uint64_t *, number, NUMBER) +NVLIST_GET_ARRAY(const char * const *, string, STRING) +NVLIST_GET_ARRAY(const nvlist_t * const *, nvlist, NVLIST) +#ifndef _KERNEL +NVLIST_GET_ARRAY(const int *, descriptor, DESCRIPTOR) +#endif + +#undef NVLIST_GET_ARRAY + #define NVLIST_TAKE(ftype, type, TYPE) \ ftype \ nvlist_take_##type(nvlist_t *nvl, const char *name) \ @@ -1389,6 +1942,31 @@ nvlist_take_binary(nvlist_t *nvl, const char *name, size_t *sizep) return (value); } +#define NVLIST_TAKE_ARRAY(ftype, type, TYPE) \ +ftype \ +nvlist_take_##type##_array(nvlist_t *nvl, const char *name, \ + size_t *nitems) \ +{ \ + nvpair_t *nvp; \ + ftype value; \ + \ + nvp = nvlist_find(nvl, NV_TYPE_##TYPE##_ARRAY, name); \ + if (nvp == NULL) \ + nvlist_report_missing(NV_TYPE_##TYPE##_ARRAY, name); \ + value = (ftype)(intptr_t)nvpair_get_##type##_array(nvp, nitems);\ + nvlist_remove_nvpair(nvl, nvp); \ + nvpair_free_structure(nvp); \ + return (value); \ +} + +NVLIST_TAKE_ARRAY(bool *, bool, BOOL) +NVLIST_TAKE_ARRAY(uint64_t *, number, NUMBER) +NVLIST_TAKE_ARRAY(char **, string, STRING) +NVLIST_TAKE_ARRAY(nvlist_t **, nvlist, NVLIST) +#ifndef _KERNEL +NVLIST_TAKE_ARRAY(int *, descriptor, DESCRIPTOR) +#endif + void nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp) { @@ -1420,10 +1998,15 @@ NVLIST_FREE(bool, BOOL) NVLIST_FREE(number, NUMBER) NVLIST_FREE(string, STRING) NVLIST_FREE(nvlist, NVLIST) +NVLIST_FREE(binary, BINARY) +NVLIST_FREE(bool_array, BOOL_ARRAY) +NVLIST_FREE(number_array, NUMBER_ARRAY) +NVLIST_FREE(string_array, STRING_ARRAY) +NVLIST_FREE(nvlist_array, NVLIST_ARRAY) #ifndef _KERNEL NVLIST_FREE(descriptor, DESCRIPTOR) +NVLIST_FREE(descriptor_array, DESCRIPTOR_ARRAY) #endif -NVLIST_FREE(binary, BINARY) #undef NVLIST_FREE diff --git a/sys/contrib/libnv/nvlist_impl.h b/sys/contrib/libnv/nvlist_impl.h index 18ccebf..9952db8 100644 --- a/sys/contrib/libnv/nvlist_impl.h +++ b/sys/contrib/libnv/nvlist_impl.h @@ -1,5 +1,6 @@ /*- * Copyright (c) 2013 The FreeBSD Foundation + * Copyright (c) 2013-2015 Mariusz Zaborski <oshogbo@FreeBSD.org> * All rights reserved. * * This software was developed by Pawel Jakub Dawidek under sponsorship from diff --git a/sys/contrib/libnv/nvpair.c b/sys/contrib/libnv/nvpair.c index 7146767..1e3bd0e 100644 --- a/sys/contrib/libnv/nvpair.c +++ b/sys/contrib/libnv/nvpair.c @@ -1,5 +1,6 @@ /*- * Copyright (c) 2009-2013 The FreeBSD Foundation + * Copyright (c) 2013-2015 Mariusz Zaborski <oshogbo@FreeBSD.org> * All rights reserved. * * This software was developed by Pawel Jakub Dawidek under sponsorship from @@ -86,6 +87,7 @@ struct nvpair { int nvp_type; uint64_t nvp_data; size_t nvp_datasize; + size_t nvp_nitems; /* Used only for array types. */ nvlist_t *nvp_list; TAILQ_ENTRY(nvpair) nvp_next; }; @@ -99,6 +101,7 @@ struct nvpair_header { uint8_t nvph_type; uint16_t nvph_namesize; uint64_t nvph_datasize; + uint64_t nvph_nitems; } __packed; @@ -109,6 +112,36 @@ nvpair_assert(const nvpair_t *nvp) NVPAIR_ASSERT(nvp); } +static nvpair_t * +nvpair_allocv(const char *name, int type, uint64_t data, size_t datasize, + size_t nitems) +{ + nvpair_t *nvp; + size_t namelen; + + PJDLOG_ASSERT(type >= NV_TYPE_FIRST && type <= NV_TYPE_LAST); + + namelen = strlen(name); + if (namelen >= NV_NAME_MAX) { + ERRNO_SET(ENAMETOOLONG); + return (NULL); + } + + nvp = nv_calloc(1, sizeof(*nvp) + namelen + 1); + if (nvp != NULL) { + nvp->nvp_name = (char *)(nvp + 1); + memcpy(nvp->nvp_name, name, namelen); + nvp->nvp_name[namelen] = '\0'; + nvp->nvp_type = type; + nvp->nvp_data = data; + nvp->nvp_datasize = datasize; + nvp->nvp_nitems = nitems; + nvp->nvp_magic = NVPAIR_MAGIC; + } + + return (nvp); +} + nvlist_t * nvpair_nvlist(const nvpair_t *nvp) { @@ -162,6 +195,19 @@ nvpair_remove_nvlist(nvpair_t *nvp) nvlist_set_parent(nvl, NULL); } +static void +nvpair_remove_nvlist_array(nvpair_t *nvp) +{ + nvlist_t **nvlarray; + size_t count, i; + + /* XXX: DECONST is bad, mkay? */ + nvlarray = __DECONST(nvlist_t **, + nvpair_get_nvlist_array(nvp, &count)); + for (i = 0; i < count; i++) + nvlist_set_array_next(nvlarray[i], NULL); +} + void nvpair_remove(struct nvl_head *head, nvpair_t *nvp, const nvlist_t *nvl) { @@ -171,6 +217,8 @@ nvpair_remove(struct nvl_head *head, nvpair_t *nvp, const nvlist_t *nvl) if (nvpair_type(nvp) == NV_TYPE_NVLIST) nvpair_remove_nvlist(nvp); + else if (nvpair_type(nvp) == NV_TYPE_NVLIST_ARRAY) + nvpair_remove_nvlist_array(nvp); TAILQ_REMOVE(head, nvp, nvp_next); nvp->nvp_list = NULL; @@ -204,16 +252,36 @@ nvpair_clone(const nvpair_t *nvp) case NV_TYPE_NVLIST: newnvp = nvpair_create_nvlist(name, nvpair_get_nvlist(nvp)); break; + case NV_TYPE_BINARY: + data = nvpair_get_binary(nvp, &datasize); + newnvp = nvpair_create_binary(name, data, datasize); + break; + case NV_TYPE_BOOL_ARRAY: + data = nvpair_get_bool_array(nvp, &datasize); + newnvp = nvpair_create_bool_array(name, data, datasize); + break; + case NV_TYPE_NUMBER_ARRAY: + data = nvpair_get_number_array(nvp, &datasize); + newnvp = nvpair_create_number_array(name, data, datasize); + break; + case NV_TYPE_STRING_ARRAY: + data = nvpair_get_string_array(nvp, &datasize); + newnvp = nvpair_create_string_array(name, data, datasize); + break; + case NV_TYPE_NVLIST_ARRAY: + data = nvpair_get_nvlist_array(nvp, &datasize); + newnvp = nvpair_create_nvlist_array(name, data, datasize); + break; #ifndef _KERNEL case NV_TYPE_DESCRIPTOR: newnvp = nvpair_create_descriptor(name, nvpair_get_descriptor(nvp)); break; -#endif - case NV_TYPE_BINARY: - data = nvpair_get_binary(nvp, &datasize); - newnvp = nvpair_create_binary(name, data, datasize); + case NV_TYPE_DESCRIPTOR_ARRAY: + data = nvpair_get_descriptor_array(nvp, &datasize); + newnvp = nvpair_create_descriptor_array(name, data, datasize); break; +#endif default: PJDLOG_ABORT("Unknown type: %d.", nvpair_type(nvp)); } @@ -250,6 +318,7 @@ nvpair_pack_header(const nvpair_t *nvp, unsigned char *ptr, size_t *leftp) PJDLOG_ASSERT(namesize > 0 && namesize <= UINT16_MAX); nvphdr.nvph_namesize = namesize; nvphdr.nvph_datasize = nvp->nvp_datasize; + nvphdr.nvph_nitems = nvp->nvp_nitems; PJDLOG_ASSERT(*leftp >= sizeof(nvphdr)); memcpy(ptr, &nvphdr, sizeof(nvphdr)); ptr += sizeof(nvphdr); @@ -336,6 +405,32 @@ nvpair_pack_nvlist_up(unsigned char *ptr, size_t *leftp) nvphdr.nvph_type = NV_TYPE_NVLIST_UP; nvphdr.nvph_namesize = namesize; nvphdr.nvph_datasize = 0; + nvphdr.nvph_nitems = 0; + PJDLOG_ASSERT(*leftp >= sizeof(nvphdr)); + memcpy(ptr, &nvphdr, sizeof(nvphdr)); + ptr += sizeof(nvphdr); + *leftp -= sizeof(nvphdr); + + PJDLOG_ASSERT(*leftp >= namesize); + memcpy(ptr, name, namesize); + ptr += namesize; + *leftp -= namesize; + + return (ptr); +} + +unsigned char * +nvpair_pack_nvlist_array_next(unsigned char *ptr, size_t *leftp) +{ + struct nvpair_header nvphdr; + size_t namesize; + const char *name = ""; + + namesize = 1; + nvphdr.nvph_type = NV_TYPE_NVLIST_ARRAY_NEXT; + nvphdr.nvph_namesize = namesize; + nvphdr.nvph_datasize = 0; + nvphdr.nvph_nitems = 0; PJDLOG_ASSERT(*leftp >= sizeof(nvphdr)); memcpy(ptr, &nvphdr, sizeof(nvphdr)); ptr += sizeof(nvphdr); @@ -396,6 +491,106 @@ nvpair_pack_binary(const nvpair_t *nvp, unsigned char *ptr, size_t *leftp) return (ptr); } +unsigned char * +nvpair_pack_bool_array(const nvpair_t *nvp, unsigned char *ptr, size_t *leftp) +{ + + NVPAIR_ASSERT(nvp); + PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_BOOL_ARRAY); + PJDLOG_ASSERT(*leftp >= nvp->nvp_datasize); + + memcpy(ptr, (const void *)(intptr_t)nvp->nvp_data, nvp->nvp_datasize); + ptr += nvp->nvp_datasize; + *leftp -= nvp->nvp_datasize; + + return (ptr); +} + +unsigned char * +nvpair_pack_number_array(const nvpair_t *nvp, unsigned char *ptr, size_t *leftp) +{ + + NVPAIR_ASSERT(nvp); + PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NUMBER_ARRAY); + PJDLOG_ASSERT(*leftp >= nvp->nvp_datasize); + + memcpy(ptr, (const void *)(intptr_t)nvp->nvp_data, nvp->nvp_datasize); + ptr += nvp->nvp_datasize; + *leftp -= nvp->nvp_datasize; + + return (ptr); +} + +unsigned char * +nvpair_pack_string_array(const nvpair_t *nvp, unsigned char *ptr, size_t *leftp) +{ + unsigned int ii; + size_t size, len; + const char * const *array; + + NVPAIR_ASSERT(nvp); + PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_STRING_ARRAY); + PJDLOG_ASSERT(*leftp >= nvp->nvp_datasize); + + size = 0; + array = nvpair_get_string_array(nvp, NULL); + PJDLOG_ASSERT(array != NULL); + + for (ii = 0; ii < nvp->nvp_nitems; ii++) { + len = strlen(array[ii]) + 1; + PJDLOG_ASSERT(*leftp >= len); + + memcpy(ptr, (const void *)array[ii], len); + size += len; + ptr += len; + *leftp -= len; + } + + PJDLOG_ASSERT(size == nvp->nvp_datasize); + + return (ptr); +} + +#ifndef _KERNEL +unsigned char * +nvpair_pack_descriptor_array(const nvpair_t *nvp, unsigned char *ptr, + int64_t *fdidxp, size_t *leftp) +{ + int64_t value; + const int *array; + unsigned int ii; + + NVPAIR_ASSERT(nvp); + PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_DESCRIPTOR_ARRAY); + PJDLOG_ASSERT(*leftp >= nvp->nvp_datasize); + + array = nvpair_get_descriptor_array(nvp, NULL); + PJDLOG_ASSERT(array != NULL); + + for (ii = 0; ii < nvp->nvp_nitems; ii++) { + PJDLOG_ASSERT(*leftp >= sizeof(value)); + + value = array[ii]; + if (value != -1) { + /* + * If there is a real descriptor here, we change its + * number to position in the array of descriptors send + * via control message. + */ + PJDLOG_ASSERT(fdidxp != NULL); + + value = *fdidxp; + (*fdidxp)++; + } + memcpy(ptr, &value, sizeof(value)); + ptr += sizeof(value); + *leftp -= sizeof(value); + } + + return (ptr); +} +#endif + void nvpair_init_datasize(nvpair_t *nvp) { @@ -430,7 +625,8 @@ nvpair_unpack_header(bool isbe, nvpair_t *nvp, const unsigned char *ptr, goto failed; #endif if (nvphdr.nvph_type > NV_TYPE_LAST && - nvphdr.nvph_type != NV_TYPE_NVLIST_UP) { + nvphdr.nvph_type != NV_TYPE_NVLIST_UP && + nvphdr.nvph_type != NV_TYPE_NVLIST_ARRAY_NEXT) { goto failed; } @@ -467,6 +663,7 @@ nvpair_unpack_header(bool isbe, nvpair_t *nvp, const unsigned char *ptr, nvp->nvp_type = nvphdr.nvph_type; nvp->nvp_data = 0; nvp->nvp_datasize = nvphdr.nvph_datasize; + nvp->nvp_nitems = nvphdr.nvph_nitems; return (ptr); failed: @@ -540,6 +737,7 @@ nvpair_unpack_number(bool isbe, nvpair_t *nvp, const unsigned char *ptr, nvp->nvp_data = be64dec(ptr); else nvp->nvp_data = le64dec(ptr); + ptr += sizeof(uint64_t); *leftp -= sizeof(uint64_t); @@ -670,6 +868,234 @@ nvpair_unpack_binary(bool isbe __unused, nvpair_t *nvp, } const unsigned char * +nvpair_unpack_bool_array(bool isbe __unused, nvpair_t *nvp, + const unsigned char *ptr, size_t *leftp) +{ + uint8_t *value; + size_t size; + unsigned int i; + + PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_BOOL_ARRAY); + + size = sizeof(*value) * nvp->nvp_nitems; + if (nvp->nvp_datasize != size || *leftp < size || + nvp->nvp_nitems == 0 || size < nvp->nvp_nitems) { + ERRNO_SET(EINVAL); + return (NULL); + } + + value = nv_malloc(size); + if (value == NULL) + return (NULL); + + for (i = 0; i < nvp->nvp_nitems; i++) { + value[i] = *(const uint8_t *)ptr; + + ptr += sizeof(*value); + *leftp -= sizeof(*value); + } + + nvp->nvp_data = (uint64_t)(uintptr_t)value; + + return (ptr); +} + +const unsigned char * +nvpair_unpack_number_array(bool isbe, nvpair_t *nvp, const unsigned char *ptr, + size_t *leftp) +{ + uint64_t *value; + size_t size; + unsigned int i; + + PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NUMBER_ARRAY); + + size = sizeof(*value) * nvp->nvp_nitems; + if (nvp->nvp_datasize != size || *leftp < size || + nvp->nvp_nitems == 0 || size < nvp->nvp_nitems) { + ERRNO_SET(EINVAL); + return (NULL); + } + + value = nv_malloc(size); + if (value == NULL) + return (NULL); + + for (i = 0; i < nvp->nvp_nitems; i++) { + if (isbe) + value[i] = be64dec(ptr); + else + value[i] = le64dec(ptr); + + ptr += sizeof(*value); + *leftp -= sizeof(*value); + } + + nvp->nvp_data = (uint64_t)(uintptr_t)value; + + return (ptr); +} + +const unsigned char * +nvpair_unpack_string_array(bool isbe __unused, nvpair_t *nvp, + const unsigned char *ptr, size_t *leftp) +{ + ssize_t size; + size_t len; + const char *tmp; + char **value; + unsigned int ii, j; + + PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_STRING_ARRAY); + + if (*leftp < nvp->nvp_datasize || nvp->nvp_datasize == 0 || + nvp->nvp_nitems == 0) { + ERRNO_SET(EINVAL); + return (NULL); + } + + size = nvp->nvp_datasize; + tmp = (const char *)ptr; + for (ii = 0; ii < nvp->nvp_nitems; ii++) { + len = strnlen(tmp, size - 1) + 1; + size -= len; + if (size < 0) { + ERRNO_SET(EINVAL); + return (NULL); + } + tmp += len; + } + if (size != 0) { + ERRNO_SET(EINVAL); + return (NULL); + } + + value = nv_malloc(sizeof(*value) * nvp->nvp_nitems); + if (value == NULL) + return (NULL); + + for (ii = 0; ii < nvp->nvp_nitems; ii++) { + value[ii] = nv_strdup((const char *)ptr); + if (value[ii] == NULL) + goto out; + len = strlen(value[ii]) + 1; + ptr += len; + *leftp -= len; + } + nvp->nvp_data = (uint64_t)(uintptr_t)value; + + return (ptr); +out: + for (j = 0; j < ii; j++) + nv_free(value[j]); + nv_free(value); + return (NULL); +} + +#ifndef _KERNEL +const unsigned char * +nvpair_unpack_descriptor_array(bool isbe, nvpair_t *nvp, + const unsigned char *ptr, size_t *leftp, const int *fds, size_t nfds) +{ + int64_t idx; + size_t size; + unsigned int ii; + int *array; + + PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_DESCRIPTOR_ARRAY); + + size = sizeof(idx) * nvp->nvp_nitems; + if (nvp->nvp_datasize != size || *leftp < size || + nvp->nvp_nitems == 0 || size < nvp->nvp_nitems) { + ERRNO_SET(EINVAL); + return (NULL); + } + + array = (int *)nv_malloc(size); + if (array == NULL) + return (NULL); + + for (ii = 0; ii < nvp->nvp_nitems; ii++) { + if (isbe) + idx = be64dec(ptr); + else + idx = le64dec(ptr); + + if (idx < 0) { + ERRNO_SET(EINVAL); + nv_free(array); + return (NULL); + } + + if ((size_t)idx >= nfds) { + ERRNO_SET(EINVAL); + nv_free(array); + return (NULL); + } + + array[ii] = (uint64_t)fds[idx]; + + ptr += sizeof(idx); + *leftp -= sizeof(idx); + } + + nvp->nvp_data = (uint64_t)(uintptr_t)array; + + return (ptr); +} +#endif + +const unsigned char * +nvpair_unpack_nvlist_array(bool isbe __unused, nvpair_t *nvp, + const unsigned char *ptr, size_t *leftp, nvlist_t **firstel) +{ + nvlist_t **value; + nvpair_t *tmpnvp; + unsigned int ii, j; + size_t sizeup; + + PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NVLIST_ARRAY); + + sizeup = sizeof(struct nvpair_header) * nvp->nvp_nitems; + if (nvp->nvp_nitems == 0 || sizeup < nvp->nvp_nitems || + sizeup > *leftp) { + ERRNO_SET(EINVAL); + return (NULL); + } + + value = nv_malloc(nvp->nvp_nitems * sizeof(*value)); + if (value == NULL) + return (NULL); + + for (ii = 0; ii < nvp->nvp_nitems; ii++) { + value[ii] = nvlist_create(0); + if (value[ii] == NULL) + goto fail; + if (ii > 0) { + tmpnvp = nvpair_allocv(" ", NV_TYPE_NVLIST, + (uint64_t)(uintptr_t)value[ii], 0, 0); + if (tmpnvp == NULL) + goto fail; + nvlist_set_array_next(value[ii - 1], tmpnvp); + } + } + nvlist_set_flags(value[nvp->nvp_nitems - 1], NV_FLAG_IN_ARRAY); + + nvp->nvp_data = (uint64_t)(uintptr_t)value; + *firstel = value[0]; + + return (ptr); +fail: + ERRNO_SAVE(); + for (j = 0; j < ii; j++) + nvlist_destroy(value[j]); + nv_free(value); + ERRNO_RESTORE(); + + return (NULL); +} + +const unsigned char * nvpair_unpack(bool isbe, const unsigned char *ptr, size_t *leftp, nvpair_t **nvpp) { @@ -717,34 +1143,6 @@ nvpair_name(const nvpair_t *nvp) return (nvp->nvp_name); } -static nvpair_t * -nvpair_allocv(const char *name, int type, uint64_t data, size_t datasize) -{ - nvpair_t *nvp; - size_t namelen; - - PJDLOG_ASSERT(type >= NV_TYPE_FIRST && type <= NV_TYPE_LAST); - - namelen = strlen(name); - if (namelen >= NV_NAME_MAX) { - ERRNO_SET(ENAMETOOLONG); - return (NULL); - } - - nvp = nv_calloc(1, sizeof(*nvp) + namelen + 1); - if (nvp != NULL) { - nvp->nvp_name = (char *)(nvp + 1); - memcpy(nvp->nvp_name, name, namelen); - nvp->nvp_name[namelen] = '\0'; - nvp->nvp_type = type; - nvp->nvp_data = data; - nvp->nvp_datasize = datasize; - nvp->nvp_magic = NVPAIR_MAGIC; - } - - return (nvp); -} - nvpair_t * nvpair_create_stringf(const char *name, const char *valuefmt, ...) { @@ -778,7 +1176,7 @@ nvpair_t * nvpair_create_null(const char *name) { - return (nvpair_allocv(name, NV_TYPE_NULL, 0, 0)); + return (nvpair_allocv(name, NV_TYPE_NULL, 0, 0, 0)); } nvpair_t * @@ -786,14 +1184,14 @@ nvpair_create_bool(const char *name, bool value) { return (nvpair_allocv(name, NV_TYPE_BOOL, value ? 1 : 0, - sizeof(uint8_t))); + sizeof(uint8_t), 0)); } nvpair_t * nvpair_create_number(const char *name, uint64_t value) { - return (nvpair_allocv(name, NV_TYPE_NUMBER, value, sizeof(value))); + return (nvpair_allocv(name, NV_TYPE_NUMBER, value, sizeof(value), 0)); } nvpair_t * @@ -814,7 +1212,7 @@ nvpair_create_string(const char *name, const char *value) size = strlen(value) + 1; nvp = nvpair_allocv(name, NV_TYPE_STRING, (uint64_t)(uintptr_t)data, - size); + size, 0); if (nvp == NULL) nv_free(data); @@ -836,7 +1234,8 @@ nvpair_create_nvlist(const char *name, const nvlist_t *value) if (nvl == NULL) return (NULL); - nvp = nvpair_allocv(name, NV_TYPE_NVLIST, (uint64_t)(uintptr_t)nvl, 0); + nvp = nvpair_allocv(name, NV_TYPE_NVLIST, (uint64_t)(uintptr_t)nvl, 0, + 0); if (nvp == NULL) nvlist_destroy(nvl); else @@ -861,7 +1260,7 @@ nvpair_create_descriptor(const char *name, int value) return (NULL); nvp = nvpair_allocv(name, NV_TYPE_DESCRIPTOR, (uint64_t)value, - sizeof(int64_t)); + sizeof(int64_t), 0); if (nvp == NULL) { ERRNO_SAVE(); close(value); @@ -889,7 +1288,7 @@ nvpair_create_binary(const char *name, const void *value, size_t size) memcpy(data, value, size); nvp = nvpair_allocv(name, NV_TYPE_BINARY, (uint64_t)(uintptr_t)data, - size); + size, 0); if (nvp == NULL) nv_free(data); @@ -897,6 +1296,226 @@ nvpair_create_binary(const char *name, const void *value, size_t size) } nvpair_t * +nvpair_create_bool_array(const char *name, const bool *value, size_t nitems) +{ + nvpair_t *nvp; + size_t size; + void *data; + + if (value == NULL || nitems == 0) { + ERRNO_SET(EINVAL); + return (NULL); + } + + size = sizeof(value[0]) * nitems; + data = nv_malloc(size); + if (data == NULL) + return (NULL); + + memcpy(data, value, size); + nvp = nvpair_allocv(name, NV_TYPE_BOOL_ARRAY, (uint64_t)(uintptr_t)data, + size, nitems); + if (nvp == NULL) { + ERRNO_SAVE(); + nv_free(data); + ERRNO_RESTORE(); + } + + return (nvp); +} + +nvpair_t * +nvpair_create_number_array(const char *name, const uint64_t *value, + size_t nitems) +{ + nvpair_t *nvp; + size_t size; + void *data; + + if (value == NULL || nitems == 0) { + ERRNO_SET(EINVAL); + return (NULL); + } + + size = sizeof(value[0]) * nitems; + data = nv_malloc(size); + if (data == NULL) + return (NULL); + + memcpy(data, value, size); + nvp = nvpair_allocv(name, NV_TYPE_NUMBER_ARRAY, + (uint64_t)(uintptr_t)data, size, nitems); + if (nvp == NULL) { + ERRNO_SAVE(); + nv_free(data); + ERRNO_RESTORE(); + } + + return (nvp); +} + +nvpair_t * +nvpair_create_string_array(const char *name, const char * const *value, + size_t nitems) +{ + nvpair_t *nvp; + unsigned int ii; + size_t datasize, size; + char **data; + + if (value == NULL || nitems == 0) { + ERRNO_SET(EINVAL); + return (NULL); + } + + nvp = NULL; + datasize = 0; + data = nv_malloc(sizeof(value[0]) * nitems); + if (data == NULL) + return (NULL); + + for (ii = 0; ii < nitems; ii++) { + if (value[ii] == NULL) { + ERRNO_SET(EINVAL); + goto fail; + } + + size = strlen(value[ii]) + 1; + datasize += size; + data[ii] = nv_strdup(value[ii]); + if (data[ii] == NULL) + goto fail; + } + nvp = nvpair_allocv(name, NV_TYPE_STRING_ARRAY, + (uint64_t)(uintptr_t)data, datasize, nitems); + +fail: + if (nvp == NULL) { + ERRNO_SAVE(); + for (; ii > 0; ii--) + nv_free(data[ii - 1]); + nv_free(data); + ERRNO_RESTORE(); + } + + return (nvp); +} + +nvpair_t * +nvpair_create_nvlist_array(const char *name, const nvlist_t * const *value, + size_t nitems) +{ + unsigned int ii; + nvlist_t **nvls; + nvpair_t *nvp; + int flags; + + nvp = NULL; + nvls = NULL; + ii = 0; + + if (value == NULL || nitems == 0) { + ERRNO_SET(EINVAL); + return (NULL); + } + + nvls = nv_malloc(sizeof(value[0]) * nitems); + if (nvls == NULL) + return (NULL); + + for (ii = 0; ii < nitems; ii++) { + if (value[ii] == NULL) { + ERRNO_SET(EINVAL); + goto fail; + } + + nvls[ii] = nvlist_clone(value[ii]); + if (nvls[ii] == NULL) + goto fail; + + if (ii > 0) { + nvp = nvpair_allocv(" ", NV_TYPE_NVLIST, + (uint64_t)(uintptr_t)nvls[ii], 0, 0); + if (nvp == NULL) + goto fail; + nvlist_set_array_next(nvls[ii - 1], nvp); + } + } + flags = nvlist_flags(nvls[nitems - 1]) | NV_FLAG_IN_ARRAY; + nvlist_set_flags(nvls[nitems - 1], flags); + + nvp = nvpair_allocv(name, NV_TYPE_NVLIST_ARRAY, + (uint64_t)(uintptr_t)nvls, 0, nitems); + +fail: + if (nvp == NULL) { + ERRNO_SAVE(); + for (; ii > 0; ii--) + nvlist_destroy(nvls[ii - 1]); + + nv_free(nvls); + ERRNO_RESTORE(); + } else { + for (ii = 0; ii < nitems; ii++) + nvlist_set_parent(nvls[ii], nvp); + } + + return (nvp); +} + +#ifndef _KERNEL +nvpair_t * +nvpair_create_descriptor_array(const char *name, const int *value, + size_t nitems) +{ + unsigned int ii; + nvpair_t *nvp; + int *fds; + + if (value == NULL) { + ERRNO_SET(EINVAL); + return (NULL); + } + + nvp = NULL; + + fds = nv_malloc(sizeof(value[0]) * nitems); + if (fds == NULL) + return (NULL); + for (ii = 0; ii < nitems; ii++) { + if (value[ii] == -1) { + fds[ii] = -1; + } else { + if (!fd_is_valid(value[ii])) { + ERRNO_SET(EBADF); + goto fail; + } + + fds[ii] = fcntl(value[ii], F_DUPFD_CLOEXEC, 0); + if (fds[ii] == -1) + goto fail; + } + } + + nvp = nvpair_allocv(name, NV_TYPE_DESCRIPTOR_ARRAY, + (uint64_t)(uintptr_t)fds, sizeof(int64_t) * nitems, nitems); + +fail: + if (nvp == NULL) { + ERRNO_SAVE(); + for (; ii > 0; ii--) { + if (fds[ii - 1] != -1) + close(fds[ii - 1]); + } + nv_free(fds); + ERRNO_RESTORE(); + } + + return (nvp); +} +#endif + +nvpair_t * nvpair_move_string(const char *name, char *value) { nvpair_t *nvp; @@ -907,7 +1526,7 @@ nvpair_move_string(const char *name, char *value) } nvp = nvpair_allocv(name, NV_TYPE_STRING, (uint64_t)(uintptr_t)value, - strlen(value) + 1); + strlen(value) + 1, 0); if (nvp == NULL) { ERRNO_SAVE(); nv_free(value); @@ -934,7 +1553,7 @@ nvpair_move_nvlist(const char *name, nvlist_t *value) } nvp = nvpair_allocv(name, NV_TYPE_NVLIST, (uint64_t)(uintptr_t)value, - 0); + 0, 0); if (nvp == NULL) nvlist_destroy(value); else @@ -955,7 +1574,7 @@ nvpair_move_descriptor(const char *name, int value) } nvp = nvpair_allocv(name, NV_TYPE_DESCRIPTOR, (uint64_t)value, - sizeof(int64_t)); + sizeof(int64_t), 0); if (nvp == NULL) { ERRNO_SAVE(); close(value); @@ -977,7 +1596,83 @@ nvpair_move_binary(const char *name, void *value, size_t size) } nvp = nvpair_allocv(name, NV_TYPE_BINARY, (uint64_t)(uintptr_t)value, - size); + size, 0); + if (nvp == NULL) { + ERRNO_SAVE(); + nv_free(value); + ERRNO_RESTORE(); + } + + return (nvp); +} + +nvpair_t * +nvpair_move_bool_array(const char *name, bool *value, size_t nitems) +{ + nvpair_t *nvp; + + if (value == NULL || nitems == 0) { + ERRNO_SET(EINVAL); + return (NULL); + } + + nvp = nvpair_allocv(name, NV_TYPE_BOOL_ARRAY, + (uint64_t)(uintptr_t)value, sizeof(value[0]) * nitems, nitems); + if (nvp == NULL) { + ERRNO_SAVE(); + nv_free(value); + ERRNO_RESTORE(); + } + + return (nvp); +} + +nvpair_t * +nvpair_move_string_array(const char *name, char **value, size_t nitems) +{ + nvpair_t *nvp; + size_t i, size; + + if (value == NULL || nitems == 0) { + ERRNO_SET(EINVAL); + return (NULL); + } + + size = 0; + for (i = 0; i < nitems; i++) { + if (value[i] == NULL) { + ERRNO_SET(EINVAL); + return (NULL); + } + + size += strlen(value[i]) + 1; + } + + nvp = nvpair_allocv(name, NV_TYPE_STRING_ARRAY, + (uint64_t)(uintptr_t)value, size, nitems); + if (nvp == NULL) { + ERRNO_SAVE(); + for (i = 0; i < nitems; i++) + nv_free(value[i]); + nv_free(value); + ERRNO_RESTORE(); + } + + return (nvp); +} + +nvpair_t * +nvpair_move_number_array(const char *name, uint64_t *value, size_t nitems) +{ + nvpair_t *nvp; + + if (value == NULL || nitems == 0) { + ERRNO_SET(EINVAL); + return (NULL); + } + + nvp = nvpair_allocv(name, NV_TYPE_NUMBER_ARRAY, + (uint64_t)(uintptr_t)value, sizeof(value[0]) * nitems, nitems); if (nvp == NULL) { ERRNO_SAVE(); nv_free(value); @@ -987,6 +1682,95 @@ nvpair_move_binary(const char *name, void *value, size_t size) return (nvp); } +nvpair_t * +nvpair_move_nvlist_array(const char *name, nvlist_t **value, size_t nitems) +{ + unsigned int ii; + nvpair_t *nvp; + int flags; + + nvp = NULL; + if (value == NULL || nitems == 0) { + ERRNO_SET(EINVAL); + return (NULL); + } + + for (ii = 0; ii < nitems; ii++) { + if (value == NULL || nvlist_error(value[ii]) != 0 || + nvlist_get_pararr(value[ii], NULL) != NULL) { + ERRNO_SET(EINVAL); + goto fail; + } + if (ii > 0) { + nvp = nvpair_allocv(" ", NV_TYPE_NVLIST, + (uint64_t)(uintptr_t)value[ii], 0, 0); + if (nvp == NULL) + goto fail; + nvlist_set_array_next(value[ii - 1], nvp); + } + } + flags = nvlist_flags(value[nitems - 1]) | NV_FLAG_IN_ARRAY; + nvlist_set_flags(value[nitems - 1], flags); + + nvp = nvpair_allocv(name, NV_TYPE_NVLIST_ARRAY, + (uint64_t)(uintptr_t)value, 0, nitems); +fail: + if (nvp == NULL) { + ERRNO_SAVE(); + for (ii = 0; ii < nitems; ii++) { + if (value[ii] != NULL && + nvlist_get_pararr(value[ii], NULL) != NULL) { + nvlist_destroy(value[ii]); + } + nv_free(value); + } + ERRNO_RESTORE(); + } else { + for (ii = 0; ii < nitems; ii++) + nvlist_set_parent(value[ii], nvp); + } + + return (nvp); +} + +#ifndef _KERNEL +nvpair_t * +nvpair_move_descriptor_array(const char *name, int *value, size_t nitems) +{ + nvpair_t *nvp; + size_t i; + + nvp = NULL; + if (value == NULL || nitems == 0) { + ERRNO_SET(EINVAL); + return (NULL); + } + + for (i = 0; i < nitems; i++) { + if (value[i] != -1 && !fd_is_valid(value[i])) { + ERRNO_SET(EBADF); + goto fail; + } + } + + nvp = nvpair_allocv(name, NV_TYPE_DESCRIPTOR_ARRAY, + (uint64_t)(uintptr_t)value, sizeof(value[0]) * nitems, nitems); + +fail: + if (nvp == NULL) { + ERRNO_SAVE(); + for (i = 0; i < nitems; i++) { + if (fd_is_valid(value[i])) + close(value[i]); + } + nv_free(value); + ERRNO_RESTORE(); + } + + return (nvp); +} +#endif + bool nvpair_get_bool(const nvpair_t *nvp) { @@ -1046,12 +1830,81 @@ nvpair_get_binary(const nvpair_t *nvp, size_t *sizep) if (sizep != NULL) *sizep = nvp->nvp_datasize; + return ((const void *)(intptr_t)nvp->nvp_data); } +const bool * +nvpair_get_bool_array(const nvpair_t *nvp, size_t *nitems) +{ + + NVPAIR_ASSERT(nvp); + PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_BOOL_ARRAY); + + if (nitems != NULL) + *nitems = nvp->nvp_nitems; + + return ((const bool *)(intptr_t)nvp->nvp_data); +} + +const uint64_t * +nvpair_get_number_array(const nvpair_t *nvp, size_t *nitems) +{ + + NVPAIR_ASSERT(nvp); + PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NUMBER_ARRAY); + + if (nitems != NULL) + *nitems = nvp->nvp_nitems; + + return ((const uint64_t *)(intptr_t)nvp->nvp_data); +} + +const char * const * +nvpair_get_string_array(const nvpair_t *nvp, size_t *nitems) +{ + + NVPAIR_ASSERT(nvp); + PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_STRING_ARRAY); + + if (nitems != NULL) + *nitems = nvp->nvp_nitems; + + return ((const char * const *)(intptr_t)nvp->nvp_data); +} + +const nvlist_t * const * +nvpair_get_nvlist_array(const nvpair_t *nvp, size_t *nitems) +{ + + NVPAIR_ASSERT(nvp); + PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NVLIST_ARRAY); + + if (nitems != NULL) + *nitems = nvp->nvp_nitems; + + return ((const nvlist_t * const *)((intptr_t)nvp->nvp_data)); +} + +#ifndef _KERNEL +const int * +nvpair_get_descriptor_array(const nvpair_t *nvp, size_t *nitems) +{ + + NVPAIR_ASSERT(nvp); + PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_DESCRIPTOR_ARRAY); + + if (nitems != NULL) + *nitems = nvp->nvp_nitems; + + return ((const int *)(intptr_t)nvp->nvp_data); +} +#endif + void nvpair_free(nvpair_t *nvp) { + size_t i; NVPAIR_ASSERT(nvp); PJDLOG_ASSERT(nvp->nvp_list == NULL); @@ -1062,6 +1915,10 @@ nvpair_free(nvpair_t *nvp) case NV_TYPE_DESCRIPTOR: close((int)nvp->nvp_data); break; + case NV_TYPE_DESCRIPTOR_ARRAY: + for (i = 0; i < nvp->nvp_nitems; i++) + close(((int *)(intptr_t)nvp->nvp_data)[i]); + break; #endif case NV_TYPE_NVLIST: nvlist_destroy((nvlist_t *)(intptr_t)nvp->nvp_data); @@ -1072,6 +1929,23 @@ nvpair_free(nvpair_t *nvp) case NV_TYPE_BINARY: nv_free((void *)(intptr_t)nvp->nvp_data); break; + case NV_TYPE_NVLIST_ARRAY: + for (i = 0; i < nvp->nvp_nitems; i++) { + nvlist_destroy( + ((nvlist_t **)(intptr_t)nvp->nvp_data)[i]); + } + nv_free(((nvlist_t **)(intptr_t)nvp->nvp_data)); + break; + case NV_TYPE_NUMBER_ARRAY: + nv_free((uint64_t *)(intptr_t)nvp->nvp_data); + break; + case NV_TYPE_BOOL_ARRAY: + nv_free((bool *)(intptr_t)nvp->nvp_data); + break; + case NV_TYPE_STRING_ARRAY: + for (i = 0; i < nvp->nvp_nitems; i++) + nv_free(((char **)(intptr_t)nvp->nvp_data)[i]); + break; } nv_free(nvp); } @@ -1106,6 +1980,16 @@ nvpair_type_string(int type) return ("DESCRIPTOR"); case NV_TYPE_BINARY: return ("BINARY"); + case NV_TYPE_BOOL_ARRAY: + return ("BOOL ARRAY"); + case NV_TYPE_NUMBER_ARRAY: + return ("NUMBER ARRAY"); + case NV_TYPE_STRING_ARRAY: + return ("STRING ARRAY"); + case NV_TYPE_NVLIST_ARRAY: + return ("NVLIST ARRAY"); + case NV_TYPE_DESCRIPTOR_ARRAY: + return ("DESCRIPTOR ARRAY"); default: return ("<UNKNOWN>"); } diff --git a/sys/contrib/libnv/nvpair_impl.h b/sys/contrib/libnv/nvpair_impl.h index fed7725..0350b1c 100644 --- a/sys/contrib/libnv/nvpair_impl.h +++ b/sys/contrib/libnv/nvpair_impl.h @@ -1,5 +1,6 @@ /*- * Copyright (c) 2009-2013 The FreeBSD Foundation + * Copyright (c) 2013-2015 Mariusz Zaborski <oshogbo@FreeBSD.org> * All rights reserved. * * This software was developed by Pawel Jakub Dawidek under sponsorship from @@ -71,6 +72,15 @@ unsigned char *nvpair_pack_descriptor(const nvpair_t *nvp, unsigned char *ptr, unsigned char *nvpair_pack_binary(const nvpair_t *nvp, unsigned char *ptr, size_t *leftp); unsigned char *nvpair_pack_nvlist_up(unsigned char *ptr, size_t *leftp); +unsigned char *nvpair_pack_bool_array(const nvpair_t *nvp, unsigned char *ptr, + size_t *leftp); +unsigned char *nvpair_pack_number_array(const nvpair_t *nvp, unsigned char *ptr, + size_t *leftp); +unsigned char *nvpair_pack_string_array(const nvpair_t *nvp, unsigned char *ptr, + size_t *leftp); +unsigned char *nvpair_pack_descriptor_array(const nvpair_t *nvp, + unsigned char *ptr, int64_t *fdidxp, size_t *leftp); +unsigned char *nvpair_pack_nvlist_array_next(unsigned char *ptr, size_t *leftp); /* Unpack data functions. */ const unsigned char *nvpair_unpack_header(bool isbe, nvpair_t *nvp, @@ -89,5 +99,15 @@ const unsigned char *nvpair_unpack_descriptor(bool isbe, nvpair_t *nvp, const unsigned char *ptr, size_t *leftp, const int *fds, size_t nfds); const unsigned char *nvpair_unpack_binary(bool isbe, nvpair_t *nvp, const unsigned char *ptr, size_t *leftp); +const unsigned char *nvpair_unpack_bool_array(bool isbe, nvpair_t *nvp, + const unsigned char *ptr, size_t *leftp); +const unsigned char *nvpair_unpack_number_array(bool isbe, nvpair_t *nvp, + const unsigned char *ptr, size_t *leftp); +const unsigned char *nvpair_unpack_string_array(bool isbe, nvpair_t *nvp, + const unsigned char *ptr, size_t *leftp); +const unsigned char *nvpair_unpack_descriptor_array(bool isbe, nvpair_t *nvp, + const unsigned char *ptr, size_t *leftp, const int *fds, size_t nfds); +const unsigned char *nvpair_unpack_nvlist_array(bool isbe, nvpair_t *nvp, + const unsigned char *ptr, size_t *leftp, nvlist_t **firstel); #endif /* !_NVPAIR_IMPL_H_ */ diff --git a/sys/dev/ata/ata-all.c b/sys/dev/ata/ata-all.c index 52db44d..118e38e 100644 --- a/sys/dev/ata/ata-all.c +++ b/sys/dev/ata/ata-all.c @@ -64,18 +64,15 @@ static void ata_cam_end_transaction(device_t dev, struct ata_request *request); static void ata_cam_request_sense(device_t dev, struct ata_request *request); static int ata_check_ids(device_t dev, union ccb *ccb); static void ata_conn_event(void *context, int dummy); -static void ata_init(void); static void ata_interrupt_locked(void *data); static int ata_module_event_handler(module_t mod, int what, void *arg); static void ata_periodic_poll(void *data); static int ata_str2mode(const char *str); -static void ata_uninit(void); /* global vars */ MALLOC_DEFINE(M_ATA, "ata_generic", "ATA driver generic layer"); int (*ata_raid_ioctl_func)(u_long cmd, caddr_t data) = NULL; devclass_t ata_devclass; -uma_zone_t ata_request_zone; int ata_dma_check_80pin = 1; /* sysctl vars */ @@ -650,12 +647,7 @@ ata_cam_begin_transaction(device_t dev, union ccb *ccb) struct ata_channel *ch = device_get_softc(dev); struct ata_request *request; - if (!(request = ata_alloc_request())) { - device_printf(dev, "FAILURE - out of memory in start\n"); - ccb->ccb_h.status = CAM_REQ_INVALID; - xpt_done(ccb); - return; - } + request = &ch->request; bzero(request, sizeof(*request)); /* setup request */ @@ -794,7 +786,6 @@ ata_cam_process_sense(device_t dev, struct ata_request *request) ccb->ccb_h.status |= CAM_AUTOSENSE_FAIL; } - ata_free_request(request); xpt_done(ccb); /* Do error recovery if needed. */ if (fatalerr) @@ -865,10 +856,8 @@ ata_cam_end_transaction(device_t dev, struct ata_request *request) if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_SCSI_STATUS_ERROR && (ccb->ccb_h.flags & CAM_DIS_AUTOSENSE) == 0) ata_cam_request_sense(dev, request); - else { - ata_free_request(request); + else xpt_done(ccb); - } /* Do error recovery if needed. */ if (fatalerr) ata_reinit(dev); @@ -1148,18 +1137,3 @@ static moduledata_t ata_moduledata = { "ata", ata_module_event_handler, NULL }; DECLARE_MODULE(ata, ata_moduledata, SI_SUB_CONFIGURE, SI_ORDER_SECOND); MODULE_VERSION(ata, 1); MODULE_DEPEND(ata, cam, 1, 1, 1); - -static void -ata_init(void) -{ - ata_request_zone = uma_zcreate("ata_request", sizeof(struct ata_request), - NULL, NULL, NULL, NULL, 0, 0); -} -SYSINIT(ata_register, SI_SUB_DRIVERS, SI_ORDER_SECOND, ata_init, NULL); - -static void -ata_uninit(void) -{ - uma_zdestroy(ata_request_zone); -} -SYSUNINIT(ata_unregister, SI_SUB_DRIVERS, SI_ORDER_SECOND, ata_uninit, NULL); diff --git a/sys/dev/ata/ata-all.h b/sys/dev/ata/ata-all.h index 19cb7ef..cf8ed78 100644 --- a/sys/dev/ata/ata-all.h +++ b/sys/dev/ata/ata-all.h @@ -450,6 +450,7 @@ struct ata_channel { struct ata_cam_device curr[16]; /* Current settings */ int requestsense; /* CCB waiting for SENSE. */ struct callout poll_callout; /* Periodic status poll. */ + struct ata_request request; }; /* disk bay/enclosure related */ @@ -507,14 +508,6 @@ int ata_sata_getrev(device_t dev, int target); int ata_request2fis_h2d(struct ata_request *request, u_int8_t *fis); void ata_pm_identify(device_t dev); -/* macros for alloc/free of struct ata_request */ -extern uma_zone_t ata_request_zone; -#define ata_alloc_request() uma_zalloc(ata_request_zone, M_NOWAIT | M_ZERO) -#define ata_free_request(request) { \ - if (!(request->flags & ATA_R_DANGER2)) \ - uma_zfree(ata_request_zone, request); \ - } - MALLOC_DECLARE(M_ATA); /* misc newbus defines */ diff --git a/sys/dev/ath/if_ath.c b/sys/dev/ath/if_ath.c index 2c935a2b..26bf591 100644 --- a/sys/dev/ath/if_ath.c +++ b/sys/dev/ath/if_ath.c @@ -1473,7 +1473,7 @@ ath_vap_create(struct ieee80211com *ic, const char name[IFNAMSIZ], int unit, const uint8_t bssid[IEEE80211_ADDR_LEN], const uint8_t mac0[IEEE80211_ADDR_LEN]) { - struct ath_softc *sc = ic->ic_ifp->if_softc; + struct ath_softc *sc = ic->ic_softc; struct ath_vap *avp; struct ieee80211vap *vap; uint8_t mac[IEEE80211_ADDR_LEN]; @@ -1732,7 +1732,7 @@ ath_vap_delete(struct ieee80211vap *vap) { struct ieee80211com *ic = vap->iv_ic; struct ifnet *ifp = ic->ic_ifp; - struct ath_softc *sc = ifp->if_softc; + struct ath_softc *sc = ic->ic_softc; struct ath_hal *ah = sc->sc_ah; struct ath_vap *avp = ATH_VAP(vap); @@ -2340,7 +2340,7 @@ ath_fatal_proc(void *arg, int pending) static void ath_bmiss_vap(struct ieee80211vap *vap) { - struct ath_softc *sc = vap->iv_ic->ic_ifp->if_softc; + struct ath_softc *sc = vap->iv_ic->ic_softc; /* * Workaround phantom bmiss interrupts by sanity-checking @@ -2361,8 +2361,6 @@ ath_bmiss_vap(struct ieee80211vap *vap) ATH_UNLOCK(sc); if ((vap->iv_flags_ext & IEEE80211_FEXT_SWBMISS) == 0) { - struct ifnet *ifp = vap->iv_ic->ic_ifp; - struct ath_softc *sc = ifp->if_softc; u_int64_t lastrx = sc->sc_lastrx; u_int64_t tsf = ath_hal_gettsf64(sc->sc_ah); /* XXX should take a locked ref to iv_bss */ @@ -2851,8 +2849,8 @@ ath_stop(struct ifnet *ifp) int ath_reset(struct ifnet *ifp, ATH_RESET_TYPE reset_type) { - struct ath_softc *sc = ifp->if_softc; struct ieee80211com *ic = ifp->if_l2com; + struct ath_softc *sc = ic->ic_softc; struct ath_hal *ah = sc->sc_ah; HAL_STATUS status; int i; @@ -3045,7 +3043,7 @@ ath_reset_vap(struct ieee80211vap *vap, u_long cmd) { struct ieee80211com *ic = vap->iv_ic; struct ifnet *ifp = ic->ic_ifp; - struct ath_softc *sc = ifp->if_softc; + struct ath_softc *sc = ic->ic_softc; struct ath_hal *ah = sc->sc_ah; switch (cmd) { @@ -3248,7 +3246,7 @@ static int ath_transmit(struct ifnet *ifp, struct mbuf *m) { struct ieee80211com *ic = ifp->if_l2com; - struct ath_softc *sc = ic->ic_ifp->if_softc; + struct ath_softc *sc = ic->ic_softc; struct ieee80211_node *ni; struct mbuf *next; struct ath_buf *bf; @@ -3538,8 +3536,7 @@ ath_media_change(struct ifnet *ifp) static void ath_key_update_begin(struct ieee80211vap *vap) { - struct ifnet *ifp = vap->iv_ic->ic_ifp; - struct ath_softc *sc = ifp->if_softc; + struct ath_softc *sc = vap->iv_ic->ic_softc; DPRINTF(sc, ATH_DEBUG_KEYCACHE, "%s:\n", __func__); taskqueue_block(sc->sc_tq); @@ -3548,8 +3545,7 @@ ath_key_update_begin(struct ieee80211vap *vap) static void ath_key_update_end(struct ieee80211vap *vap) { - struct ifnet *ifp = vap->iv_ic->ic_ifp; - struct ath_softc *sc = ifp->if_softc; + struct ath_softc *sc = vap->iv_ic->ic_softc; DPRINTF(sc, ATH_DEBUG_KEYCACHE, "%s:\n", __func__); taskqueue_unblock(sc->sc_tq); @@ -4156,7 +4152,7 @@ static struct ieee80211_node * ath_node_alloc(struct ieee80211vap *vap, const uint8_t mac[IEEE80211_ADDR_LEN]) { struct ieee80211com *ic = vap->iv_ic; - struct ath_softc *sc = ic->ic_ifp->if_softc; + struct ath_softc *sc = ic->ic_softc; const size_t space = sizeof(struct ath_node) + sc->sc_rc->arc_space; struct ath_node *an; @@ -4183,7 +4179,7 @@ static void ath_node_cleanup(struct ieee80211_node *ni) { struct ieee80211com *ic = ni->ni_ic; - struct ath_softc *sc = ic->ic_ifp->if_softc; + struct ath_softc *sc = ic->ic_softc; DPRINTF(sc, ATH_DEBUG_NODE, "%s: %6D: an %p\n", __func__, ni->ni_macaddr, ":", ATH_NODE(ni)); @@ -4198,7 +4194,7 @@ static void ath_node_free(struct ieee80211_node *ni) { struct ieee80211com *ic = ni->ni_ic; - struct ath_softc *sc = ic->ic_ifp->if_softc; + struct ath_softc *sc = ic->ic_softc; DPRINTF(sc, ATH_DEBUG_NODE, "%s: %6D: an %p\n", __func__, ni->ni_macaddr, ":", ATH_NODE(ni)); @@ -4210,7 +4206,7 @@ static void ath_node_getsignal(const struct ieee80211_node *ni, int8_t *rssi, int8_t *noise) { struct ieee80211com *ic = ni->ni_ic; - struct ath_softc *sc = ic->ic_ifp->if_softc; + struct ath_softc *sc = ic->ic_softc; struct ath_hal *ah = sc->sc_ah; *rssi = ic->ic_node_getrssi(ni); @@ -4422,7 +4418,7 @@ ath_txq_update(struct ath_softc *sc, int ac) int ath_wme_update(struct ieee80211com *ic) { - struct ath_softc *sc = ic->ic_ifp->if_softc; + struct ath_softc *sc = ic->ic_softc; return !ath_txq_update(sc, WME_AC_BE) || !ath_txq_update(sc, WME_AC_BK) || @@ -5797,7 +5793,7 @@ static void ath_scan_start(struct ieee80211com *ic) { struct ifnet *ifp = ic->ic_ifp; - struct ath_softc *sc = ifp->if_softc; + struct ath_softc *sc = ic->ic_softc; struct ath_hal *ah = sc->sc_ah; u_int32_t rfilt; @@ -5821,8 +5817,7 @@ ath_scan_start(struct ieee80211com *ic) static void ath_scan_end(struct ieee80211com *ic) { - struct ifnet *ifp = ic->ic_ifp; - struct ath_softc *sc = ifp->if_softc; + struct ath_softc *sc = ic->ic_softc; struct ath_hal *ah = sc->sc_ah; u_int32_t rfilt; @@ -5862,8 +5857,7 @@ ath_scan_end(struct ieee80211com *ic) static void ath_update_chw(struct ieee80211com *ic) { - struct ifnet *ifp = ic->ic_ifp; - struct ath_softc *sc = ifp->if_softc; + struct ath_softc *sc = ic->ic_softc; DPRINTF(sc, ATH_DEBUG_STATE, "%s: called\n", __func__); ath_set_channel(ic); @@ -5873,8 +5867,7 @@ ath_update_chw(struct ieee80211com *ic) static void ath_set_channel(struct ieee80211com *ic) { - struct ifnet *ifp = ic->ic_ifp; - struct ath_softc *sc = ifp->if_softc; + struct ath_softc *sc = ic->ic_softc; ATH_LOCK(sc); ath_power_set_power_state(sc, HAL_PM_AWAKE); @@ -5916,7 +5909,7 @@ static int ath_newstate(struct ieee80211vap *vap, enum ieee80211_state nstate, int arg) { struct ieee80211com *ic = vap->iv_ic; - struct ath_softc *sc = ic->ic_ifp->if_softc; + struct ath_softc *sc = ic->ic_softc; struct ath_vap *avp = ATH_VAP(vap); struct ath_hal *ah = sc->sc_ah; struct ieee80211_node *ni = NULL; @@ -6252,7 +6245,7 @@ static void ath_setup_stationkey(struct ieee80211_node *ni) { struct ieee80211vap *vap = ni->ni_vap; - struct ath_softc *sc = vap->iv_ic->ic_ifp->if_softc; + struct ath_softc *sc = vap->iv_ic->ic_softc; ieee80211_keyix keyix, rxkeyix; /* XXX should take a locked ref to vap->iv_bss */ @@ -6285,7 +6278,7 @@ ath_newassoc(struct ieee80211_node *ni, int isnew) { struct ath_node *an = ATH_NODE(ni); struct ieee80211vap *vap = ni->ni_vap; - struct ath_softc *sc = vap->iv_ic->ic_ifp->if_softc; + struct ath_softc *sc = vap->iv_ic->ic_softc; const struct ieee80211_txparam *tp = ni->ni_txparms; an->an_mcastrix = ath_tx_findrix(sc, tp->mcastrate); @@ -6337,7 +6330,7 @@ static int ath_setregdomain(struct ieee80211com *ic, struct ieee80211_regdomain *reg, int nchans, struct ieee80211_channel chans[]) { - struct ath_softc *sc = ic->ic_ifp->if_softc; + struct ath_softc *sc = ic->ic_softc; struct ath_hal *ah = sc->sc_ah; HAL_STATUS status; @@ -6361,7 +6354,7 @@ static void ath_getradiocaps(struct ieee80211com *ic, int maxchans, int *nchans, struct ieee80211_channel chans[]) { - struct ath_softc *sc = ic->ic_ifp->if_softc; + struct ath_softc *sc = ic->ic_softc; struct ath_hal *ah = sc->sc_ah; DPRINTF(sc, ATH_DEBUG_REGDOMAIN, "%s: use rd %u cc %d\n", @@ -6693,8 +6686,8 @@ ath_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { #define IS_RUNNING(ifp) \ ((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING)) - struct ath_softc *sc = ifp->if_softc; struct ieee80211com *ic = ifp->if_l2com; + struct ath_softc *sc = ic->ic_softc; struct ifreq *ifr = (struct ifreq *)data; const HAL_RATE_TABLE *rt; int error = 0; @@ -6864,7 +6857,7 @@ ath_node_powersave(struct ieee80211_node *ni, int enable) #ifdef ATH_SW_PSQ struct ath_node *an = ATH_NODE(ni); struct ieee80211com *ic = ni->ni_ic; - struct ath_softc *sc = ic->ic_ifp->if_softc; + struct ath_softc *sc = ic->ic_softc; struct ath_vap *avp = ATH_VAP(ni->ni_vap); /* XXX and no TXQ locks should be held here */ @@ -6931,7 +6924,7 @@ ath_node_set_tim(struct ieee80211_node *ni, int enable) { #ifdef ATH_SW_PSQ struct ieee80211com *ic = ni->ni_ic; - struct ath_softc *sc = ic->ic_ifp->if_softc; + struct ath_softc *sc = ic->ic_softc; struct ath_node *an = ATH_NODE(ni); struct ath_vap *avp = ATH_VAP(ni->ni_vap); int changed = 0; @@ -7136,7 +7129,7 @@ ath_node_recv_pspoll(struct ieee80211_node *ni, struct mbuf *m) struct ath_node *an; struct ath_vap *avp; struct ieee80211com *ic = ni->ni_ic; - struct ath_softc *sc = ic->ic_ifp->if_softc; + struct ath_softc *sc = ic->ic_softc; int tid; /* Just paranoia */ diff --git a/sys/dev/ath/if_ath_keycache.c b/sys/dev/ath/if_ath_keycache.c index fe99f10..b8a77e8 100644 --- a/sys/dev/ath/if_ath_keycache.c +++ b/sys/dev/ath/if_ath_keycache.c @@ -425,7 +425,7 @@ int ath_key_alloc(struct ieee80211vap *vap, struct ieee80211_key *k, ieee80211_keyix *keyix, ieee80211_keyix *rxkeyix) { - struct ath_softc *sc = vap->iv_ic->ic_ifp->if_softc; + struct ath_softc *sc = vap->iv_ic->ic_softc; /* * Group key allocation must be handled specially for @@ -493,7 +493,7 @@ ath_key_alloc(struct ieee80211vap *vap, struct ieee80211_key *k, int ath_key_delete(struct ieee80211vap *vap, const struct ieee80211_key *k) { - struct ath_softc *sc = vap->iv_ic->ic_ifp->if_softc; + struct ath_softc *sc = vap->iv_ic->ic_softc; struct ath_hal *ah = sc->sc_ah; const struct ieee80211_cipher *cip = k->wk_cipher; u_int keyix = k->wk_keyix; @@ -538,7 +538,7 @@ int ath_key_set(struct ieee80211vap *vap, const struct ieee80211_key *k, const u_int8_t mac[IEEE80211_ADDR_LEN]) { - struct ath_softc *sc = vap->iv_ic->ic_ifp->if_softc; + struct ath_softc *sc = vap->iv_ic->ic_softc; return ath_keyset(sc, vap, k, vap->iv_bss); } diff --git a/sys/dev/ath/if_ath_rx.c b/sys/dev/ath/if_ath_rx.c index 2779b7a..e391dd7 100644 --- a/sys/dev/ath/if_ath_rx.c +++ b/sys/dev/ath/if_ath_rx.c @@ -330,7 +330,7 @@ ath_recv_mgmt(struct ieee80211_node *ni, struct mbuf *m, int subtype, const struct ieee80211_rx_stats *rxs, int rssi, int nf) { struct ieee80211vap *vap = ni->ni_vap; - struct ath_softc *sc = vap->iv_ic->ic_ifp->if_softc; + struct ath_softc *sc = vap->iv_ic->ic_softc; uint64_t tsf_beacon_old, tsf_beacon; uint64_t nexttbtt; int64_t tsf_delta; diff --git a/sys/dev/ath/if_ath_tdma.c b/sys/dev/ath/if_ath_tdma.c index fd23db1..d4c9ccd 100644 --- a/sys/dev/ath/if_ath_tdma.c +++ b/sys/dev/ath/if_ath_tdma.c @@ -359,7 +359,7 @@ ath_tdma_update(struct ieee80211_node *ni, #define TU_TO_TSF(_tu) (((u_int64_t)(_tu)) << 10) struct ieee80211vap *vap = ni->ni_vap; struct ieee80211com *ic = ni->ni_ic; - struct ath_softc *sc = ic->ic_ifp->if_softc; + struct ath_softc *sc = ic->ic_softc; struct ath_hal *ah = sc->sc_ah; const HAL_RATE_TABLE *rt = sc->sc_currates; u_int64_t tsf, rstamp, nextslot, nexttbtt, nexttbtt_full; diff --git a/sys/dev/ath/if_ath_tx.c b/sys/dev/ath/if_ath_tx.c index c15b158..916d4cb 100644 --- a/sys/dev/ath/if_ath_tx.c +++ b/sys/dev/ath/if_ath_tx.c @@ -2341,7 +2341,7 @@ ath_raw_xmit(struct ieee80211_node *ni, struct mbuf *m, { struct ieee80211com *ic = ni->ni_ic; struct ifnet *ifp = ic->ic_ifp; - struct ath_softc *sc = ifp->if_softc; + struct ath_softc *sc = ic->ic_softc; struct ath_buf *bf; struct ieee80211_frame *wh = mtod(m, struct ieee80211_frame *); int error = 0; @@ -5731,7 +5731,7 @@ int ath_addba_request(struct ieee80211_node *ni, struct ieee80211_tx_ampdu *tap, int dialogtoken, int baparamset, int batimeout) { - struct ath_softc *sc = ni->ni_ic->ic_ifp->if_softc; + struct ath_softc *sc = ni->ni_ic->ic_softc; int tid = tap->txa_tid; struct ath_node *an = ATH_NODE(ni); struct ath_tid *atid = &an->an_tid[tid]; @@ -5809,7 +5809,7 @@ int ath_addba_response(struct ieee80211_node *ni, struct ieee80211_tx_ampdu *tap, int status, int code, int batimeout) { - struct ath_softc *sc = ni->ni_ic->ic_ifp->if_softc; + struct ath_softc *sc = ni->ni_ic->ic_softc; int tid = tap->txa_tid; struct ath_node *an = ATH_NODE(ni); struct ath_tid *atid = &an->an_tid[tid]; @@ -5856,7 +5856,7 @@ ath_addba_response(struct ieee80211_node *ni, struct ieee80211_tx_ampdu *tap, void ath_addba_stop(struct ieee80211_node *ni, struct ieee80211_tx_ampdu *tap) { - struct ath_softc *sc = ni->ni_ic->ic_ifp->if_softc; + struct ath_softc *sc = ni->ni_ic->ic_softc; int tid = tap->txa_tid; struct ath_node *an = ATH_NODE(ni); struct ath_tid *atid = &an->an_tid[tid]; @@ -5991,7 +5991,7 @@ void ath_bar_response(struct ieee80211_node *ni, struct ieee80211_tx_ampdu *tap, int status) { - struct ath_softc *sc = ni->ni_ic->ic_ifp->if_softc; + struct ath_softc *sc = ni->ni_ic->ic_softc; int tid = tap->txa_tid; struct ath_node *an = ATH_NODE(ni); struct ath_tid *atid = &an->an_tid[tid]; @@ -6064,7 +6064,7 @@ void ath_addba_response_timeout(struct ieee80211_node *ni, struct ieee80211_tx_ampdu *tap) { - struct ath_softc *sc = ni->ni_ic->ic_ifp->if_softc; + struct ath_softc *sc = ni->ni_ic->ic_softc; int tid = tap->txa_tid; struct ath_node *an = ATH_NODE(ni); struct ath_tid *atid = &an->an_tid[tid]; diff --git a/sys/dev/bxe/ecore_hsi.h b/sys/dev/bxe/ecore_hsi.h index 005bb2e..f78f4ea 100644 --- a/sys/dev/bxe/ecore_hsi.h +++ b/sys/dev/bxe/ecore_hsi.h @@ -2536,9 +2536,9 @@ struct shmem2_region { #define SHMEM_EEE_SUPPORTED_MASK 0x000f0000 #define SHMEM_EEE_SUPPORTED_SHIFT 16 #define SHMEM_EEE_ADV_STATUS_MASK 0x00f00000 - #define SHMEM_EEE_100M_ADV (1<<0) - #define SHMEM_EEE_1G_ADV (1<<1) - #define SHMEM_EEE_10G_ADV (1<<2) + #define SHMEM_EEE_100M_ADV (1U<<0) + #define SHMEM_EEE_1G_ADV (1U<<1) + #define SHMEM_EEE_10G_ADV (1U<<2) #define SHMEM_EEE_ADV_STATUS_SHIFT 20 #define SHMEM_EEE_LP_ADV_STATUS_MASK 0x0f000000 #define SHMEM_EEE_LP_ADV_STATUS_SHIFT 24 diff --git a/sys/dev/e1000/e1000_80003es2lan.c b/sys/dev/e1000/e1000_80003es2lan.c index 076e02b..b948bb4 100644 --- a/sys/dev/e1000/e1000_80003es2lan.c +++ b/sys/dev/e1000/e1000_80003es2lan.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2013, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_80003es2lan.h b/sys/dev/e1000/e1000_80003es2lan.h index 3807e46..89b1551 100644 --- a/sys/dev/e1000/e1000_80003es2lan.h +++ b/sys/dev/e1000/e1000_80003es2lan.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2013, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_82540.c b/sys/dev/e1000/e1000_82540.c index 141b92e..68f92c6 100644 --- a/sys/dev/e1000/e1000_82540.c +++ b/sys/dev/e1000/e1000_82540.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2011, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_82541.c b/sys/dev/e1000/e1000_82541.c index 781aa93..69fcee4 100644 --- a/sys/dev/e1000/e1000_82541.c +++ b/sys/dev/e1000/e1000_82541.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2011, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_82541.h b/sys/dev/e1000/e1000_82541.h index 3b6b961..1eebfad 100644 --- a/sys/dev/e1000/e1000_82541.h +++ b/sys/dev/e1000/e1000_82541.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2008, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_82542.c b/sys/dev/e1000/e1000_82542.c index 19d5402..a6b3616 100644 --- a/sys/dev/e1000/e1000_82542.c +++ b/sys/dev/e1000/e1000_82542.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_82543.c b/sys/dev/e1000/e1000_82543.c index 1c01658..3350f17 100644 --- a/sys/dev/e1000/e1000_82543.c +++ b/sys/dev/e1000/e1000_82543.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2011, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_82543.h b/sys/dev/e1000/e1000_82543.h index 60e5c15..0fa813b 100644 --- a/sys/dev/e1000/e1000_82543.h +++ b/sys/dev/e1000/e1000_82543.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2008, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_82571.c b/sys/dev/e1000/e1000_82571.c index e209d43..a64ef56 100644 --- a/sys/dev/e1000/e1000_82571.c +++ b/sys/dev/e1000/e1000_82571.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_82571.h b/sys/dev/e1000/e1000_82571.h index c76f16f..cda87a2 100644 --- a/sys/dev/e1000/e1000_82571.h +++ b/sys/dev/e1000/e1000_82571.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2010, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_82575.c b/sys/dev/e1000/e1000_82575.c index d79db67..8981ae3 100644 --- a/sys/dev/e1000/e1000_82575.c +++ b/sys/dev/e1000/e1000_82575.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_82575.h b/sys/dev/e1000/e1000_82575.h index 6569b98..503fdce 100644 --- a/sys/dev/e1000/e1000_82575.h +++ b/sys/dev/e1000/e1000_82575.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_api.c b/sys/dev/e1000/e1000_api.c index 374ffa6..5db22db 100644 --- a/sys/dev/e1000/e1000_api.c +++ b/sys/dev/e1000/e1000_api.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_api.h b/sys/dev/e1000/e1000_api.h index a2ffa16..e87acc8 100644 --- a/sys/dev/e1000/e1000_api.h +++ b/sys/dev/e1000/e1000_api.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_defines.h b/sys/dev/e1000/e1000_defines.h index 5deada2..9472ca4 100644 --- a/sys/dev/e1000/e1000_defines.h +++ b/sys/dev/e1000/e1000_defines.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_hw.h b/sys/dev/e1000/e1000_hw.h index faf64a3..3ec921e 100644 --- a/sys/dev/e1000/e1000_hw.h +++ b/sys/dev/e1000/e1000_hw.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_i210.c b/sys/dev/e1000/e1000_i210.c index f12c13f..563f11a 100644 --- a/sys/dev/e1000/e1000_i210.c +++ b/sys/dev/e1000/e1000_i210.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_i210.h b/sys/dev/e1000/e1000_i210.h index 2a20ca1..f940915 100644 --- a/sys/dev/e1000/e1000_i210.h +++ b/sys/dev/e1000/e1000_i210.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_ich8lan.c b/sys/dev/e1000/e1000_ich8lan.c index 204c39c..23e7b95 100644 --- a/sys/dev/e1000/e1000_ich8lan.c +++ b/sys/dev/e1000/e1000_ich8lan.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_ich8lan.h b/sys/dev/e1000/e1000_ich8lan.h index f045ebd..9cb79c0 100644 --- a/sys/dev/e1000/e1000_ich8lan.h +++ b/sys/dev/e1000/e1000_ich8lan.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_mac.c b/sys/dev/e1000/e1000_mac.c index b888b34..1c86307 100644 --- a/sys/dev/e1000/e1000_mac.c +++ b/sys/dev/e1000/e1000_mac.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_mac.h b/sys/dev/e1000/e1000_mac.h index 2c1bfe3..1daed9b 100644 --- a/sys/dev/e1000/e1000_mac.h +++ b/sys/dev/e1000/e1000_mac.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_manage.c b/sys/dev/e1000/e1000_manage.c index 8087e65..f319c8b 100644 --- a/sys/dev/e1000/e1000_manage.c +++ b/sys/dev/e1000/e1000_manage.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_manage.h b/sys/dev/e1000/e1000_manage.h index 51f17671..303e99e 100644 --- a/sys/dev/e1000/e1000_manage.h +++ b/sys/dev/e1000/e1000_manage.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2012, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_mbx.c b/sys/dev/e1000/e1000_mbx.c index 55477b2..d9fb9ac 100644 --- a/sys/dev/e1000/e1000_mbx.c +++ b/sys/dev/e1000/e1000_mbx.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_mbx.h b/sys/dev/e1000/e1000_mbx.h index d2aea5c4..fadd849 100644 --- a/sys/dev/e1000/e1000_mbx.h +++ b/sys/dev/e1000/e1000_mbx.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_nvm.c b/sys/dev/e1000/e1000_nvm.c index f702f71..0a1a18d 100644 --- a/sys/dev/e1000/e1000_nvm.c +++ b/sys/dev/e1000/e1000_nvm.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_nvm.h b/sys/dev/e1000/e1000_nvm.h index 34077b2..31f2180 100644 --- a/sys/dev/e1000/e1000_nvm.h +++ b/sys/dev/e1000/e1000_nvm.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2013, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_osdep.c b/sys/dev/e1000/e1000_osdep.c index 75a7b79..2987cda 100644 --- a/sys/dev/e1000/e1000_osdep.c +++ b/sys/dev/e1000/e1000_osdep.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2010, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_osdep.h b/sys/dev/e1000/e1000_osdep.h index 1324110..fc46f48 100644 --- a/sys/dev/e1000/e1000_osdep.h +++ b/sys/dev/e1000/e1000_osdep.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_phy.c b/sys/dev/e1000/e1000_phy.c index f27889c..adb6732 100644 --- a/sys/dev/e1000/e1000_phy.c +++ b/sys/dev/e1000/e1000_phy.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_phy.h b/sys/dev/e1000/e1000_phy.h index 0e5b2e6..d3d563f 100644 --- a/sys/dev/e1000/e1000_phy.h +++ b/sys/dev/e1000/e1000_phy.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_regs.h b/sys/dev/e1000/e1000_regs.h index 952a7dc..da93d75 100644 --- a/sys/dev/e1000/e1000_regs.h +++ b/sys/dev/e1000/e1000_regs.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_vf.c b/sys/dev/e1000/e1000_vf.c index 2cabac9..4af985b 100644 --- a/sys/dev/e1000/e1000_vf.c +++ b/sys/dev/e1000/e1000_vf.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/e1000_vf.h b/sys/dev/e1000/e1000_vf.h index 2a780741..e6f834e 100644 --- a/sys/dev/e1000/e1000_vf.h +++ b/sys/dev/e1000/e1000_vf.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2014, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c index 830325b..e36a3d8 100644 --- a/sys/dev/e1000/if_em.c +++ b/sys/dev/e1000/if_em.c @@ -364,8 +364,14 @@ MODULE_DEPEND(em, netmap, 1, 1, 1); #define CSUM_TSO 0 #endif +#define TSO_WORKAROUND 4 + static SYSCTL_NODE(_hw, OID_AUTO, em, CTLFLAG_RD, 0, "EM driver parameters"); +static int em_disable_crc_stripping = 0; +SYSCTL_INT(_hw_em, OID_AUTO, disable_crc_stripping, CTLFLAG_RDTUN, + &em_disable_crc_stripping, 0, "Disable CRC Stripping"); + static int em_tx_int_delay_dflt = EM_TICKS_TO_USECS(EM_TIDV); static int em_rx_int_delay_dflt = EM_TICKS_TO_USECS(EM_RDTR); SYSCTL_INT(_hw_em, OID_AUTO, tx_int_delay, CTLFLAG_RDTUN, &em_tx_int_delay_dflt, @@ -1872,13 +1878,15 @@ em_xmit(struct tx_ring *txr, struct mbuf **m_headp) struct ether_header *eh; struct ip *ip = NULL; struct tcphdr *tp = NULL; - u32 txd_upper = 0, txd_lower = 0, txd_used = 0; + u32 txd_upper = 0, txd_lower = 0; int ip_off, poff; int nsegs, i, j, first, last = 0; - int error, do_tso, tso_desc = 0, remap = 1; + int error; + bool do_tso, tso_desc, remap = TRUE; m_head = *m_headp; - do_tso = ((m_head->m_pkthdr.csum_flags & CSUM_TSO) != 0); + do_tso = (m_head->m_pkthdr.csum_flags & CSUM_TSO); + tso_desc = FALSE; ip_off = poff = 0; /* @@ -1914,74 +1922,82 @@ em_xmit(struct tx_ring *txr, struct mbuf **m_headp) * for IPv6 yet. */ ip_off = sizeof(struct ether_header); - m_head = m_pullup(m_head, ip_off); - if (m_head == NULL) { - *m_headp = NULL; - return (ENOBUFS); + if (m_head->m_len < ip_off) { + m_head = m_pullup(m_head, ip_off); + if (m_head == NULL) { + *m_headp = NULL; + return (ENOBUFS); + } } eh = mtod(m_head, struct ether_header *); if (eh->ether_type == htons(ETHERTYPE_VLAN)) { ip_off = sizeof(struct ether_vlan_header); - m_head = m_pullup(m_head, ip_off); + if (m_head->m_len < ip_off) { + m_head = m_pullup(m_head, ip_off); + if (m_head == NULL) { + *m_headp = NULL; + return (ENOBUFS); + } + } + } + if (m_head->m_len < ip_off + sizeof(struct ip)) { + m_head = m_pullup(m_head, ip_off + sizeof(struct ip)); if (m_head == NULL) { *m_headp = NULL; return (ENOBUFS); } } - m_head = m_pullup(m_head, ip_off + sizeof(struct ip)); - if (m_head == NULL) { - *m_headp = NULL; - return (ENOBUFS); - } ip = (struct ip *)(mtod(m_head, char *) + ip_off); poff = ip_off + (ip->ip_hl << 2); - if (do_tso) { - m_head = m_pullup(m_head, poff + sizeof(struct tcphdr)); - if (m_head == NULL) { - *m_headp = NULL; - return (ENOBUFS); + + if (do_tso || (m_head->m_pkthdr.csum_flags & CSUM_TCP)) { + if (m_head->m_len < poff + sizeof(struct tcphdr)) { + m_head = m_pullup(m_head, poff + + sizeof(struct tcphdr)); + if (m_head == NULL) { + *m_headp = NULL; + return (ENOBUFS); + } } tp = (struct tcphdr *)(mtod(m_head, char *) + poff); /* * TSO workaround: * pull 4 more bytes of data into it. */ - m_head = m_pullup(m_head, poff + (tp->th_off << 2) + 4); - if (m_head == NULL) { - *m_headp = NULL; - return (ENOBUFS); + if (m_head->m_len < poff + (tp->th_off << 2)) { + m_head = m_pullup(m_head, poff + + (tp->th_off << 2) + + TSO_WORKAROUND); + if (m_head == NULL) { + *m_headp = NULL; + return (ENOBUFS); + } } ip = (struct ip *)(mtod(m_head, char *) + ip_off); - ip->ip_len = 0; - ip->ip_sum = 0; - /* - * The pseudo TCP checksum does not include TCP payload - * length so driver should recompute the checksum here - * what hardware expect to see. This is adherence of - * Microsoft's Large Send specification. - */ tp = (struct tcphdr *)(mtod(m_head, char *) + poff); - tp->th_sum = in_pseudo(ip->ip_src.s_addr, - ip->ip_dst.s_addr, htons(IPPROTO_TCP)); - } else if (m_head->m_pkthdr.csum_flags & CSUM_TCP) { - m_head = m_pullup(m_head, poff + sizeof(struct tcphdr)); - if (m_head == NULL) { - *m_headp = NULL; - return (ENOBUFS); + if (do_tso) { + ip->ip_len = htons(m_head->m_pkthdr.tso_segsz + + (ip->ip_hl << 2) + + (tp->th_off << 2)); + ip->ip_sum = 0; + /* + * The pseudo TCP checksum does not include TCP + * payload length so driver should recompute + * the checksum here what hardware expect to + * see. This is adherence of Microsoft's Large + * Send specification. + */ + tp->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htons(IPPROTO_TCP)); } - tp = (struct tcphdr *)(mtod(m_head, char *) + poff); - m_head = m_pullup(m_head, poff + (tp->th_off << 2)); - if (m_head == NULL) { - *m_headp = NULL; - return (ENOBUFS); - } - ip = (struct ip *)(mtod(m_head, char *) + ip_off); - tp = (struct tcphdr *)(mtod(m_head, char *) + poff); } else if (m_head->m_pkthdr.csum_flags & CSUM_UDP) { - m_head = m_pullup(m_head, poff + sizeof(struct udphdr)); - if (m_head == NULL) { - *m_headp = NULL; - return (ENOBUFS); + if (m_head->m_len < poff + sizeof(struct udphdr)) { + m_head = m_pullup(m_head, poff + + sizeof(struct udphdr)); + if (m_head == NULL) { + *m_headp = NULL; + return (ENOBUFS); + } } ip = (struct ip *)(mtod(m_head, char *) + ip_off); } @@ -2027,7 +2043,7 @@ retry: *m_headp = m; /* Try it again, but only once */ - remap = 0; + remap = FALSE; goto retry; } else if (error != 0) { adapter->no_tx_dma_setup++; @@ -2042,13 +2058,13 @@ retry: * it follows a TSO burst, then we need to add a * sentinel descriptor to prevent premature writeback. */ - if ((do_tso == 0) && (txr->tx_tso == TRUE)) { + if ((!do_tso) && (txr->tx_tso == TRUE)) { if (nsegs == 1) tso_desc = TRUE; txr->tx_tso = FALSE; } - if (nsegs > (txr->tx_avail - 2)) { + if (nsegs > (txr->tx_avail - EM_MAX_SCATTER)) { txr->no_desc_avail++; bus_dmamap_unload(txr->txtag, map); return (ENOBUFS); @@ -2088,23 +2104,23 @@ retry: ** If this is the last descriptor, we want to ** split it so we have a small final sentinel */ - if (tso_desc && (j == (nsegs -1)) && (seg_len > 8)) { - seg_len -= 4; + if (tso_desc && (j == (nsegs - 1)) && (seg_len > 8)) { + seg_len -= TSO_WORKAROUND; ctxd->buffer_addr = htole64(seg_addr); ctxd->lower.data = htole32( - adapter->txd_cmd | txd_lower | seg_len); - ctxd->upper.data = - htole32(txd_upper); + adapter->txd_cmd | txd_lower | seg_len); + ctxd->upper.data = htole32(txd_upper); if (++i == adapter->num_tx_desc) i = 0; + /* Now make the sentinel */ - ++txd_used; /* using an extra txd */ + txr->tx_avail--; ctxd = &txr->tx_base[i]; tx_buffer = &txr->tx_buffers[i]; ctxd->buffer_addr = htole64(seg_addr + seg_len); ctxd->lower.data = htole32( - adapter->txd_cmd | txd_lower | 4); + adapter->txd_cmd | txd_lower | TSO_WORKAROUND); ctxd->upper.data = htole32(txd_upper); last = i; @@ -2114,8 +2130,7 @@ retry: ctxd->buffer_addr = htole64(seg_addr); ctxd->lower.data = htole32( adapter->txd_cmd | txd_lower | seg_len); - ctxd->upper.data = - htole32(txd_upper); + ctxd->upper.data = htole32(txd_upper); last = i; if (++i == adapter->num_tx_desc) i = 0; @@ -2126,8 +2141,6 @@ retry: txr->next_avail_desc = i; txr->tx_avail -= nsegs; - if (tso_desc) /* TSO used an extra for sentinel */ - txr->tx_avail -= txd_used; tx_buffer->m_head = m_head; /* @@ -3030,6 +3043,11 @@ em_setup_interface(device_t dev, struct adapter *adapter) if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); if_setioctlfn(ifp, em_ioctl); if_setgetcounterfn(ifp, em_get_counter); + /* TSO parameters */ + ifp->if_hw_tsomax = EM_TSO_SIZE; + ifp->if_hw_tsomaxsegcount = EM_MAX_SCATTER; + ifp->if_hw_tsomaxsegsize = EM_TSO_SEG_SIZE; + #ifdef EM_MULTIQUEUE /* Multiqueue stack interface */ if_settransmitfn(ifp, em_mq_start); @@ -4514,7 +4532,8 @@ em_initialize_receive_unit(struct adapter *adapter) (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT); /* Strip the CRC */ - rctl |= E1000_RCTL_SECRC; + if (!em_disable_crc_stripping) + rctl |= E1000_RCTL_SECRC; /* Make sure VLAN Filters are off */ rctl &= ~E1000_RCTL_VFE; @@ -4888,8 +4907,8 @@ em_enable_intr(struct adapter *adapter) u32 ims_mask = IMS_ENABLE_MASK; if (hw->mac.type == e1000_82574) { - E1000_WRITE_REG(hw, EM_EIAC, EM_MSIX_MASK); - ims_mask |= EM_MSIX_MASK; + E1000_WRITE_REG(hw, EM_EIAC, adapter->ims); + ims_mask |= adapter->ims; } E1000_WRITE_REG(hw, E1000_IMS, ims_mask); } diff --git a/sys/dev/e1000/if_em.h b/sys/dev/e1000/if_em.h index be18a6c..8725de3 100644 --- a/sys/dev/e1000/if_em.h +++ b/sys/dev/e1000/if_em.h @@ -266,7 +266,7 @@ #define HW_DEBUGOUT1(S, A) if (DEBUG_HW) printf(S "\n", A) #define HW_DEBUGOUT2(S, A, B) if (DEBUG_HW) printf(S "\n", A, B) -#define EM_MAX_SCATTER 32 +#define EM_MAX_SCATTER 64 #define EM_VFTA_SIZE 128 #define EM_TSO_SIZE (65535 + sizeof(struct ether_vlan_header)) #define EM_TSO_SEG_SIZE 4096 /* Max dma segment size */ diff --git a/sys/dev/e1000/if_igb.c b/sys/dev/e1000/if_igb.c index 9eacc78..a3ea8d0 100644 --- a/sys/dev/e1000/if_igb.c +++ b/sys/dev/e1000/if_igb.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2013, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/if_igb.h b/sys/dev/e1000/if_igb.h index f2d0926..a4222e3 100644 --- a/sys/dev/e1000/if_igb.h +++ b/sys/dev/e1000/if_igb.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2013, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/sys/dev/e1000/if_lem.c b/sys/dev/e1000/if_lem.c index f34010e..7476be5 100644 --- a/sys/dev/e1000/if_lem.c +++ b/sys/dev/e1000/if_lem.c @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2012, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -97,7 +97,7 @@ /********************************************************************* * Legacy Em Driver version: *********************************************************************/ -char lem_driver_version[] = "1.0.6"; +char lem_driver_version[] = "1.1.0"; /********************************************************************* * PCI Device ID Table @@ -2913,10 +2913,6 @@ lem_free_transmit_structures(struct adapter *adapter) bus_dma_tag_destroy(adapter->txtag); adapter->txtag = NULL; } -#if __FreeBSD_version >= 800000 - if (adapter->br != NULL) - buf_ring_free(adapter->br, M_DEVBUF); -#endif } /********************************************************************* diff --git a/sys/dev/e1000/if_lem.h b/sys/dev/e1000/if_lem.h index 41447d1..4c43bdd 100644 --- a/sys/dev/e1000/if_lem.h +++ b/sys/dev/e1000/if_lem.h @@ -1,6 +1,6 @@ /****************************************************************************** - Copyright (c) 2001-2011, Intel Corporation + Copyright (c) 2001-2015, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -296,9 +296,6 @@ struct em_int_delay_info { /* Our adapter structure */ struct adapter { if_t ifp; -#if __FreeBSD_version >= 800000 - struct buf_ring *br; -#endif struct e1000_hw hw; /* FreeBSD operating-system-specific structures. */ diff --git a/sys/dev/gpio/gpiobus.c b/sys/dev/gpio/gpiobus.c index e741d28..6cafdaf 100644 --- a/sys/dev/gpio/gpiobus.c +++ b/sys/dev/gpio/gpiobus.c @@ -155,12 +155,16 @@ gpiobus_attach_bus(device_t dev) int gpiobus_detach_bus(device_t dev) { + int err; #ifdef FDT ofw_gpiobus_unregister_provider(dev); #endif + err = bus_generic_detach(dev); + if (err != 0) + return (err); - return (bus_generic_detach(dev)); + return (device_delete_children(dev)); } int @@ -338,11 +342,14 @@ gpiobus_detach(device_t dev) if ((err = device_get_children(dev, &devlist, &ndevs)) != 0) return (err); for (i = 0; i < ndevs; i++) { - device_delete_child(dev, devlist[i]); devi = GPIOBUS_IVAR(devlist[i]); gpiobus_free_ivars(devi); + resource_list_free(&devi->rl); + free(devi, M_DEVBUF); + device_delete_child(dev, devlist[i]); } free(devlist, M_TEMP); + rman_fini(&sc->sc_intr_rman); if (sc->sc_pins) { for (i = 0; i < sc->sc_npins; i++) { if (sc->sc_pins[i].name != NULL) @@ -442,7 +449,7 @@ gpiobus_add_child(device_t dev, u_int order, const char *name, int unit) devi = malloc(sizeof(struct gpiobus_ivar), M_DEVBUF, M_NOWAIT | M_ZERO); if (devi == NULL) { device_delete_child(dev, child); - return (0); + return (NULL); } resource_list_init(&devi->rl); device_set_ivars(child, devi); @@ -461,8 +468,11 @@ gpiobus_hinted_child(device_t bus, const char *dname, int dunit) child = BUS_ADD_CHILD(bus, 0, dname, dunit); devi = GPIOBUS_IVAR(child); resource_int_value(dname, dunit, "pins", &pins); - if (gpiobus_parse_pins(sc, child, pins)) + if (gpiobus_parse_pins(sc, child, pins)) { + resource_list_free(&devi->rl); + free(devi, M_DEVBUF); device_delete_child(bus, child); + } if (resource_int_value(dname, dunit, "irq", &irq) == 0) { if (bus_set_resource(child, SYS_RES_IRQ, 0, irq, 1) != 0) device_printf(bus, diff --git a/sys/dev/gpio/gpioled.c b/sys/dev/gpio/gpioled.c index 01710c2..e699128 100644 --- a/sys/dev/gpio/gpioled.c +++ b/sys/dev/gpio/gpioled.c @@ -255,3 +255,4 @@ static driver_t gpioled_driver = { }; DRIVER_MODULE(gpioled, gpiobus, gpioled_driver, gpioled_devclass, 0, 0); +MODULE_DEPEND(gpioled, gpiobus, 1, 1, 1); diff --git a/sys/dev/md/md.c b/sys/dev/md/md.c index a82d81d..c19f7fe 100644 --- a/sys/dev/md/md.c +++ b/sys/dev/md/md.c @@ -89,6 +89,7 @@ #include <sys/vnode.h> #include <geom/geom.h> +#include <geom/geom_int.h> #include <vm/vm.h> #include <vm/vm_param.h> @@ -121,9 +122,12 @@ SYSCTL_INT(_vm, OID_AUTO, md_malloc_wait, CTLFLAG_RW, &md_malloc_wait, 0, #define MD_ROOT_FSTYPE "ufs" #endif -#if defined(MD_ROOT) && defined(MD_ROOT_SIZE) +#if defined(MD_ROOT) /* * Preloaded image gets put here. + */ +#if defined(MD_ROOT_SIZE) +/* * Applications that patch the object with the image can determine * the size looking at the start and end markers (strings), * so we want them contiguous. @@ -135,6 +139,14 @@ static struct { .start = "MFS Filesystem goes here", .end = "MFS Filesystem had better STOP here", }; +const int mfs_root_size = sizeof(mfs_root.start); +#else +extern volatile u_char __weak_symbol mfs_root; +extern volatile u_char __weak_symbol mfs_root_end; +__GLOBL(mfs_root); +__GLOBL(mfs_root_end); +#define mfs_root_size ((uintptr_t)(&mfs_root_end - &mfs_root)) +#endif #endif static g_init_t g_md_init; @@ -1552,6 +1564,9 @@ md_preloaded(u_char *image, size_t length, const char *name) if (name != NULL) { printf("%s%d: Preloaded image <%s> %zd bytes at %p\n", MD_NAME, sc->unit, name, length, image); + } else { + printf("%s%d: Embedded image %zd bytes at %p\n", + MD_NAME, sc->unit, length, image); } } @@ -1571,10 +1586,13 @@ g_md_init(struct g_class *mp __unused) sx_init(&md_sx, "MD config lock"); g_topology_unlock(); md_uh = new_unrhdr(0, INT_MAX, NULL); -#ifdef MD_ROOT_SIZE - sx_xlock(&md_sx); - md_preloaded(mfs_root.start, sizeof(mfs_root.start), NULL); - sx_xunlock(&md_sx); +#ifdef MD_ROOT + if (mfs_root_size != 0) { + sx_xlock(&md_sx); + md_preloaded(__DEVOLATILE(u_char *, &mfs_root), mfs_root_size, + NULL); + sx_xunlock(&md_sx); + } #endif /* XXX: are preload_* static or do they need Giant ? */ while ((mod = preload_search_next_name(mod)) != NULL) { @@ -1660,9 +1678,11 @@ g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, "read-only"); sbuf_printf(sb, "%s<type>%s</type>\n", indent, type); - if (mp->type == MD_VNODE && mp->vnode != NULL) - sbuf_printf(sb, "%s<file>%s</file>\n", - indent, mp->file); + if (mp->type == MD_VNODE && mp->vnode != NULL) { + sbuf_printf(sb, "%s<file>", indent); + g_conf_printf_escaped(sb, "%s", mp->file); + sbuf_printf(sb, "</file>\n"); + } } } } diff --git a/sys/dev/random/fortuna.c b/sys/dev/random/fortuna.c index 2aafba4..0b03931 100644 --- a/sys/dev/random/fortuna.c +++ b/sys/dev/random/fortuna.c @@ -58,6 +58,7 @@ __FBSDID("$FreeBSD$"); #include <dev/random/fortuna.h> #else /* !_KERNEL */ #include <inttypes.h> +#include <stdbool.h> #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -124,9 +125,7 @@ static uint8_t zero_region[RANDOM_ZERO_BLOCKSIZE]; static void random_fortuna_pre_read(void); static void random_fortuna_read(uint8_t *, u_int); -static void random_fortuna_write(uint8_t *, u_int); -static void random_fortuna_reseed(void); -static int random_fortuna_seeded(void); +static bool random_fortuna_seeded(void); static void random_fortuna_process_event(struct harvest_event *); static void random_fortuna_init_alg(void *); static void random_fortuna_deinit_alg(void *); @@ -139,8 +138,6 @@ struct random_algorithm random_alg_context = { .ra_deinit_alg = random_fortuna_deinit_alg, .ra_pre_read = random_fortuna_pre_read, .ra_read = random_fortuna_read, - .ra_write = random_fortuna_write, - .ra_reseed = random_fortuna_reseed, .ra_seeded = random_fortuna_seeded, .ra_event_processor = random_fortuna_process_event, .ra_poolcount = RANDOM_FORTUNA_NPOOLS, @@ -420,43 +417,7 @@ random_fortuna_read(uint8_t *buf, u_int bytecount) RANDOM_RESEED_UNLOCK(); } -/* Internal function to hand external entropy to the PRNG. */ -void -random_fortuna_write(uint8_t *buf, u_int count) -{ - static u_int destination = 0; - struct harvest_event event; - struct randomdev_hash hash; - uint32_t entropy_data[RANDOM_KEYSIZE_WORDS], timestamp; - int i; - - /* Extra timing here is helpful to scrape scheduler timing entropy */ - randomdev_hash_init(&hash); - timestamp = (uint32_t)get_cyclecount(); - randomdev_hash_iterate(&hash, ×tamp, sizeof(timestamp)); - randomdev_hash_iterate(&hash, buf, count); - timestamp = (uint32_t)get_cyclecount(); - randomdev_hash_iterate(&hash, ×tamp, sizeof(timestamp)); - randomdev_hash_finish(&hash, entropy_data); - explicit_bzero(&hash, sizeof(hash)); - for (i = 0; i < RANDOM_KEYSIZE_WORDS; i += sizeof(event.he_entropy)/sizeof(event.he_entropy[0])) { - event.he_somecounter = (uint32_t)get_cyclecount(); - event.he_size = sizeof(event.he_entropy); - event.he_bits = event.he_size/8; - event.he_source = RANDOM_CACHED; - event.he_destination = destination++; /* Harmless cheating */ - memcpy(event.he_entropy, entropy_data + i, sizeof(event.he_entropy)); - random_fortuna_process_event(&event); - } - explicit_bzero(entropy_data, sizeof(entropy_data)); -} - -void -random_fortuna_reseed(void) -{ -} - -int +bool random_fortuna_seeded(void) { diff --git a/sys/dev/random/other_algorithm.c b/sys/dev/random/other_algorithm.c new file mode 100644 index 0000000..740e879 --- /dev/null +++ b/sys/dev/random/other_algorithm.c @@ -0,0 +1,209 @@ +/*- + * Copyright (c) 2015 Mark R V Murray + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/*- + * This is a skeleton for folks who wish to build a loadable module + * containing an alternative entropy-processing algorithm for random(4). + * + * The functions below should be completed with the appropriate code, + * and the nearby yarrow.c and fortuna.c may be consulted for examples + * of working code. + * + * The author is willing to provide reasonable help to those wishing to + * write such a module for themselves. Please use the markm@ FreeBSD + * email address, and ensure that you are developing this on a suitably + * supported branch (This is currently 11-CURRENT, and will be no + * older than 11-STABLE in the future). + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/random.h> +#include <sys/sysctl.h> +#include <sys/systm.h> + +#include <machine/cpu.h> + +#include <crypto/rijndael/rijndael-api-fst.h> +#include <crypto/sha2/sha2.h> + +#include <dev/random/hash.h> +#include <dev/random/randomdev.h> +#include <dev/random/random_harvestq.h> +#include <dev/random/uint128.h> +#include <dev/random/other_algorithm.h> + +static void random_other_pre_read(void); +static void random_other_read(uint8_t *, u_int); +static bool random_other_seeded(void); +static void random_other_process_event(struct harvest_event *); +static void random_other_init_alg(void *); +static void random_other_deinit_alg(void *); + +/* + * RANDOM_OTHER_NPOOLS is used when reading hardware random + * number sources to ensure that each pool gets one read sample + * per loop iteration. Yarrow has 2 such pools (FAST and SLOW), + * and fortuna has 32 (0-31). The RNG used prior to Yarrow and + * ported from Linux had just 1 pool. + */ +#define RANDOM_OTHER_NPOOLS 1 + +struct random_algorithm random_alg_context = { + .ra_ident = "other", + .ra_init_alg = random_other_init_alg, + .ra_deinit_alg = random_other_deinit_alg, + .ra_pre_read = random_other_pre_read, + .ra_read = random_other_read, + .ra_seeded = random_other_seeded, + .ra_event_processor = random_other_process_event, + .ra_poolcount = RANDOM_OTHER_NPOOLS, +}; + +/* Use a mutex to protect your reseed variables? */ +static mtx_t other_mtx; + +/* + * void random_other_init_alg(void *unused __unused) + * + * Do algorithm-specific initialisation here. + */ +void +random_other_init_alg(void *unused __unused) +{ + + RANDOM_RESEED_INIT_LOCK(); + /* + * Do set-up work here! + */ +} + +/* + * void random_other_deinit_alg(void *unused __unused) + * + * Do algorithm-specific deinitialisation here. + */ +static void +random_other_deinit_alg(void *unused __unused) +{ + + /* + * Do tear-down work here! + */ + RANDOM_RESEED_DEINIT_LOCK(); +} + +/* + * void random_other_pre_read(void) + * + * Do any pre-read preparation you need to. This will be called + * before >=1 calls to random_other_read() corresponding to one + * read(2). + * + * This routine will be called periodically while the generator is + * still blocked and a read is being attempted, giving you an + * opportunity to unblock. + */ +static void +random_other_pre_read(void) +{ + + RANDOM_RESEED_LOCK(); + /* + * Do pre-read housekeeping work here! + * You may use this as a chance to unblock the generator. + */ + RANDOM_RESEED_UNLOCK(); +} + +/* + * void random_other_read(uint8_t *buf, u_int count) + * + * Generate <count> bytes of output into <*buf>. + * You may use the fact that <count> will be a multiple of + * RANDOM_BLOCKSIZE for optimization purposes. + * + * This function will always be called with your generator + * unblocked and ready. If you are not ready to generate + * output here, then feel free to KASSERT() or panic(). + */ +static void +random_other_read(uint8_t *buf, u_int count) +{ + + RANDOM_RESEED_LOCK(); + /* + * Do random-number generation work here! + */ + RANDOM_RESEED_UNLOCK(); +} + +/* + * bool random_other_seeded(void) + * + * Return true if your generator is ready to generate + * output, and false otherwise. + */ +static bool +random_other_seeded(void) +{ + bool seeded = false; + + /* + * Find out if your generator is seeded here! + */ + return (seeded); +} + +/* + * void random_other_process_event(struct harvest_event *event) + * + * Process one stochastic event <*event> into your entropy + * processor. + * + * The structure of the event may change, so it is easier to + * just grab the whole thing into your accumulation system. + * You may pick-and-choose bits, but please don't complain + * when/if these change. + */ +static void +random_other_process_event(struct harvest_event *event) +{ + + RANDOM_RESEED_LOCK(); + /* + * Do entropy accumulation work here! + * You may use this as a chance to unblock the generator. + */ + RANDOM_RESEED_UNLOCK(); +} diff --git a/sys/dev/random/other_algorithm.h b/sys/dev/random/other_algorithm.h new file mode 100644 index 0000000..8ca2bb8 --- /dev/null +++ b/sys/dev/random/other_algorithm.h @@ -0,0 +1,62 @@ +/*- + * Copyright (c) 2015 Mark R V Murray + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/*- + * This is a skeleton for folks who wish to build a loadable module + * containing an alternative entropy-processing algorithm for random(4). + * + * The functions below should be completed with the appropriate code, + * and the nearby yarrow.c and fortuna.c may be consulted for examples + * of working code. + * + * The author is willing to provide reasonable help to those wishing to + * write such a module for themselves. Please use the markm@ FreeBSD + * email address, and ensure that you are developing this on a suitably + * supported branch (This is currently 11-CURRENT, and will be no + * older than 11-STABLE in the future). + */ + +#ifndef SYS_DEV_RANDOM_OTHER_H_INCLUDED +#define SYS_DEV_RANDOM_OTHER_H_INCLUDED + +#ifdef _KERNEL +typedef struct mtx mtx_t; +#define RANDOM_RESEED_INIT_LOCK(x) mtx_init(&other_mtx, "reseed mutex", NULL, MTX_DEF) +#define RANDOM_RESEED_DEINIT_LOCK(x) mtx_destroy(&other_mtx) +#define RANDOM_RESEED_LOCK(x) mtx_lock(&other_mtx) +#define RANDOM_RESEED_UNLOCK(x) mtx_unlock(&other_mtx) +#define RANDOM_RESEED_ASSERT_LOCK_OWNED(x) mtx_assert(&other_mtx, MA_OWNED) +#else +#define RANDOM_RESEED_INIT_LOCK(x) mtx_init(&other_mtx, mtx_plain) +#define RANDOM_RESEED_DEINIT_LOCK(x) mtx_destroy(&other_mtx) +#define RANDOM_RESEED_LOCK(x) mtx_lock(&other_mtx) +#define RANDOM_RESEED_UNLOCK(x) mtx_unlock(&other_mtx) +#define RANDOM_RESEED_ASSERT_LOCK_OWNED(x) +#endif + +#endif /* SYS_DEV_RANDOM_OTHER_H_INCLUDED */ diff --git a/sys/dev/random/random_harvestq.c b/sys/dev/random/random_harvestq.c index 34a809b..255136c 100644 --- a/sys/dev/random/random_harvestq.c +++ b/sys/dev/random/random_harvestq.c @@ -47,12 +47,21 @@ __FBSDID("$FreeBSD$"); #include <sys/sysctl.h> #include <sys/unistd.h> +#if defined(RANDOM_LOADABLE) +#include <sys/lock.h> +#include <sys/sx.h> +#endif + +#include <machine/atomic.h> #include <machine/cpu.h> #include <dev/random/randomdev.h> #include <dev/random/random_harvestq.h> static void random_kthread(void); +static void random_sources_feed(void); + +static u_int read_rate; /* List for the dynamic sysctls */ static struct sysctl_ctx_list random_clist; @@ -66,7 +75,7 @@ static struct sysctl_ctx_list random_clist; #define RANDOM_RING_MAX 1024 #define RANDOM_ACCUM_MAX 8 -/* 1 to let the kernel thread run, 0 to terminate */ +/* 1 to let the kernel thread run, 0 to terminate, -1 to mark completion */ volatile int random_kthread_control; /* @@ -123,13 +132,18 @@ static struct kproc_desc random_proc_kp = { &harvest_context.hc_kthread_proc, }; - /* Pass the given event straight through to Fortuna/Yarrow/Whatever. */ static __inline void random_harvestq_fast_process_event(struct harvest_event *event) { - if (random_alg_context.ra_event_processor) - random_alg_context.ra_event_processor(event); +#if defined(RANDOM_LOADABLE) + RANDOM_CONFIG_S_LOCK(); + if (p_random_alg_context) +#endif + p_random_alg_context->ra_event_processor(event); +#if defined(RANDOM_LOADABLE) + RANDOM_CONFIG_S_UNLOCK(); +#endif } static void @@ -163,12 +177,58 @@ random_kthread(void) /* XXX: FIX!! This is a *great* place to pass hardware/live entropy to random(9) */ tsleep_sbt(&harvest_context.hc_kthread_proc, 0, "-", SBT_1S/10, 0, C_PREL(1)); } + random_kthread_control = -1; wakeup(&harvest_context.hc_kthread_proc); kproc_exit(0); /* NOTREACHED */ } +/* This happens well after SI_SUB_RANDOM */ SYSINIT(random_device_h_proc, SI_SUB_CREATE_INIT, SI_ORDER_ANY, kproc_start, &random_proc_kp); +/* + * Run through all fast sources reading entropy for the given + * number of rounds, which should be a multiple of the number + * of entropy accumulation pools in use; 2 for Yarrow and 32 + * for Fortuna. + */ +static void +random_sources_feed(void) +{ + uint32_t entropy[HARVESTSIZE]; + struct random_sources *rrs; + u_int i, n, local_read_rate; + + /* + * Step over all of live entropy sources, and feed their output + * to the system-wide RNG. + */ +#if defined(RANDOM_LOADABLE) + RANDOM_CONFIG_S_LOCK(); + if (p_random_alg_context) { + /* It's an indenting error. Yeah, Yeah. */ +#endif + local_read_rate = atomic_readandclear_32(&read_rate); + LIST_FOREACH(rrs, &source_list, rrs_entries) { + for (i = 0; i < p_random_alg_context->ra_poolcount*(local_read_rate + 1); i++) { + n = rrs->rrs_source->rs_read(entropy, sizeof(entropy)); + KASSERT((n > 0 && n <= sizeof(entropy)), ("very bad return from rs_read (= %d) in %s", n, __func__)); + random_harvest_direct(entropy, n, (n*8)/2, rrs->rrs_source->rs_source); + } + } + explicit_bzero(entropy, sizeof(entropy)); +#if defined(RANDOM_LOADABLE) + } + RANDOM_CONFIG_S_UNLOCK(); +#endif +} + +void +read_rate_increment(u_int chunk) +{ + + atomic_add_32(&read_rate, chunk); +} + /* ARGSUSED */ RANDOM_CHECK_UINT(harvestmask, 0, RANDOM_HARVEST_EVERYTHING_MASK); @@ -317,7 +377,8 @@ random_harvestq_deinit(void *unused __unused) /* Command the hash/reseed thread to end and wait for it to finish */ random_kthread_control = 0; - tsleep(&harvest_context.hc_kthread_proc, 0, "harvqterm", 0); + while (random_kthread_control >= 0) + tsleep(&harvest_context.hc_kthread_proc, 0, "harvqterm", hz/5); sysctl_ctx_free(&random_clist); } SYSUNINIT(random_device_h_init, SI_SUB_RANDOM, SI_ORDER_SECOND, random_harvestq_deinit, NULL); @@ -412,3 +473,5 @@ random_harvest_direct(const void *entropy, u_int size, u_int bits, enum random_e random_harvestq_fast_process_event(&event); explicit_bzero(&event, sizeof(event)); } + +MODULE_VERSION(random_harvestq, 1); diff --git a/sys/dev/random/random_harvestq.h b/sys/dev/random/random_harvestq.h index f1de86f..421b592 100644 --- a/sys/dev/random/random_harvestq.h +++ b/sys/dev/random/random_harvestq.h @@ -43,6 +43,8 @@ struct harvest_event { uint8_t he_source; /* origin of the entropy */ } __packed; +void read_rate_increment(u_int); + #define RANDOM_HARVESTQ_BOOT_ENTROPY_FILE "/boot/entropy" #define RANDOM_HARVEST_INIT_LOCK(x) mtx_init(&harvest_context.hc_mtx, "entropy harvest mutex", NULL, MTX_SPIN) diff --git a/sys/dev/random/random_infra.c b/sys/dev/random/random_infra.c new file mode 100644 index 0000000..d31b84b --- /dev/null +++ b/sys/dev/random/random_infra.c @@ -0,0 +1,128 @@ +/*- + * Copyright (c) 2015 Mark R V Murray + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/random.h> +#include <sys/sysctl.h> + +#if defined(RANDOM_LOADABLE) +#include <sys/lock.h> +#include <sys/sx.h> +#endif + +#include <dev/random/randomdev.h> + +/* Set up the sysctl root node for the entropy device */ +SYSCTL_NODE(_kern, OID_AUTO, random, CTLFLAG_RW, 0, "Cryptographically Secure Random Number Generator"); + +MALLOC_DEFINE(M_ENTROPY, "entropy", "Entropy harvesting buffers and data structures"); + +struct sources_head source_list = LIST_HEAD_INITIALIZER(source_list); + +#if defined(RANDOM_LOADABLE) +struct random_algorithm *p_random_alg_context = NULL; +#else /* !defined(RANDOM_LOADABLE) */ +struct random_algorithm *p_random_alg_context = &random_alg_context; +#endif /* defined(RANDOM_LOADABLE) */ + +#if defined(RANDOM_LOADABLE) + +struct random_readers { + int (*read_random_uio)(struct uio *, bool); + u_int (*read_random)(void *, u_int); +} random_reader_context = { + (int (*)(struct uio *, bool))nullop, + (u_int (*)(void *, u_int))nullop, +}; + +struct sx randomdev_config_lock; + +static void +random_infra_sysinit(void *dummy __unused) +{ + + RANDOM_CONFIG_INIT_LOCK(); +} +SYSINIT(random_device_h_init, SI_SUB_RANDOM, SI_ORDER_FIRST, random_infra_sysinit, NULL); + +void +random_infra_init(int (*p_random_read_uio)(struct uio *, bool), u_int (*p_random_read)(void *, u_int)) +{ + + RANDOM_CONFIG_X_LOCK(); + random_reader_context.read_random_uio = p_random_read_uio; + random_reader_context.read_random = p_random_read; + RANDOM_CONFIG_X_UNLOCK(); +} + +void +random_infra_uninit(void) +{ + + RANDOM_CONFIG_X_LOCK(); + random_reader_context.read_random_uio = (int (*)(struct uio *, bool))nullop; + random_reader_context.read_random = (u_int (*)(void *, u_int))nullop; + RANDOM_CONFIG_X_UNLOCK(); +} + +static void +random_infra_sysuninit(void *dummy __unused) +{ + + RANDOM_CONFIG_DEINIT_LOCK(); +} +SYSUNINIT(random_device_h_init, SI_SUB_RANDOM, SI_ORDER_FIRST, random_infra_sysuninit, NULL); + +int +read_random_uio(struct uio *uio, bool nonblock) +{ + int retval; + + RANDOM_CONFIG_S_LOCK(); + retval = random_reader_context.read_random_uio(uio, nonblock); + RANDOM_CONFIG_S_UNLOCK(); + return (retval); +} + +u_int +read_random(void *buf, u_int len) +{ + u_int retval; + + RANDOM_CONFIG_S_LOCK(); + retval = random_reader_context.read_random(buf, len); + RANDOM_CONFIG_S_UNLOCK(); + return (retval); +} + +#endif /* defined(RANDOM_LOADABLE) */ diff --git a/sys/dev/random/randomdev.c b/sys/dev/random/randomdev.c index 5c20c5d..f20a462 100644 --- a/sys/dev/random/randomdev.c +++ b/sys/dev/random/randomdev.c @@ -56,14 +56,18 @@ __FBSDID("$FreeBSD$"); #include <dev/random/randomdev.h> #include <dev/random/random_harvestq.h> -#include "opt_random.h" +#define RANDOM_UNIT 0 -#if defined(RANDOM_DUMMY) && defined(RANDOM_YARROW) -#error "Cannot define both RANDOM_DUMMY and RANDOM_YARROW" +#if defined(RANDOM_LOADABLE) +#define READ_RANDOM_UIO _read_random_uio +#define READ_RANDOM _read_random +static int READ_RANDOM_UIO(struct uio *, bool); +static u_int READ_RANDOM(void *, u_int); +#else +#define READ_RANDOM_UIO read_random_uio +#define READ_RANDOM read_random #endif -#define RANDOM_UNIT 0 - /* Return the largest number >= x that is a multiple of m */ #define CEIL_TO_MULTIPLE(x, m) ((((x) + (m) - 1)/(m))*(m)) @@ -84,68 +88,31 @@ static struct cdevsw random_cdevsw = { /* For use with make_dev(9)/destroy_dev(9). */ static struct cdev *random_dev; -/* Set up the sysctl root node for the entropy device */ -SYSCTL_NODE(_kern, OID_AUTO, random, CTLFLAG_RW, 0, "Cryptographically Secure Random Number Generator"); - -MALLOC_DEFINE(M_ENTROPY, "entropy", "Entropy harvesting buffers and data structures"); - -#if defined(RANDOM_DUMMY) - -/*- - * Dummy "always block" pseudo algorithm, used when there is no real - * random(4) driver to provide a CSPRNG. - */ - -static u_int -dummy_random_zero(void) -{ - - return (0); -} - -static void -dummy_random(void) -{ -} - -struct random_algorithm random_alg_context = { - .ra_ident = "Dummy", - .ra_init_alg = NULL, - .ra_deinit_alg = NULL, - .ra_pre_read = dummy_random, - .ra_read = (random_alg_read_t *)dummy_random_zero, - .ra_write = (random_alg_write_t *)dummy_random_zero, - .ra_reseed = dummy_random, - .ra_seeded = (random_alg_seeded_t *)dummy_random_zero, - .ra_event_processor = NULL, - .ra_poolcount = 0, -}; - -#else /* !defined(RANDOM_DUMMY) */ - -LIST_HEAD(sources_head, random_sources); -static struct sources_head source_list = LIST_HEAD_INITIALIZER(source_list); -static u_int read_rate; - static void random_alg_context_ra_init_alg(void *data) { - random_alg_context.ra_init_alg(data); + p_random_alg_context = &random_alg_context; + p_random_alg_context->ra_init_alg(data); +#if defined(RANDOM_LOADABLE) + random_infra_init(READ_RANDOM_UIO, READ_RANDOM); +#endif } static void random_alg_context_ra_deinit_alg(void *data) { - random_alg_context.ra_deinit_alg(data); +#if defined(RANDOM_LOADABLE) + random_infra_uninit(); +#endif + p_random_alg_context->ra_deinit_alg(data); + p_random_alg_context = NULL; } SYSINIT(random_device, SI_SUB_RANDOM, SI_ORDER_THIRD, random_alg_context_ra_init_alg, NULL); SYSUNINIT(random_device, SI_SUB_RANDOM, SI_ORDER_THIRD, random_alg_context_ra_deinit_alg, NULL); -#endif /* defined(RANDOM_DUMMY) */ - static struct selinfo rsel; /* @@ -156,28 +123,28 @@ static int randomdev_read(struct cdev *dev __unused, struct uio *uio, int flags) { - return (read_random_uio(uio, (flags & O_NONBLOCK) != 0)); + return (READ_RANDOM_UIO(uio, (flags & O_NONBLOCK) != 0)); } int -read_random_uio(struct uio *uio, bool nonblock) +READ_RANDOM_UIO(struct uio *uio, bool nonblock) { uint8_t *random_buf; int error, spamcount; ssize_t read_len, total_read, c; random_buf = malloc(PAGE_SIZE, M_ENTROPY, M_WAITOK); - random_alg_context.ra_pre_read(); + p_random_alg_context->ra_pre_read(); error = 0; spamcount = 0; /* (Un)Blocking logic */ - while (!random_alg_context.ra_seeded()) { + while (!p_random_alg_context->ra_seeded()) { if (nonblock) { error = EWOULDBLOCK; break; } /* keep tapping away at the pre-read until we seed/unblock. */ - random_alg_context.ra_pre_read(); + p_random_alg_context->ra_pre_read(); /* Only bother the console every 10 seconds or so */ if (spamcount == 0) printf("random: %s unblock wait\n", __func__); @@ -187,10 +154,7 @@ read_random_uio(struct uio *uio, bool nonblock) break; } if (error == 0) { -#if !defined(RANDOM_DUMMY) - /* XXX: FIX!! Next line as an atomic operation? */ - read_rate += (uio->uio_resid + sizeof(uint32_t))/sizeof(uint32_t); -#endif + read_rate_increment((uio->uio_resid + sizeof(uint32_t))/sizeof(uint32_t)); total_read = 0; while (uio->uio_resid && !error) { read_len = uio->uio_resid; @@ -203,7 +167,7 @@ read_random_uio(struct uio *uio, bool nonblock) read_len = CEIL_TO_MULTIPLE(read_len, RANDOM_BLOCKSIZE); /* Work in chunks page-sized or less */ read_len = MIN(read_len, PAGE_SIZE); - random_alg_context.ra_read(random_buf, read_len); + p_random_alg_context->ra_read(random_buf, read_len); c = MIN(uio->uio_resid, read_len); error = uiomove(random_buf, c, uio); total_read += c; @@ -224,19 +188,16 @@ read_random_uio(struct uio *uio, bool nonblock) * RANDOM_BLOCKSIZE bytes. */ u_int -read_random(void *random_buf, u_int len) +READ_RANDOM(void *random_buf, u_int len) { u_int read_len; uint8_t local_buf[len + RANDOM_BLOCKSIZE]; KASSERT(random_buf != NULL, ("No suitable random buffer in %s", __func__)); - random_alg_context.ra_pre_read(); + p_random_alg_context->ra_pre_read(); /* (Un)Blocking logic; if not seeded, return nothing. */ - if (random_alg_context.ra_seeded()) { -#if !defined(RANDOM_DUMMY) - /* XXX: FIX!! Next line as an atomic operation? */ - read_rate += (len + sizeof(uint32_t))/sizeof(uint32_t); -#endif + if (p_random_alg_context->ra_seeded()) { + read_rate_increment((len + sizeof(uint32_t))/sizeof(uint32_t)); if (len > 0) { /* * Belt-and-braces. @@ -244,7 +205,7 @@ read_random(void *random_buf, u_int len) * which is what the underlying generator is expecting. */ read_len = CEIL_TO_MULTIPLE(len, RANDOM_BLOCKSIZE); - random_alg_context.ra_read(local_buf, read_len); + p_random_alg_context->ra_read(local_buf, read_len); memcpy(random_buf, local_buf, len); } } else @@ -252,6 +213,37 @@ read_random(void *random_buf, u_int len) return (len); } +static __inline void +randomdev_accumulate(uint8_t *buf, u_int count) +{ + static u_int destination = 0; + static struct harvest_event event; + static struct randomdev_hash hash; + static uint32_t entropy_data[RANDOM_KEYSIZE_WORDS]; + uint32_t timestamp; + int i; + + /* Extra timing here is helpful to scrape scheduler jitter entropy */ + randomdev_hash_init(&hash); + timestamp = (uint32_t)get_cyclecount(); + randomdev_hash_iterate(&hash, ×tamp, sizeof(timestamp)); + randomdev_hash_iterate(&hash, buf, count); + timestamp = (uint32_t)get_cyclecount(); + randomdev_hash_iterate(&hash, ×tamp, sizeof(timestamp)); + randomdev_hash_finish(&hash, entropy_data); + explicit_bzero(&hash, sizeof(hash)); + for (i = 0; i < RANDOM_KEYSIZE_WORDS; i += sizeof(event.he_entropy)/sizeof(event.he_entropy[0])) { + event.he_somecounter = (uint32_t)get_cyclecount(); + event.he_size = sizeof(event.he_entropy); + event.he_bits = event.he_size/8; + event.he_source = RANDOM_CACHED; + event.he_destination = destination++; /* Harmless cheating */ + memcpy(event.he_entropy, entropy_data + i, sizeof(event.he_entropy)); + p_random_alg_context->ra_event_processor(&event); + } + explicit_bzero(entropy_data, sizeof(entropy_data)); +} + /* ARGSUSED */ static int randomdev_write(struct cdev *dev __unused, struct uio *uio, int flags __unused) @@ -267,7 +259,7 @@ randomdev_write(struct cdev *dev __unused, struct uio *uio, int flags __unused) error = uiomove(random_buf, c, uio); if (error) break; - random_alg_context.ra_write(random_buf, c); + randomdev_accumulate(random_buf, c); tsleep(&random_alg_context, 0, "randwr", hz/10); } if (nbytes != uio->uio_resid && (error == ERESTART || error == EINTR)) @@ -283,7 +275,7 @@ randomdev_poll(struct cdev *dev __unused, int events, struct thread *td __unused { if (events & (POLLIN | POLLRDNORM)) { - if (random_alg_context.ra_seeded()) + if (p_random_alg_context->ra_seeded()) events &= (POLLIN | POLLRDNORM); else selrecord(td, &rsel); @@ -325,9 +317,6 @@ randomdev_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t addr __unused, void random_source_register(struct random_source *rsource) { -#if defined(RANDOM_DUMMY) - (void)rsource; -#else /* !defined(RANDOM_DUMMY) */ struct random_sources *rrs; KASSERT(rsource != NULL, ("invalid input to %s", __func__)); @@ -337,15 +326,11 @@ random_source_register(struct random_source *rsource) printf("random: registering fast source %s\n", rsource->rs_ident); LIST_INSERT_HEAD(&source_list, rrs, rrs_entries); -#endif /* defined(RANDOM_DUMMY) */ } void random_source_deregister(struct random_source *rsource) { -#if defined(RANDOM_DUMMY) - (void)rsource; -#else /* !defined(RANDOM_DUMMY) */ struct random_sources *rrs = NULL; KASSERT(rsource != NULL, ("invalid input to %s", __func__)); @@ -356,41 +341,6 @@ random_source_deregister(struct random_source *rsource) } if (rrs != NULL) free(rrs, M_ENTROPY); -#endif /* defined(RANDOM_DUMMY) */ -} - -#if !defined(RANDOM_DUMMY) -/* - * Run through all fast sources reading entropy for the given - * number of rounds, which should be a multiple of the number - * of entropy accumulation pools in use; 2 for Yarrow and 32 - * for Fortuna. - * - * BEWARE!!! - * This function runs inside the RNG thread! Don't do anything silly! - */ -void -random_sources_feed(void) -{ - uint32_t entropy[HARVESTSIZE]; - struct random_sources *rrs; - u_int i, n, local_read_rate; - - /* - * Step over all of live entropy sources, and feed their output - * to the system-wide RNG. - */ - /* XXX: FIX!! Next lines as an atomic operation? */ - local_read_rate = read_rate; - read_rate = RANDOM_ALG_READ_RATE_MINIMUM; - LIST_FOREACH(rrs, &source_list, rrs_entries) { - for (i = 0; i < random_alg_context.ra_poolcount*local_read_rate; i++) { - n = rrs->rrs_source->rs_read(entropy, sizeof(entropy)); - KASSERT((n > 0 && n <= sizeof(entropy)), ("very bad return from rs_read (= %d) in %s", n, __func__)); - random_harvest_direct(entropy, n, (n*8)/2, rrs->rrs_source->rs_source); - } - } - explicit_bzero(entropy, sizeof(entropy)); } static int @@ -414,7 +364,6 @@ random_source_handler(SYSCTL_HANDLER_ARGS) SYSCTL_PROC(_kern_random, OID_AUTO, random_sources, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, random_source_handler, "A", "List of active fast entropy sources."); -#endif /* !defined(RANDOM_DUMMY) */ /* ARGSUSED */ static int @@ -449,3 +398,5 @@ static moduledata_t randomdev_mod = { DECLARE_MODULE(random_device, randomdev_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); MODULE_VERSION(random_device, 1); +MODULE_DEPEND(random_device, crypto, 1, 1, 1); +MODULE_DEPEND(random_device, random_harvestq, 1, 1, 1); diff --git a/sys/dev/random/randomdev.h b/sys/dev/random/randomdev.h index 799efb1..0f3b359 100644 --- a/sys/dev/random/randomdev.h +++ b/sys/dev/random/randomdev.h @@ -55,16 +55,15 @@ random_check_uint_##name(SYSCTL_HANDLER_ARGS) \ MALLOC_DECLARE(M_ENTROPY); -#define RANDOM_ALG_READ_RATE_MINIMUM 32 - #endif /* _KERNEL */ struct harvest_event; +typedef void random_alg_init_t(void *); +typedef void random_alg_deinit_t(void *); typedef void random_alg_pre_read_t(void); typedef void random_alg_read_t(uint8_t *, u_int); -typedef void random_alg_write_t(uint8_t *, u_int); -typedef int random_alg_seeded_t(void); +typedef bool random_alg_seeded_t(void); typedef void random_alg_reseed_t(void); typedef void random_alg_eventprocessor_t(struct harvest_event *); @@ -81,13 +80,11 @@ struct random_algorithm { void (*ra_deinit_alg)(void *); random_alg_pre_read_t *ra_pre_read; random_alg_read_t *ra_read; - random_alg_write_t *ra_write; - random_alg_reseed_t *ra_reseed; random_alg_seeded_t *ra_seeded; random_alg_eventprocessor_t *ra_event_processor; }; -extern struct random_algorithm random_alg_context; +extern struct random_algorithm random_alg_context, *p_random_alg_context; #ifdef _KERNEL @@ -97,22 +94,33 @@ extern struct random_algorithm random_alg_context; * upon request. */ struct random_source { - const char *rs_ident; - enum random_entropy_source rs_source; - random_source_read_t *rs_read; + const char *rs_ident; + enum random_entropy_source rs_source; + random_source_read_t *rs_read; }; -#if !defined(RANDOM_DUMMY) struct random_sources { - LIST_ENTRY(random_sources) rrs_entries; - struct random_source *rrs_source; + LIST_ENTRY(random_sources) rrs_entries; + struct random_source *rrs_source; }; -#endif /* !defined(RANDOM_DUMMY) */ + +LIST_HEAD(sources_head, random_sources); +extern struct sources_head source_list; void random_source_register(struct random_source *); void random_source_deregister(struct random_source *); -void random_sources_feed(void); +#if defined(RANDOM_LOADABLE) +extern struct sx randomdev_config_lock; +#define RANDOM_CONFIG_INIT_LOCK(x) sx_init(&randomdev_config_lock, "configuration change lock") +#define RANDOM_CONFIG_X_LOCK(x) sx_xlock(&randomdev_config_lock) +#define RANDOM_CONFIG_X_UNLOCK(x) sx_xunlock(&randomdev_config_lock) +#define RANDOM_CONFIG_S_LOCK(x) sx_slock(&randomdev_config_lock) +#define RANDOM_CONFIG_S_UNLOCK(x) sx_sunlock(&randomdev_config_lock) +#define RANDOM_CONFIG_DEINIT_LOCK(x) sx_destroy(&randomdev_config_lock) +void random_infra_init(int (*)(struct uio *, bool), u_int (*)(void *, u_int)); +void random_infra_uninit(void); +#endif #endif /* _KERNEL */ diff --git a/sys/dev/random/randomdev_none.c b/sys/dev/random/randomdev_none.c deleted file mode 100644 index ee5cbf2..0000000 --- a/sys/dev/random/randomdev_none.c +++ /dev/null @@ -1,72 +0,0 @@ -/*- - * Copyright (c) 2015 Mark R V Murray - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer - * in this position and unchanged. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#include <sys/param.h> -#include <sys/malloc.h> -#include <sys/random.h> -#include <sys/systm.h> - -#include <dev/random/randomdev.h> - -#include "opt_random.h" - -#if defined(RANDOM_DUMMY) || defined(RANDOM_YARROW) -#error "Cannot define any of RANDOM_DUMMY and RANDOM_YARROW without 'device random'" -#endif - -/*- - * Dummy "not even here" device. Stub out all routines that the kernel would need. - */ - -/* ARGSUSED */ -u_int -read_random(void *random_buf __unused, u_int len __unused) -{ - - return (0); -} - -/* ARGSUSED */ -void -random_harvest_direct(const void *entropy __unused, u_int count __unused, u_int bits __unused, enum random_entropy_source origin __unused) -{ -} - -/* ARGSUSED */ -void -random_harvest_queue(const void *entropy __unused, u_int count __unused, u_int bits __unused, enum random_entropy_source origin __unused) -{ -} - -/* ARGSUSED */ -void -random_harvest_fast(const void *entropy __unused, u_int count __unused, u_int bits __unused, enum random_entropy_source origin __unused) -{ -} diff --git a/sys/dev/random/unit_test.c b/sys/dev/random/unit_test.c index 7ae5716..fac4c8d 100644 --- a/sys/dev/random/unit_test.c +++ b/sys/dev/random/unit_test.c @@ -46,6 +46,7 @@ Where <alg> is YARROW or FORTUNA. #include <sys/types.h> #include <inttypes.h> +#include <stdbool.h> #include <stdio.h> #include <stdlib.h> #include <threads.h> @@ -172,35 +173,6 @@ RunHarvester(void *arg __unused) } static int -WriteCSPRNG(void *threadid) -{ - uint8_t *buf; - int i; - - printf("Thread #1 starts\n"); - - for (i = 0; ; i++) { - if (stopseeding) - break; - buf = malloc(4096); - if (i % 1000 == 0) - printf("Thread write 1 - %d\n", i); - if (buf != NULL) { - printf("Thread 1 writing.\n"); - random_alg_context.ra_write(buf, i); - free(buf); - } - usleep(1000000); - } - - printf("Thread #1 ends\n"); - - thrd_exit(0); - - return (0); -} - -static int ReadCSPRNG(void *threadid) { size_t tid, zsize; @@ -271,7 +243,7 @@ main(int argc, char *argv[]) for (t = 0; t < NUM_THREADS; t++) { printf("In main: creating thread %ld\n", t); - rc = thrd_create(&threads[t], (t == 0 ? RunHarvester : (t == 1 ? WriteCSPRNG : ReadCSPRNG)), NULL); + rc = thrd_create(&threads[t], (t == 0 ? RunHarvester : ReadCSPRNG), NULL); if (rc != thrd_success) { printf("ERROR; return code from thrd_create() is %d\n", rc); exit(-1); diff --git a/sys/dev/random/yarrow.c b/sys/dev/random/yarrow.c index d6ebd46..2ef15a4 100644 --- a/sys/dev/random/yarrow.c +++ b/sys/dev/random/yarrow.c @@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$"); #include <dev/random/yarrow.h> #else /* !_KERNEL */ #include <inttypes.h> +#include <stdbool.h> #include <stdio.h> #include <stdlib.h> #include <stdint.h> @@ -92,7 +93,7 @@ static struct yarrow_state { u_int ysp_thresh; /* pool reseed threshhold */ struct randomdev_hash ysp_hash; /* accumulated entropy */ } ys_pool[RANDOM_YARROW_NPOOLS];/* pool[0] is fast, pool[1] is slow */ - int ys_seeded; + bool ys_seeded; /* Reseed lock */ mtx_t ys_mtx; } yarrow_state; @@ -108,9 +109,7 @@ RANDOM_CHECK_UINT(slowoverthresh, 1, 5); static void random_yarrow_pre_read(void); static void random_yarrow_read(uint8_t *, u_int); -static void random_yarrow_write(uint8_t *, u_int); -static void random_yarrow_reseed(void); -static int random_yarrow_seeded(void); +static bool random_yarrow_seeded(void); static void random_yarrow_process_event(struct harvest_event *); static void random_yarrow_init_alg(void *); static void random_yarrow_deinit_alg(void *); @@ -123,8 +122,6 @@ struct random_algorithm random_alg_context = { .ra_deinit_alg = random_yarrow_deinit_alg, .ra_pre_read = random_yarrow_pre_read, .ra_read = random_yarrow_read, - .ra_write = random_yarrow_write, - .ra_reseed = random_yarrow_reseed, .ra_seeded = random_yarrow_seeded, .ra_event_processor = random_yarrow_process_event, .ra_poolcount = RANDOM_YARROW_NPOOLS, @@ -141,7 +138,7 @@ random_yarrow_init_alg(void *unused __unused) RANDOM_RESEED_INIT_LOCK(); /* Start unseeded, therefore blocked. */ - yarrow_state.ys_seeded = 0; + yarrow_state.ys_seeded = false; #ifdef _KERNEL /* * Yarrow parameters. Do not adjust these unless you have @@ -266,12 +263,14 @@ random_yarrow_reseed_internal(u_int fastslow) RANDOM_RESEED_ASSERT_LOCK_OWNED(); #ifdef RANDOM_DEBUG /* WARNING! This is dangerously tedious to do with mutexes held! */ - printf("random: %s %s seeded = %d\n", __func__, (fastslow == RANDOM_YARROW_FAST ? "RANDOM_YARROW_FAST" : "RANDOM_YARROW_SLOW"), yarrow_state.ys_seeded); - printf("random: %s - fast - thresh %d,1 - ", __func__, yarrow_state.ys_pool[RANDOM_YARROW_FAST].ysp_thresh); + printf("random: %s ", __func__); + printf("type/pool = %s ", fastslow == RANDOM_YARROW_FAST ? "RANDOM_YARROW_FAST" : "RANDOM_YARROW_SLOW"); + printf("seeded = %s\n", yarrow_state.ys_seeded ? "true" : "false"); + printf("random: fast - thresh %d,1 - ", yarrow_state.ys_pool[RANDOM_YARROW_FAST].ysp_thresh); for (i = RANDOM_START; i < ENTROPYSOURCE; i++) printf(" %d", yarrow_state.ys_pool[RANDOM_YARROW_FAST].ysp_source_bits[i]); printf("\n"); - printf("random: %s - slow - thresh %d,%d - ", __func__, yarrow_state.ys_pool[RANDOM_YARROW_SLOW].ysp_thresh, yarrow_state.ys_slowoverthresh); + printf("random: slow - thresh %d,%d - ", yarrow_state.ys_pool[RANDOM_YARROW_SLOW].ysp_thresh, yarrow_state.ys_slowoverthresh); for (i = RANDOM_START; i < ENTROPYSOURCE; i++) printf(" %d", yarrow_state.ys_pool[RANDOM_YARROW_SLOW].ysp_source_bits[i]); printf("\n"); @@ -338,7 +337,7 @@ random_yarrow_reseed_internal(u_int fastslow) #endif /* Unblock the device if it was blocked due to being unseeded */ if (!yarrow_state.ys_seeded) { - yarrow_state.ys_seeded = 1; + yarrow_state.ys_seeded = true; randomdev_unblock(); } } @@ -395,47 +394,7 @@ random_yarrow_read(uint8_t *buf, u_int bytecount) RANDOM_RESEED_UNLOCK(); } -/* Internal function to hand external entropy to the PRNG. */ -void -random_yarrow_write(uint8_t *buf, u_int count) -{ - static u_int destination = 0; - static struct harvest_event event; - struct randomdev_hash hash; - uint32_t entropy_data[RANDOM_KEYSIZE_WORDS], timestamp; - int i; - - /* Extra timing here is helpful to scrape scheduler timing entropy */ - randomdev_hash_init(&hash); - timestamp = (uint32_t)get_cyclecount(); - randomdev_hash_iterate(&hash, ×tamp, sizeof(timestamp)); - randomdev_hash_iterate(&hash, buf, count); - timestamp = (uint32_t)get_cyclecount(); - randomdev_hash_iterate(&hash, ×tamp, sizeof(timestamp)); - randomdev_hash_finish(&hash, entropy_data); - explicit_bzero(&hash, sizeof(hash)); - for (i = 0; i < RANDOM_KEYSIZE_WORDS; i += sizeof(event.he_entropy)/sizeof(event.he_entropy[0])) { - event.he_somecounter = (uint32_t)get_cyclecount(); - event.he_size = sizeof(event.he_entropy); - event.he_bits = event.he_size/8; - event.he_source = RANDOM_CACHED; - event.he_destination = destination++; /* Harmless cheating */ - memcpy(event.he_entropy, entropy_data + i, sizeof(event.he_entropy)); - random_yarrow_process_event(&event); - } - explicit_bzero(entropy_data, sizeof(entropy_data)); -} - -void -random_yarrow_reseed(void) -{ - - RANDOM_RESEED_LOCK(); - random_yarrow_reseed_internal(RANDOM_YARROW_SLOW); - RANDOM_RESEED_UNLOCK(); -} - -int +bool random_yarrow_seeded(void) { diff --git a/sys/dev/usb/controller/dwc_otg.c b/sys/dev/usb/controller/dwc_otg.c index bd3e51b..e018ab5 100644 --- a/sys/dev/usb/controller/dwc_otg.c +++ b/sys/dev/usb/controller/dwc_otg.c @@ -1,7 +1,7 @@ /* $FreeBSD$ */ /*- * Copyright (c) 2015 Daisuke Aoyama. All rights reserved. - * Copyright (c) 2012 Hans Petter Selasky. All rights reserved. + * Copyright (c) 2012-2015 Hans Petter Selasky. All rights reserved. * Copyright (c) 2010-2011 Aleksandr Rybalko. All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -597,14 +597,18 @@ dwc_otg_clear_hcint(struct dwc_otg_softc *sc, uint8_t x) } static uint8_t -dwc_otg_host_check_fifo_empty(struct dwc_otg_softc *sc, struct dwc_otg_td *td) +dwc_otg_host_check_tx_fifo_empty(struct dwc_otg_softc *sc, struct dwc_otg_td *td) { uint32_t temp; temp = DWC_OTG_READ_4(sc, DOTG_GINTSTS); - if (td->ep_type == UE_INTERRUPT || - td->ep_type == UE_ISOCHRONOUS) { + if (td->ep_type == UE_ISOCHRONOUS) { + /* + * NOTE: USB INTERRUPT transactions are executed like + * USB CONTROL transactions! See the setup standard + * chain function for more information. + */ if (!(temp & GINTSTS_PTXFEMP)) { DPRINTF("Periodic TX FIFO is not empty\n"); if (!(sc->sc_irq_mask & GINTMSK_PTXFEMPMSK)) { @@ -631,8 +635,10 @@ dwc_otg_host_channel_alloc(struct dwc_otg_softc *sc, struct dwc_otg_td *td, uint8_t is_out) { uint8_t x; + uint8_t y; + uint8_t z; - if (td->channel < DWC_OTG_MAX_CHANNELS) + if (td->channel[0] < DWC_OTG_MAX_CHANNELS) return (0); /* already allocated */ /* check if device is suspended */ @@ -641,20 +647,42 @@ dwc_otg_host_channel_alloc(struct dwc_otg_softc *sc, /* compute needed TX FIFO size */ if (is_out != 0) { - if (dwc_otg_host_check_fifo_empty(sc, td) != 0) + if (dwc_otg_host_check_tx_fifo_empty(sc, td) != 0) return (1); /* busy - cannot transfer data */ } - - for (x = 0; x != sc->sc_host_ch_max; x++) { + z = td->max_packet_count; + for (x = y = 0; x != sc->sc_host_ch_max; x++) { /* check if channel is allocated */ if (sc->sc_chan_state[x].allocated != 0) continue; /* check if channel is still enabled */ if (sc->sc_chan_state[x].wait_halted != 0) continue; + /* store channel number */ + td->channel[y++] = x; + /* check if we got all channels */ + if (y == z) + break; + } + if (y != z) { + /* reset channel variable */ + td->channel[0] = DWC_OTG_MAX_CHANNELS; + td->channel[1] = DWC_OTG_MAX_CHANNELS; + td->channel[2] = DWC_OTG_MAX_CHANNELS; + /* wait a bit */ + dwc_otg_enable_sof_irq(sc); + return (1); /* busy - not enough channels */ + } + + for (y = 0; y != z; y++) { + x = td->channel[y]; + /* set allocated */ sc->sc_chan_state[x].allocated = 1; + /* set wait halted */ + sc->sc_chan_state[x].wait_halted = 1; + /* clear interrupts */ dwc_otg_clear_hcint(sc, x); @@ -663,29 +691,22 @@ dwc_otg_host_channel_alloc(struct dwc_otg_softc *sc, /* set active channel */ sc->sc_active_rx_ep |= (1 << x); - - /* set channel */ - td->channel = x; - - return (0); /* allocated */ } - /* wait a bit */ - dwc_otg_enable_sof_irq(sc); - return (1); /* busy */ + return (0); /* allocated */ } static void -dwc_otg_host_channel_free(struct dwc_otg_softc *sc, struct dwc_otg_td *td) +dwc_otg_host_channel_free_sub(struct dwc_otg_softc *sc, struct dwc_otg_td *td, uint8_t index) { uint32_t hcchar; uint8_t x; - if (td->channel >= DWC_OTG_MAX_CHANNELS) + if (td->channel[index] >= DWC_OTG_MAX_CHANNELS) return; /* already freed */ /* free channel */ - x = td->channel; - td->channel = DWC_OTG_MAX_CHANNELS; + x = td->channel[index]; + td->channel[index] = DWC_OTG_MAX_CHANNELS; DPRINTF("CH=%d\n", x); @@ -704,26 +725,42 @@ dwc_otg_host_channel_free(struct dwc_otg_softc *sc, struct dwc_otg_td *td) /* clear active channel */ sc->sc_active_rx_ep &= ~(1 << x); + /* check if already halted */ + if (sc->sc_chan_state[x].wait_halted == 0) + return; + /* disable host channel */ hcchar = DWC_OTG_READ_4(sc, DOTG_HCCHAR(x)); if (hcchar & HCCHAR_CHENA) { DPRINTF("Halting channel %d\n", x); DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(x), hcchar | HCCHAR_CHDIS); - sc->sc_chan_state[x].wait_halted = 1; /* don't write HCCHAR until the channel is halted */ + } else { + sc->sc_chan_state[x].wait_halted = 0; } } static void +dwc_otg_host_channel_free(struct dwc_otg_softc *sc, struct dwc_otg_td *td) +{ + uint8_t x; + for (x = 0; x != td->max_packet_count; x++) + dwc_otg_host_channel_free_sub(sc, td, x); +} + +static void dwc_otg_host_dump_rx(struct dwc_otg_softc *sc, struct dwc_otg_td *td) { + uint8_t x; /* dump any pending messages */ - if (sc->sc_last_rx_status != 0) { - if (td->channel < DWC_OTG_MAX_CHANNELS && - td->channel == GRXSTSRD_CHNUM_GET(sc->sc_last_rx_status)) { - dwc_otg_common_rx_ack(sc); - } + if (sc->sc_last_rx_status == 0) + return; + for (x = 0; x != td->max_packet_count; x++) { + if (td->channel[x] >= DWC_OTG_MAX_CHANNELS || + td->channel[x] != GRXSTSRD_CHNUM_GET(sc->sc_last_rx_status)) + continue; + dwc_otg_common_rx_ack(sc); } } @@ -737,13 +774,13 @@ dwc_otg_host_setup_tx(struct dwc_otg_softc *sc, struct dwc_otg_td *td) dwc_otg_host_dump_rx(sc, td); - if (td->channel < DWC_OTG_MAX_CHANNELS) { - hcint = sc->sc_chan_state[td->channel].hcint; + if (td->channel[0] < DWC_OTG_MAX_CHANNELS) { + hcint = sc->sc_chan_state[td->channel[0]].hcint; DPRINTF("CH=%d ST=%d HCINT=0x%08x HCCHAR=0x%08x HCTSIZ=0x%08x\n", - td->channel, td->state, hcint, - DWC_OTG_READ_4(sc, DOTG_HCCHAR(td->channel)), - DWC_OTG_READ_4(sc, DOTG_HCTSIZ(td->channel))); + td->channel[0], td->state, hcint, + DWC_OTG_READ_4(sc, DOTG_HCCHAR(td->channel[0])), + DWC_OTG_READ_4(sc, DOTG_HCTSIZ(td->channel[0]))); } else { hcint = 0; goto check_state; @@ -753,12 +790,12 @@ dwc_otg_host_setup_tx(struct dwc_otg_softc *sc, struct dwc_otg_td *td) HCINT_ACK | HCINT_NYET)) { /* give success bits priority over failure bits */ } else if (hcint & HCINT_STALL) { - DPRINTF("CH=%d STALL\n", td->channel); + DPRINTF("CH=%d STALL\n", td->channel[0]); td->error_stall = 1; td->error_any = 1; goto complete; } else if (hcint & HCINT_ERRORS) { - DPRINTF("CH=%d ERROR\n", td->channel); + DPRINTF("CH=%d ERROR\n", td->channel[0]); td->errcnt++; if (td->hcsplt != 0 || td->errcnt >= 3) { td->error_any = 1; @@ -863,23 +900,23 @@ send_pkt: usbd_copy_out(td->pc, 0, &req, sizeof(req)); - DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(td->channel), + DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(td->channel[0]), (sizeof(req) << HCTSIZ_XFERSIZE_SHIFT) | (1 << HCTSIZ_PKTCNT_SHIFT) | (HCTSIZ_PID_SETUP << HCTSIZ_PID_SHIFT)); - DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(td->channel), td->hcsplt); + DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(td->channel[0]), td->hcsplt); hcchar = td->hcchar; hcchar &= ~(HCCHAR_EPDIR_IN | HCCHAR_EPTYPE_MASK); hcchar |= UE_CONTROL << HCCHAR_EPTYPE_SHIFT; /* must enable channel before writing data to FIFO */ - DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(td->channel), hcchar); + DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(td->channel[0]), hcchar); /* transfer data into FIFO */ bus_space_write_region_4(sc->sc_io_tag, sc->sc_io_hdl, - DOTG_DFIFO(td->channel), (uint32_t *)&req, sizeof(req) / 4); + DOTG_DFIFO(td->channel[0]), (uint32_t *)&req, sizeof(req) / 4); /* wait until next slot before trying complete split */ td->tt_complete_slot = sc->sc_last_frame_num + 1; @@ -916,17 +953,17 @@ send_cpkt: td->hcsplt |= HCSPLT_COMPSPLT; td->state = DWC_CHAN_ST_WAIT_C_ANE; - DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(td->channel), + DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(td->channel[0]), (HCTSIZ_PID_SETUP << HCTSIZ_PID_SHIFT)); - DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(td->channel), td->hcsplt); + DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(td->channel[0]), td->hcsplt); hcchar = td->hcchar; hcchar &= ~(HCCHAR_EPDIR_IN | HCCHAR_EPTYPE_MASK); hcchar |= UE_CONTROL << HCCHAR_EPTYPE_SHIFT; /* must enable channel before writing data to FIFO */ - DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(td->channel), hcchar); + DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(td->channel[0]), hcchar); busy: return (1); /* busy */ @@ -1060,50 +1097,51 @@ dwc_otg_host_rate_check_interrupt(struct dwc_otg_softc *sc, struct dwc_otg_td *t static uint8_t dwc_otg_host_rate_check(struct dwc_otg_softc *sc, struct dwc_otg_td *td) { + uint8_t frame_num = (uint8_t)sc->sc_last_frame_num; + if (td->ep_type == UE_ISOCHRONOUS) { /* non TT isochronous traffic */ - if ((td->tmr_val != 0) || - (sc->sc_last_frame_num & (td->tmr_res - 1))) { + if (frame_num & (td->tmr_res - 1)) goto busy; - } - td->tmr_val = 1; /* executed */ + if ((frame_num ^ td->tmr_val) & td->tmr_res) + goto busy; + td->tmr_val = td->tmr_res + sc->sc_last_frame_num; td->toggle = 0; - + return (0); } else if (td->ep_type == UE_INTERRUPT) { if (!td->tt_scheduled) goto busy; td->tt_scheduled = 0; + return (0); } else if (td->did_nak != 0) { - uint8_t frame_num = (uint8_t)sc->sc_last_frame_num; /* check if we should pause sending queries for 125us */ if (td->tmr_res == frame_num) { /* wait a bit */ dwc_otg_enable_sof_irq(sc); goto busy; } - /* query for data one more time */ - td->tmr_res = frame_num; - td->did_nak = 0; } else if (td->set_toggle) { td->set_toggle = 0; td->toggle = 1; } + /* query for data one more time */ + td->tmr_res = frame_num; + td->did_nak = 0; return (0); busy: return (1); } static uint8_t -dwc_otg_host_data_rx_sub(struct dwc_otg_softc *sc, struct dwc_otg_td *td) +dwc_otg_host_data_rx_sub(struct dwc_otg_softc *sc, struct dwc_otg_td *td, + uint8_t channel) { uint32_t count; - uint8_t channel; /* check endpoint status */ if (sc->sc_last_rx_status == 0) goto busy; - channel = td->channel; if (channel >= DWC_OTG_MAX_CHANNELS) goto busy; @@ -1128,21 +1166,22 @@ dwc_otg_host_data_rx_sub(struct dwc_otg_softc *sc, struct dwc_otg_td *td) /* get the packet byte count */ count = GRXSTSRD_BCNT_GET(sc->sc_last_rx_status); - /* check for isochronous transfer or high-speed bandwidth endpoint */ - if (td->ep_type == UE_ISOCHRONOUS || td->max_packet_count > 1) { - if ((sc->sc_last_rx_status & GRXSTSRD_DPID_MASK) != GRXSTSRD_DPID_DATA0) { + /* check for ISOCHRONOUS endpoint */ + if (td->ep_type == UE_ISOCHRONOUS) { + if ((sc->sc_last_rx_status & GRXSTSRD_DPID_MASK) != + GRXSTSRD_DPID_DATA0) { + /* more data to be received */ td->tt_xactpos = HCSPLT_XACTPOS_MIDDLE; } else { + /* all data received */ td->tt_xactpos = HCSPLT_XACTPOS_BEGIN; - /* verify the packet byte count */ - if (count < td->max_packet_size) { + if (count != td->remainder) { /* we have a short packet */ td->short_pkt = 1; td->got_short = 1; } } - td->toggle = 0; } else { /* verify the packet byte count */ if (count != td->max_packet_size) { @@ -1194,15 +1233,17 @@ complete: static uint8_t dwc_otg_host_data_rx(struct dwc_otg_softc *sc, struct dwc_otg_td *td) { - uint32_t hcint; + uint32_t hcint = 0; uint32_t hcchar; uint8_t delta; uint8_t channel; + uint8_t x; - channel = td->channel; - - if (channel < DWC_OTG_MAX_CHANNELS) { - hcint = sc->sc_chan_state[channel].hcint; + for (x = 0; x != td->max_packet_count; x++) { + channel = td->channel[x]; + if (channel >= DWC_OTG_MAX_CHANNELS) + continue; + hcint |= sc->sc_chan_state[channel].hcint; DPRINTF("CH=%d ST=%d HCINT=0x%08x HCCHAR=0x%08x HCTSIZ=0x%08x\n", channel, td->state, hcint, @@ -1230,19 +1271,17 @@ dwc_otg_host_data_rx(struct dwc_otg_softc *sc, struct dwc_otg_td *td) } /* check channels for data, if any */ - if (dwc_otg_host_data_rx_sub(sc, td)) + if (dwc_otg_host_data_rx_sub(sc, td, channel)) goto complete; /* refresh interrupt status */ - hcint = sc->sc_chan_state[channel].hcint; + hcint |= sc->sc_chan_state[channel].hcint; if (hcint & (HCINT_ERRORS | HCINT_RETRY | HCINT_ACK | HCINT_NYET)) { if (!(hcint & HCINT_ERRORS)) td->errcnt = 0; } - } else { - hcint = 0; } switch (td->state) { @@ -1269,6 +1308,8 @@ dwc_otg_host_data_rx(struct dwc_otg_softc *sc, struct dwc_otg_td *td) td->toggle ^= 1; goto receive_pkt; } + } else if (td->ep_type == UE_ISOCHRONOUS) { + goto complete; } td->did_nak = 1; td->tt_scheduled = 0; @@ -1292,12 +1333,12 @@ dwc_otg_host_data_rx(struct dwc_otg_softc *sc, struct dwc_otg_td *td) if (td->ep_type == UE_ISOCHRONOUS) { /* check if we are complete */ - if ((td->remainder == 0) || - (td->tt_xactpos == HCSPLT_XACTPOS_BEGIN)) { + if (td->tt_xactpos == HCSPLT_XACTPOS_BEGIN) { goto complete; + } else { + /* get more packets */ + goto busy; } - /* get another packet */ - goto receive_pkt; } else { /* check if we are complete */ if ((td->remainder == 0) || (td->got_short != 0)) { @@ -1365,8 +1406,7 @@ receive_pkt: } /* complete split */ td->hcsplt |= HCSPLT_COMPSPLT; - } else if (td->tt_xactpos == HCSPLT_XACTPOS_BEGIN && - dwc_otg_host_rate_check(sc, td)) { + } else if (dwc_otg_host_rate_check(sc, td)) { td->state = DWC_CHAN_ST_WAIT_C_PKT; goto busy; } @@ -1377,8 +1417,6 @@ receive_pkt: goto busy; } - channel = td->channel; - /* set toggle, if any */ if (td->set_toggle) { td->set_toggle = 0; @@ -1387,28 +1425,31 @@ receive_pkt: td->state = DWC_CHAN_ST_WAIT_ANE; - /* receive one packet */ - DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel), - (td->max_packet_size << HCTSIZ_XFERSIZE_SHIFT) | - (1 << HCTSIZ_PKTCNT_SHIFT) | - (td->toggle ? (HCTSIZ_PID_DATA1 << HCTSIZ_PID_SHIFT) : - (HCTSIZ_PID_DATA0 << HCTSIZ_PID_SHIFT))); + for (x = 0; x != td->max_packet_count; x++) { + channel = td->channel[x]; - DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(channel), td->hcsplt); + /* receive one packet */ + DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel), + (td->max_packet_size << HCTSIZ_XFERSIZE_SHIFT) | + (1 << HCTSIZ_PKTCNT_SHIFT) | + (td->toggle ? (HCTSIZ_PID_DATA1 << HCTSIZ_PID_SHIFT) : + (HCTSIZ_PID_DATA0 << HCTSIZ_PID_SHIFT))); - hcchar = td->hcchar; - hcchar |= HCCHAR_EPDIR_IN; + DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(channel), td->hcsplt); - /* receive complete split ASAP */ - if ((sc->sc_last_frame_num & 1) != 0 && - (td->ep_type == UE_INTERRUPT || td->ep_type == UE_ISOCHRONOUS)) - hcchar |= HCCHAR_ODDFRM; - else - hcchar &= ~HCCHAR_ODDFRM; + hcchar = td->hcchar; + hcchar |= HCCHAR_EPDIR_IN; - /* must enable channel before data can be received */ - DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(channel), hcchar); + /* receive complete split ASAP */ + if ((sc->sc_last_frame_num & 1) != 0 && + td->ep_type == UE_ISOCHRONOUS) + hcchar |= HCCHAR_ODDFRM; + else + hcchar &= ~HCCHAR_ODDFRM; + /* must enable channel before data can be received */ + DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(channel), hcchar); + } /* wait until next slot before trying complete split */ td->tt_complete_slot = sc->sc_last_frame_num + 1; @@ -1437,7 +1478,7 @@ receive_spkt: goto busy; } - channel = td->channel; + channel = td->channel[0]; td->hcsplt &= ~HCSPLT_COMPSPLT; td->state = DWC_CHAN_ST_WAIT_S_ANE; @@ -1450,7 +1491,7 @@ receive_spkt: /* send after next SOF event */ if ((sc->sc_last_frame_num & 1) == 0 && - (td->ep_type == UE_INTERRUPT || td->ep_type == UE_ISOCHRONOUS)) + td->ep_type == UE_ISOCHRONOUS) td->hcchar |= HCCHAR_ODDFRM; else td->hcchar &= ~HCCHAR_ODDFRM; @@ -1605,10 +1646,12 @@ dwc_otg_host_data_tx(struct dwc_otg_softc *sc, struct dwc_otg_td *td) uint32_t hcchar; uint8_t delta; uint8_t channel; + uint8_t x; dwc_otg_host_dump_rx(sc, td); - channel = td->channel; + /* check that last channel is complete */ + channel = td->channel[td->npkt]; if (channel < DWC_OTG_MAX_CHANNELS) { hcint = sc->sc_chan_state[channel].hcint; @@ -1658,7 +1701,11 @@ dwc_otg_host_data_tx(struct dwc_otg_softc *sc, struct dwc_otg_td *td) td->offset += td->tx_bytes; td->remainder -= td->tx_bytes; td->toggle ^= 1; - td->did_nak = 0; + /* check if next response will be a NAK */ + if (hcint & HCINT_NYET) + td->did_nak = 1; + else + td->did_nak = 0; td->tt_scheduled = 0; /* check remainder */ @@ -1715,33 +1762,13 @@ dwc_otg_host_data_tx(struct dwc_otg_softc *sc, struct dwc_otg_td *td) goto send_cpkt; case DWC_CHAN_ST_TX_WAIT_ISOC: - - /* Check if isochronous OUT traffic is complete */ + /* Check if ISOCHRONOUS OUT traffic is complete */ if ((hcint & HCINT_HCH_DONE_MASK) == 0) break; td->offset += td->tx_bytes; td->remainder -= td->tx_bytes; - - if (td->hcsplt != 0 || td->remainder == 0) - goto complete; - - /* check for next packet */ - if (td->max_packet_count > 1) - td->tt_xactpos++; - - /* free existing channel, if any */ - dwc_otg_host_channel_free(sc, td); - - td->state = DWC_CHAN_ST_TX_PKT_ISOC; - - /* FALLTHROUGH */ - - case DWC_CHAN_ST_TX_PKT_ISOC: - if (dwc_otg_host_channel_alloc(sc, td, 1)) - break; - channel = td->channel; - goto send_isoc_pkt; + goto complete; default: break; } @@ -1775,8 +1802,6 @@ send_pkt: goto busy; } - channel = td->channel; - /* set toggle, if any */ if (td->set_toggle) { td->set_toggle = 0; @@ -1784,8 +1809,7 @@ send_pkt: } if (td->ep_type == UE_ISOCHRONOUS) { -send_isoc_pkt: - /* Isochronous OUT transfers don't have any ACKs */ + /* ISOCHRONOUS OUT transfers don't have any ACKs */ td->state = DWC_CHAN_ST_TX_WAIT_ISOC; td->hcsplt &= ~HCSPLT_COMPSPLT; if (td->hcsplt != 0) { @@ -1799,123 +1823,110 @@ send_isoc_pkt: /* Update transaction position */ td->hcsplt &= ~HCSPLT_XACTPOS_MASK; td->hcsplt |= (HCSPLT_XACTPOS_ALL << HCSPLT_XACTPOS_SHIFT); - } else { - /* send one packet at a time */ - count = td->max_packet_size; - if (td->remainder < count) { - /* we have a short packet */ - td->short_pkt = 1; - count = td->remainder; - } } } else if (td->hcsplt != 0) { - td->hcsplt &= ~HCSPLT_COMPSPLT; - /* Wait for ACK/NAK/ERR from TT */ td->state = DWC_CHAN_ST_WAIT_S_ANE; - - /* send one packet at a time */ - count = td->max_packet_size; - if (td->remainder < count) { - /* we have a short packet */ - td->short_pkt = 1; - count = td->remainder; - } } else { /* Wait for ACK/NAK/STALL from device */ td->state = DWC_CHAN_ST_WAIT_ANE; + } + + td->tx_bytes = 0; + + for (x = 0; x != td->max_packet_count; x++) { + uint32_t rem_bytes; + + channel = td->channel[x]; /* send one packet at a time */ count = td->max_packet_size; - if (td->remainder < count) { + rem_bytes = td->remainder - td->tx_bytes; + if (rem_bytes < count) { /* we have a short packet */ td->short_pkt = 1; - count = td->remainder; - } - } - - /* check for High-Speed multi-packets */ - if ((td->hcsplt == 0) && (td->max_packet_count > 1)) { - if (td->npkt == 0) { - if (td->remainder >= (3 * td->max_packet_size)) - td->npkt = 3; - else if (td->remainder >= (2 * td->max_packet_size)) - td->npkt = 2; - else - td->npkt = 1; - - if (td->npkt > td->max_packet_count) - td->npkt = td->max_packet_count; - - td->tt_xactpos = 1; /* overload */ + count = rem_bytes; } - if (td->tt_xactpos == td->npkt) { - if (td->npkt == 1) { + if (count == rem_bytes) { + /* last packet */ + switch (x) { + case 0: DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel), (count << HCTSIZ_XFERSIZE_SHIFT) | (1 << HCTSIZ_PKTCNT_SHIFT) | - (HCTSIZ_PID_DATA0 << HCTSIZ_PID_SHIFT)); - } else if (td->npkt == 2) { + (td->toggle ? (HCTSIZ_PID_DATA1 << HCTSIZ_PID_SHIFT) : + (HCTSIZ_PID_DATA0 << HCTSIZ_PID_SHIFT))); + break; + case 1: DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel), (count << HCTSIZ_XFERSIZE_SHIFT) | (1 << HCTSIZ_PKTCNT_SHIFT) | (HCTSIZ_PID_DATA1 << HCTSIZ_PID_SHIFT)); - } else { + break; + default: DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel), (count << HCTSIZ_XFERSIZE_SHIFT) | (1 << HCTSIZ_PKTCNT_SHIFT) | (HCTSIZ_PID_DATA2 << HCTSIZ_PID_SHIFT)); + break; } - td->npkt = 0; - } else { + } else if (td->ep_type == UE_ISOCHRONOUS && + td->max_packet_count > 1) { + /* ISOCHRONOUS multi packet */ DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel), (count << HCTSIZ_XFERSIZE_SHIFT) | (1 << HCTSIZ_PKTCNT_SHIFT) | (HCTSIZ_PID_MDATA << HCTSIZ_PID_SHIFT)); + } else { + /* TODO: HCTSIZ_DOPNG */ + /* standard BULK/INTERRUPT/CONTROL packet */ + DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel), + (count << HCTSIZ_XFERSIZE_SHIFT) | + (1 << HCTSIZ_PKTCNT_SHIFT) | + (td->toggle ? (HCTSIZ_PID_DATA1 << HCTSIZ_PID_SHIFT) : + (HCTSIZ_PID_DATA0 << HCTSIZ_PID_SHIFT))); } - } else { - /* TODO: HCTSIZ_DOPNG */ - DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel), - (count << HCTSIZ_XFERSIZE_SHIFT) | - (1 << HCTSIZ_PKTCNT_SHIFT) | - (td->toggle ? (HCTSIZ_PID_DATA1 << HCTSIZ_PID_SHIFT) : - (HCTSIZ_PID_DATA0 << HCTSIZ_PID_SHIFT))); - } + DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(channel), td->hcsplt); - DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(channel), td->hcsplt); + hcchar = td->hcchar; + hcchar &= ~HCCHAR_EPDIR_IN; - hcchar = td->hcchar; - hcchar &= ~HCCHAR_EPDIR_IN; + /* send after next SOF event */ + if ((sc->sc_last_frame_num & 1) == 0 && + td->ep_type == UE_ISOCHRONOUS) + hcchar |= HCCHAR_ODDFRM; + else + hcchar &= ~HCCHAR_ODDFRM; - /* send after next SOF event */ - if ((sc->sc_last_frame_num & 1) == 0 && - (td->ep_type == UE_INTERRUPT || td->ep_type == UE_ISOCHRONOUS)) - hcchar |= HCCHAR_ODDFRM; - else - hcchar &= ~HCCHAR_ODDFRM; + /* must enable before writing data to FIFO */ + DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(channel), hcchar); - /* must enable before writing data to FIFO */ - DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(channel), hcchar); + if (count != 0) { + /* clear topmost word before copy */ + sc->sc_tx_bounce_buffer[(count - 1) / 4] = 0; - if (count != 0) { + /* copy out data */ + usbd_copy_out(td->pc, td->offset + td->tx_bytes, + sc->sc_tx_bounce_buffer, count); - /* clear topmost word before copy */ - sc->sc_tx_bounce_buffer[(count - 1) / 4] = 0; + /* transfer data into FIFO */ + bus_space_write_region_4(sc->sc_io_tag, sc->sc_io_hdl, + DOTG_DFIFO(channel), + sc->sc_tx_bounce_buffer, (count + 3) / 4); + } - /* copy out data */ - usbd_copy_out(td->pc, td->offset, - sc->sc_tx_bounce_buffer, count); + /* store number of bytes transmitted */ + td->tx_bytes += count; - /* transfer data into FIFO */ - bus_space_write_region_4(sc->sc_io_tag, sc->sc_io_hdl, - DOTG_DFIFO(channel), - sc->sc_tx_bounce_buffer, (count + 3) / 4); + /* store last packet index */ + td->npkt = x; + + /* check for last packet */ + if (count == rem_bytes) + break; } - - /* store number of bytes transmitted */ - td->tx_bytes = count; goto busy; send_cpkt: @@ -1941,7 +1952,7 @@ send_cpkt: goto busy; } - channel = td->channel; + channel = td->channel[0]; td->hcsplt |= HCSPLT_COMPSPLT; td->state = DWC_CHAN_ST_WAIT_C_ANE; @@ -1956,7 +1967,7 @@ send_cpkt: /* receive complete split ASAP */ if ((sc->sc_last_frame_num & 1) != 0 && - (td->ep_type == UE_INTERRUPT || td->ep_type == UE_ISOCHRONOUS)) + td->ep_type == UE_ISOCHRONOUS) hcchar |= HCCHAR_ODDFRM; else hcchar &= ~HCCHAR_ODDFRM; @@ -2383,9 +2394,6 @@ dwc_otg_update_host_transfer_schedule_locked(struct dwc_otg_softc *sc) if ((td->hcchar & HCCHAR_EPDIR_IN) != 0) continue; - /* execute more frames */ - td->tmr_val = 0; - sc->sc_needsof = 1; if (td->hcsplt == 0 || td->tt_scheduled != 0) @@ -2417,9 +2425,6 @@ dwc_otg_update_host_transfer_schedule_locked(struct dwc_otg_softc *sc) if ((td->hcchar & HCCHAR_EPDIR_IN) == 0) continue; - /* execute more frames */ - td->tmr_val = 0; - sc->sc_needsof = 1; if (td->hcsplt == 0 || td->tt_scheduled != 0) @@ -2513,10 +2518,10 @@ dwc_otg_update_host_transfer_schedule_locked(struct dwc_otg_softc *sc) TAILQ_CONCAT(&head, &sc->sc_bus.intr_q.head, wait_entry); TAILQ_CONCAT(&sc->sc_bus.intr_q.head, &head, wait_entry); - /* put non-TT BULK transfers last */ + /* put non-TT non-ISOCHRONOUS transfers last */ TAILQ_FOREACH_SAFE(xfer, &sc->sc_bus.intr_q.head, wait_entry, xfer_next) { td = xfer->td_transfer_cache; - if (td == NULL || td->hcsplt != 0 || td->ep_type != UE_BULK) + if (td == NULL || td->hcsplt != 0 || td->ep_type == UE_ISOCHRONOUS) continue; TAILQ_REMOVE(&sc->sc_bus.intr_q.head, xfer, wait_entry); TAILQ_INSERT_TAIL(&head, xfer, wait_entry); @@ -2551,11 +2556,19 @@ static void dwc_otg_interrupt_poll_locked(struct dwc_otg_softc *sc) { struct usb_xfer *xfer; - uint32_t count = 0; + uint32_t count; uint32_t temp; uint8_t got_rx_status; uint8_t x; + if (sc->sc_flags.status_device_mode == 0) { + /* + * Update host transfer schedule, so that new + * transfers can be issued: + */ + dwc_otg_update_host_transfer_schedule_locked(sc); + } + count = 0; repeat: if (++count == 16) { /* give other interrupts a chance */ @@ -2659,12 +2672,6 @@ repeat: sc->sc_irq_mask &= ~GINTMSK_RXFLVLMSK; DWC_OTG_WRITE_4(sc, DOTG_GINTMSK, sc->sc_irq_mask); } - - if (sc->sc_flags.status_device_mode == 0 && sc->sc_xfer_complete == 0) { - /* update host transfer schedule, so that new transfers can be issued */ - if (dwc_otg_update_host_transfer_schedule_locked(sc)) - goto repeat; - } } static void @@ -2944,12 +2951,6 @@ dwc_otg_interrupt(void *arg) /* complete FIFOs, if any */ dwc_otg_interrupt_complete_locked(sc); - - if (sc->sc_flags.status_device_mode == 0) { - /* update host transfer schedule, so that new transfers can be issued */ - if (dwc_otg_update_host_transfer_schedule_locked(sc)) - dwc_otg_interrupt_poll_locked(sc); - } } USB_BUS_SPIN_UNLOCK(&sc->sc_bus); USB_BUS_UNLOCK(&sc->sc_bus); @@ -2982,7 +2983,9 @@ dwc_otg_setup_standard_chain_sub(struct dwc_otg_std_temp *temp) td->set_toggle = 0; td->got_short = 0; td->did_nak = 0; - td->channel = DWC_OTG_MAX_CHANNELS; + td->channel[0] = DWC_OTG_MAX_CHANNELS; + td->channel[1] = DWC_OTG_MAX_CHANNELS; + td->channel[2] = DWC_OTG_MAX_CHANNELS; td->state = 0; td->errcnt = 0; td->tt_scheduled = 0; @@ -3247,8 +3250,10 @@ dwc_otg_setup_standard_chain(struct usb_xfer *xfer) td->tmr_val = sc->sc_tmr_val + ival; td->tmr_res = ival; } else if (td->ep_type == UE_ISOCHRONOUS) { - td->tmr_val = 0; td->tmr_res = 1; + td->tmr_val = sc->sc_last_frame_num; + if (td->hcchar & HCCHAR_EPDIR_IN) + td->tmr_val++; } else { td->tmr_val = 0; td->tmr_res = (uint8_t)sc->sc_last_frame_num; @@ -3258,10 +3263,8 @@ dwc_otg_setup_standard_chain(struct usb_xfer *xfer) hcsplt = 0; if (td->ep_type == UE_INTERRUPT) { uint32_t ival; -#if 0 hcchar |= ((xfer->max_packet_count & 3) << HCCHAR_MC_SHIFT); -#endif ival = xfer->interval / DWC_OTG_HOST_TIMER_RATE; if (ival == 0) ival = 1; @@ -3272,8 +3275,11 @@ dwc_otg_setup_standard_chain(struct usb_xfer *xfer) } else if (td->ep_type == UE_ISOCHRONOUS) { hcchar |= ((xfer->max_packet_count & 3) << HCCHAR_MC_SHIFT); - td->tmr_val = 0; td->tmr_res = 1 << usbd_xfer_get_fps_shift(xfer); + td->tmr_val = sc->sc_last_frame_num; + if (td->hcchar & HCCHAR_EPDIR_IN) + td->tmr_val += td->tmr_res; + } else { td->tmr_val = 0; td->tmr_res = (uint8_t)sc->sc_last_frame_num; @@ -3330,6 +3336,19 @@ dwc_otg_start_standard_chain(struct usb_xfer *xfer) dwc_otg_xfer_do_fifo(sc, xfer); if (dwc_otg_xfer_do_complete_locked(sc, xfer)) goto done; + } else { + struct dwc_otg_td *td = xfer->td_transfer_cache; + if (td->ep_type == UE_ISOCHRONOUS && + (td->hcchar & HCCHAR_EPDIR_IN) == 0) { + /* + * Need to start ISOCHRONOUS OUT transfer ASAP + * because execution is delayed by one 125us + * microframe: + */ + dwc_otg_xfer_do_fifo(sc, xfer); + if (dwc_otg_xfer_do_complete_locked(sc, xfer)) + goto done; + } } /* put transfer on interrupt queue */ @@ -3950,11 +3969,6 @@ dwc_otg_do_poll(struct usb_bus *bus) USB_BUS_SPIN_LOCK(&sc->sc_bus); dwc_otg_interrupt_poll_locked(sc); dwc_otg_interrupt_complete_locked(sc); - if (sc->sc_flags.status_device_mode == 0) { - /* update host transfer schedule, so that new transfers can be issued */ - if (dwc_otg_update_host_transfer_schedule_locked(sc)) - dwc_otg_interrupt_poll_locked(sc); - } USB_BUS_SPIN_UNLOCK(&sc->sc_bus); USB_BUS_UNLOCK(&sc->sc_bus); } @@ -4728,6 +4742,9 @@ dwc_otg_xfer_setup(struct usb_setup_params *parm) /* init TD */ td->max_packet_size = xfer->max_packet_size; td->max_packet_count = xfer->max_packet_count; + /* range check */ + if (td->max_packet_count == 0 || td->max_packet_count > 3) + td->max_packet_count = 1; td->ep_no = ep_no; td->ep_type = ep_type; td->obj_next = last_obj; @@ -4766,12 +4783,13 @@ dwc_otg_ep_init(struct usb_device *udev, struct usb_endpoint_descriptor *edesc, return; } } else { - if (udev->speed == USB_SPEED_HIGH) { - if ((UGETW(edesc->wMaxPacketSize) >> 11) & 3) { - /* high bandwidth endpoint - not tested */ - DPRINTF("High Bandwidth Endpoint - not tested\n"); - return; - } + if (udev->speed == USB_SPEED_HIGH && + (edesc->wMaxPacketSize[1] & 0x18) != 0 && + (edesc->bmAttributes & UE_XFERTYPE) != UE_ISOCHRONOUS) { + /* not supported */ + DPRINTFN(-1, "Non-isochronous high bandwidth " + "endpoint not supported\n"); + return; } } if ((edesc->bmAttributes & UE_XFERTYPE) == UE_ISOCHRONOUS) diff --git a/sys/dev/usb/controller/dwc_otg.h b/sys/dev/usb/controller/dwc_otg.h index 39c9529..f5e9887 100644 --- a/sys/dev/usb/controller/dwc_otg.h +++ b/sys/dev/usb/controller/dwc_otg.h @@ -69,7 +69,7 @@ struct dwc_otg_td { uint8_t tmr_val; uint8_t ep_no; uint8_t ep_type; - uint8_t channel; + uint8_t channel[3]; uint8_t tt_index; /* TT data */ uint8_t tt_start_slot; /* TT data */ uint8_t tt_complete_slot; /* TT data */ @@ -80,8 +80,7 @@ struct dwc_otg_td { #define DWC_CHAN_ST_WAIT_S_ANE 2 #define DWC_CHAN_ST_WAIT_C_ANE 3 #define DWC_CHAN_ST_WAIT_C_PKT 4 -#define DWC_CHAN_ST_TX_PKT_ISOC 5 -#define DWC_CHAN_ST_TX_WAIT_ISOC 6 +#define DWC_CHAN_ST_TX_WAIT_ISOC 5 uint8_t error_any:1; uint8_t error_stall:1; uint8_t alt_next:1; diff --git a/sys/dev/usb/controller/usb_controller.c b/sys/dev/usb/controller/usb_controller.c index 92ea6c5..9f7ce24 100644 --- a/sys/dev/usb/controller/usb_controller.c +++ b/sys/dev/usb/controller/usb_controller.c @@ -231,7 +231,8 @@ usb_detach(device_t dev) /* Get rid of USB callback processes */ usb_proc_free(USB_BUS_GIANT_PROC(bus)); - usb_proc_free(USB_BUS_NON_GIANT_PROC(bus)); + usb_proc_free(USB_BUS_NON_GIANT_ISOC_PROC(bus)); + usb_proc_free(USB_BUS_NON_GIANT_BULK_PROC(bus)); /* Get rid of USB explore process */ @@ -395,7 +396,8 @@ usb_bus_explore(struct usb_proc_msg *pm) */ usb_proc_rewakeup(USB_BUS_CONTROL_XFER_PROC(bus)); usb_proc_rewakeup(USB_BUS_GIANT_PROC(bus)); - usb_proc_rewakeup(USB_BUS_NON_GIANT_PROC(bus)); + usb_proc_rewakeup(USB_BUS_NON_GIANT_ISOC_PROC(bus)); + usb_proc_rewakeup(USB_BUS_NON_GIANT_BULK_PROC(bus)); #endif USB_BUS_UNLOCK(bus); @@ -860,9 +862,13 @@ usb_attach_sub(device_t dev, struct usb_bus *bus) &bus->bus_mtx, device_get_nameunit(dev), USB_PRI_MED)) { device_printf(dev, "WARNING: Creation of USB Giant " "callback process failed.\n"); - } else if (usb_proc_create(USB_BUS_NON_GIANT_PROC(bus), + } else if (usb_proc_create(USB_BUS_NON_GIANT_ISOC_PROC(bus), + &bus->bus_mtx, device_get_nameunit(dev), USB_PRI_HIGHEST)) { + device_printf(dev, "WARNING: Creation of USB non-Giant ISOC " + "callback process failed.\n"); + } else if (usb_proc_create(USB_BUS_NON_GIANT_BULK_PROC(bus), &bus->bus_mtx, device_get_nameunit(dev), USB_PRI_HIGH)) { - device_printf(dev, "WARNING: Creation of USB non-Giant " + device_printf(dev, "WARNING: Creation of USB non-Giant BULK " "callback process failed.\n"); } else if (usb_proc_create(USB_BUS_EXPLORE_PROC(bus), &bus->bus_mtx, device_get_nameunit(dev), USB_PRI_MED)) { diff --git a/sys/dev/usb/usb_bus.h b/sys/dev/usb/usb_bus.h index bdd1681..3ceeb1e 100644 --- a/sys/dev/usb/usb_bus.h +++ b/sys/dev/usb/usb_bus.h @@ -57,19 +57,26 @@ struct usb_bus { struct root_hold_token *bus_roothold; #endif +/* convenience macros */ +#define USB_BUS_TT_PROC(bus) USB_BUS_NON_GIANT_ISOC_PROC(bus) +#define USB_BUS_CS_PROC(bus) USB_BUS_NON_GIANT_ISOC_PROC(bus) + #if USB_HAVE_PER_BUS_PROCESS #define USB_BUS_GIANT_PROC(bus) (&(bus)->giant_callback_proc) -#define USB_BUS_NON_GIANT_PROC(bus) (&(bus)->non_giant_callback_proc) +#define USB_BUS_NON_GIANT_ISOC_PROC(bus) (&(bus)->non_giant_isoc_callback_proc) +#define USB_BUS_NON_GIANT_BULK_PROC(bus) (&(bus)->non_giant_bulk_callback_proc) #define USB_BUS_EXPLORE_PROC(bus) (&(bus)->explore_proc) #define USB_BUS_CONTROL_XFER_PROC(bus) (&(bus)->control_xfer_proc) - /* - * There are two callback processes. One for Giant locked - * callbacks. One for non-Giant locked callbacks. This should - * avoid congestion and reduce response time in most cases. + * There are three callback processes. One for Giant locked + * callbacks. One for non-Giant locked non-periodic callbacks + * and one for non-Giant locked periodic callbacks. This + * should avoid congestion and reduce response time in most + * cases. */ struct usb_process giant_callback_proc; - struct usb_process non_giant_callback_proc; + struct usb_process non_giant_isoc_callback_proc; + struct usb_process non_giant_bulk_callback_proc; /* Explore process */ struct usb_process explore_proc; diff --git a/sys/dev/usb/usb_device.c b/sys/dev/usb/usb_device.c index 5ffc07f..13e2c14 100644 --- a/sys/dev/usb/usb_device.c +++ b/sys/dev/usb/usb_device.c @@ -2181,7 +2181,7 @@ usb_free_device(struct usb_device *udev, uint8_t flag) * anywhere: */ USB_BUS_LOCK(udev->bus); - usb_proc_mwait(USB_BUS_NON_GIANT_PROC(udev->bus), + usb_proc_mwait(USB_BUS_CS_PROC(udev->bus), &udev->cs_msg[0], &udev->cs_msg[1]); USB_BUS_UNLOCK(udev->bus); diff --git a/sys/dev/usb/usb_hub.c b/sys/dev/usb/usb_hub.c index 2f1459c..a54fa2e 100644 --- a/sys/dev/usb/usb_hub.c +++ b/sys/dev/usb/usb_hub.c @@ -346,7 +346,7 @@ uhub_tt_buffer_reset_async_locked(struct usb_device *child, struct usb_endpoint } up->req_reset_tt = req; /* get reset transfer started */ - usb_proc_msignal(USB_BUS_NON_GIANT_PROC(udev->bus), + usb_proc_msignal(USB_BUS_TT_PROC(udev->bus), &hub->tt_msg[0], &hub->tt_msg[1]); } #endif @@ -1579,7 +1579,7 @@ uhub_detach(device_t dev) #if USB_HAVE_TT_SUPPORT /* Make sure our TT messages are not queued anywhere */ USB_BUS_LOCK(bus); - usb_proc_mwait(USB_BUS_NON_GIANT_PROC(bus), + usb_proc_mwait(USB_BUS_TT_PROC(bus), &hub->tt_msg[0], &hub->tt_msg[1]); USB_BUS_UNLOCK(bus); #endif diff --git a/sys/dev/usb/usb_pf.c b/sys/dev/usb/usb_pf.c index 468eafb..82ad8e4 100644 --- a/sys/dev/usb/usb_pf.c +++ b/sys/dev/usb/usb_pf.c @@ -221,7 +221,13 @@ usbpf_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) ubus = ifp->if_softc; unit = ifp->if_dunit; + /* + * Lock USB before clearing the "ifp" pointer, to avoid + * clearing the pointer in the middle of a TAP operation: + */ + USB_BUS_LOCK(ubus); ubus->ifp = NULL; + USB_BUS_UNLOCK(ubus); bpfdetach(ifp); if_detach(ifp); if_free(ifp); diff --git a/sys/dev/usb/usb_process.h b/sys/dev/usb/usb_process.h index c12cdc4..dd20afd 100644 --- a/sys/dev/usb/usb_process.h +++ b/sys/dev/usb/usb_process.h @@ -34,6 +34,7 @@ #endif /* defines */ +#define USB_PRI_HIGHEST PI_SWI(SWI_TTY) #define USB_PRI_HIGH PI_SWI(SWI_NET) #define USB_PRI_MED PI_SWI(SWI_CAMBIO) diff --git a/sys/dev/usb/usb_transfer.c b/sys/dev/usb/usb_transfer.c index 5650790..783a96c 100644 --- a/sys/dev/usb/usb_transfer.c +++ b/sys/dev/usb/usb_transfer.c @@ -872,6 +872,19 @@ done: } } +static uint8_t +usbd_transfer_setup_has_bulk(const struct usb_config *setup_start, + uint16_t n_setup) +{ + while (n_setup--) { + uint8_t type = setup_start[n_setup].type; + if (type == UE_BULK || type == UE_BULK_INTR || + type == UE_TYPE_ANY) + return (1); + } + return (0); +} + /*------------------------------------------------------------------------* * usbd_transfer_setup - setup an array of USB transfers * @@ -1013,9 +1026,12 @@ usbd_transfer_setup(struct usb_device *udev, else if (xfer_mtx == &Giant) info->done_p = USB_BUS_GIANT_PROC(udev->bus); + else if (usbd_transfer_setup_has_bulk(setup_start, n_setup)) + info->done_p = + USB_BUS_NON_GIANT_BULK_PROC(udev->bus); else info->done_p = - USB_BUS_NON_GIANT_PROC(udev->bus); + USB_BUS_NON_GIANT_ISOC_PROC(udev->bus); } /* reset sizes */ @@ -2280,10 +2296,8 @@ usbd_callback_ss_done_defer(struct usb_xfer *xfer) * will have a Lock Order Reversal, LOR, if we try to * proceed ! */ - if (usb_proc_msignal(info->done_p, - &info->done_m[0], &info->done_m[1])) { - /* ignore */ - } + (void) usb_proc_msignal(info->done_p, + &info->done_m[0], &info->done_m[1]); } else { /* clear second recurse flag */ pq->recurse_2 = 0; @@ -2307,23 +2321,26 @@ usbd_callback_wrapper(struct usb_xfer_queue *pq) struct usb_xfer_root *info = xfer->xroot; USB_BUS_LOCK_ASSERT(info->bus, MA_OWNED); - if (!mtx_owned(info->xfer_mtx) && !SCHEDULER_STOPPED()) { + if ((pq->recurse_3 != 0 || mtx_owned(info->xfer_mtx) == 0) && + SCHEDULER_STOPPED() == 0) { /* * Cases that end up here: * * 5) HW interrupt done callback or other source. + * 6) HW completed transfer during callback */ - DPRINTFN(3, "case 5\n"); + DPRINTFN(3, "case 5 and 6\n"); /* * We have to postpone the callback due to the fact we * will have a Lock Order Reversal, LOR, if we try to - * proceed ! + * proceed! + * + * Postponing the callback also ensures that other USB + * transfer queues get a chance. */ - if (usb_proc_msignal(info->done_p, - &info->done_m[0], &info->done_m[1])) { - /* ignore */ - } + (void) usb_proc_msignal(info->done_p, + &info->done_m[0], &info->done_m[1]); return; } /* @@ -2381,8 +2398,11 @@ usbd_callback_wrapper(struct usb_xfer_queue *pq) } #if USB_HAVE_PF - if (xfer->usb_state != USB_ST_SETUP) + if (xfer->usb_state != USB_ST_SETUP) { + USB_BUS_LOCK(info->bus); usbpf_xfertap(xfer, USBPF_XFERTAP_DONE); + USB_BUS_UNLOCK(info->bus); + } #endif /* call processing routine */ (xfer->callback) (xfer, xfer->error); @@ -2694,7 +2714,7 @@ usbd_pipe_start(struct usb_xfer_queue *pq) } else if (udev->ctrl_xfer[1]) { info = udev->ctrl_xfer[1]->xroot; usb_proc_msignal( - USB_BUS_NON_GIANT_PROC(info->bus), + USB_BUS_CS_PROC(info->bus), &udev->cs_msg[0], &udev->cs_msg[1]); } else { /* should not happen */ @@ -3019,9 +3039,11 @@ usb_command_wrapper(struct usb_xfer_queue *pq, struct usb_xfer *xfer) if (!pq->recurse_1) { - do { + /* clear third recurse flag */ + pq->recurse_3 = 0; - /* set both recurse flags */ + do { + /* set two first recurse flags */ pq->recurse_1 = 1; pq->recurse_2 = 1; @@ -3040,6 +3062,12 @@ usb_command_wrapper(struct usb_xfer_queue *pq, struct usb_xfer *xfer) (pq->command) (pq); DPRINTFN(6, "cb %p (leave)\n", pq->curr); + /* + * Set third recurse flag to indicate + * recursion happened: + */ + pq->recurse_3 = 1; + } while (!pq->recurse_2); /* clear first recurse flag */ @@ -3315,7 +3343,8 @@ usbd_transfer_poll(struct usb_xfer **ppxfer, uint16_t max) USB_BUS_CONTROL_XFER_PROC(udev->bus)->up_msleep = 0; USB_BUS_EXPLORE_PROC(udev->bus)->up_msleep = 0; USB_BUS_GIANT_PROC(udev->bus)->up_msleep = 0; - USB_BUS_NON_GIANT_PROC(udev->bus)->up_msleep = 0; + USB_BUS_NON_GIANT_ISOC_PROC(udev->bus)->up_msleep = 0; + USB_BUS_NON_GIANT_BULK_PROC(udev->bus)->up_msleep = 0; /* poll USB hardware */ (udev->bus->methods->xfer_poll) (udev->bus); diff --git a/sys/dev/usb/usbdi.h b/sys/dev/usb/usbdi.h index 09b0ca7..ecd5a81 100644 --- a/sys/dev/usb/usbdi.h +++ b/sys/dev/usb/usbdi.h @@ -128,6 +128,8 @@ struct usb_xfer_queue { void (*command) (struct usb_xfer_queue *pq); uint8_t recurse_1:1; uint8_t recurse_2:1; + uint8_t recurse_3:1; + uint8_t reserved:5; }; /* diff --git a/sys/dev/vt/hw/efifb/efifb.c b/sys/dev/vt/hw/efifb/efifb.c index ec029c8..4184f77 100644 --- a/sys/dev/vt/hw/efifb/efifb.c +++ b/sys/dev/vt/hw/efifb/efifb.c @@ -96,7 +96,6 @@ vt_efifb_probe(struct vt_device *vd) static int vt_efifb_init(struct vt_device *vd) { - int depth, d; struct fb_info *info; struct efi_fb *efifb; caddr_t kmdp; @@ -116,16 +115,13 @@ vt_efifb_init(struct vt_device *vd) info->fb_height = efifb->fb_height; info->fb_width = efifb->fb_width; - depth = fls(efifb->fb_mask_red); - d = fls(efifb->fb_mask_green); - depth = d > depth ? d : depth; - d = fls(efifb->fb_mask_blue); - depth = d > depth ? d : depth; - d = fls(efifb->fb_mask_reserved); - depth = d > depth ? d : depth; - info->fb_depth = depth; + info->fb_depth = fls(efifb->fb_mask_red | efifb->fb_mask_green | + efifb->fb_mask_blue | efifb->fb_mask_reserved); + /* Round to a multiple of the bits in a byte. */ + info->fb_bpp = (info->fb_depth + NBBY - 1) & ~(NBBY - 1); - info->fb_stride = efifb->fb_stride * (depth / 8); + /* Stride in bytes, not pixels */ + info->fb_stride = efifb->fb_stride * (info->fb_bpp / NBBY); vt_generate_cons_palette(info->fb_cmap, COLOR_FORMAT_RGB, efifb->fb_mask_red, ffs(efifb->fb_mask_red) - 1, @@ -137,16 +133,6 @@ vt_efifb_init(struct vt_device *vd) info->fb_vbase = (intptr_t)pmap_mapdev_attr(info->fb_pbase, info->fb_size, VM_MEMATTR_WRITE_COMBINING); - /* Get pixel storage size. */ - info->fb_bpp = info->fb_stride / info->fb_width * 8; - - /* - * Early FB driver work with static window buffer, so reduce to minimal - * size, buffer or screen. - */ - info->fb_width = MIN(info->fb_width, VT_FB_DEFAULT_WIDTH); - info->fb_height = MIN(info->fb_height, VT_FB_DEFAULT_HEIGHT); - vt_fb_init(vd); return (CN_INTERNAL); diff --git a/sys/dev/vt/hw/vga/vt_vga.c b/sys/dev/vt/hw/vga/vt_vga.c index 0b7ebe4..4661f35 100644 --- a/sys/dev/vt/hw/vga/vt_vga.c +++ b/sys/dev/vt/hw/vga/vt_vga.c @@ -883,9 +883,9 @@ vga_bitblt_text_txtmode(struct vt_device *vd, const struct vt_window *vw, /* Convert colors to VGA attributes. */ attr = bg << 4 | fg; - MEM_WRITE1(sc, 0x18000 + (row * 80 + col) * 2 + 0, + MEM_WRITE1(sc, (row * 80 + col) * 2 + 0, ch); - MEM_WRITE1(sc, 0x18000 + (row * 80 + col) * 2 + 1, + MEM_WRITE1(sc, (row * 80 + col) * 2 + 1, attr); } } @@ -1226,8 +1226,6 @@ vga_init(struct vt_device *vd) # error "Architecture not yet supported!" #endif - bus_space_map(sc->vga_fb_tag, VGA_MEM_BASE, VGA_MEM_SIZE, 0, - &sc->vga_fb_handle); bus_space_map(sc->vga_reg_tag, VGA_REG_BASE, VGA_REG_SIZE, 0, &sc->vga_reg_handle); @@ -1236,9 +1234,13 @@ vga_init(struct vt_device *vd) vd->vd_flags |= VDF_TEXTMODE; vd->vd_width = 80; vd->vd_height = 25; + bus_space_map(sc->vga_fb_tag, VGA_TXT_BASE, VGA_TXT_SIZE, 0, + &sc->vga_fb_handle); } else { vd->vd_width = VT_VGA_WIDTH; vd->vd_height = VT_VGA_HEIGHT; + bus_space_map(sc->vga_fb_tag, VGA_MEM_BASE, VGA_MEM_SIZE, 0, + &sc->vga_fb_handle); } if (vga_initialize(vd, textmode) != 0) return (CN_DEAD); diff --git a/sys/dev/vt/hw/vga/vt_vga_reg.h b/sys/dev/vt/hw/vga/vt_vga_reg.h index 5bfb8ce..cf33a37 100644 --- a/sys/dev/vt/hw/vga/vt_vga_reg.h +++ b/sys/dev/vt/hw/vga/vt_vga_reg.h @@ -49,6 +49,8 @@ #define VGA_MEM_BASE 0xA0000 #define VGA_MEM_SIZE 0x10000 +#define VGA_TXT_BASE 0xB8000 +#define VGA_TXT_SIZE 0x08000 #define VGA_REG_BASE 0x3c0 #define VGA_REG_SIZE 0x10+0x0c diff --git a/sys/dev/vt/vt_core.c b/sys/dev/vt/vt_core.c index 99da892..702df42 100644 --- a/sys/dev/vt/vt_core.c +++ b/sys/dev/vt/vt_core.c @@ -264,8 +264,9 @@ vt_update_static(void *dummy) if (!vty_enabled(VTY_VT)) return; if (main_vd->vd_driver != NULL) - printf("VT: running with driver \"%s\".\n", - main_vd->vd_driver->vd_name); + printf("VT(%s): %s %ux%u\n", main_vd->vd_driver->vd_name, + (main_vd->vd_flags & VDF_TEXTMODE) ? "text" : "resolution", + main_vd->vd_width, main_vd->vd_height); else printf("VT: init without driver.\n"); diff --git a/sys/dev/xen/netfront/netfront.c b/sys/dev/xen/netfront/netfront.c index 2f972b8..302c017 100644 --- a/sys/dev/xen/netfront/netfront.c +++ b/sys/dev/xen/netfront/netfront.c @@ -280,8 +280,6 @@ struct netfront_info { struct callout xn_stat_ch; u_long rx_pfn_array[NET_RX_RING_SIZE]; - multicall_entry_t rx_mcl[NET_RX_RING_SIZE+1]; - mmu_update_t rx_mmu[NET_RX_RING_SIZE]; struct ifmedia sc_media; bool xn_resume; @@ -882,13 +880,6 @@ refill: gnttab_grant_foreign_transfer_ref(ref, otherend_id, pfn); sc->rx_pfn_array[nr_flips] = pfn; - if (!xen_feature(XENFEAT_auto_translated_physmap)) { - /* Remove this page before passing - * back to Xen. - */ - MULTI_update_va_mapping(&sc->rx_mcl[i], - vaddr, 0, 0); - } nr_flips++; } else { gnttab_grant_foreign_access_ref(ref, @@ -918,25 +909,6 @@ refill: reservation.extent_order = 0; reservation.address_bits = 0; reservation.domid = DOMID_SELF; - - if (!xen_feature(XENFEAT_auto_translated_physmap)) { - /* After all PTEs have been zapped, flush the TLB. */ - sc->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] = - UVMF_TLB_FLUSH|UVMF_ALL; - - /* Give away a batch of pages. */ - sc->rx_mcl[i].op = __HYPERVISOR_memory_op; - sc->rx_mcl[i].args[0] = XENMEM_decrease_reservation; - sc->rx_mcl[i].args[1] = (u_long)&reservation; - /* Zap PTEs and give away pages in one big multicall. */ - (void)HYPERVISOR_multicall(sc->rx_mcl, i+1); - - if (__predict_false(sc->rx_mcl[i].result != i || - HYPERVISOR_memory_op(XENMEM_decrease_reservation, - &reservation) != i)) - panic("%s: unable to reduce memory " - "reservation\n", __func__); - } } else { wmb(); } @@ -961,7 +933,6 @@ xn_rxeof(struct netfront_info *np) struct netif_rx_response *rx = &rinfo.rx; struct netif_extra_info *extras = rinfo.extras; RING_IDX i, rp; - multicall_entry_t *mcl; struct mbuf *m; struct mbufq rxq, errq; int err, pages_flipped = 0, work_to_do; @@ -1022,19 +993,6 @@ xn_rxeof(struct netfront_info *np) #ifdef notyet balloon_update_driver_allowance(-pages_flipped); #endif - /* Do all the remapping work, and M->P updates, in one big - * hypercall. - */ - if (!!xen_feature(XENFEAT_auto_translated_physmap)) { - mcl = np->rx_mcl + pages_flipped; - mcl->op = __HYPERVISOR_mmu_update; - mcl->args[0] = (u_long)np->rx_mmu; - mcl->args[1] = pages_flipped; - mcl->args[2] = 0; - mcl->args[3] = DOMID_SELF; - (void)HYPERVISOR_multicall(np->rx_mcl, - pages_flipped + 1); - } } mbufq_drain(&errq); @@ -1273,8 +1231,6 @@ xennet_get_responses(struct netfront_info *np, int *pages_flipped_p) { int pages_flipped = *pages_flipped_p; - struct mmu_update *mmu; - struct multicall_entry *mcl; struct netif_rx_response *rx = &rinfo->rx; struct netif_extra_info *extras = rinfo->extras; struct mbuf *m, *m0, *m_prev; @@ -1346,22 +1302,6 @@ xennet_get_responses(struct netfront_info *np, goto next; } - if (!xen_feature( XENFEAT_auto_translated_physmap)) { - /* Remap the page. */ - void *vaddr = mtod(m, void *); - uint32_t pfn; - - mcl = np->rx_mcl + pages_flipped; - mmu = np->rx_mmu + pages_flipped; - - MULTI_update_va_mapping(mcl, (u_long)vaddr, - (((vm_paddr_t)mfn) << PAGE_SHIFT) | PG_RW | - PG_V | PG_M | PG_A, 0); - pfn = (uintptr_t)m->m_ext.ext_arg1; - mmu->ptr = ((vm_paddr_t)mfn << PAGE_SHIFT) | - MMU_MACHPHYS_UPDATE; - mmu->val = pfn; - } pages_flipped++; } else { ret = gnttab_end_foreign_access_ref(ref); diff --git a/sys/fs/nfsserver/nfs_nfsdstate.c b/sys/fs/nfsserver/nfs_nfsdstate.c index d1ade4a..c0e05b9 100644 --- a/sys/fs/nfsserver/nfs_nfsdstate.c +++ b/sys/fs/nfsserver/nfs_nfsdstate.c @@ -401,9 +401,12 @@ nfsrv_setclient(struct nfsrv_descript *nd, struct nfsclient **new_clpp, } /* For NFSv4.1, mark that we found a confirmed clientid. */ - if ((nd->nd_flag & ND_NFSV41) != 0) + if ((nd->nd_flag & ND_NFSV41) != 0) { + clientidp->lval[0] = clp->lc_clientid.lval[0]; + clientidp->lval[1] = clp->lc_clientid.lval[1]; + confirmp->lval[0] = 0; /* Ignored by client */ confirmp->lval[1] = 1; - else { + } else { /* * id and verifier match, so update the net address info * and get rid of any existing callback authentication diff --git a/sys/kern/genassym.sh b/sys/kern/genassym.sh index 1cbc32b..521c7a2 100644 --- a/sys/kern/genassym.sh +++ b/sys/kern/genassym.sh @@ -10,7 +10,7 @@ usage() work() { - ${NM:='nm'} "$1" | ${AWK:='awk'} ' + ${NM:='nm'} ${NMFLAGS} "$1" | ${AWK:='awk'} ' / C .*sign$/ { sign = substr($1, length($1) - 3, 4) sub("^0*", "", sign) diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c index 3310d1d..d84c26f 100644 --- a/sys/kern/kern_exit.c +++ b/sys/kern/kern_exit.c @@ -981,6 +981,10 @@ proc_to_reap(struct thread *td, struct proc *p, idtype_t idtype, id_t id, switch (idtype) { case P_ALL: + if (p->p_procdesc != NULL) { + PROC_UNLOCK(p); + return (0); + } break; case P_PID: if (p->p_pid != (pid_t)id) { diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c index d09f0b6..432e38a 100644 --- a/sys/kern/kern_tc.c +++ b/sys/kern/kern_tc.c @@ -133,6 +133,8 @@ SYSCTL_PROC(_kern_timecounter, OID_AUTO, alloweddeviation, sysctl_kern_timecounter_adjprecision, "I", "Allowed time interval deviation in percents"); +static int tc_chosen; /* Non-zero if a specific tc was chosen via sysctl. */ + static void tc_windup(void); static void cpu_tick_calibrate(int); @@ -1197,10 +1199,13 @@ tc_init(struct timecounter *tc) "quality", CTLFLAG_RD, &(tc->tc_quality), 0, "goodness of time counter"); /* - * Never automatically use a timecounter with negative quality. + * Do not automatically switch if the current tc was specifically + * chosen. Never automatically use a timecounter with negative quality. * Even though we run on the dummy counter, switching here may be - * worse since this timecounter may not be monotonous. + * worse since this timecounter may not be monotonic. */ + if (tc_chosen) + return; if (tc->tc_quality < 0) return; if (tc->tc_quality < timecounter->tc_quality) @@ -1433,9 +1438,12 @@ sysctl_kern_timecounter_hardware(SYSCTL_HANDLER_ARGS) strlcpy(newname, tc->tc_name, sizeof(newname)); error = sysctl_handle_string(oidp, &newname[0], sizeof(newname), req); - if (error != 0 || req->newptr == NULL || - strcmp(newname, tc->tc_name) == 0) + if (error != 0 || req->newptr == NULL) return (error); + /* Record that the tc in use now was specifically chosen. */ + tc_chosen = 1; + if (strcmp(newname, tc->tc_name) == 0) + return (0); for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) { if (strcmp(newname, newtc->tc_name) != 0) continue; @@ -1464,7 +1472,7 @@ SYSCTL_PROC(_kern_timecounter, OID_AUTO, hardware, CTLTYPE_STRING | CTLFLAG_RW, "Timecounter hardware selected"); -/* Report or change the active timecounter hardware. */ +/* Report the available timecounter hardware. */ static int sysctl_kern_timecounter_choice(SYSCTL_HANDLER_ARGS) { diff --git a/sys/modules/Makefile b/sys/modules/Makefile index bd52356..21009a9 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -296,6 +296,9 @@ SUBDIR= \ ${_qlxgbe} \ ral \ ${_ralfw} \ + ${_random_fortuna} \ + ${_random_yarrow} \ + ${_random_other} \ rc4 \ ${_rdma} \ ${_rdrand_rng} \ @@ -398,6 +401,9 @@ _autofs= autofs .if exists(${.CURDIR}/../opencrypto) _crypto= crypto _cryptodev= cryptodev +_random_fortuna=random_fortuna +_random_yarrow= random_yarrow +_random_other= random_other .endif .endif diff --git a/sys/modules/am335x_dmtpps/Makefile b/sys/modules/am335x_dmtpps/Makefile new file mode 100644 index 0000000..4d9deac --- /dev/null +++ b/sys/modules/am335x_dmtpps/Makefile @@ -0,0 +1,8 @@ +# $FreeBSD$ + +.PATH: ${.CURDIR}/../../arm/ti/am335x + +KMOD= am335x_dmtpps +SRCS= am335x_dmtpps.c + +.include <bsd.kmod.mk> diff --git a/sys/modules/ctl/Makefile b/sys/modules/ctl/Makefile index e97ec38..c74f000 100644 --- a/sys/modules/ctl/Makefile +++ b/sys/modules/ctl/Makefile @@ -11,7 +11,7 @@ SRCS+= ctl_backend_ramdisk.c SRCS+= ctl_cmd_table.c SRCS+= ctl_frontend.c SRCS+= ctl_frontend_cam_sim.c -SRCS+= ctl_frontend_internal.c +SRCS+= ctl_frontend_ioctl.c SRCS+= ctl_frontend_iscsi.c SRCS+= ctl_scsi_all.c SRCS+= ctl_tpc.c diff --git a/sys/modules/gpio/gpiobus/Makefile b/sys/modules/gpio/gpiobus/Makefile index e868cba..2a3f86d 100644 --- a/sys/modules/gpio/gpiobus/Makefile +++ b/sys/modules/gpio/gpiobus/Makefile @@ -32,8 +32,9 @@ .PATH: ${.CURDIR}/../../../dev/gpio/ KMOD= gpiobus -SRCS= gpiobus.c -SRCS+= device_if.h bus_if.h gpio_if.h gpiobus_if.h opt_platform.h +SRCS= gpiobus.c gpioc.c +SRCS+= gpio_if.c gpio_if.h gpiobus_if.c gpiobus_if.h +SRCS+= device_if.h bus_if.h opt_platform.h CFLAGS+= -I. -I${.CURDIR}/../../../dev/gpio/ diff --git a/sys/modules/random_fortuna/Makefile b/sys/modules/random_fortuna/Makefile new file mode 100644 index 0000000..d28ae4d --- /dev/null +++ b/sys/modules/random_fortuna/Makefile @@ -0,0 +1,11 @@ +# $FreeBSD$ + +.PATH: ${.CURDIR}/../../dev/random + +KMOD = random_fortuna +SRCS = randomdev.c hash.c fortuna.c +SRCS += opt_param.h bus_if.h device_if.h +SRCS += opt_ddb.h +CFLAGS += -DRANDOM_LOADABLE + +.include <bsd.kmod.mk> diff --git a/sys/modules/random_other/Makefile b/sys/modules/random_other/Makefile new file mode 100644 index 0000000..6ce586b --- /dev/null +++ b/sys/modules/random_other/Makefile @@ -0,0 +1,11 @@ +# $FreeBSD$ + +.PATH: ${.CURDIR}/../../dev/random + +KMOD = random_OTHER +SRCS = randomdev.c hash.c other_algorithm.c +SRCS += opt_param.h bus_if.h device_if.h +SRCS += opt_ddb.h +CFLAGS += -DRANDOM_LOADABLE + +.include <bsd.kmod.mk> diff --git a/sys/modules/random_yarrow/Makefile b/sys/modules/random_yarrow/Makefile new file mode 100644 index 0000000..1750af4 --- /dev/null +++ b/sys/modules/random_yarrow/Makefile @@ -0,0 +1,11 @@ +# $FreeBSD$ + +.PATH: ${.CURDIR}/../../dev/random + +KMOD = random_yarrow +SRCS = randomdev.c hash.c yarrow.c +SRCS += opt_param.h bus_if.h device_if.h +SRCS += opt_ddb.h +CFLAGS += -DRANDOM_LOADABLE + +.include <bsd.kmod.mk> diff --git a/sys/net/ieee8023ad_lacp.c b/sys/net/ieee8023ad_lacp.c index 64aafb1..1af4ffc 100644 --- a/sys/net/ieee8023ad_lacp.c +++ b/sys/net/ieee8023ad_lacp.c @@ -522,7 +522,7 @@ lacp_port_create(struct lagg_port *lgp) int error; boolean_t active = TRUE; /* XXX should be configurable */ - boolean_t fast = FALSE; /* XXX should be configurable */ + boolean_t fast = FALSE; /* Configurable via ioctl */ link_init_sdl(ifp, (struct sockaddr *)&sdl, IFT_ETHER); sdl.sdl_alen = ETHER_ADDR_LEN; diff --git a/sys/net/ieee8023ad_lacp.h b/sys/net/ieee8023ad_lacp.h index e814f83..8f0f51a 100644 --- a/sys/net/ieee8023ad_lacp.h +++ b/sys/net/ieee8023ad_lacp.h @@ -251,6 +251,7 @@ struct lacp_softc { u_int32_t lsc_tx_test; } lsc_debug; u_int32_t lsc_strict_mode; + boolean_t lsc_fast_timeout; /* if set, fast timeout */ }; #define LACP_TYPE_ACTORINFO 1 diff --git a/sys/net/if_lagg.c b/sys/net/if_lagg.c index dcd005a..b623493 100644 --- a/sys/net/if_lagg.c +++ b/sys/net/if_lagg.c @@ -1257,6 +1257,8 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) ro->ro_opts |= LAGG_OPT_LACP_RXTEST; if (lsc->lsc_strict_mode != 0) ro->ro_opts |= LAGG_OPT_LACP_STRICT; + if (lsc->lsc_fast_timeout != 0) + ro->ro_opts |= LAGG_OPT_LACP_TIMEOUT; ro->ro_active = sc->sc_active; } else { @@ -1292,6 +1294,8 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) case -LAGG_OPT_LACP_RXTEST: case LAGG_OPT_LACP_STRICT: case -LAGG_OPT_LACP_STRICT: + case LAGG_OPT_LACP_TIMEOUT: + case -LAGG_OPT_LACP_TIMEOUT: valid = lacp = 1; break; default: @@ -1320,6 +1324,7 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) sc->sc_opts &= ~ro->ro_opts; } else { struct lacp_softc *lsc; + struct lacp_port *lp; lsc = (struct lacp_softc *)sc->sc_psc; @@ -1342,6 +1347,20 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) case -LAGG_OPT_LACP_STRICT: lsc->lsc_strict_mode = 0; break; + case LAGG_OPT_LACP_TIMEOUT: + LACP_LOCK(lsc); + LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) + lp->lp_state |= LACP_STATE_TIMEOUT; + LACP_UNLOCK(lsc); + lsc->lsc_fast_timeout = 1; + break; + case -LAGG_OPT_LACP_TIMEOUT: + LACP_LOCK(lsc); + LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) + lp->lp_state &= ~LACP_STATE_TIMEOUT; + LACP_UNLOCK(lsc); + lsc->lsc_fast_timeout = 0; + break; } } LAGG_WUNLOCK(sc); diff --git a/sys/net/if_lagg.h b/sys/net/if_lagg.h index a45fa16..bb5ea23 100644 --- a/sys/net/if_lagg.h +++ b/sys/net/if_lagg.h @@ -150,6 +150,7 @@ struct lagg_reqopts { #define LAGG_OPT_LACP_STRICT 0x10 /* LACP strict mode */ #define LAGG_OPT_LACP_TXTEST 0x20 /* LACP debug: txtest */ #define LAGG_OPT_LACP_RXTEST 0x40 /* LACP debug: rxtest */ +#define LAGG_OPT_LACP_TIMEOUT 0x80 /* LACP timeout */ u_int ro_count; /* number of ports */ u_int ro_active; /* active port count */ u_int ro_flapping; /* number of flapping */ diff --git a/sys/netinet/if_ether.c b/sys/netinet/if_ether.c index f90925a..263c197 100644 --- a/sys/netinet/if_ether.c +++ b/sys/netinet/if_ether.c @@ -130,6 +130,13 @@ static void arptimer(void *); static void in_arpinput(struct mbuf *); #endif +static void arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, + struct ifnet *ifp, int bridged, struct llentry *la); +static void arp_update_lle(struct arphdr *ah, struct ifnet *ifp, + struct llentry *la); +static void arp_mark_lle_reachable(struct llentry *la); + + static const struct netisr_handler arp_nh = { .nh_name = "arp", .nh_handler = arpintr, @@ -302,57 +309,37 @@ arprequest(struct ifnet *ifp, const struct in_addr *sip, } /* - * Resolve an IP address into an ethernet address. - * On input: - * ifp is the interface we use - * is_gw != if @dst represents gateway to some destination - * m is the mbuf. May be NULL if we don't have a packet. - * dst is the next hop, - * desten is where we want the address. - * flags returns lle entry flags. + * Resolve an IP address into an ethernet address - heavy version. + * Used internally by arpresolve(). + * We have already checked than we can't use existing lle without + * modification so we have to acquire LLE_EXCLUSIVE lle lock. * * On success, desten and flags are filled in and the function returns 0; * If the packet must be held pending resolution, we return EWOULDBLOCK * On other errors, we return the corresponding error code. * Note that m_freem() handles NULL. */ -int -arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m, +static int +arpresolve_full(struct ifnet *ifp, int is_gw, int create, struct mbuf *m, const struct sockaddr *dst, u_char *desten, uint32_t *pflags) { - struct llentry *la = 0; - u_int flags = 0; + struct llentry *la = NULL; struct mbuf *curr = NULL; struct mbuf *next = NULL; - int create, error, renew; + int error, renew; if (pflags != NULL) *pflags = 0; - create = 0; - if (m != NULL) { - if (m->m_flags & M_BCAST) { - /* broadcast */ - (void)memcpy(desten, - ifp->if_broadcastaddr, ifp->if_addrlen); - return (0); - } - if (m->m_flags & M_MCAST) { - /* multicast */ - ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten); - return (0); - } + if (create == 0) { + IF_AFDATA_RLOCK(ifp); + la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); + IF_AFDATA_RUNLOCK(ifp); } -retry: - IF_AFDATA_RLOCK(ifp); - la = lla_lookup(LLTABLE(ifp), flags, dst); - IF_AFDATA_RUNLOCK(ifp); - if ((la == NULL) && ((flags & LLE_EXCLUSIVE) == 0) - && ((ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0)) { + if (la == NULL && (ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0) { create = 1; - flags |= LLE_EXCLUSIVE; IF_AFDATA_WLOCK(ifp); - la = lla_create(LLTABLE(ifp), flags, dst); + la = lla_create(LLTABLE(ifp), 0, dst); IF_AFDATA_WUNLOCK(ifp); } if (la == NULL) { @@ -382,10 +369,7 @@ retry: if (pflags != NULL) *pflags = la->la_flags; - if (flags & LLE_EXCLUSIVE) - LLE_WUNLOCK(la); - else - LLE_RUNLOCK(la); + LLE_WUNLOCK(la); if (renew == 1) arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL); @@ -393,20 +377,7 @@ retry: return (0); } - if (la->la_flags & LLE_STATIC) { /* should not happen! */ - log(LOG_DEBUG, "arpresolve: ouch, empty static llinfo for %s\n", - inet_ntoa(SIN(dst)->sin_addr)); - m_freem(m); - error = EINVAL; - goto done; - } - renew = (la->la_asked == 0 || la->la_expire != time_uptime); - if ((renew || m != NULL) && (flags & LLE_EXCLUSIVE) == 0) { - flags |= LLE_EXCLUSIVE; - LLE_RUNLOCK(la); - goto retry; - } /* * There is an arptab entry, but no ethernet address * response yet. Add the mbuf to the list, dropping @@ -431,11 +402,6 @@ retry: } else la->la_hold = m; la->la_numheld++; - if (renew == 0 && (flags & LLE_EXCLUSIVE)) { - flags &= ~LLE_EXCLUSIVE; - LLE_DOWNGRADE(la); - } - } /* * Return EWOULDBLOCK if we have tried less than arp_maxtries. It @@ -462,15 +428,88 @@ retry: arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL); return (error); } -done: - if (flags & LLE_EXCLUSIVE) - LLE_WUNLOCK(la); - else - LLE_RUNLOCK(la); + + LLE_WUNLOCK(la); return (error); } /* + * Resolve an IP address into an ethernet address. + * On input: + * ifp is the interface we use + * is_gw != 0 if @dst represents gateway to some destination + * m is the mbuf. May be NULL if we don't have a packet. + * dst is the next hop, + * desten is the storage to put LL address. + * flags returns lle entry flags. + * + * On success, desten and flags are filled in and the function returns 0; + * If the packet must be held pending resolution, we return EWOULDBLOCK + * On other errors, we return the corresponding error code. + * Note that m_freem() handles NULL. + */ +int +arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m, + const struct sockaddr *dst, u_char *desten, uint32_t *pflags) +{ + struct llentry *la = 0; + int renew; + + if (pflags != NULL) + *pflags = 0; + + if (m != NULL) { + if (m->m_flags & M_BCAST) { + /* broadcast */ + (void)memcpy(desten, + ifp->if_broadcastaddr, ifp->if_addrlen); + return (0); + } + if (m->m_flags & M_MCAST) { + /* multicast */ + ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten); + return (0); + } + } + + IF_AFDATA_RLOCK(ifp); + la = lla_lookup(LLTABLE(ifp), 0, dst); + IF_AFDATA_RUNLOCK(ifp); + + if (la == NULL) + return (arpresolve_full(ifp, is_gw, 1, m, dst, desten, pflags)); + + if ((la->la_flags & LLE_VALID) && + ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) { + bcopy(&la->ll_addr, desten, ifp->if_addrlen); + renew = 0; + /* + * If entry has an expiry time and it is approaching, + * see if we need to send an ARP request within this + * arpt_down interval. + */ + if (!(la->la_flags & LLE_STATIC) && + time_uptime + la->la_preempt > la->la_expire) { + renew = 1; + la->la_preempt--; + } + + if (pflags != NULL) + *pflags = la->la_flags; + + LLE_RUNLOCK(la); + + if (renew == 1) + arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL); + + return (0); + } + LLE_RUNLOCK(la); + + return (arpresolve_full(ifp, is_gw, 0, m, dst, desten, pflags)); +} + +/* * Common length and type checks are done here, * then the protocol-specific routine is called. */ @@ -576,10 +615,10 @@ in_arpinput(struct mbuf *m) struct sockaddr sa; struct in_addr isaddr, itaddr, myaddr; u_int8_t *enaddr = NULL; - int op, flags; + int op; int req_len; int bridged = 0, is_bridge = 0; - int carped, create; + int carped; struct sockaddr_in sin; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; @@ -708,6 +747,16 @@ match: "%s!\n", inet_ntoa(isaddr)); goto drop; } + + if (ifp->if_addrlen != ah->ar_hln) { + LLE_WUNLOCK(la); + ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, " + "i/f %d (ignored)\n", ifp->if_addrlen, + (u_char *) ar_sha(ah), ":", ah->ar_hln, + ifp->if_addrlen); + goto drop; + } + /* * Warn if another host is using the same IP address, but only if the * IP address isn't 0.0.0.0, which is used for DHCP only, in which @@ -730,100 +779,22 @@ match: sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr = isaddr; - create = (itaddr.s_addr == myaddr.s_addr) ? 1 : 0; - flags = LLE_EXCLUSIVE; - IF_AFDATA_LOCK(ifp); - if (create != 0) - la = lla_create(LLTABLE(ifp), 0, (struct sockaddr *)&sin); - else - la = lla_lookup(LLTABLE(ifp), flags, (struct sockaddr *)&sin); - IF_AFDATA_UNLOCK(ifp); - if (la != NULL) { - /* the following is not an error when doing bridging */ - if (!bridged && la->lle_tbl->llt_ifp != ifp) { - if (log_arp_wrong_iface) - ARP_LOG(LOG_WARNING, "%s is on %s " - "but got reply from %*D on %s\n", - inet_ntoa(isaddr), - la->lle_tbl->llt_ifp->if_xname, - ifp->if_addrlen, (u_char *)ar_sha(ah), ":", - ifp->if_xname); - LLE_WUNLOCK(la); - goto reply; - } - if ((la->la_flags & LLE_VALID) && - bcmp(ar_sha(ah), &la->ll_addr, ifp->if_addrlen)) { - if (la->la_flags & LLE_STATIC) { - LLE_WUNLOCK(la); - if (log_arp_permanent_modify) - ARP_LOG(LOG_ERR, - "%*D attempts to modify " - "permanent entry for %s on %s\n", - ifp->if_addrlen, - (u_char *)ar_sha(ah), ":", - inet_ntoa(isaddr), ifp->if_xname); - goto reply; - } - if (log_arp_movements) { - ARP_LOG(LOG_INFO, "%s moved from %*D " - "to %*D on %s\n", - inet_ntoa(isaddr), - ifp->if_addrlen, - (u_char *)&la->ll_addr, ":", - ifp->if_addrlen, (u_char *)ar_sha(ah), ":", - ifp->if_xname); - } - } - - if (ifp->if_addrlen != ah->ar_hln) { - LLE_WUNLOCK(la); - ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, " - "i/f %d (ignored)\n", ifp->if_addrlen, - (u_char *) ar_sha(ah), ":", ah->ar_hln, - ifp->if_addrlen); - goto drop; - } - (void)memcpy(&la->ll_addr, ar_sha(ah), ifp->if_addrlen); - la->la_flags |= LLE_VALID; - - EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED); - - if (!(la->la_flags & LLE_STATIC)) { - int canceled; - - LLE_ADDREF(la); - la->la_expire = time_uptime + V_arpt_keep; - canceled = callout_reset(&la->lle_timer, - hz * V_arpt_keep, arptimer, la); - if (canceled) - LLE_REMREF(la); - } - la->la_asked = 0; - la->la_preempt = V_arp_maxtries; + IF_AFDATA_RLOCK(ifp); + la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, (struct sockaddr *)&sin); + IF_AFDATA_RUNLOCK(ifp); + if (la != NULL) + arp_check_update_lle(ah, isaddr, ifp, bridged, la); + else if (itaddr.s_addr == myaddr.s_addr) { /* - * The packets are all freed within the call to the output - * routine. - * - * NB: The lock MUST be released before the call to the - * output routine. + * Reply to our address, but no lle exists yet. + * do we really have to create an entry? */ - if (la->la_hold != NULL) { - struct mbuf *m_hold, *m_hold_next; - - m_hold = la->la_hold; - la->la_hold = NULL; - la->la_numheld = 0; - lltable_fill_sa_entry(la, (struct sockaddr *)&sa); - LLE_WUNLOCK(la); - for (; m_hold != NULL; m_hold = m_hold_next) { - m_hold_next = m_hold->m_nextpkt; - m_hold->m_nextpkt = NULL; - /* Avoid confusing lower layers. */ - m_clrprotoflags(m_hold); - (*ifp->if_output)(ifp, m_hold, &sa, NULL); - } - } else - LLE_WUNLOCK(la); + IF_AFDATA_WLOCK(ifp); + la = lla_create(LLTABLE(ifp), 0, (struct sockaddr *)&sin); + arp_update_lle(ah, ifp, la); + IF_AFDATA_WUNLOCK(ifp); + arp_mark_lle_reachable(la); + LLE_WUNLOCK(la); } reply: if (op != ARPOP_REQUEST) @@ -934,6 +905,140 @@ drop: } #endif +/* + * Checks received arp data against existing @la. + * Updates lle state/performs notification if necessary. + */ +static void +arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp, + int bridged, struct llentry *la) +{ + struct sockaddr sa; + struct mbuf *m_hold, *m_hold_next; + + LLE_WLOCK_ASSERT(la); + + /* the following is not an error when doing bridging */ + if (!bridged && la->lle_tbl->llt_ifp != ifp) { + if (log_arp_wrong_iface) + ARP_LOG(LOG_WARNING, "%s is on %s " + "but got reply from %*D on %s\n", + inet_ntoa(isaddr), + la->lle_tbl->llt_ifp->if_xname, + ifp->if_addrlen, (u_char *)ar_sha(ah), ":", + ifp->if_xname); + LLE_WUNLOCK(la); + return; + } + if ((la->la_flags & LLE_VALID) && + bcmp(ar_sha(ah), &la->ll_addr, ifp->if_addrlen)) { + if (la->la_flags & LLE_STATIC) { + LLE_WUNLOCK(la); + if (log_arp_permanent_modify) + ARP_LOG(LOG_ERR, + "%*D attempts to modify " + "permanent entry for %s on %s\n", + ifp->if_addrlen, + (u_char *)ar_sha(ah), ":", + inet_ntoa(isaddr), ifp->if_xname); + return; + } + if (log_arp_movements) { + ARP_LOG(LOG_INFO, "%s moved from %*D " + "to %*D on %s\n", + inet_ntoa(isaddr), + ifp->if_addrlen, + (u_char *)&la->ll_addr, ":", + ifp->if_addrlen, (u_char *)ar_sha(ah), ":", + ifp->if_xname); + } + } + + /* Check if something has changed */ + if (memcmp(&la->ll_addr, ar_sha(ah), ifp->if_addrlen) != 0 || + (la->la_flags & LLE_VALID) == 0) { + /* Perform real LLE update */ + /* use afdata WLOCK to update fields */ + LLE_ADDREF(la); + LLE_WUNLOCK(la); + IF_AFDATA_WLOCK(ifp); + LLE_WLOCK(la); + + /* + * Since we droppped LLE lock, other thread might have deleted + * this lle. Check and return + */ + if ((la->la_flags & LLE_DELETED) != 0) { + IF_AFDATA_WUNLOCK(ifp); + LLE_FREE_LOCKED(la); + return; + } + + /* Update data */ + arp_update_lle(ah, ifp, la); + + IF_AFDATA_WUNLOCK(ifp); + LLE_REMREF(la); + } + + arp_mark_lle_reachable(la); + + /* + * The packets are all freed within the call to the output + * routine. + * + * NB: The lock MUST be released before the call to the + * output routine. + */ + if (la->la_hold != NULL) { + m_hold = la->la_hold; + la->la_hold = NULL; + la->la_numheld = 0; + lltable_fill_sa_entry(la, &sa); + LLE_WUNLOCK(la); + for (; m_hold != NULL; m_hold = m_hold_next) { + m_hold_next = m_hold->m_nextpkt; + m_hold->m_nextpkt = NULL; + /* Avoid confusing lower layers. */ + m_clrprotoflags(m_hold); + (*ifp->if_output)(ifp, m_hold, &sa, NULL); + } + } else + LLE_WUNLOCK(la); +} + +/* + * Updates @la fields used by fast path code. + */ +static void +arp_update_lle(struct arphdr *ah, struct ifnet *ifp, struct llentry *la) +{ + + memcpy(&la->ll_addr, ar_sha(ah), ifp->if_addrlen); + la->la_flags |= LLE_VALID; +} + +static void +arp_mark_lle_reachable(struct llentry *la) +{ + int canceled; + + LLE_WLOCK_ASSERT(la); + + EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED); + + if (!(la->la_flags & LLE_STATIC)) { + LLE_ADDREF(la); + la->la_expire = time_uptime + V_arpt_keep; + canceled = callout_reset(&la->lle_timer, + hz * V_arpt_keep, arptimer, la); + if (canceled) + LLE_REMREF(la); + } + la->la_asked = 0; + la->la_preempt = V_arp_maxtries; +} + void arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa) { diff --git a/sys/netinet/sctp_timer.c b/sys/netinet/sctp_timer.c index 6c8589e..3e72585 100644 --- a/sys/netinet/sctp_timer.c +++ b/sys/netinet/sctp_timer.c @@ -1492,6 +1492,8 @@ sctp_pathmtu_timer(struct sctp_inpcb *inp, #endif if (mtu > next_mtu) { net->mtu = next_mtu; + } else { + net->mtu = mtu; } } } diff --git a/sys/ofed/drivers/infiniband/core/cma.c b/sys/ofed/drivers/infiniband/core/cma.c index f1d26cc..7ee525a 100644 --- a/sys/ofed/drivers/infiniband/core/cma.c +++ b/sys/ofed/drivers/infiniband/core/cma.c @@ -72,6 +72,11 @@ static int def_prec2sl = 3; module_param_named(def_prec2sl, def_prec2sl, int, 0644); MODULE_PARM_DESC(def_prec2sl, "Default value for SL priority with RoCE. Valid values 0 - 7"); +static int unify_tcp_port_space = 1; +module_param(unify_tcp_port_space, int, 0644); +MODULE_PARM_DESC(unify_tcp_port_space, "Unify the host TCP and RDMA port " + "space allocation (default=1)"); + static int debug_level = 0; #define cma_pr(level, priv, format, arg...) \ printk(level "CMA: %p: %s: " format, ((struct rdma_id_priv *) priv) , __func__, ## arg) @@ -957,6 +962,8 @@ static void cma_release_port(struct rdma_id_private *id_priv) kfree(bind_list); } mutex_unlock(&lock); + if (id_priv->sock) + sock_release(id_priv->sock); } static void cma_leave_mc_groups(struct rdma_id_private *id_priv) @@ -2449,6 +2456,42 @@ static int cma_bind_listen(struct rdma_id_private *id_priv) return ret; } +static int cma_get_tcp_port(struct rdma_id_private *id_priv) +{ + int ret; + int size; + struct socket *sock; + + ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (ret) + return ret; +#ifdef __linux__ + ret = sock->ops->bind(sock, + (struct sockaddr *) &id_priv->id.route.addr.src_addr, + ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr)); +#else + ret = -sobind(sock, + (struct sockaddr *)&id_priv->id.route.addr.src_addr, + curthread); +#endif + if (ret) { + sock_release(sock); + return ret; + } + + size = ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr); + ret = sock_getname(sock, + (struct sockaddr *) &id_priv->id.route.addr.src_addr, + &size, 0); + if (ret) { + sock_release(sock); + return ret; + } + + id_priv->sock = sock; + return 0; +} + static int cma_get_port(struct rdma_id_private *id_priv) { struct idr *ps; @@ -2460,6 +2503,11 @@ static int cma_get_port(struct rdma_id_private *id_priv) break; case RDMA_PS_TCP: ps = &tcp_ps; + if (unify_tcp_port_space) { + ret = cma_get_tcp_port(id_priv); + if (ret) + goto out; + } break; case RDMA_PS_UDP: ps = &udp_ps; @@ -2480,7 +2528,7 @@ static int cma_get_port(struct rdma_id_private *id_priv) else ret = cma_use_port(ps, id_priv); mutex_unlock(&lock); - +out: return ret; } diff --git a/sys/powerpc/powerpc/trap.c b/sys/powerpc/powerpc/trap.c index 57008e9..d2e5eaa 100644 --- a/sys/powerpc/powerpc/trap.c +++ b/sys/powerpc/powerpc/trap.c @@ -413,8 +413,8 @@ printtrap(u_int vector, struct trapframe *frame, int isfatal, int user) case EXC_DTMISS: printf(" virtual address = 0x%" PRIxPTR "\n", frame->dar); #ifdef AIM - printf(" dsisr = 0x%" PRIxPTR "\n", - frame->cpu.aim.dsisr); + printf(" dsisr = 0x%lx\n", + (u_long)frame->cpu.aim.dsisr); #endif break; case EXC_ISE: @@ -438,7 +438,7 @@ printtrap(u_int vector, struct trapframe *frame, int isfatal, int user) frame->cpu.booke.esr); #endif printf(" srr0 = 0x%" PRIxPTR "\n", frame->srr0); - printf(" srr1 = 0x%" PRIxPTR "\n", frame->srr1); + printf(" srr1 = 0x%lx\n", (u_long)frame->srr1); printf(" lr = 0x%" PRIxPTR "\n", frame->lr); printf(" curthread = %p\n", curthread); if (curthread != NULL) diff --git a/sys/sys/ata.h b/sys/sys/ata.h index 863f0e8..272b46a 100644 --- a/sys/sys/ata.h +++ b/sys/sys/ata.h @@ -399,6 +399,7 @@ struct ata_params { #define ATA_IDLE_CMD 0xe3 /* idle */ #define ATA_READ_BUFFER 0xe4 /* read buffer */ #define ATA_READ_PM 0xe4 /* read portmultiplier */ +#define ATA_CHECK_POWER_MODE 0xe5 /* device power mode */ #define ATA_SLEEP 0xe6 /* sleep */ #define ATA_FLUSHCACHE 0xe7 /* flush cache to disk */ #define ATA_WRITE_PM 0xe8 /* write portmultiplier */ diff --git a/sys/sys/nv.h b/sys/sys/nv.h index fa5d138..a985b6d 100644 --- a/sys/sys/nv.h +++ b/sys/sys/nv.h @@ -1,5 +1,6 @@ /*- * Copyright (c) 2009-2013 The FreeBSD Foundation + * Copyright (c) 2013-2015 Mariusz Zaborski <oshogbo@FreeBSD.org> * All rights reserved. * * This software was developed by Pawel Jakub Dawidek under sponsorship from @@ -59,6 +60,11 @@ typedef struct nvlist nvlist_t; #define NV_TYPE_NVLIST 5 #define NV_TYPE_DESCRIPTOR 6 #define NV_TYPE_BINARY 7 +#define NV_TYPE_BOOL_ARRAY 8 +#define NV_TYPE_NUMBER_ARRAY 9 +#define NV_TYPE_STRING_ARRAY 10 +#define NV_TYPE_NVLIST_ARRAY 11 +#define NV_TYPE_DESCRIPTOR_ARRAY 12 /* * Perform case-insensitive lookups of provided names. @@ -101,6 +107,11 @@ const char *nvlist_next(const nvlist_t *nvl, int *typep, void **cookiep); const nvlist_t *nvlist_get_parent(const nvlist_t *nvl, void **cookiep); +const nvlist_t *nvlist_get_array_next(const nvlist_t *nvl); +bool nvlist_in_array(const nvlist_t *nvl); + +const nvlist_t *nvlist_get_pararr(const nvlist_t *nvl, void **cookiep); + /* * The nvlist_exists functions check if the given name (optionally of the given * type) exists on nvlist. @@ -114,10 +125,15 @@ bool nvlist_exists_bool(const nvlist_t *nvl, const char *name); bool nvlist_exists_number(const nvlist_t *nvl, const char *name); bool nvlist_exists_string(const nvlist_t *nvl, const char *name); bool nvlist_exists_nvlist(const nvlist_t *nvl, const char *name); +bool nvlist_exists_binary(const nvlist_t *nvl, const char *name); +bool nvlist_exists_bool_array(const nvlist_t *nvl, const char *name); +bool nvlist_exists_number_array(const nvlist_t *nvl, const char *name); +bool nvlist_exists_string_array(const nvlist_t *nvl, const char *name); +bool nvlist_exists_nvlist_array(const nvlist_t *nvl, const char *name); #ifndef _KERNEL bool nvlist_exists_descriptor(const nvlist_t *nvl, const char *name); +bool nvlist_exists_descriptor_array(const nvlist_t *nvl, const char *name); #endif -bool nvlist_exists_binary(const nvlist_t *nvl, const char *name); /* * The nvlist_add functions add the given name/value pair. @@ -134,10 +150,15 @@ void nvlist_add_stringf(nvlist_t *nvl, const char *name, const char *valuefmt, . void nvlist_add_stringv(nvlist_t *nvl, const char *name, const char *valuefmt, va_list valueap) __printflike(3, 0); #endif void nvlist_add_nvlist(nvlist_t *nvl, const char *name, const nvlist_t *value); +void nvlist_add_binary(nvlist_t *nvl, const char *name, const void *value, size_t size); +void nvlist_add_bool_array(nvlist_t *nvl, const char *name, const bool *value, size_t nitems); +void nvlist_add_number_array(nvlist_t *nvl, const char *name, const uint64_t *value, size_t nitems); +void nvlist_add_string_array(nvlist_t *nvl, const char *name, const char * const *value, size_t nitems); +void nvlist_add_nvlist_array(nvlist_t *nvl, const char *name, const nvlist_t * const *value, size_t nitems); #ifndef _KERNEL void nvlist_add_descriptor(nvlist_t *nvl, const char *name, int value); +void nvlist_add_descriptor_array(nvlist_t *nvl, const char *name, const int *value, size_t nitems); #endif -void nvlist_add_binary(nvlist_t *nvl, const char *name, const void *value, size_t size); /* * The nvlist_move functions add the given name/value pair. @@ -146,10 +167,15 @@ void nvlist_add_binary(nvlist_t *nvl, const char *name, const void *value, size_ void nvlist_move_string(nvlist_t *nvl, const char *name, char *value); void nvlist_move_nvlist(nvlist_t *nvl, const char *name, nvlist_t *value); +void nvlist_move_binary(nvlist_t *nvl, const char *name, void *value, size_t size); +void nvlist_move_bool_array(nvlist_t *nvl, const char *name, bool *value, size_t nitems); +void nvlist_move_string_array(nvlist_t *nvl, const char *name, char **value, size_t nitems); +void nvlist_move_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t **value, size_t nitems); +void nvlist_move_number_array(nvlist_t *nvl, const char *name, uint64_t *value, size_t nitems); #ifndef _KERNEL void nvlist_move_descriptor(nvlist_t *nvl, const char *name, int value); +void nvlist_move_descriptor_array(nvlist_t *nvl, const char *name, int *value, size_t nitems); #endif -void nvlist_move_binary(nvlist_t *nvl, const char *name, void *value, size_t size); /* * The nvlist_get functions returns value associated with the given name. @@ -157,14 +183,19 @@ void nvlist_move_binary(nvlist_t *nvl, const char *name, void *value, size_t siz * not be freed by the caller. */ -bool nvlist_get_bool(const nvlist_t *nvl, const char *name); -uint64_t nvlist_get_number(const nvlist_t *nvl, const char *name); -const char *nvlist_get_string(const nvlist_t *nvl, const char *name); -const nvlist_t *nvlist_get_nvlist(const nvlist_t *nvl, const char *name); +bool nvlist_get_bool(const nvlist_t *nvl, const char *name); +uint64_t nvlist_get_number(const nvlist_t *nvl, const char *name); +const char *nvlist_get_string(const nvlist_t *nvl, const char *name); +const nvlist_t *nvlist_get_nvlist(const nvlist_t *nvl, const char *name); +const void *nvlist_get_binary(const nvlist_t *nvl, const char *name, size_t *sizep); +const bool *nvlist_get_bool_array(const nvlist_t *nvl, const char *name, size_t *nitemsp); +const uint64_t *nvlist_get_number_array(const nvlist_t *nvl, const char *name, size_t *nitemsp); +const char * const *nvlist_get_string_array(const nvlist_t *nvl, const char *name, size_t *nitemsp); +const nvlist_t * const *nvlist_get_nvlist_array(const nvlist_t *nvl, const char *name, size_t *nitemsp); #ifndef _KERNEL -int nvlist_get_descriptor(const nvlist_t *nvl, const char *name); +int nvlist_get_descriptor(const nvlist_t *nvl, const char *name); +const int *nvlist_get_descriptor_array(const nvlist_t *nvl, const char *name, size_t *nitemsp); #endif -const void *nvlist_get_binary(const nvlist_t *nvl, const char *name, size_t *sizep); /* * The nvlist_take functions returns value associated with the given name and @@ -172,14 +203,19 @@ const void *nvlist_get_binary(const nvlist_t *nvl, const char *name, size_t *siz * The caller is responsible for freeing received data. */ -bool nvlist_take_bool(nvlist_t *nvl, const char *name); -uint64_t nvlist_take_number(nvlist_t *nvl, const char *name); -char *nvlist_take_string(nvlist_t *nvl, const char *name); -nvlist_t *nvlist_take_nvlist(nvlist_t *nvl, const char *name); +bool nvlist_take_bool(nvlist_t *nvl, const char *name); +uint64_t nvlist_take_number(nvlist_t *nvl, const char *name); +char *nvlist_take_string(nvlist_t *nvl, const char *name); +nvlist_t *nvlist_take_nvlist(nvlist_t *nvl, const char *name); +void *nvlist_take_binary(nvlist_t *nvl, const char *name, size_t *sizep); +bool *nvlist_take_bool_array(nvlist_t *nvl, const char *name, size_t *nitemsp); +uint64_t *nvlist_take_number_array(nvlist_t *nvl, const char *name, size_t *nitemsp); +char **nvlist_take_string_array(nvlist_t *nvl, const char *name, size_t *nitemsp); +nvlist_t **nvlist_take_nvlist_array(nvlist_t *nvl, const char *name, size_t *nitemsp); #ifndef _KERNEL int nvlist_take_descriptor(nvlist_t *nvl, const char *name); +int *nvlist_take_descriptor_array(nvlist_t *nvl, const char *name, size_t *nitemsp); #endif -void *nvlist_take_binary(nvlist_t *nvl, const char *name, size_t *sizep); /* * The nvlist_free functions removes the given name/value pair from the nvlist @@ -194,10 +230,16 @@ void nvlist_free_bool(nvlist_t *nvl, const char *name); void nvlist_free_number(nvlist_t *nvl, const char *name); void nvlist_free_string(nvlist_t *nvl, const char *name); void nvlist_free_nvlist(nvlist_t *nvl, const char *name); +void nvlist_free_binary(nvlist_t *nvl, const char *name); +void nvlist_free_bool_array(nvlist_t *nvl, const char *name); +void nvlist_free_number_array(nvlist_t *nvl, const char *name); +void nvlist_free_string_array(nvlist_t *nvl, const char *name); +void nvlist_free_nvlist_array(nvlist_t *nvl, const char *name); +void nvlist_free_binary_array(nvlist_t *nvl, const char *name); #ifndef _KERNEL void nvlist_free_descriptor(nvlist_t *nvl, const char *name); +void nvlist_free_descriptor_array(nvlist_t *nvl, const char *name); #endif -void nvlist_free_binary(nvlist_t *nvl, const char *name); __END_DECLS diff --git a/sys/sys/random.h b/sys/sys/random.h index 78a9955..92eb80f 100644 --- a/sys/sys/random.h +++ b/sys/sys/random.h @@ -33,10 +33,29 @@ #include <sys/types.h> +#include "opt_random.h" + +#if defined(RANDOM_LOADABLE) && defined(RANDOM_YARROW) +#error "Cannot define both RANDOM_LOADABLE and RANDOM_YARROW" +#endif + struct uio; +#if defined(DEV_RANDOM) u_int read_random(void *, u_int); int read_random_uio(struct uio *, bool); +#else +static __inline int +read_random_uio(void *a __unused, u_int b __unused) +{ + return (0); +} +static __inline u_int +read_random(void *a __unused, u_int b __unused) +{ + return (0); +} +#endif /* * Note: if you add or remove members of random_entropy_source, remember to also update the @@ -76,15 +95,15 @@ enum random_entropy_source { #define RANDOM_HARVEST_EVERYTHING_MASK ((1 << (RANDOM_ENVIRONMENTAL_END + 1)) - 1) -#if defined(RANDOM_DUMMY) -#define random_harvest_queue(a, b, c, d) do {} while (0) -#define random_harvest_fast(a, b, c, d) do {} while (0) -#define random_harvest_direct(a, b, c, d) do {} while (0) -#else /* !defined(RANDOM_DUMMY) */ +#if defined(DEV_RANDOM) void random_harvest_queue(const void *, u_int, u_int, enum random_entropy_source); void random_harvest_fast(const void *, u_int, u_int, enum random_entropy_source); void random_harvest_direct(const void *, u_int, u_int, enum random_entropy_source); -#endif /* defined(RANDOM_DUMMY) */ +#else +#define random_harvest_queue(a, b, c, d) do {} while (0) +#define random_harvest_fast(a, b, c, d) do {} while (0) +#define random_harvest_direct(a, b, c, d) do {} while (0) +#endif #endif /* _KERNEL */ diff --git a/sys/sys/socketvar.h b/sys/sys/socketvar.h index 112bb2c..26cf9a6 100644 --- a/sys/sys/socketvar.h +++ b/sys/sys/socketvar.h @@ -78,7 +78,7 @@ struct socket { short so_state; /* (b) internal state flags SS_* */ int so_qstate; /* (e) internal state flags SQ_* */ void *so_pcb; /* protocol control block */ - struct vnet *so_vnet; /* network stack instance */ + struct vnet *so_vnet; /* (a) network stack instance */ struct protosw *so_proto; /* (a) protocol handle */ /* * Variables for connection queuing. diff --git a/sys/sys/timeet.h b/sys/sys/timeet.h index 728578b..3d50e51 100644 --- a/sys/sys/timeet.h +++ b/sys/sys/timeet.h @@ -53,7 +53,7 @@ typedef int et_deregister_cb_t(struct eventtimer *et, void *arg); struct eventtimer { SLIST_ENTRY(eventtimer) et_all; /* Pointer to the next event timer. */ - char *et_name; + const char *et_name; /* Name of the event timer. */ int et_flags; /* Set of capabilities flags: */ diff --git a/sys/sys/timetc.h b/sys/sys/timetc.h index e68e327..8f00e22 100644 --- a/sys/sys/timetc.h +++ b/sys/sys/timetc.h @@ -49,7 +49,7 @@ struct timecounter { /* This mask should mask off any unimplemented bits. */ uint64_t tc_frequency; /* Frequency of the counter in Hz. */ - char *tc_name; + const char *tc_name; /* Name of the timecounter. */ int tc_quality; /* diff --git a/sys/teken/demo/teken_demo.c b/sys/teken/demo/teken_demo.c index 08323dc..42747ce 100644 --- a/sys/teken/demo/teken_demo.c +++ b/sys/teken/demo/teken_demo.c @@ -72,7 +72,7 @@ struct pixel { #define NCOLS 80 #define NROWS 24 -struct pixel buffer[NCOLS][NROWS]; +static struct pixel buffer[NCOLS][NROWS]; static int ptfd; diff --git a/sys/teken/teken.c b/sys/teken/teken.c index 3002a88..8834390 100644 --- a/sys/teken/teken.c +++ b/sys/teken/teken.c @@ -29,12 +29,14 @@ #include <sys/cdefs.h> #if defined(__FreeBSD__) && defined(_KERNEL) #include <sys/param.h> +#include <sys/limits.h> #include <sys/lock.h> #include <sys/systm.h> #define teken_assert(x) MPASS(x) #else /* !(__FreeBSD__ && _KERNEL) */ #include <sys/types.h> #include <assert.h> +#include <limits.h> #include <stdint.h> #include <stdio.h> #include <string.h> @@ -405,18 +407,24 @@ teken_state_numbers(teken_t *t, teken_char_t c) teken_assert(t->t_curnum < T_NUMSIZE); if (c >= '0' && c <= '9') { - /* - * Don't do math with the default value of 1 when a - * custom number is inserted. - */ if (t->t_stateflags & TS_FIRSTDIGIT) { + /* First digit. */ t->t_stateflags &= ~TS_FIRSTDIGIT; - t->t_nums[t->t_curnum] = 0; - } else { - t->t_nums[t->t_curnum] *= 10; + t->t_nums[t->t_curnum] = c - '0'; + } else if (t->t_nums[t->t_curnum] < UINT_MAX / 100) { + /* + * There is no need to continue parsing input + * once the value exceeds the size of the + * terminal. It would only allow for integer + * overflows when performing arithmetic on the + * cursor position. + * + * Ignore any further digits if the value is + * already UINT_MAX / 100. + */ + t->t_nums[t->t_curnum] = + t->t_nums[t->t_curnum] * 10 + c - '0'; } - - t->t_nums[t->t_curnum] += c - '0'; return (1); } else if (c == ';') { if (t->t_stateflags & TS_FIRSTDIGIT) diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 91affa0..13916c0 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -566,11 +566,6 @@ vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, if (pageout_status[i] != VM_PAGER_PEND) { vm_object_pip_wakeup(object); vm_page_sunbusy(mt); - if (vm_page_count_severe()) { - vm_page_lock(mt); - vm_page_try_to_cache(mt); - vm_page_unlock(mt); - } } } if (prunlen != NULL) diff --git a/sys/x86/iommu/intel_idpgtbl.c b/sys/x86/iommu/intel_idpgtbl.c index 405976b..d52e8d4 100644 --- a/sys/x86/iommu/intel_idpgtbl.c +++ b/sys/x86/iommu/intel_idpgtbl.c @@ -374,8 +374,9 @@ retry: KASSERT(lvl > 0, ("lost root page table page %p", domain)); /* - * Page table page does not exists, allocate - * it and create pte in the up level. + * Page table page does not exist, allocate + * it and create a pte in the preceeding page level + * to reference the allocated page table page. */ m = dmar_pgalloc(domain->pgtbl_obj, idx, flags | DMAR_PGF_ZERO); diff --git a/sys/x86/x86/busdma_bounce.c b/sys/x86/x86/busdma_bounce.c index dcdeafa..48c500f 100644 --- a/sys/x86/x86/busdma_bounce.c +++ b/sys/x86/x86/busdma_bounce.c @@ -79,7 +79,8 @@ struct bounce_page { vm_offset_t vaddr; /* kva of bounce buffer */ bus_addr_t busaddr; /* Physical address */ vm_offset_t datavaddr; /* kva of client data */ - bus_addr_t dataaddr; /* client physical address */ + vm_page_t datapage; /* physical page of client data */ + vm_offset_t dataoffs; /* page offset of client data */ bus_size_t datacount; /* client data count */ STAILQ_ENTRY(bounce_page) links; }; @@ -658,7 +659,7 @@ bounce_bus_dmamap_load_buffer(bus_dma_tag_t dmat, bus_dmamap_t map, void *buf, { bus_size_t sgsize, max_sgsize; bus_addr_t curaddr; - vm_offset_t vaddr; + vm_offset_t kvaddr, vaddr; int error; if (map == NULL) @@ -681,10 +682,13 @@ bounce_bus_dmamap_load_buffer(bus_dma_tag_t dmat, bus_dmamap_t map, void *buf, /* * Get the physical address for this segment. */ - if (pmap == kernel_pmap) + if (pmap == kernel_pmap) { curaddr = pmap_kextract(vaddr); - else + kvaddr = vaddr; + } else { curaddr = pmap_extract(pmap, vaddr); + kvaddr = 0; + } /* * Compute the segment size, and adjust counts. @@ -696,7 +700,7 @@ bounce_bus_dmamap_load_buffer(bus_dma_tag_t dmat, bus_dmamap_t map, void *buf, bus_dma_run_filter(&dmat->common, curaddr)) { sgsize = roundup2(sgsize, dmat->common.alignment); sgsize = MIN(sgsize, max_sgsize); - curaddr = add_bounce_page(dmat, map, vaddr, curaddr, + curaddr = add_bounce_page(dmat, map, kvaddr, curaddr, sgsize); } else { sgsize = MIN(sgsize, max_sgsize); @@ -757,48 +761,56 @@ bounce_bus_dmamap_sync(bus_dma_tag_t dmat, bus_dmamap_t map, bus_dmasync_op_t op) { struct bounce_page *bpage; + vm_offset_t datavaddr, tempvaddr; - if ((bpage = STAILQ_FIRST(&map->bpages)) != NULL) { - /* - * Handle data bouncing. We might also - * want to add support for invalidating - * the caches on broken hardware - */ - CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x op 0x%x " - "performing bounce", __func__, dmat, - dmat->common.flags, op); - - if ((op & BUS_DMASYNC_PREWRITE) != 0) { - while (bpage != NULL) { - if (bpage->datavaddr != 0) { - bcopy((void *)bpage->datavaddr, - (void *)bpage->vaddr, - bpage->datacount); - } else { - physcopyout(bpage->dataaddr, - (void *)bpage->vaddr, - bpage->datacount); - } - bpage = STAILQ_NEXT(bpage, links); + if ((bpage = STAILQ_FIRST(&map->bpages)) == NULL) + return; + + /* + * Handle data bouncing. We might also want to add support for + * invalidating the caches on broken hardware. + */ + CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x op 0x%x " + "performing bounce", __func__, dmat, dmat->common.flags, op); + + if ((op & BUS_DMASYNC_PREWRITE) != 0) { + while (bpage != NULL) { + tempvaddr = 0; + datavaddr = bpage->datavaddr; + if (datavaddr == 0) { + tempvaddr = + pmap_quick_enter_page(bpage->datapage); + datavaddr = tempvaddr | bpage->dataoffs; } - dmat->bounce_zone->total_bounced++; + + bcopy((void *)datavaddr, + (void *)bpage->vaddr, bpage->datacount); + + if (tempvaddr != 0) + pmap_quick_remove_page(tempvaddr); + bpage = STAILQ_NEXT(bpage, links); } + dmat->bounce_zone->total_bounced++; + } - if ((op & BUS_DMASYNC_POSTREAD) != 0) { - while (bpage != NULL) { - if (bpage->datavaddr != 0) { - bcopy((void *)bpage->vaddr, - (void *)bpage->datavaddr, - bpage->datacount); - } else { - physcopyin((void *)bpage->vaddr, - bpage->dataaddr, - bpage->datacount); - } - bpage = STAILQ_NEXT(bpage, links); + if ((op & BUS_DMASYNC_POSTREAD) != 0) { + while (bpage != NULL) { + tempvaddr = 0; + datavaddr = bpage->datavaddr; + if (datavaddr == 0) { + tempvaddr = + pmap_quick_enter_page(bpage->datapage); + datavaddr = tempvaddr | bpage->dataoffs; } - dmat->bounce_zone->total_bounced++; + + bcopy((void *)bpage->vaddr, + (void *)datavaddr, bpage->datacount); + + if (tempvaddr != 0) + pmap_quick_remove_page(tempvaddr); + bpage = STAILQ_NEXT(bpage, links); } + dmat->bounce_zone->total_bounced++; } } @@ -993,7 +1005,8 @@ add_bounce_page(bus_dma_tag_t dmat, bus_dmamap_t map, vm_offset_t vaddr, bpage->busaddr |= addr & PAGE_MASK; } bpage->datavaddr = vaddr; - bpage->dataaddr = addr; + bpage->datapage = PHYS_TO_VM_PAGE(addr & ~PAGE_MASK); + bpage->dataoffs = addr & PAGE_MASK; bpage->datacount = size; STAILQ_INSERT_TAIL(&(map->bpages), bpage, links); return (bpage->busaddr); diff --git a/sys/xen/gnttab.h b/sys/xen/gnttab.h index d0a44ae..9e82124 100644 --- a/sys/xen/gnttab.h +++ b/sys/xen/gnttab.h @@ -126,10 +126,8 @@ gnttab_set_map_op(struct gnttab_map_grant_ref *map, vm_paddr_t addr, { if (flags & GNTMAP_contains_pte) map->host_addr = addr; - else if (xen_feature(XENFEAT_auto_translated_physmap)) - map->host_addr = vtophys(addr); else - map->host_addr = addr; + map->host_addr = vtophys(addr); map->flags = flags; map->ref = ref; @@ -142,10 +140,8 @@ gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, vm_paddr_t addr, { if (flags & GNTMAP_contains_pte) unmap->host_addr = addr; - else if (xen_feature(XENFEAT_auto_translated_physmap)) - unmap->host_addr = vtophys(addr); else - unmap->host_addr = addr; + unmap->host_addr = vtophys(addr); unmap->handle = handle; unmap->dev_bus_addr = 0; @@ -155,13 +151,8 @@ static inline void gnttab_set_replace_op(struct gnttab_unmap_and_replace *unmap, vm_paddr_t addr, vm_paddr_t new_addr, grant_handle_t handle) { - if (xen_feature(XENFEAT_auto_translated_physmap)) { - unmap->host_addr = vtophys(addr); - unmap->new_addr = vtophys(new_addr); - } else { - unmap->host_addr = addr; - unmap->new_addr = new_addr; - } + unmap->host_addr = vtophys(addr); + unmap->new_addr = vtophys(new_addr); unmap->handle = handle; } |