summaryrefslogtreecommitdiffstats
path: root/sys
diff options
context:
space:
mode:
Diffstat (limited to 'sys')
-rw-r--r--sys/amd64/amd64/pmap.c26
-rw-r--r--sys/arm/arm/cpufunc.c1
-rw-r--r--sys/arm/arm/identcpu.c2
-rw-r--r--sys/arm/arm/pmap-v6-new.c26
-rw-r--r--sys/arm/arm/stdatomic.c10
-rw-r--r--sys/arm/broadcom/bcm2835/bcm2835_systimer.c6
-rw-r--r--sys/arm/conf/BEAGLEBONE3
-rw-r--r--sys/arm/include/armreg.h1
-rw-r--r--sys/arm/ti/am335x/am335x_dmtpps.c549
-rw-r--r--sys/arm/ti/am335x/files.am335x1
-rw-r--r--sys/arm/versatile/sp804.c6
-rw-r--r--sys/arm64/arm64/bus_machdep.c32
-rw-r--r--sys/arm64/arm64/bus_space_asm.S164
-rw-r--r--sys/arm64/arm64/exception.S2
-rw-r--r--sys/arm64/arm64/trap.c39
-rw-r--r--sys/boot/kshim/bsd_kernel.h3
-rw-r--r--sys/boot/uboot/fdt/uboot_fdt.c7
-rw-r--r--sys/cam/ctl/README.ctl.txt10
-rw-r--r--sys/cam/ctl/ctl.c536
-rw-r--r--sys/cam/ctl/ctl.h2
-rw-r--r--sys/cam/ctl/ctl_backend.c1
-rw-r--r--sys/cam/ctl/ctl_backend_block.c68
-rw-r--r--sys/cam/ctl/ctl_backend_ramdisk.c1
-rw-r--r--sys/cam/ctl/ctl_cmd_table.c1
-rw-r--r--sys/cam/ctl/ctl_error.c1
-rw-r--r--sys/cam/ctl/ctl_frontend.c1
-rw-r--r--sys/cam/ctl/ctl_frontend_cam_sim.c1
-rw-r--r--sys/cam/ctl/ctl_frontend_internal.c1612
-rw-r--r--sys/cam/ctl/ctl_frontend_internal.h154
-rw-r--r--sys/cam/ctl/ctl_frontend_ioctl.c470
-rw-r--r--sys/cam/ctl/ctl_frontend_iscsi.c1
-rw-r--r--sys/cam/ctl/ctl_ioctl.h22
-rw-r--r--sys/cam/ctl/ctl_private.h25
-rw-r--r--sys/cam/ctl/ctl_tpc.c1
-rw-r--r--sys/cam/ctl/ctl_tpc_local.c1
-rw-r--r--sys/cddl/compat/opensolaris/sys/nvpair.h187
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c54
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h18
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/Makefile.files6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c2125
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c111
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c266
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c44
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c5
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c822
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c28
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c18
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c216
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c24
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c9
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c366
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h8
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h54
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h9
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h5
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h22
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h106
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h23
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h41
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c13
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c15
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c1
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c141
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c2
-rw-r--r--sys/compat/cloudabi/cloudabi_proc.c5
-rw-r--r--sys/conf/Makefile.arm4
-rw-r--r--sys/conf/NOTES7
-rw-r--r--sys/conf/files24
-rw-r--r--sys/conf/files.amd642
-rw-r--r--sys/conf/kern.post.mk28
-rw-r--r--sys/conf/kern.pre.mk29
-rw-r--r--sys/conf/options12
-rw-r--r--sys/contrib/libnv/nv_impl.h38
-rw-r--r--sys/contrib/libnv/nvlist.c643
-rw-r--r--sys/contrib/libnv/nvlist_impl.h1
-rw-r--r--sys/contrib/libnv/nvpair.c972
-rw-r--r--sys/contrib/libnv/nvpair_impl.h20
-rw-r--r--sys/dev/ata/ata-all.c30
-rw-r--r--sys/dev/ata/ata-all.h9
-rw-r--r--sys/dev/ath/if_ath.c59
-rw-r--r--sys/dev/ath/if_ath_keycache.c6
-rw-r--r--sys/dev/ath/if_ath_rx.c2
-rw-r--r--sys/dev/ath/if_ath_tdma.c2
-rw-r--r--sys/dev/ath/if_ath_tx.c12
-rw-r--r--sys/dev/bxe/ecore_hsi.h6
-rw-r--r--sys/dev/e1000/e1000_80003es2lan.c2
-rw-r--r--sys/dev/e1000/e1000_80003es2lan.h2
-rw-r--r--sys/dev/e1000/e1000_82540.c2
-rw-r--r--sys/dev/e1000/e1000_82541.c2
-rw-r--r--sys/dev/e1000/e1000_82541.h2
-rw-r--r--sys/dev/e1000/e1000_82542.c2
-rw-r--r--sys/dev/e1000/e1000_82543.c2
-rw-r--r--sys/dev/e1000/e1000_82543.h2
-rw-r--r--sys/dev/e1000/e1000_82571.c2
-rw-r--r--sys/dev/e1000/e1000_82571.h2
-rw-r--r--sys/dev/e1000/e1000_82575.c2
-rw-r--r--sys/dev/e1000/e1000_82575.h2
-rw-r--r--sys/dev/e1000/e1000_api.c2
-rw-r--r--sys/dev/e1000/e1000_api.h2
-rw-r--r--sys/dev/e1000/e1000_defines.h2
-rw-r--r--sys/dev/e1000/e1000_hw.h2
-rw-r--r--sys/dev/e1000/e1000_i210.c2
-rw-r--r--sys/dev/e1000/e1000_i210.h2
-rw-r--r--sys/dev/e1000/e1000_ich8lan.c2
-rw-r--r--sys/dev/e1000/e1000_ich8lan.h2
-rw-r--r--sys/dev/e1000/e1000_mac.c2
-rw-r--r--sys/dev/e1000/e1000_mac.h2
-rw-r--r--sys/dev/e1000/e1000_manage.c2
-rw-r--r--sys/dev/e1000/e1000_manage.h2
-rw-r--r--sys/dev/e1000/e1000_mbx.c2
-rw-r--r--sys/dev/e1000/e1000_mbx.h2
-rw-r--r--sys/dev/e1000/e1000_nvm.c2
-rw-r--r--sys/dev/e1000/e1000_nvm.h2
-rw-r--r--sys/dev/e1000/e1000_osdep.c2
-rw-r--r--sys/dev/e1000/e1000_osdep.h2
-rw-r--r--sys/dev/e1000/e1000_phy.c2
-rw-r--r--sys/dev/e1000/e1000_phy.h2
-rw-r--r--sys/dev/e1000/e1000_regs.h2
-rw-r--r--sys/dev/e1000/e1000_vf.c2
-rw-r--r--sys/dev/e1000/e1000_vf.h2
-rw-r--r--sys/dev/e1000/if_em.c151
-rw-r--r--sys/dev/e1000/if_em.h2
-rw-r--r--sys/dev/e1000/if_igb.c2
-rw-r--r--sys/dev/e1000/if_igb.h2
-rw-r--r--sys/dev/e1000/if_lem.c8
-rw-r--r--sys/dev/e1000/if_lem.h5
-rw-r--r--sys/dev/gpio/gpiobus.c18
-rw-r--r--sys/dev/gpio/gpioled.c1
-rw-r--r--sys/dev/md/md.c36
-rw-r--r--sys/dev/random/fortuna.c45
-rw-r--r--sys/dev/random/other_algorithm.c209
-rw-r--r--sys/dev/random/other_algorithm.h62
-rw-r--r--sys/dev/random/random_harvestq.c73
-rw-r--r--sys/dev/random/random_harvestq.h2
-rw-r--r--sys/dev/random/random_infra.c128
-rw-r--r--sys/dev/random/randomdev.c181
-rw-r--r--sys/dev/random/randomdev.h38
-rw-r--r--sys/dev/random/randomdev_none.c72
-rw-r--r--sys/dev/random/unit_test.c32
-rw-r--r--sys/dev/random/yarrow.c63
-rw-r--r--sys/dev/usb/controller/dwc_otg.c492
-rw-r--r--sys/dev/usb/controller/dwc_otg.h5
-rw-r--r--sys/dev/usb/controller/usb_controller.c14
-rw-r--r--sys/dev/usb/usb_bus.h19
-rw-r--r--sys/dev/usb/usb_device.c2
-rw-r--r--sys/dev/usb/usb_hub.c4
-rw-r--r--sys/dev/usb/usb_pf.c6
-rw-r--r--sys/dev/usb/usb_process.h1
-rw-r--r--sys/dev/usb/usb_transfer.c63
-rw-r--r--sys/dev/usb/usbdi.h2
-rw-r--r--sys/dev/vt/hw/efifb/efifb.c26
-rw-r--r--sys/dev/vt/hw/vga/vt_vga.c10
-rw-r--r--sys/dev/vt/hw/vga/vt_vga_reg.h2
-rw-r--r--sys/dev/vt/vt_core.c5
-rw-r--r--sys/dev/xen/netfront/netfront.c60
-rw-r--r--sys/fs/nfsserver/nfs_nfsdstate.c7
-rw-r--r--sys/kern/genassym.sh2
-rw-r--r--sys/kern/kern_exit.c4
-rw-r--r--sys/kern/kern_tc.c18
-rw-r--r--sys/modules/Makefile6
-rw-r--r--sys/modules/am335x_dmtpps/Makefile8
-rw-r--r--sys/modules/ctl/Makefile2
-rw-r--r--sys/modules/gpio/gpiobus/Makefile5
-rw-r--r--sys/modules/random_fortuna/Makefile11
-rw-r--r--sys/modules/random_other/Makefile11
-rw-r--r--sys/modules/random_yarrow/Makefile11
-rw-r--r--sys/net/ieee8023ad_lacp.c2
-rw-r--r--sys/net/ieee8023ad_lacp.h1
-rw-r--r--sys/net/if_lagg.c19
-rw-r--r--sys/net/if_lagg.h1
-rw-r--r--sys/netinet/if_ether.c415
-rw-r--r--sys/netinet/sctp_timer.c2
-rw-r--r--sys/ofed/drivers/infiniband/core/cma.c50
-rw-r--r--sys/powerpc/powerpc/trap.c6
-rw-r--r--sys/sys/ata.h1
-rw-r--r--sys/sys/nv.h72
-rw-r--r--sys/sys/random.h31
-rw-r--r--sys/sys/socketvar.h2
-rw-r--r--sys/sys/timeet.h2
-rw-r--r--sys/sys/timetc.h2
-rw-r--r--sys/teken/demo/teken_demo.c2
-rw-r--r--sys/teken/teken.c26
-rw-r--r--sys/vm/vm_pageout.c5
-rw-r--r--sys/x86/iommu/intel_idpgtbl.c5
-rw-r--r--sys/x86/x86/busdma_bounce.c97
-rw-r--r--sys/xen/gnttab.h17
199 files changed, 8063 insertions, 5176 deletions
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 1e64fc8..41dea8b 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -390,6 +390,8 @@ static struct md_page *pv_table;
*/
pt_entry_t *CMAP1 = 0;
caddr_t CADDR1 = 0;
+static vm_offset_t qframe = 0;
+static struct mtx qframe_mtx;
static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */
@@ -1031,7 +1033,7 @@ pmap_init(void)
struct pmap_preinit_mapping *ppim;
vm_page_t mpte;
vm_size_t s;
- int i, pv_npg;
+ int error, i, pv_npg;
/*
* Initialize the vm page array entries for the kernel pmap's
@@ -1112,6 +1114,12 @@ pmap_init(void)
printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i,
ppim->pa, ppim->va, ppim->sz, ppim->mode);
}
+
+ mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
+ error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
+ (vmem_addr_t *)&qframe);
+ if (error != 0)
+ panic("qframe allocation failed");
}
static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
@@ -7019,13 +7027,27 @@ pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
vm_offset_t
pmap_quick_enter_page(vm_page_t m)
{
+ vm_paddr_t paddr;
- return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
+ paddr = VM_PAGE_TO_PHYS(m);
+ if (paddr < dmaplimit)
+ return (PHYS_TO_DMAP(paddr));
+ mtx_lock_spin(&qframe_mtx);
+ KASSERT(*vtopte(qframe) == 0, ("qframe busy"));
+ pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A |
+ X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0));
+ return (qframe);
}
void
pmap_quick_remove_page(vm_offset_t addr)
{
+
+ if (addr != qframe)
+ return;
+ pte_store(vtopte(qframe), 0);
+ invlpg(qframe);
+ mtx_unlock_spin(&qframe_mtx);
}
#include "opt_ddb.h"
diff --git a/sys/arm/arm/cpufunc.c b/sys/arm/arm/cpufunc.c
index 0b589ed..fea0581 100644
--- a/sys/arm/arm/cpufunc.c
+++ b/sys/arm/arm/cpufunc.c
@@ -904,6 +904,7 @@ set_cpufuncs()
cputype == CPU_ID_CORTEXA9R1 ||
cputype == CPU_ID_CORTEXA9R2 ||
cputype == CPU_ID_CORTEXA9R3 ||
+ cputype == CPU_ID_CORTEXA9R4 ||
cputype == CPU_ID_CORTEXA12R0 ||
cputype == CPU_ID_CORTEXA15R0 ||
cputype == CPU_ID_CORTEXA15R1 ||
diff --git a/sys/arm/arm/identcpu.c b/sys/arm/arm/identcpu.c
index 75bf08c..be1393b1 100644
--- a/sys/arm/arm/identcpu.c
+++ b/sys/arm/arm/identcpu.c
@@ -185,6 +185,8 @@ const struct cpuidtab cpuids[] = {
generic_steppings },
{ CPU_ID_CORTEXA9R3, CPU_CLASS_CORTEXA, "Cortex A9-r3",
generic_steppings },
+ { CPU_ID_CORTEXA9R4, CPU_CLASS_CORTEXA, "Cortex A9-r4",
+ generic_steppings },
{ CPU_ID_CORTEXA12R0, CPU_CLASS_CORTEXA, "Cortex A12-r0",
generic_steppings },
{ CPU_ID_CORTEXA15R0, CPU_CLASS_CORTEXA, "Cortex A15-r0",
diff --git a/sys/arm/arm/pmap-v6-new.c b/sys/arm/arm/pmap-v6-new.c
index b18648f..864e05c 100644
--- a/sys/arm/arm/pmap-v6-new.c
+++ b/sys/arm/arm/pmap-v6-new.c
@@ -1166,10 +1166,9 @@ pmap_init_qpages(void)
pc = pcpu_find(i);
pc->pc_qmap_addr = kva_alloc(PAGE_SIZE);
if (pc->pc_qmap_addr == 0)
- panic("pmap_init_qpages: unable to allocate KVA");
+ panic("%s: unable to allocate KVA", __func__);
}
}
-
SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_qpages, NULL);
/*
@@ -5728,18 +5727,17 @@ pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
vm_offset_t
pmap_quick_enter_page(vm_page_t m)
{
- pt2_entry_t *pte;
- vm_offset_t qmap_addr;
+ pt2_entry_t *pte2p;
+ vm_offset_t qmap_addr;
critical_enter();
-
qmap_addr = PCPU_GET(qmap_addr);
- pte = pt2map_entry(qmap_addr);
+ pte2p = pt2map_entry(qmap_addr);
- KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy"));
+ KASSERT(pte2_load(pte2p) == 0, ("%s: PTE2 busy", __func__));
- pte2_store(pte, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m),
- PTE2_AP_KRW, pmap_page_get_memattr(m)));
+ pte2_store(pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW,
+ pmap_page_get_memattr(m)));
tlb_flush_local(qmap_addr);
return (qmap_addr);
@@ -5748,16 +5746,16 @@ pmap_quick_enter_page(vm_page_t m)
void
pmap_quick_remove_page(vm_offset_t addr)
{
- pt2_entry_t *pte;
+ pt2_entry_t *pte2p;
vm_offset_t qmap_addr;
qmap_addr = PCPU_GET(qmap_addr);
- pte = pt2map_entry(qmap_addr);
+ pte2p = pt2map_entry(qmap_addr);
- KASSERT(addr == qmap_addr, ("pmap_quick_remove_page: invalid address"));
- KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use"));
+ KASSERT(addr == qmap_addr, ("%s: invalid address", __func__));
+ KASSERT(pte2_load(pte2p) != 0, ("%s: PTE2 not in use", __func__));
- pte2_clear(pte);
+ pte2_clear(pte2p);
critical_exit();
}
diff --git a/sys/arm/arm/stdatomic.c b/sys/arm/arm/stdatomic.c
index 211f26a..3c0b997 100644
--- a/sys/arm/arm/stdatomic.c
+++ b/sys/arm/arm/stdatomic.c
@@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <machine/acle-compat.h>
+#include <machine/atomic.h>
#include <machine/cpufunc.h>
#include <machine/sysarch.h>
@@ -67,19 +68,12 @@ do_sync(void)
__asm volatile ("" : : : "memory");
}
-#elif __ARM_ARCH >= 7
-static inline void
-do_sync(void)
-{
-
- __asm volatile ("dmb" : : : "memory");
-}
#elif __ARM_ARCH >= 6
static inline void
do_sync(void)
{
- __asm volatile ("mcr p15, 0, %0, c7, c10, 5" : : "r" (0) : "memory");
+ dmb();
}
#endif
diff --git a/sys/arm/broadcom/bcm2835/bcm2835_systimer.c b/sys/arm/broadcom/bcm2835/bcm2835_systimer.c
index 93bf676..731c7d0 100644
--- a/sys/arm/broadcom/bcm2835/bcm2835_systimer.c
+++ b/sys/arm/broadcom/bcm2835/bcm2835_systimer.c
@@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
#define BCM2835_NUM_TIMERS 4
#define DEFAULT_TIMER 3
+#define DEFAULT_TIMER_NAME "BCM2835-3"
#define DEFAULT_FREQUENCY 1000000
#define MIN_PERIOD 5LLU
@@ -101,7 +102,7 @@ static struct bcm_systimer_softc *bcm_systimer_sc = NULL;
static unsigned bcm_systimer_tc_get_timecount(struct timecounter *);
static struct timecounter bcm_systimer_tc = {
- .tc_name = "BCM2835 Timecounter",
+ .tc_name = DEFAULT_TIMER_NAME,
.tc_get_timecount = bcm_systimer_tc_get_timecount,
.tc_poll_pps = NULL,
.tc_counter_mask = ~0u,
@@ -238,8 +239,7 @@ bcm_systimer_attach(device_t dev)
sc->st[DEFAULT_TIMER].index = DEFAULT_TIMER;
sc->st[DEFAULT_TIMER].enabled = 0;
- sc->st[DEFAULT_TIMER].et.et_name = malloc(64, M_DEVBUF, M_NOWAIT | M_ZERO);
- sprintf(sc->st[DEFAULT_TIMER].et.et_name, "BCM2835 Event Timer %d", DEFAULT_TIMER);
+ sc->st[DEFAULT_TIMER].et.et_name = DEFAULT_TIMER_NAME;
sc->st[DEFAULT_TIMER].et.et_flags = ET_FLAGS_ONESHOT;
sc->st[DEFAULT_TIMER].et.et_quality = 1000;
sc->st[DEFAULT_TIMER].et.et_frequency = sc->sysclk_freq;
diff --git a/sys/arm/conf/BEAGLEBONE b/sys/arm/conf/BEAGLEBONE
index a0ca1b6..12b8290 100644
--- a/sys/arm/conf/BEAGLEBONE
+++ b/sys/arm/conf/BEAGLEBONE
@@ -26,7 +26,7 @@ ident BEAGLEBONE
include "std.armv6"
include "../ti/am335x/std.am335x"
-makeoptions MODULES_EXTRA="dtb/am335x"
+makeoptions MODULES_EXTRA="dtb/am335x am335x_dmtpps"
# DTrace support
options KDTRACE_HOOKS # Kernel DTrace hooks
@@ -77,6 +77,7 @@ device ti_i2c
device am335x_pmic # AM335x Power Management IC (TPC65217)
device am335x_rtc # RTC support (power management only)
+#define am335x_dmtpps # Pulse Per Second capture driver
# Console and misc
device uart
diff --git a/sys/arm/include/armreg.h b/sys/arm/include/armreg.h
index 9358703..a300ddf 100644
--- a/sys/arm/include/armreg.h
+++ b/sys/arm/include/armreg.h
@@ -133,6 +133,7 @@
#define CPU_ID_CORTEXA9R1 0x411fc090
#define CPU_ID_CORTEXA9R2 0x412fc090
#define CPU_ID_CORTEXA9R3 0x413fc090
+#define CPU_ID_CORTEXA9R4 0x414fc090
#define CPU_ID_CORTEXA12R0 0x410fc0d0
#define CPU_ID_CORTEXA15R0 0x410fc0f0
#define CPU_ID_CORTEXA15R1 0x411fc0f0
diff --git a/sys/arm/ti/am335x/am335x_dmtpps.c b/sys/arm/ti/am335x/am335x_dmtpps.c
new file mode 100644
index 0000000..08b4104
--- /dev/null
+++ b/sys/arm/ti/am335x/am335x_dmtpps.c
@@ -0,0 +1,549 @@
+/*-
+ * Copyright (c) 2015 Ian lepore <ian@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * AM335x PPS driver using DMTimer capture.
+ *
+ * Note that this PPS driver does not use an interrupt. Instead it uses the
+ * hardware's ability to latch the timer's count register in response to a
+ * signal on an IO pin. Each of timers 4-7 have an associated pin, and this
+ * code allows any one of those to be used.
+ *
+ * The timecounter routines in kern_tc.c call the pps poll routine periodically
+ * to see if a new counter value has been latched. When a new value has been
+ * latched, the only processing done in the poll routine is to capture the
+ * current set of timecounter timehands (done with pps_capture()) and the
+ * latched value from the timer. The remaining work (done by pps_event() while
+ * holding a mutex) is scheduled to be done later in a non-interrupt context.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/malloc.h>
+#include <sys/rman.h>
+#include <sys/taskqueue.h>
+#include <sys/timepps.h>
+#include <sys/timetc.h>
+#include <machine/bus.h>
+
+#include <dev/ofw/openfirm.h>
+#include <dev/ofw/ofw_bus.h>
+#include <dev/ofw/ofw_bus_subr.h>
+
+#include <arm/ti/ti_prcm.h>
+#include <arm/ti/ti_hwmods.h>
+#include <arm/ti/ti_pinmux.h>
+#include <arm/ti/am335x/am335x_scm_padconf.h>
+
+#include "am335x_dmtreg.h"
+
+#define PPS_CDEV_NAME "dmtpps"
+
+struct dmtpps_softc {
+ device_t dev;
+ int mem_rid;
+ struct resource * mem_res;
+ int tmr_num; /* N from hwmod str "timerN" */
+ char tmr_name[12]; /* "DMTimerN" */
+ uint32_t tclr; /* Cached TCLR register. */
+ struct timecounter tc;
+ int pps_curmode; /* Edge mode now set in hw. */
+ struct task pps_task; /* For pps_event handling. */
+ struct cdev * pps_cdev;
+ struct pps_state pps_state;
+ struct mtx pps_mtx;
+};
+
+static int dmtpps_tmr_num; /* Set by probe() */
+
+/* List of compatible strings for FDT tree */
+static struct ofw_compat_data compat_data[] = {
+ {"ti,am335x-timer", 1},
+ {"ti,am335x-timer-1ms", 1},
+ {NULL, 0},
+};
+
+/*
+ * A table relating pad names to the hardware timer number they can be mux'd to.
+ */
+struct padinfo {
+ char * ballname;
+ int tmr_num;
+};
+static struct padinfo dmtpps_padinfo[] = {
+ {"GPMC_ADVn_ALE", 4},
+ {"I2C0_SDA", 4},
+ {"MII1_TX_EN", 4},
+ {"XDMA_EVENT_INTR0", 4},
+ {"GPMC_BEn0_CLE", 5},
+ {"MDC", 5},
+ {"MMC0_DAT3", 5},
+ {"UART1_RTSn", 5},
+ {"GPMC_WEn", 6},
+ {"MDIO", 6},
+ {"MMC0_DAT2", 6},
+ {"UART1_CTSn", 6},
+ {"GPMC_OEn_REn", 7},
+ {"I2C0_SCL", 7},
+ {"UART0_CTSn", 7},
+ {"XDMA_EVENT_INTR1", 7},
+ {NULL, 0}
+};
+
+/*
+ * This is either brilliantly user-friendly, or utterly lame...
+ *
+ * The am335x chip is used on the popular Beaglebone boards. Those boards have
+ * pins for all four capture-capable timers available on the P8 header. Allow
+ * users to configure the input pin by giving the name of the header pin.
+ */
+struct nicknames {
+ const char * nick;
+ const char * name;
+};
+static struct nicknames dmtpps_pin_nicks[] = {
+ {"P8-7", "GPMC_ADVn_ALE"},
+ {"P8-9", "GPMC_BEn0_CLE"},
+ {"P8-10", "GPMC_WEn"},
+ {"P8-8", "GPMC_OEn_REn",},
+ {NULL, NULL}
+};
+
+#define DMTIMER_READ4(sc, reg) bus_read_4((sc)->mem_res, (reg))
+#define DMTIMER_WRITE4(sc, reg, val) bus_write_4((sc)->mem_res, (reg), (val))
+
+/*
+ * Translate a short friendly case-insensitive name to its canonical name.
+ */
+static const char *
+dmtpps_translate_nickname(const char *nick)
+{
+ struct nicknames *nn;
+
+ for (nn = dmtpps_pin_nicks; nn->nick != NULL; nn++)
+ if (strcasecmp(nick, nn->nick) == 0)
+ return nn->name;
+ return (nick);
+}
+
+/*
+ * See if our tunable is set to the name of the input pin. If not, that's NOT
+ * an error, return 0. If so, try to configure that pin as a timer capture
+ * input pin, and if that works, then we have our timer unit number and if it
+ * fails that IS an error, return -1.
+ */
+static int
+dmtpps_find_tmr_num_by_tunable()
+{
+ struct padinfo *pi;
+ char iname[20];
+ char muxmode[12];
+ const char * ballname;
+ int err;
+
+ if (!TUNABLE_STR_FETCH("hw.am335x_dmtpps.input", iname, sizeof(iname)))
+ return (0);
+ ballname = dmtpps_translate_nickname(iname);
+ for (pi = dmtpps_padinfo; pi->ballname != NULL; pi++) {
+ if (strcmp(ballname, pi->ballname) != 0)
+ continue;
+ snprintf(muxmode, sizeof(muxmode), "timer%d", pi->tmr_num);
+ err = ti_pinmux_padconf_set(pi->ballname, muxmode,
+ PADCONF_INPUT);
+ if (err != 0) {
+ printf("am335x_dmtpps: unable to configure capture pin "
+ "for %s to input mode\n", muxmode);
+ return (-1);
+ } else if (bootverbose) {
+ printf("am335x_dmtpps: configured pin %s as input "
+ "for %s\n", iname, muxmode);
+ }
+ return (pi->tmr_num);
+ }
+
+ /* Invalid name in the tunable, that's an error. */
+ printf("am335x_dmtpps: unknown pin name '%s'\n", iname);
+ return (-1);
+}
+
+/*
+ * Ask the pinmux driver whether any pin has been configured as a TIMER4..TIMER7
+ * input pin. If so, return the timer number, if not return 0.
+ */
+static int
+dmtpps_find_tmr_num_by_padconf()
+{
+ int err;
+ unsigned int padstate;
+ const char * padmux;
+ struct padinfo *pi;
+ char muxmode[12];
+
+ for (pi = dmtpps_padinfo; pi->ballname != NULL; pi++) {
+ err = ti_pinmux_padconf_get(pi->ballname, &padmux, &padstate);
+ snprintf(muxmode, sizeof(muxmode), "timer%d", pi->tmr_num);
+ if (err == 0 && (padstate & RXACTIVE) != 0 &&
+ strcmp(muxmode, padmux) == 0)
+ return (pi->tmr_num);
+ }
+ /* Nothing found, not an error. */
+ return (0);
+}
+
+/*
+ * Figure out which hardware timer number to use based on input pin
+ * configuration. This is done just once, the first time probe() runs.
+ */
+static int
+dmtpps_find_tmr_num()
+{
+ int tmr_num;
+
+ if ((tmr_num = dmtpps_find_tmr_num_by_tunable()) == 0)
+ tmr_num = dmtpps_find_tmr_num_by_padconf();
+
+ if (tmr_num <= 0) {
+ printf("am335x_dmtpps: PPS driver not enabled: unable to find "
+ "or configure a capture input pin\n");
+ tmr_num = -1; /* Must return non-zero to prevent re-probing. */
+ }
+ return (tmr_num);
+}
+
+static void
+dmtpps_set_hw_capture(struct dmtpps_softc *sc, bool force_off)
+{
+ int newmode;
+
+ if (force_off)
+ newmode = 0;
+ else
+ newmode = sc->pps_state.ppsparam.mode & PPS_CAPTUREASSERT;
+
+ if (newmode == sc->pps_curmode)
+ return;
+ sc->pps_curmode = newmode;
+
+ if (newmode == PPS_CAPTUREASSERT)
+ sc->tclr |= DMT_TCLR_CAPTRAN_LOHI;
+ else
+ sc->tclr &= ~DMT_TCLR_CAPTRAN_MASK;
+ DMTIMER_WRITE4(sc, DMT_TCLR, sc->tclr);
+}
+
+static unsigned
+dmtpps_get_timecount(struct timecounter *tc)
+{
+ struct dmtpps_softc *sc;
+
+ sc = tc->tc_priv;
+
+ return (DMTIMER_READ4(sc, DMT_TCRR));
+}
+
+static void
+dmtpps_poll(struct timecounter *tc)
+{
+ struct dmtpps_softc *sc;
+
+ sc = tc->tc_priv;
+
+ /*
+ * If a new value has been latched we've got a PPS event. Capture the
+ * timecounter data, then override the capcount field (pps_capture()
+ * populates it from the current DMT_TCRR register) with the latched
+ * value from the TCAR1 register.
+ *
+ * There is no locking here, by design. pps_capture() writes into an
+ * area of struct pps_state which is read only by pps_event(). The
+ * synchronization of access to that area is temporal rather than
+ * interlock based... we write in this routine and trigger the task that
+ * will read the data, so no simultaneous access can occur.
+ *
+ * Note that we don't have the TCAR interrupt enabled, but the hardware
+ * still provides the status bits in the "RAW" status register even when
+ * they're masked from generating an irq. However, when clearing the
+ * TCAR status to re-arm the capture for the next second, we have to
+ * write to the IRQ status register, not the RAW register. Quirky.
+ */
+ if (DMTIMER_READ4(sc, DMT_IRQSTATUS_RAW) & DMT_IRQ_TCAR) {
+ pps_capture(&sc->pps_state);
+ sc->pps_state.capcount = DMTIMER_READ4(sc, DMT_TCAR1);
+ DMTIMER_WRITE4(sc, DMT_IRQSTATUS, DMT_IRQ_TCAR);
+ taskqueue_enqueue_fast(taskqueue_fast, &sc->pps_task);
+ }
+}
+
+static void
+dmtpps_event(void *arg, int pending)
+{
+ struct dmtpps_softc *sc;
+
+ sc = arg;
+
+ /* This is the task function that gets enqueued by poll_pps. Once the
+ * time has been captured by the timecounter polling code which runs in
+ * primary interrupt context, the remaining (more expensive) work to
+ * process the event is done later in a threaded context.
+ *
+ * Here there is an interlock that protects the event data in struct
+ * pps_state. That data can be accessed at any time from userland via
+ * ioctl() calls so we must ensure that there is no read access to
+ * partially updated data while pps_event() does its work.
+ */
+ mtx_lock(&sc->pps_mtx);
+ pps_event(&sc->pps_state, PPS_CAPTUREASSERT);
+ mtx_unlock(&sc->pps_mtx);
+}
+
+static int
+dmtpps_open(struct cdev *dev, int flags, int fmt,
+ struct thread *td)
+{
+ struct dmtpps_softc *sc;
+
+ sc = dev->si_drv1;
+
+ /*
+ * Begin polling for pps and enable capture in the hardware whenever the
+ * device is open. Doing this stuff again is harmless if this isn't the
+ * first open.
+ */
+ sc->tc.tc_poll_pps = dmtpps_poll;
+ dmtpps_set_hw_capture(sc, false);
+
+ return 0;
+}
+
+static int
+dmtpps_close(struct cdev *dev, int flags, int fmt,
+ struct thread *td)
+{
+ struct dmtpps_softc *sc;
+
+ sc = dev->si_drv1;
+
+ /*
+ * Stop polling and disable capture on last close. Use the force-off
+ * flag to override the configured mode and turn off the hardware.
+ */
+ sc->tc.tc_poll_pps = NULL;
+ dmtpps_set_hw_capture(sc, true);
+
+ return 0;
+}
+
+static int
+dmtpps_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
+ int flags, struct thread *td)
+{
+ struct dmtpps_softc *sc;
+ int err;
+
+ sc = dev->si_drv1;
+
+ /* Let the kernel do the heavy lifting for ioctl. */
+ mtx_lock(&sc->pps_mtx);
+ err = pps_ioctl(cmd, data, &sc->pps_state);
+ mtx_unlock(&sc->pps_mtx);
+ if (err != 0)
+ return (err);
+
+ /*
+ * The capture mode could have changed, set the hardware to whatever
+ * mode is now current. Effectively a no-op if nothing changed.
+ */
+ dmtpps_set_hw_capture(sc, false);
+
+ return (err);
+}
+
+static struct cdevsw dmtpps_cdevsw = {
+ .d_version = D_VERSION,
+ .d_open = dmtpps_open,
+ .d_close = dmtpps_close,
+ .d_ioctl = dmtpps_ioctl,
+ .d_name = PPS_CDEV_NAME,
+};
+
+static int
+dmtpps_probe(device_t dev)
+{
+ char strbuf[64];
+ int tmr_num;
+
+ if (!ofw_bus_status_okay(dev))
+ return (ENXIO);
+
+ if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0)
+ return (ENXIO);
+
+ /*
+ * If we haven't chosen which hardware timer to use yet, go do that now.
+ * We need to know that to decide whether to return success for this
+ * hardware timer instance or not.
+ */
+ if (dmtpps_tmr_num == 0)
+ dmtpps_tmr_num = dmtpps_find_tmr_num();
+
+ /*
+ * Figure out which hardware timer is being probed and see if it matches
+ * the configured timer number determined earlier.
+ */
+ tmr_num = ti_hwmods_get_unit(dev, "timer");
+ if (dmtpps_tmr_num != tmr_num)
+ return (ENXIO);
+
+ snprintf(strbuf, sizeof(strbuf), "AM335x PPS-Capture DMTimer%d",
+ tmr_num);
+ device_set_desc_copy(dev, strbuf);
+
+ return(BUS_PROBE_DEFAULT);
+}
+
+static int
+dmtpps_attach(device_t dev)
+{
+ struct dmtpps_softc *sc;
+ clk_ident_t timer_id;
+ int err, sysclk_freq;
+
+ sc = device_get_softc(dev);
+ sc->dev = dev;
+
+ /* Get the base clock frequency. */
+ err = ti_prcm_clk_get_source_freq(SYS_CLK, &sysclk_freq);
+
+ /* Enable clocks and power on the device. */
+ if ((timer_id = ti_hwmods_get_clock(dev)) == INVALID_CLK_IDENT)
+ return (ENXIO);
+ if ((err = ti_prcm_clk_set_source(timer_id, SYSCLK_CLK)) != 0)
+ return (err);
+ if ((err = ti_prcm_clk_enable(timer_id)) != 0)
+ return (err);
+
+ /* Request the memory resources. */
+ sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
+ &sc->mem_rid, RF_ACTIVE);
+ if (sc->mem_res == NULL) {
+ return (ENXIO);
+ }
+
+ /* Figure out which hardware timer this is and set the name string. */
+ sc->tmr_num = ti_hwmods_get_unit(dev, "timer");
+ snprintf(sc->tmr_name, sizeof(sc->tmr_name), "DMTimer%d", sc->tmr_num);
+
+ /* Set up timecounter hardware, start it. */
+ DMTIMER_WRITE4(sc, DMT_TSICR, DMT_TSICR_RESET);
+ while (DMTIMER_READ4(sc, DMT_TIOCP_CFG) & DMT_TIOCP_RESET)
+ continue;
+
+ sc->tclr |= DMT_TCLR_START | DMT_TCLR_AUTOLOAD;
+ DMTIMER_WRITE4(sc, DMT_TLDR, 0);
+ DMTIMER_WRITE4(sc, DMT_TCRR, 0);
+ DMTIMER_WRITE4(sc, DMT_TCLR, sc->tclr);
+
+ /* Register the timecounter. */
+ sc->tc.tc_name = sc->tmr_name;
+ sc->tc.tc_get_timecount = dmtpps_get_timecount;
+ sc->tc.tc_counter_mask = ~0u;
+ sc->tc.tc_frequency = sysclk_freq;
+ sc->tc.tc_quality = 1000;
+ sc->tc.tc_priv = sc;
+
+ tc_init(&sc->tc);
+
+ /*
+ * Indicate our PPS capabilities. Have the kernel init its part of the
+ * pps_state struct and add its capabilities.
+ *
+ * While the hardware has a mode to capture each edge, it's not clear we
+ * can use it that way, because there's only a single interrupt/status
+ * bit to say something was captured, but not which edge it was. For
+ * now, just say we can only capture assert events (the positive-going
+ * edge of the pulse).
+ */
+ mtx_init(&sc->pps_mtx, "dmtpps", NULL, MTX_DEF);
+ sc->pps_state.ppscap = PPS_CAPTUREASSERT;
+ sc->pps_state.driver_abi = PPS_ABI_VERSION;
+ sc->pps_state.driver_mtx = &sc->pps_mtx;
+ pps_init_abi(&sc->pps_state);
+
+ /*
+ * Init the task that does deferred pps_event() processing after
+ * the polling routine has captured a pps pulse time.
+ */
+ TASK_INIT(&sc->pps_task, 0, dmtpps_event, sc);
+
+ /* Create the PPS cdev. */
+ sc->pps_cdev = make_dev(&dmtpps_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
+ PPS_CDEV_NAME);
+ sc->pps_cdev->si_drv1 = sc;
+
+ if (bootverbose)
+ device_printf(sc->dev, "Using %s for PPS device /dev/%s\n",
+ sc->tmr_name, PPS_CDEV_NAME);
+
+ return (0);
+}
+
+static int
+dmtpps_detach(device_t dev)
+{
+
+ /*
+ * There is no way to remove a timecounter once it has been registered,
+ * even if it's not in use, so we can never detach. If we were
+ * dynamically loaded as a module this will prevent unloading.
+ */
+ return (EBUSY);
+}
+
+static device_method_t dmtpps_methods[] = {
+ DEVMETHOD(device_probe, dmtpps_probe),
+ DEVMETHOD(device_attach, dmtpps_attach),
+ DEVMETHOD(device_detach, dmtpps_detach),
+ { 0, 0 }
+};
+
+static driver_t dmtpps_driver = {
+ "am335x_dmtpps",
+ dmtpps_methods,
+ sizeof(struct dmtpps_softc),
+};
+
+static devclass_t dmtpps_devclass;
+
+DRIVER_MODULE(am335x_dmtpps, simplebus, dmtpps_driver, dmtpps_devclass, 0, 0);
+MODULE_DEPEND(am335x_dmtpps, am335x_prcm, 1, 1, 1);
+
diff --git a/sys/arm/ti/am335x/files.am335x b/sys/arm/ti/am335x/files.am335x
index 7293fd0..d0193e8 100644
--- a/sys/arm/ti/am335x/files.am335x
+++ b/sys/arm/ti/am335x/files.am335x
@@ -3,6 +3,7 @@
arm/ti/aintc.c standard
arm/ti/am335x/am335x_dmtimer.c standard
+arm/ti/am335x/am335x_dmtpps.c optional am335x_dmtpps
arm/ti/am335x/am335x_gpio.c optional gpio
arm/ti/am335x/am335x_lcd.c optional sc | vt
arm/ti/am335x/am335x_lcd_syscons.c optional sc
diff --git a/sys/arm/versatile/sp804.c b/sys/arm/versatile/sp804.c
index a69c018..de05700 100644
--- a/sys/arm/versatile/sp804.c
+++ b/sys/arm/versatile/sp804.c
@@ -244,7 +244,7 @@ sp804_timer_attach(device_t dev)
* Timer 1, timecounter
*/
sc->tc.tc_frequency = sc->sysclk_freq;
- sc->tc.tc_name = "SP804 Time Counter";
+ sc->tc.tc_name = "SP804-1";
sc->tc.tc_get_timecount = sp804_timer_tc_get_timecount;
sc->tc.tc_poll_pps = NULL;
sc->tc.tc_counter_mask = ~0u;
@@ -263,9 +263,7 @@ sp804_timer_attach(device_t dev)
* Timer 2, event timer
*/
sc->et_enabled = 0;
- sc->et.et_name = malloc(64, M_DEVBUF, M_NOWAIT | M_ZERO);
- sprintf(sc->et.et_name, "SP804 Event Timer %d",
- device_get_unit(dev));
+ sc->et.et_name = "SP804-2";
sc->et.et_flags = ET_FLAGS_PERIODIC | ET_FLAGS_ONESHOT;
sc->et.et_quality = 1000;
sc->et.et_frequency = sc->sysclk_freq / DEFAULT_DIVISOR;
diff --git a/sys/arm64/arm64/bus_machdep.c b/sys/arm64/arm64/bus_machdep.c
index 25a675e..f6df4a1 100644
--- a/sys/arm64/arm64/bus_machdep.c
+++ b/sys/arm64/arm64/bus_machdep.c
@@ -49,6 +49,15 @@ void generic_bs_rm_4(void *, bus_space_handle_t, bus_size_t, uint32_t *,
void generic_bs_rm_8(void *, bus_space_handle_t, bus_size_t, uint64_t *,
bus_size_t);
+void generic_bs_rr_1(void *, bus_space_handle_t, bus_size_t, uint8_t *,
+ bus_size_t);
+void generic_bs_rr_2(void *, bus_space_handle_t, bus_size_t, uint16_t *,
+ bus_size_t);
+void generic_bs_rr_4(void *, bus_space_handle_t, bus_size_t, uint32_t *,
+ bus_size_t);
+void generic_bs_rr_8(void *, bus_space_handle_t, bus_size_t, uint64_t *,
+ bus_size_t);
+
void generic_bs_w_1(void *, bus_space_handle_t, bus_size_t, uint8_t);
void generic_bs_w_2(void *, bus_space_handle_t, bus_size_t, uint16_t);
void generic_bs_w_4(void *, bus_space_handle_t, bus_size_t, uint32_t);
@@ -63,6 +72,15 @@ void generic_bs_wm_4(void *, bus_space_handle_t, bus_size_t, const uint32_t *,
void generic_bs_wm_8(void *, bus_space_handle_t, bus_size_t, const uint64_t *,
bus_size_t);
+void generic_bs_wr_1(void *, bus_space_handle_t, bus_size_t, const uint8_t *,
+ bus_size_t);
+void generic_bs_wr_2(void *, bus_space_handle_t, bus_size_t, const uint16_t *,
+ bus_size_t);
+void generic_bs_wr_4(void *, bus_space_handle_t, bus_size_t, const uint32_t *,
+ bus_size_t);
+void generic_bs_wr_8(void *, bus_space_handle_t, bus_size_t, const uint64_t *,
+ bus_size_t);
+
static int
generic_bs_map(void *t, bus_addr_t bpa, bus_size_t size, int flags,
bus_space_handle_t *bshp)
@@ -126,6 +144,12 @@ struct bus_space memmap_bus = {
.bs_rm_4 = generic_bs_rm_4,
.bs_rm_8 = generic_bs_rm_8,
+ /* read region */
+ .bs_rr_1 = generic_bs_rr_1,
+ .bs_rr_2 = generic_bs_rr_2,
+ .bs_rr_4 = generic_bs_rr_4,
+ .bs_rr_8 = generic_bs_rr_8,
+
/* write single */
.bs_w_1 = generic_bs_w_1,
.bs_w_2 = generic_bs_w_2,
@@ -139,10 +163,10 @@ struct bus_space memmap_bus = {
.bs_wm_8 = generic_bs_wm_8,
/* write region */
- .bs_wr_1 = NULL,
- .bs_wr_2 = NULL,
- .bs_wr_4 = NULL,
- .bs_wr_8 = NULL,
+ .bs_wr_1 = generic_bs_wr_1,
+ .bs_wr_2 = generic_bs_wr_2,
+ .bs_wr_4 = generic_bs_wr_4,
+ .bs_wr_8 = generic_bs_wr_8,
/* set multiple */
.bs_sm_1 = NULL,
diff --git a/sys/arm64/arm64/bus_space_asm.S b/sys/arm64/arm64/bus_space_asm.S
index 20d4128..d919bd5 100644
--- a/sys/arm64/arm64/bus_space_asm.S
+++ b/sys/arm64/arm64/bus_space_asm.S
@@ -133,6 +133,90 @@ ENTRY(generic_bs_rm_8)
2: ret
END(generic_bs_rm_8)
+ENTRY(generic_bs_rr_1)
+ /* Is there is anything to read. */
+ cbz x4, 2f
+
+ /* Calculate the device address. */
+ add x0, x1, x2
+ /*
+ * x0 = The device address.
+ * x3 = The kernel address.
+ * x4 = Count
+ */
+
+ /* Read the data. */
+1: ldrb w1, [x0], #1
+ strb w1, [x3], #1
+ subs x4, x4, #1
+ b.ne 1b
+
+2: ret
+END(generic_bs_rr_1)
+
+ENTRY(generic_bs_rr_2)
+ /* Is there is anything to read. */
+ cbz x4, 2f
+
+ /* Calculate the device address. */
+ add x0, x1, x2
+ /*
+ * x0 = The device address.
+ * x3 = The kernel address.
+ * x4 = Count
+ */
+
+ /* Read the data. */
+1: ldrh w1, [x0], #2
+ strh w1, [x3], #2
+ subs x4, x4, #1
+ b.ne 1b
+
+2: ret
+END(generic_bs_rr_2)
+
+ENTRY(generic_bs_rr_4)
+ /* Is there is anything to read. */
+ cbz x4, 2f
+
+ /* Calculate the device address. */
+ add x0, x1, x2
+ /*
+ * x0 = The device address.
+ * x3 = The kernel address.
+ * x4 = Count
+ */
+
+ /* Read the data. */
+1: ldr w1, [x0], #4
+ str w1, [x3], #4
+ subs x4, x4, #1
+ b.ne 1b
+
+2: ret
+END(generic_bs_rr_4)
+
+ENTRY(generic_bs_rr_8)
+ /* Is there is anything to read. */
+ cbz x4, 2f
+
+ /* Calculate the device address. */
+ add x0, x1, x2
+ /*
+ * x0 = The device address.
+ * x3 = The kernel address.
+ * x4 = Count
+ */
+
+ /* Read the data. */
+1: ldr x1, [x0], #8
+ str x1, [x3], #8
+ subs x4, x4, #1
+ b.ne 1b
+
+2: ret
+END(generic_bs_rr_8)
+
ENTRY(generic_bs_w_1)
strb w3, [x1, x2]
@@ -233,3 +317,83 @@ ENTRY(generic_bs_wm_8)
2: ret
END(generic_bs_wm_8)
+
+ENTRY(generic_bs_wr_1)
+ /* Is there is anything to write. */
+ cbz x4, 2f
+
+ add x0, x1, x2
+ /*
+ * x0 = The device address.
+ * x3 = The kernel address.
+ * x4 = Count
+ */
+
+ /* Write the data */
+1: ldrb w1, [x3], #1
+ strb w1, [x0], #1
+ subs x4, x4, #1
+ b.ne 1b
+
+2: ret
+END(generic_bs_wr_1)
+
+ENTRY(generic_bs_wr_2)
+ /* Is there is anything to write. */
+ cbz x4, 2f
+
+ add x0, x1, x2
+ /*
+ * x0 = The device address.
+ * x3 = The kernel address.
+ * x4 = Count
+ */
+
+ /* Write the data */
+1: ldrh w1, [x3], #2
+ strh w1, [x0], #2
+ subs x4, x4, #1
+ b.ne 1b
+
+2: ret
+END(generic_bs_wr_2)
+
+ENTRY(generic_bs_wr_4)
+ /* Is there is anything to write. */
+ cbz x4, 2f
+
+ add x0, x1, x2
+ /*
+ * x0 = The device address.
+ * x3 = The kernel address.
+ * x4 = Count
+ */
+
+ /* Write the data */
+1: ldr w1, [x3], #4
+ str w1, [x0], #4
+ subs x4, x4, #1
+ b.ne 1b
+
+2: ret
+END(generic_bs_wr_4)
+
+ENTRY(generic_bs_wr_8)
+ /* Is there is anything to write. */
+ cbz x4, 2f
+
+ add x0, x1, x2
+ /*
+ * x0 = The device address.
+ * x3 = The kernel address.
+ * x4 = Count
+ */
+
+ /* Write the data */
+1: ldr x1, [x3], #8
+ str x1, [x0], #8
+ subs x4, x4, #1
+ b.ne 1b
+
+2: ret
+END(generic_bs_wr_8)
diff --git a/sys/arm64/arm64/exception.S b/sys/arm64/arm64/exception.S
index 4f457da..b05941f 100644
--- a/sys/arm64/arm64/exception.S
+++ b/sys/arm64/arm64/exception.S
@@ -104,7 +104,7 @@ __FBSDID("$FreeBSD$");
/* Read the current thread flags */
1: ldr x1, [x18, #PC_CURTHREAD] /* Load curthread */
- ldr x2, [x1, #TD_FLAGS]! /* TODO: No need for the ! but clang fails without it */
+ ldr x2, [x1, #TD_FLAGS]
/* Check if we have either bits set */
mov x3, #((TDF_ASTPENDING|TDF_NEEDRESCHED) >> 8)
diff --git a/sys/arm64/arm64/trap.c b/sys/arm64/arm64/trap.c
index 41e92a6..fa9aaa8 100644
--- a/sys/arm64/arm64/trap.c
+++ b/sys/arm64/arm64/trap.c
@@ -229,6 +229,21 @@ data_abort(struct trapframe *frame, uint64_t esr, int lower)
userret(td, frame);
}
+static void
+print_registers(struct trapframe *frame)
+{
+ u_int reg;
+
+ for (reg = 0; reg < 31; reg++) {
+ printf(" %sx%d: %16lx\n", (reg < 10) ? " " : "", reg,
+ frame->tf_x[reg]);
+ }
+ printf(" sp: %16lx\n", frame->tf_sp);
+ printf(" lr: %16lx\n", frame->tf_lr);
+ printf(" elr: %16lx\n", frame->tf_elr);
+ printf("spsr: %16lx\n", frame->tf_spsr);
+}
+
void
do_el1h_sync(struct trapframe *frame)
{
@@ -265,6 +280,7 @@ do_el1h_sync(struct trapframe *frame)
switch(exception) {
case EXCP_FP_SIMD:
case EXCP_TRAP_FP:
+ print_registers(frame);
panic("VFP exception in the kernel");
case EXCP_DATA_ABORT:
data_abort(frame, esr, 0);
@@ -286,11 +302,30 @@ do_el1h_sync(struct trapframe *frame)
#endif
break;
default:
+ print_registers(frame);
panic("Unknown kernel exception %x esr_el1 %lx\n", exception,
esr);
}
}
+/*
+ * We get EXCP_UNKNOWN from QEMU when executing zeroed memory. For now turn
+ * this into a SIGILL.
+ */
+static void
+el0_excp_unknown(struct trapframe *frame)
+{
+ struct thread *td;
+ uint64_t far;
+
+ td = curthread;
+ far = READ_SPECIALREG(far_el1);
+ printf("el0 EXCP_UNKNOWN exception\n");
+ print_registers(frame);
+ call_trapsignal(td, SIGILL, ILL_ILLTRP, (void *)far);
+ userret(td, frame);
+}
+
void
do_el0_sync(struct trapframe *frame)
{
@@ -332,7 +367,11 @@ do_el0_sync(struct trapframe *frame)
case EXCP_DATA_ABORT:
data_abort(frame, esr, 1);
break;
+ case EXCP_UNKNOWN:
+ el0_excp_unknown(frame);
+ break;
default:
+ print_registers(frame);
panic("Unknown userland exception %x esr_el1 %lx\n", exception,
esr);
}
diff --git a/sys/boot/kshim/bsd_kernel.h b/sys/boot/kshim/bsd_kernel.h
index 0e40fb0..aba8131 100644
--- a/sys/boot/kshim/bsd_kernel.h
+++ b/sys/boot/kshim/bsd_kernel.h
@@ -43,7 +43,8 @@
#define M_USBDEV 0
#define USB_PROC_MAX 3
#define USB_BUS_GIANT_PROC(bus) (usb_process + 2)
-#define USB_BUS_NON_GIANT_PROC(bus) (usb_process + 2)
+#define USB_BUS_NON_GIANT_BULK_PROC(bus) (usb_process + 2)
+#define USB_BUS_NON_GIANT_ISOC_PROC(bus) (usb_process + 2)
#define USB_BUS_EXPLORE_PROC(bus) (usb_process + 0)
#define USB_BUS_CONTROL_XFER_PROC(bus) (usb_process + 1)
#define SYSCTL_DECL(...)
diff --git a/sys/boot/uboot/fdt/uboot_fdt.c b/sys/boot/uboot/fdt/uboot_fdt.c
index 86f46e9..6b646f6 100644
--- a/sys/boot/uboot/fdt/uboot_fdt.c
+++ b/sys/boot/uboot/fdt/uboot_fdt.c
@@ -69,10 +69,11 @@ fdt_platform_load_dtb(void)
}
/*
- * If the U-boot environment contains a variable giving the name of a
- * file, use it if we can load and validate it.
+ * Try to get FDT filename first from loader env and then from u-boot env
*/
- s = ub_env_get("fdtfile");
+ s = getenv("fdt_file");
+ if (s == NULL)
+ s = ub_env_get("fdtfile");
if (s == NULL)
s = ub_env_get("fdt_file");
if (s != NULL && *s != '\0') {
diff --git a/sys/cam/ctl/README.ctl.txt b/sys/cam/ctl/README.ctl.txt
index a6de201..d4dc938 100644
--- a/sys/cam/ctl/README.ctl.txt
+++ b/sys/cam/ctl/README.ctl.txt
@@ -366,16 +366,6 @@ This is a CTL frontend port that is also a CAM SIM. The idea is that this
frontend allows for using CTL without any target-capable hardware. So any
LUNs you create in CTL are visible via this port.
-
-ctl_frontend_internal.c
-ctl_frontend_internal.h:
------------------------
-
-This is a frontend port written for Copan to do some system-specific tasks
-that required sending commands into CTL from inside the kernel. This isn't
-entirely relevant to FreeBSD in general, but can perhaps be repurposed or
-removed later.
-
ctl_ha.h:
--------
diff --git a/sys/cam/ctl/ctl.c b/sys/cam/ctl/ctl.c
index bdf5e6a..9141fc8 100644
--- a/sys/cam/ctl/ctl.c
+++ b/sys/cam/ctl/ctl.c
@@ -72,7 +72,6 @@ __FBSDID("$FreeBSD$");
#include <cam/ctl/ctl_io.h>
#include <cam/ctl/ctl.h>
#include <cam/ctl/ctl_frontend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
#include <cam/ctl/ctl_util.h>
#include <cam/ctl/ctl_backend.h>
#include <cam/ctl/ctl_ioctl.h>
@@ -383,18 +382,7 @@ static int ctl_init(void);
void ctl_shutdown(void);
static int ctl_open(struct cdev *dev, int flags, int fmt, struct thread *td);
static int ctl_close(struct cdev *dev, int flags, int fmt, struct thread *td);
-static void ctl_ioctl_online(void *arg);
-static void ctl_ioctl_offline(void *arg);
-static int ctl_ioctl_lun_enable(void *arg, int lun_id);
-static int ctl_ioctl_lun_disable(void *arg, int lun_id);
-static int ctl_ioctl_do_datamove(struct ctl_scsiio *ctsio);
static int ctl_serialize_other_sc_cmd(struct ctl_scsiio *ctsio);
-static int ctl_ioctl_submit_wait(union ctl_io *io);
-static void ctl_ioctl_datamove(union ctl_io *io);
-static void ctl_ioctl_done(union ctl_io *io);
-static void ctl_ioctl_hard_startstop_callback(void *arg,
- struct cfi_metatask *metatask);
-static void ctl_ioctl_bbrread_callback(void *arg,struct cfi_metatask *metatask);
static int ctl_ioctl_fill_ooa(struct ctl_lun *lun, uint32_t *cur_fill_num,
struct ctl_ooa *ooa_hdr,
struct ctl_ooa_entry *kern_entries);
@@ -529,11 +517,6 @@ static moduledata_t ctl_moduledata = {
DECLARE_MODULE(ctl, ctl_moduledata, SI_SUB_CONFIGURE, SI_ORDER_THIRD);
MODULE_VERSION(ctl, 1);
-static struct ctl_frontend ioctl_frontend =
-{
- .name = "ioctl",
-};
-
#ifdef notyet
static void
ctl_isc_handler_finish_xfer(struct ctl_softc *ctl_softc,
@@ -1064,7 +1047,6 @@ ctl_init(void)
{
struct ctl_softc *softc;
void *other_pool;
- struct ctl_port *port;
int i, error, retval;
//int isc_retval;
@@ -1189,32 +1171,6 @@ ctl_init(void)
return (error);
}
- /*
- * Initialize the ioctl front end.
- */
- ctl_frontend_register(&ioctl_frontend);
- port = &softc->ioctl_info.port;
- port->frontend = &ioctl_frontend;
- sprintf(softc->ioctl_info.port_name, "ioctl");
- port->port_type = CTL_PORT_IOCTL;
- port->num_requested_ctl_io = 100;
- port->port_name = softc->ioctl_info.port_name;
- port->port_online = ctl_ioctl_online;
- port->port_offline = ctl_ioctl_offline;
- port->onoff_arg = &softc->ioctl_info;
- port->lun_enable = ctl_ioctl_lun_enable;
- port->lun_disable = ctl_ioctl_lun_disable;
- port->targ_lun_arg = &softc->ioctl_info;
- port->fe_datamove = ctl_ioctl_datamove;
- port->fe_done = ctl_ioctl_done;
- port->max_targets = 15;
- port->max_target_id = 15;
-
- if (ctl_port_register(&softc->ioctl_info.port) != 0) {
- printf("ctl: ioctl front end registration failed, will "
- "continue anyway\n");
- }
-
SYSCTL_ADD_PROC(&softc->sysctl_ctx,SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "ha_state", CTLTYPE_INT | CTLFLAG_RWTUN,
softc, 0, ctl_ha_state_sysctl, "I", "HA state for this head");
@@ -1238,9 +1194,6 @@ ctl_shutdown(void)
softc = (struct ctl_softc *)control_softc;
- if (ctl_port_deregister(&softc->ioctl_info.port) != 0)
- printf("ctl: ioctl front end deregistration failed\n");
-
mtx_lock(&softc->ctl_lock);
/*
@@ -1253,8 +1206,6 @@ ctl_shutdown(void)
mtx_unlock(&softc->ctl_lock);
- ctl_frontend_deregister(&ioctl_frontend);
-
#if 0
ctl_shutdown_thread(softc->work_thread);
mtx_destroy(&softc->queue_lock);
@@ -1426,26 +1377,6 @@ ctl_port_list(struct ctl_port_entry *entries, int num_entries_alloced,
return (retval);
}
-static void
-ctl_ioctl_online(void *arg)
-{
- struct ctl_ioctl_info *ioctl_info;
-
- ioctl_info = (struct ctl_ioctl_info *)arg;
-
- ioctl_info->flags |= CTL_IOCTL_FLAG_ENABLED;
-}
-
-static void
-ctl_ioctl_offline(void *arg)
-{
- struct ctl_ioctl_info *ioctl_info;
-
- ioctl_info = (struct ctl_ioctl_info *)arg;
-
- ioctl_info->flags &= ~CTL_IOCTL_FLAG_ENABLED;
-}
-
/*
* Remove an initiator by port number and initiator ID.
* Returns 0 for success, -1 for failure.
@@ -1641,181 +1572,6 @@ ctl_create_iid(struct ctl_port *port, int iid, uint8_t *buf)
}
}
-static int
-ctl_ioctl_lun_enable(void *arg, int lun_id)
-{
- return (0);
-}
-
-static int
-ctl_ioctl_lun_disable(void *arg, int lun_id)
-{
- return (0);
-}
-
-/*
- * Data movement routine for the CTL ioctl frontend port.
- */
-static int
-ctl_ioctl_do_datamove(struct ctl_scsiio *ctsio)
-{
- struct ctl_sg_entry *ext_sglist, *kern_sglist;
- struct ctl_sg_entry ext_entry, kern_entry;
- int ext_sglen, ext_sg_entries, kern_sg_entries;
- int ext_sg_start, ext_offset;
- int len_to_copy, len_copied;
- int kern_watermark, ext_watermark;
- int ext_sglist_malloced;
- int i, j;
-
- ext_sglist_malloced = 0;
- ext_sg_start = 0;
- ext_offset = 0;
-
- CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove\n"));
-
- /*
- * If this flag is set, fake the data transfer.
- */
- if (ctsio->io_hdr.flags & CTL_FLAG_NO_DATAMOVE) {
- ctsio->ext_data_filled = ctsio->ext_data_len;
- goto bailout;
- }
-
- /*
- * To simplify things here, if we have a single buffer, stick it in
- * a S/G entry and just make it a single entry S/G list.
- */
- if (ctsio->io_hdr.flags & CTL_FLAG_EDPTR_SGLIST) {
- int len_seen;
-
- ext_sglen = ctsio->ext_sg_entries * sizeof(*ext_sglist);
-
- ext_sglist = (struct ctl_sg_entry *)malloc(ext_sglen, M_CTL,
- M_WAITOK);
- ext_sglist_malloced = 1;
- if (copyin(ctsio->ext_data_ptr, ext_sglist,
- ext_sglen) != 0) {
- ctl_set_internal_failure(ctsio,
- /*sks_valid*/ 0,
- /*retry_count*/ 0);
- goto bailout;
- }
- ext_sg_entries = ctsio->ext_sg_entries;
- len_seen = 0;
- for (i = 0; i < ext_sg_entries; i++) {
- if ((len_seen + ext_sglist[i].len) >=
- ctsio->ext_data_filled) {
- ext_sg_start = i;
- ext_offset = ctsio->ext_data_filled - len_seen;
- break;
- }
- len_seen += ext_sglist[i].len;
- }
- } else {
- ext_sglist = &ext_entry;
- ext_sglist->addr = ctsio->ext_data_ptr;
- ext_sglist->len = ctsio->ext_data_len;
- ext_sg_entries = 1;
- ext_sg_start = 0;
- ext_offset = ctsio->ext_data_filled;
- }
-
- if (ctsio->kern_sg_entries > 0) {
- kern_sglist = (struct ctl_sg_entry *)ctsio->kern_data_ptr;
- kern_sg_entries = ctsio->kern_sg_entries;
- } else {
- kern_sglist = &kern_entry;
- kern_sglist->addr = ctsio->kern_data_ptr;
- kern_sglist->len = ctsio->kern_data_len;
- kern_sg_entries = 1;
- }
-
-
- kern_watermark = 0;
- ext_watermark = ext_offset;
- len_copied = 0;
- for (i = ext_sg_start, j = 0;
- i < ext_sg_entries && j < kern_sg_entries;) {
- uint8_t *ext_ptr, *kern_ptr;
-
- len_to_copy = MIN(ext_sglist[i].len - ext_watermark,
- kern_sglist[j].len - kern_watermark);
-
- ext_ptr = (uint8_t *)ext_sglist[i].addr;
- ext_ptr = ext_ptr + ext_watermark;
- if (ctsio->io_hdr.flags & CTL_FLAG_BUS_ADDR) {
- /*
- * XXX KDM fix this!
- */
- panic("need to implement bus address support");
-#if 0
- kern_ptr = bus_to_virt(kern_sglist[j].addr);
-#endif
- } else
- kern_ptr = (uint8_t *)kern_sglist[j].addr;
- kern_ptr = kern_ptr + kern_watermark;
-
- kern_watermark += len_to_copy;
- ext_watermark += len_to_copy;
-
- if ((ctsio->io_hdr.flags & CTL_FLAG_DATA_MASK) ==
- CTL_FLAG_DATA_IN) {
- CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: copying %d "
- "bytes to user\n", len_to_copy));
- CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: from %p "
- "to %p\n", kern_ptr, ext_ptr));
- if (copyout(kern_ptr, ext_ptr, len_to_copy) != 0) {
- ctl_set_internal_failure(ctsio,
- /*sks_valid*/ 0,
- /*retry_count*/ 0);
- goto bailout;
- }
- } else {
- CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: copying %d "
- "bytes from user\n", len_to_copy));
- CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: from %p "
- "to %p\n", ext_ptr, kern_ptr));
- if (copyin(ext_ptr, kern_ptr, len_to_copy)!= 0){
- ctl_set_internal_failure(ctsio,
- /*sks_valid*/ 0,
- /*retry_count*/0);
- goto bailout;
- }
- }
-
- len_copied += len_to_copy;
-
- if (ext_sglist[i].len == ext_watermark) {
- i++;
- ext_watermark = 0;
- }
-
- if (kern_sglist[j].len == kern_watermark) {
- j++;
- kern_watermark = 0;
- }
- }
-
- ctsio->ext_data_filled += len_copied;
-
- CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: ext_sg_entries: %d, "
- "kern_sg_entries: %d\n", ext_sg_entries,
- kern_sg_entries));
- CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: ext_data_len = %d, "
- "kern_data_len = %d\n", ctsio->ext_data_len,
- ctsio->kern_data_len));
-
-
- /* XXX KDM set residual?? */
-bailout:
-
- if (ext_sglist_malloced != 0)
- free(ext_sglist, M_CTL);
-
- return (CTL_RETVAL_COMPLETE);
-}
-
/*
* Serialize a command that went down the "wrong" side, and so was sent to
* this controller for execution. The logic is a little different than the
@@ -1982,149 +1738,6 @@ ctl_serialize_other_sc_cmd(struct ctl_scsiio *ctsio)
return (retval);
}
-static int
-ctl_ioctl_submit_wait(union ctl_io *io)
-{
- struct ctl_fe_ioctl_params params;
- ctl_fe_ioctl_state last_state;
- int done, retval;
-
- retval = 0;
-
- bzero(&params, sizeof(params));
-
- mtx_init(&params.ioctl_mtx, "ctliocmtx", NULL, MTX_DEF);
- cv_init(&params.sem, "ctlioccv");
- params.state = CTL_IOCTL_INPROG;
- last_state = params.state;
-
- io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr = &params;
-
- CTL_DEBUG_PRINT(("ctl_ioctl_submit_wait\n"));
-
- /* This shouldn't happen */
- if ((retval = ctl_queue(io)) != CTL_RETVAL_COMPLETE)
- return (retval);
-
- done = 0;
-
- do {
- mtx_lock(&params.ioctl_mtx);
- /*
- * Check the state here, and don't sleep if the state has
- * already changed (i.e. wakeup has already occured, but we
- * weren't waiting yet).
- */
- if (params.state == last_state) {
- /* XXX KDM cv_wait_sig instead? */
- cv_wait(&params.sem, &params.ioctl_mtx);
- }
- last_state = params.state;
-
- switch (params.state) {
- case CTL_IOCTL_INPROG:
- /* Why did we wake up? */
- /* XXX KDM error here? */
- mtx_unlock(&params.ioctl_mtx);
- break;
- case CTL_IOCTL_DATAMOVE:
- CTL_DEBUG_PRINT(("got CTL_IOCTL_DATAMOVE\n"));
-
- /*
- * change last_state back to INPROG to avoid
- * deadlock on subsequent data moves.
- */
- params.state = last_state = CTL_IOCTL_INPROG;
-
- mtx_unlock(&params.ioctl_mtx);
- ctl_ioctl_do_datamove(&io->scsiio);
- /*
- * Note that in some cases, most notably writes,
- * this will queue the I/O and call us back later.
- * In other cases, generally reads, this routine
- * will immediately call back and wake us up,
- * probably using our own context.
- */
- io->scsiio.be_move_done(io);
- break;
- case CTL_IOCTL_DONE:
- mtx_unlock(&params.ioctl_mtx);
- CTL_DEBUG_PRINT(("got CTL_IOCTL_DONE\n"));
- done = 1;
- break;
- default:
- mtx_unlock(&params.ioctl_mtx);
- /* XXX KDM error here? */
- break;
- }
- } while (done == 0);
-
- mtx_destroy(&params.ioctl_mtx);
- cv_destroy(&params.sem);
-
- return (CTL_RETVAL_COMPLETE);
-}
-
-static void
-ctl_ioctl_datamove(union ctl_io *io)
-{
- struct ctl_fe_ioctl_params *params;
-
- params = (struct ctl_fe_ioctl_params *)
- io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr;
-
- mtx_lock(&params->ioctl_mtx);
- params->state = CTL_IOCTL_DATAMOVE;
- cv_broadcast(&params->sem);
- mtx_unlock(&params->ioctl_mtx);
-}
-
-static void
-ctl_ioctl_done(union ctl_io *io)
-{
- struct ctl_fe_ioctl_params *params;
-
- params = (struct ctl_fe_ioctl_params *)
- io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr;
-
- mtx_lock(&params->ioctl_mtx);
- params->state = CTL_IOCTL_DONE;
- cv_broadcast(&params->sem);
- mtx_unlock(&params->ioctl_mtx);
-}
-
-static void
-ctl_ioctl_hard_startstop_callback(void *arg, struct cfi_metatask *metatask)
-{
- struct ctl_fe_ioctl_startstop_info *sd_info;
-
- sd_info = (struct ctl_fe_ioctl_startstop_info *)arg;
-
- sd_info->hs_info.status = metatask->status;
- sd_info->hs_info.total_luns = metatask->taskinfo.startstop.total_luns;
- sd_info->hs_info.luns_complete =
- metatask->taskinfo.startstop.luns_complete;
- sd_info->hs_info.luns_failed = metatask->taskinfo.startstop.luns_failed;
-
- cv_broadcast(&sd_info->sem);
-}
-
-static void
-ctl_ioctl_bbrread_callback(void *arg, struct cfi_metatask *metatask)
-{
- struct ctl_fe_ioctl_bbrread_info *fe_bbr_info;
-
- fe_bbr_info = (struct ctl_fe_ioctl_bbrread_info *)arg;
-
- mtx_lock(fe_bbr_info->lock);
- fe_bbr_info->bbr_info->status = metatask->status;
- fe_bbr_info->bbr_info->bbr_status = metatask->taskinfo.bbrread.status;
- fe_bbr_info->wakeup_done = 1;
- mtx_unlock(fe_bbr_info->lock);
-
- cv_broadcast(&fe_bbr_info->sem);
-}
-
/*
* Returns 0 for success, errno for failure.
*/
@@ -2367,57 +1980,9 @@ ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
retval = 0;
switch (cmd) {
- case CTL_IO: {
- union ctl_io *io;
- void *pool_tmp;
-
- /*
- * If we haven't been "enabled", don't allow any SCSI I/O
- * to this FETD.
- */
- if ((softc->ioctl_info.flags & CTL_IOCTL_FLAG_ENABLED) == 0) {
- retval = EPERM;
- break;
- }
-
- io = ctl_alloc_io(softc->ioctl_info.port.ctl_pool_ref);
-
- /*
- * Need to save the pool reference so it doesn't get
- * spammed by the user's ctl_io.
- */
- pool_tmp = io->io_hdr.pool;
- memcpy(io, (void *)addr, sizeof(*io));
- io->io_hdr.pool = pool_tmp;
-
- /*
- * No status yet, so make sure the status is set properly.
- */
- io->io_hdr.status = CTL_STATUS_NONE;
-
- /*
- * The user sets the initiator ID, target and LUN IDs.
- */
- io->io_hdr.nexus.targ_port = softc->ioctl_info.port.targ_port;
- io->io_hdr.flags |= CTL_FLAG_USER_REQ;
- if ((io->io_hdr.io_type == CTL_IO_SCSI)
- && (io->scsiio.tag_type != CTL_TAG_UNTAGGED))
- io->scsiio.tag_num = softc->ioctl_info.cur_tag_num++;
-
- retval = ctl_ioctl_submit_wait(io);
-
- if (retval != 0) {
- ctl_free_io(io);
- break;
- }
-
- memcpy((void *)addr, io, sizeof(*io));
-
- /* return this to our pool */
- ctl_free_io(io);
-
+ case CTL_IO:
+ retval = ctl_ioctl_io(dev, cmd, addr, flag, td);
break;
- }
case CTL_ENABLE_PORT:
case CTL_DISABLE_PORT:
case CTL_SET_PORT_WWNS: {
@@ -2724,103 +2289,6 @@ ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
break;
}
- case CTL_HARD_START:
- case CTL_HARD_STOP: {
- struct ctl_fe_ioctl_startstop_info ss_info;
- struct cfi_metatask *metatask;
- struct mtx hs_mtx;
-
- mtx_init(&hs_mtx, "HS Mutex", NULL, MTX_DEF);
-
- cv_init(&ss_info.sem, "hard start/stop cv" );
-
- metatask = cfi_alloc_metatask(/*can_wait*/ 1);
- if (metatask == NULL) {
- retval = ENOMEM;
- mtx_destroy(&hs_mtx);
- break;
- }
-
- if (cmd == CTL_HARD_START)
- metatask->tasktype = CFI_TASK_STARTUP;
- else
- metatask->tasktype = CFI_TASK_SHUTDOWN;
-
- metatask->callback = ctl_ioctl_hard_startstop_callback;
- metatask->callback_arg = &ss_info;
-
- cfi_action(metatask);
-
- /* Wait for the callback */
- mtx_lock(&hs_mtx);
- cv_wait_sig(&ss_info.sem, &hs_mtx);
- mtx_unlock(&hs_mtx);
-
- /*
- * All information has been copied from the metatask by the
- * time cv_broadcast() is called, so we free the metatask here.
- */
- cfi_free_metatask(metatask);
-
- memcpy((void *)addr, &ss_info.hs_info, sizeof(ss_info.hs_info));
-
- mtx_destroy(&hs_mtx);
- break;
- }
- case CTL_BBRREAD: {
- struct ctl_bbrread_info *bbr_info;
- struct ctl_fe_ioctl_bbrread_info fe_bbr_info;
- struct mtx bbr_mtx;
- struct cfi_metatask *metatask;
-
- bbr_info = (struct ctl_bbrread_info *)addr;
-
- bzero(&fe_bbr_info, sizeof(fe_bbr_info));
-
- bzero(&bbr_mtx, sizeof(bbr_mtx));
- mtx_init(&bbr_mtx, "BBR Mutex", NULL, MTX_DEF);
-
- fe_bbr_info.bbr_info = bbr_info;
- fe_bbr_info.lock = &bbr_mtx;
-
- cv_init(&fe_bbr_info.sem, "BBR read cv");
- metatask = cfi_alloc_metatask(/*can_wait*/ 1);
-
- if (metatask == NULL) {
- mtx_destroy(&bbr_mtx);
- cv_destroy(&fe_bbr_info.sem);
- retval = ENOMEM;
- break;
- }
- metatask->tasktype = CFI_TASK_BBRREAD;
- metatask->callback = ctl_ioctl_bbrread_callback;
- metatask->callback_arg = &fe_bbr_info;
- metatask->taskinfo.bbrread.lun_num = bbr_info->lun_num;
- metatask->taskinfo.bbrread.lba = bbr_info->lba;
- metatask->taskinfo.bbrread.len = bbr_info->len;
-
- cfi_action(metatask);
-
- mtx_lock(&bbr_mtx);
- while (fe_bbr_info.wakeup_done == 0)
- cv_wait_sig(&fe_bbr_info.sem, &bbr_mtx);
- mtx_unlock(&bbr_mtx);
-
- bbr_info->status = metatask->status;
- bbr_info->bbr_status = metatask->taskinfo.bbrread.status;
- bbr_info->scsi_status = metatask->taskinfo.bbrread.scsi_status;
- memcpy(&bbr_info->sense_data,
- &metatask->taskinfo.bbrread.sense_data,
- MIN(sizeof(bbr_info->sense_data),
- sizeof(metatask->taskinfo.bbrread.sense_data)));
-
- cfi_free_metatask(metatask);
-
- mtx_destroy(&bbr_mtx);
- cv_destroy(&fe_bbr_info.sem);
-
- break;
- }
case CTL_DELAY_IO: {
struct ctl_io_delay_info *delay_info;
#ifdef CTL_IO_DELAY
diff --git a/sys/cam/ctl/ctl.h b/sys/cam/ctl/ctl.h
index b1d9118..2826742 100644
--- a/sys/cam/ctl/ctl.h
+++ b/sys/cam/ctl/ctl.h
@@ -194,6 +194,8 @@ void ctl_portDB_changed(int portnum);
#ifdef notyet
void ctl_init_isc_msg(void);
#endif
+int ctl_ioctl_io(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
+ struct thread *td);
/*
* KPI to manipulate LUN/port options
diff --git a/sys/cam/ctl/ctl_backend.c b/sys/cam/ctl/ctl_backend.c
index cabecb7..ae5034b 100644
--- a/sys/cam/ctl/ctl_backend.c
+++ b/sys/cam/ctl/ctl_backend.c
@@ -55,7 +55,6 @@ __FBSDID("$FreeBSD$");
#include <cam/ctl/ctl.h>
#include <cam/ctl/ctl_frontend.h>
#include <cam/ctl/ctl_backend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
#include <cam/ctl/ctl_ioctl.h>
#include <cam/ctl/ctl_ha.h>
#include <cam/ctl/ctl_private.h>
diff --git a/sys/cam/ctl/ctl_backend_block.c b/sys/cam/ctl/ctl_backend_block.c
index 5bb3121..65d0491 100644
--- a/sys/cam/ctl/ctl_backend_block.c
+++ b/sys/cam/ctl/ctl_backend_block.c
@@ -84,7 +84,6 @@ __FBSDID("$FreeBSD$");
#include <cam/ctl/ctl_io.h>
#include <cam/ctl/ctl.h>
#include <cam/ctl/ctl_backend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
#include <cam/ctl/ctl_ioctl.h>
#include <cam/ctl/ctl_scsi_all.h>
#include <cam/ctl/ctl_error.h>
@@ -170,7 +169,6 @@ struct ctl_be_block_lun {
uint64_t size_blocks;
uint64_t size_bytes;
uint32_t blocksize;
- int blocksize_shift;
uint16_t pblockexp;
uint16_t pblockoff;
uint16_t ublockexp;
@@ -773,7 +771,7 @@ ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun,
DPRINTF("entered\n");
- off = roff = ((off_t)lbalen->lba) << be_lun->blocksize_shift;
+ off = roff = ((off_t)lbalen->lba) * be_lun->blocksize;
vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
error = VOP_IOCTL(be_lun->vn, FIOSEEKHOLE, &off,
0, curthread->td_ucred, curthread);
@@ -791,10 +789,9 @@ ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun,
}
VOP_UNLOCK(be_lun->vn, 0);
- off >>= be_lun->blocksize_shift;
data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr;
scsi_u64to8b(lbalen->lba, data->descr[0].addr);
- scsi_ulto4b(MIN(UINT32_MAX, off - lbalen->lba),
+ scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->blocksize - lbalen->lba),
data->descr[0].length);
data->descr[0].status = status;
@@ -816,14 +813,14 @@ ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun, const char *attrname)
if (strcmp(attrname, "blocksused") == 0) {
error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
if (error == 0)
- val = vattr.va_bytes >> be_lun->blocksize_shift;
+ val = vattr.va_bytes / be_lun->blocksize;
}
if (strcmp(attrname, "blocksavail") == 0 &&
(be_lun->vn->v_iflag & VI_DOOMED) == 0) {
error = VFS_STATFS(be_lun->vn->v_mount, &statfs);
if (error == 0)
- val = (statfs.f_bavail * statfs.f_bsize) >>
- be_lun->blocksize_shift;
+ val = statfs.f_bavail * statfs.f_bsize /
+ be_lun->blocksize;
}
VOP_UNLOCK(be_lun->vn, 0);
return (val);
@@ -934,7 +931,7 @@ ctl_be_block_gls_zvol(struct ctl_be_block_lun *be_lun,
DPRINTF("entered\n");
- off = roff = ((off_t)lbalen->lba) << be_lun->blocksize_shift;
+ off = roff = ((off_t)lbalen->lba) * be_lun->blocksize;
error = (*dev_data->csw->d_ioctl)(dev_data->cdev, FIOSEEKHOLE,
(caddr_t)&off, FREAD, curthread);
if (error == 0 && off > roff)
@@ -950,10 +947,9 @@ ctl_be_block_gls_zvol(struct ctl_be_block_lun *be_lun,
}
}
- off >>= be_lun->blocksize_shift;
data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr;
scsi_u64to8b(lbalen->lba, data->descr[0].addr);
- scsi_ulto4b(MIN(UINT32_MAX, off - lbalen->lba),
+ scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->blocksize - lbalen->lba),
data->descr[0].length);
data->descr[0].status = status;
@@ -1866,7 +1862,7 @@ ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
struct cdevsw *devsw;
char *value;
int error, atomic, maxio, unmap;
- off_t ps, pss, po, pos, us, uss, uo, uos;
+ off_t ps, pss, po, pos, us, uss, uo, uos, tmp;
params = &be_lun->params;
@@ -1909,8 +1905,7 @@ ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
return (ENODEV);
}
- error = devsw->d_ioctl(dev, DIOCGSECTORSIZE,
- (caddr_t)&be_lun->blocksize, FREAD,
+ error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, (caddr_t)&tmp, FREAD,
curthread);
if (error) {
snprintf(req->error_str, sizeof(req->error_str),
@@ -1925,15 +1920,9 @@ ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
* the user is asking for is an even multiple of the underlying
* device's blocksize.
*/
- if ((params->blocksize_bytes != 0)
- && (params->blocksize_bytes > be_lun->blocksize)) {
- uint32_t bs_multiple, tmp_blocksize;
-
- bs_multiple = params->blocksize_bytes / be_lun->blocksize;
-
- tmp_blocksize = bs_multiple * be_lun->blocksize;
-
- if (tmp_blocksize == params->blocksize_bytes) {
+ if ((params->blocksize_bytes != 0) &&
+ (params->blocksize_bytes >= tmp)) {
+ if (params->blocksize_bytes % tmp == 0) {
be_lun->blocksize = params->blocksize_bytes;
} else {
snprintf(req->error_str, sizeof(req->error_str),
@@ -1944,17 +1933,16 @@ ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
return (EINVAL);
}
- } else if ((params->blocksize_bytes != 0)
- && (params->blocksize_bytes != be_lun->blocksize)) {
+ } else if (params->blocksize_bytes != 0) {
snprintf(req->error_str, sizeof(req->error_str),
"requested blocksize %u < backing device "
"blocksize %u", params->blocksize_bytes,
be_lun->blocksize);
return (EINVAL);
- }
+ } else
+ be_lun->blocksize = tmp;
- error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
- (caddr_t)&be_lun->size_bytes, FREAD,
+ error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, (caddr_t)&tmp, FREAD,
curthread);
if (error) {
snprintf(req->error_str, sizeof(req->error_str),
@@ -1965,7 +1953,7 @@ ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
}
if (params->lun_size_bytes != 0) {
- if (params->lun_size_bytes > be_lun->size_bytes) {
+ if (params->lun_size_bytes > tmp) {
snprintf(req->error_str, sizeof(req->error_str),
"requested LUN size %ju > backing device "
"size %ju",
@@ -1975,7 +1963,8 @@ ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
}
be_lun->size_bytes = params->lun_size_bytes;
- }
+ } else
+ be_lun->size_bytes = tmp;
error = devsw->d_ioctl(dev, DIOCGSTRIPESIZE,
(caddr_t)&ps, FREAD, curthread);
@@ -2160,14 +2149,8 @@ ctl_be_block_open(struct ctl_be_block_softc *softc,
}
VOP_UNLOCK(be_lun->vn, 0);
- if (error != 0) {
+ if (error != 0)
ctl_be_block_close(be_lun);
- return (error);
- }
-
- be_lun->blocksize_shift = fls(be_lun->blocksize) - 1;
- be_lun->size_blocks = be_lun->size_bytes >> be_lun->blocksize_shift;
-
return (0);
}
@@ -2224,10 +2207,14 @@ ctl_be_block_create(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
goto bailout_error;
}
be_lun->dev_path = strdup(value, M_CTLBLK);
- be_lun->blocksize = 512;
- be_lun->blocksize_shift = fls(be_lun->blocksize) - 1;
+ be_lun->size_bytes = params->lun_size_bytes;
+ if (params->blocksize_bytes != 0)
+ be_lun->blocksize = params->blocksize_bytes;
+ else
+ be_lun->blocksize = 512;
retval = ctl_be_block_open(softc, be_lun, req);
+ be_lun->size_blocks = be_lun->size_bytes / be_lun->blocksize;
if (retval != 0) {
retval = 0;
req->status = CTL_LUN_WARNING;
@@ -2652,10 +2639,9 @@ ctl_be_block_modify(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
error = ctl_be_block_modify_file(be_lun, req);
else
error = EINVAL;
+ be_lun->size_blocks = be_lun->size_bytes / be_lun->blocksize;
if (error == 0 && be_lun->size_bytes != oldsize) {
- be_lun->size_blocks = be_lun->size_bytes >>
- be_lun->blocksize_shift;
/*
* The maximum LBA is the size - 1.
diff --git a/sys/cam/ctl/ctl_backend_ramdisk.c b/sys/cam/ctl/ctl_backend_ramdisk.c
index ad90241..211738b 100644
--- a/sys/cam/ctl/ctl_backend_ramdisk.c
+++ b/sys/cam/ctl/ctl_backend_ramdisk.c
@@ -62,7 +62,6 @@ __FBSDID("$FreeBSD$");
#include <cam/ctl/ctl.h>
#include <cam/ctl/ctl_util.h>
#include <cam/ctl/ctl_backend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
#include <cam/ctl/ctl_debug.h>
#include <cam/ctl/ctl_ioctl.h>
#include <cam/ctl/ctl_error.h>
diff --git a/sys/cam/ctl/ctl_cmd_table.c b/sys/cam/ctl/ctl_cmd_table.c
index 08ff88a..9a7d70e 100644
--- a/sys/cam/ctl/ctl_cmd_table.c
+++ b/sys/cam/ctl/ctl_cmd_table.c
@@ -52,7 +52,6 @@
#include <cam/ctl/ctl.h>
#include <cam/ctl/ctl_frontend.h>
#include <cam/ctl/ctl_backend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
#include <cam/ctl/ctl_ioctl.h>
#include <cam/ctl/ctl_ha.h>
#include <cam/ctl/ctl_private.h>
diff --git a/sys/cam/ctl/ctl_error.c b/sys/cam/ctl/ctl_error.c
index d4d7f79..4b41331 100644
--- a/sys/cam/ctl/ctl_error.c
+++ b/sys/cam/ctl/ctl_error.c
@@ -57,7 +57,6 @@ __FBSDID("$FreeBSD$");
#include <cam/ctl/ctl_io.h>
#include <cam/ctl/ctl.h>
#include <cam/ctl/ctl_frontend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
#include <cam/ctl/ctl_backend.h>
#include <cam/ctl/ctl_ioctl.h>
#include <cam/ctl/ctl_error.h>
diff --git a/sys/cam/ctl/ctl_frontend.c b/sys/cam/ctl/ctl_frontend.c
index e22b9d4..34baf44 100644
--- a/sys/cam/ctl/ctl_frontend.c
+++ b/sys/cam/ctl/ctl_frontend.c
@@ -55,7 +55,6 @@ __FBSDID("$FreeBSD$");
#include <cam/ctl/ctl_io.h>
#include <cam/ctl/ctl.h>
#include <cam/ctl/ctl_frontend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
#include <cam/ctl/ctl_backend.h>
/* XXX KDM move defines from ctl_ioctl.h to somewhere else */
#include <cam/ctl/ctl_ioctl.h>
diff --git a/sys/cam/ctl/ctl_frontend_cam_sim.c b/sys/cam/ctl/ctl_frontend_cam_sim.c
index 3abc572..97b361a 100644
--- a/sys/cam/ctl/ctl_frontend_cam_sim.c
+++ b/sys/cam/ctl/ctl_frontend_cam_sim.c
@@ -64,7 +64,6 @@ __FBSDID("$FreeBSD$");
#include <cam/ctl/ctl_io.h>
#include <cam/ctl/ctl.h>
#include <cam/ctl/ctl_frontend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
#include <cam/ctl/ctl_debug.h>
#define io_ptr spriv_ptr1
diff --git a/sys/cam/ctl/ctl_frontend_internal.c b/sys/cam/ctl/ctl_frontend_internal.c
deleted file mode 100644
index 4768292..0000000
--- a/sys/cam/ctl/ctl_frontend_internal.c
+++ /dev/null
@@ -1,1612 +0,0 @@
-/*-
- * Copyright (c) 2004, 2005 Silicon Graphics International Corp.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. Redistributions in binary form must reproduce at minimum a disclaimer
- * substantially similar to the "NO WARRANTY" disclaimer below
- * ("Disclaimer") and any redistribution must be conditioned upon
- * including a substantially similar Disclaimer requirement for further
- * binary redistribution.
- *
- * NO WARRANTY
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
- * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGES.
- *
- * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_frontend_internal.c#5 $
- */
-/*
- * CTL kernel internal frontend target driver. This allows kernel-level
- * clients to send commands into CTL.
- *
- * This has elements of a FETD (e.g. it has to set tag numbers, initiator,
- * port, target, and LUN) and elements of an initiator (LUN discovery and
- * probing, error recovery, command initiation). Even though this has some
- * initiator type elements, this is not intended to be a full fledged
- * initiator layer. It is only intended to send a limited number of
- * commands to a well known target layer.
- *
- * To be able to fulfill the role of a full initiator layer, it would need
- * a whole lot more functionality.
- *
- * Author: Ken Merry <ken@FreeBSD.org>
- *
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/kernel.h>
-#include <sys/types.h>
-#include <sys/malloc.h>
-#include <sys/module.h>
-#include <sys/lock.h>
-#include <sys/mutex.h>
-#include <sys/condvar.h>
-#include <sys/queue.h>
-#include <sys/sbuf.h>
-#include <sys/sysctl.h>
-#include <vm/uma.h>
-#include <cam/scsi/scsi_all.h>
-#include <cam/scsi/scsi_da.h>
-#include <cam/ctl/ctl_io.h>
-#include <cam/ctl/ctl.h>
-#include <cam/ctl/ctl_frontend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
-#include <cam/ctl/ctl_backend.h>
-#include <cam/ctl/ctl_ioctl.h>
-#include <cam/ctl/ctl_util.h>
-#include <cam/ctl/ctl_ha.h>
-#include <cam/ctl/ctl_private.h>
-#include <cam/ctl/ctl_debug.h>
-#include <cam/ctl/ctl_scsi_all.h>
-#include <cam/ctl/ctl_error.h>
-
-/*
- * Task structure:
- * - overall metatask, different potential metatask types (e.g. forced
- * shutdown, gentle shutdown)
- * - forced shutdown metatask:
- * - states: report luns, pending, done?
- * - list of luns pending, with the relevant I/O for that lun attached.
- * This would allow moving ahead on LUNs with no errors, and going
- * into error recovery on LUNs with problems. Per-LUN states might
- * include inquiry, stop/offline, done.
- *
- * Use LUN enable for LUN list instead of getting it manually? We'd still
- * need inquiry data for each LUN.
- *
- * How to handle processor LUN w.r.t. found/stopped counts?
- */
-#ifdef oldapi
-typedef enum {
- CFI_TASK_NONE,
- CFI_TASK_SHUTDOWN,
- CFI_TASK_STARTUP
-} cfi_tasktype;
-
-struct cfi_task_startstop {
- int total_luns;
- int luns_complete;
- int luns_failed;
- cfi_cb_t callback;
- void *callback_arg;
- /* XXX KDM add more fields here */
-};
-
-union cfi_taskinfo {
- struct cfi_task_startstop startstop;
-};
-
-struct cfi_metatask {
- cfi_tasktype tasktype;
- cfi_mt_status status;
- union cfi_taskinfo taskinfo;
- void *cfi_context;
- STAILQ_ENTRY(cfi_metatask) links;
-};
-#endif
-
-typedef enum {
- CFI_ERR_RETRY = 0x000,
- CFI_ERR_FAIL = 0x001,
- CFI_ERR_LUN_RESET = 0x002,
- CFI_ERR_MASK = 0x0ff,
- CFI_ERR_NO_DECREMENT = 0x100
-} cfi_error_action;
-
-typedef enum {
- CFI_ERR_SOFT,
- CFI_ERR_HARD
-} cfi_error_policy;
-
-typedef enum {
- CFI_LUN_INQUIRY,
- CFI_LUN_READCAPACITY,
- CFI_LUN_READCAPACITY_16,
- CFI_LUN_READY
-} cfi_lun_state;
-
-struct cfi_lun {
- int lun_id;
- struct scsi_inquiry_data inq_data;
- uint64_t num_blocks;
- uint32_t blocksize;
- int blocksize_powerof2;
- uint32_t cur_tag_num;
- cfi_lun_state state;
- struct cfi_softc *softc;
- STAILQ_HEAD(, cfi_lun_io) io_list;
- STAILQ_ENTRY(cfi_lun) links;
-};
-
-struct cfi_lun_io {
- struct cfi_lun *lun;
- struct cfi_metatask *metatask;
- cfi_error_policy policy;
- void (*done_function)(union ctl_io *io);
- union ctl_io *ctl_io;
- struct cfi_lun_io *orig_lun_io;
- STAILQ_ENTRY(cfi_lun_io) links;
-};
-
-typedef enum {
- CFI_NONE = 0x00,
- CFI_ONLINE = 0x01,
-} cfi_flags;
-
-struct cfi_softc {
- struct ctl_port port;
- char fe_name[40];
- struct mtx lock;
- cfi_flags flags;
- STAILQ_HEAD(, cfi_lun) lun_list;
- STAILQ_HEAD(, cfi_metatask) metatask_list;
-};
-
-MALLOC_DEFINE(M_CTL_CFI, "ctlcfi", "CTL CFI");
-
-static uma_zone_t cfi_lun_zone;
-static uma_zone_t cfi_metatask_zone;
-
-static struct cfi_softc fetd_internal_softc;
-
-int cfi_init(void);
-void cfi_shutdown(void) __unused;
-static void cfi_online(void *arg);
-static void cfi_offline(void *arg);
-static int cfi_lun_enable(void *arg, int lun_id);
-static int cfi_lun_disable(void *arg, int lun_id);
-static void cfi_datamove(union ctl_io *io);
-static cfi_error_action cfi_checkcond_parse(union ctl_io *io,
- struct cfi_lun_io *lun_io);
-static cfi_error_action cfi_error_parse(union ctl_io *io,
- struct cfi_lun_io *lun_io);
-static void cfi_init_io(union ctl_io *io, struct cfi_lun *lun,
- struct cfi_metatask *metatask, cfi_error_policy policy,
- int retries, struct cfi_lun_io *orig_lun_io,
- void (*done_function)(union ctl_io *io));
-static void cfi_done(union ctl_io *io);
-static void cfi_lun_probe_done(union ctl_io *io);
-static void cfi_lun_probe(struct cfi_lun *lun, int have_lock);
-static void cfi_metatask_done(struct cfi_softc *softc,
- struct cfi_metatask *metatask);
-static void cfi_metatask_bbr_errorparse(struct cfi_metatask *metatask,
- union ctl_io *io);
-static void cfi_metatask_io_done(union ctl_io *io);
-static void cfi_err_recovery_done(union ctl_io *io);
-static void cfi_lun_io_done(union ctl_io *io);
-
-static struct ctl_frontend cfi_frontend =
-{
- .name = "kernel",
- .init = cfi_init,
- .shutdown = cfi_shutdown,
-};
-CTL_FRONTEND_DECLARE(ctlcfi, cfi_frontend);
-
-int
-cfi_init(void)
-{
- struct cfi_softc *softc;
- struct ctl_port *port;
- int retval;
-
- softc = &fetd_internal_softc;
-
- port = &softc->port;
-
- retval = 0;
-
- if (sizeof(struct cfi_lun_io) > CTL_PORT_PRIV_SIZE) {
- printf("%s: size of struct cfi_lun_io %zd > "
- "CTL_PORT_PRIV_SIZE %d\n", __func__,
- sizeof(struct cfi_lun_io),
- CTL_PORT_PRIV_SIZE);
- }
- memset(softc, 0, sizeof(*softc));
-
- mtx_init(&softc->lock, "CTL frontend mutex", NULL, MTX_DEF);
- STAILQ_INIT(&softc->lun_list);
- STAILQ_INIT(&softc->metatask_list);
- sprintf(softc->fe_name, "kernel");
- port->frontend = &cfi_frontend;
- port->port_type = CTL_PORT_INTERNAL;
- port->num_requested_ctl_io = 100;
- port->port_name = softc->fe_name;
- port->port_online = cfi_online;
- port->port_offline = cfi_offline;
- port->onoff_arg = softc;
- port->lun_enable = cfi_lun_enable;
- port->lun_disable = cfi_lun_disable;
- port->targ_lun_arg = softc;
- port->fe_datamove = cfi_datamove;
- port->fe_done = cfi_done;
- port->max_targets = 15;
- port->max_target_id = 15;
-
- if (ctl_port_register(port) != 0)
- {
- printf("%s: internal frontend registration failed\n", __func__);
- return (0);
- }
-
- cfi_lun_zone = uma_zcreate("cfi_lun", sizeof(struct cfi_lun),
- NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
- cfi_metatask_zone = uma_zcreate("cfi_metatask", sizeof(struct cfi_metatask),
- NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
-
- return (0);
-}
-
-void
-cfi_shutdown(void)
-{
- struct cfi_softc *softc;
-
- softc = &fetd_internal_softc;
-
- /*
- * XXX KDM need to clear out any I/O pending on each LUN.
- */
- if (ctl_port_deregister(&softc->port) != 0)
- printf("%s: ctl_frontend_deregister() failed\n", __func__);
-
- uma_zdestroy(cfi_lun_zone);
- uma_zdestroy(cfi_metatask_zone);
-}
-
-static void
-cfi_online(void *arg)
-{
- struct cfi_softc *softc;
- struct cfi_lun *lun;
-
- softc = (struct cfi_softc *)arg;
-
- softc->flags |= CFI_ONLINE;
-
- /*
- * Go through and kick off the probe for each lun. Should we check
- * the LUN flags here to determine whether or not to probe it?
- */
- mtx_lock(&softc->lock);
- STAILQ_FOREACH(lun, &softc->lun_list, links)
- cfi_lun_probe(lun, /*have_lock*/ 1);
- mtx_unlock(&softc->lock);
-}
-
-static void
-cfi_offline(void *arg)
-{
- struct cfi_softc *softc;
-
- softc = (struct cfi_softc *)arg;
-
- softc->flags &= ~CFI_ONLINE;
-}
-
-static int
-cfi_lun_enable(void *arg, int lun_id)
-{
- struct cfi_softc *softc;
- struct cfi_lun *lun;
- int found;
-
- softc = (struct cfi_softc *)arg;
-
- found = 0;
- mtx_lock(&softc->lock);
- STAILQ_FOREACH(lun, &softc->lun_list, links) {
- if (lun->lun_id == lun_id) {
- found = 1;
- break;
- }
- }
- mtx_unlock(&softc->lock);
-
- /*
- * If we already have this target/LUN, there is no reason to add
- * it to our lists again.
- */
- if (found != 0)
- return (0);
-
- lun = uma_zalloc(cfi_lun_zone, M_NOWAIT | M_ZERO);
- if (lun == NULL) {
- printf("%s: unable to allocate LUN structure\n", __func__);
- return (1);
- }
-
- lun->lun_id = lun_id;
- lun->cur_tag_num = 0;
- lun->state = CFI_LUN_INQUIRY;
- lun->softc = softc;
- STAILQ_INIT(&lun->io_list);
-
- mtx_lock(&softc->lock);
- STAILQ_INSERT_TAIL(&softc->lun_list, lun, links);
- mtx_unlock(&softc->lock);
-
- cfi_lun_probe(lun, /*have_lock*/ 0);
-
- return (0);
-}
-
-static int
-cfi_lun_disable(void *arg, int lun_id)
-{
- struct cfi_softc *softc;
- struct cfi_lun *lun;
- int found;
-
- softc = (struct cfi_softc *)arg;
-
- found = 0;
-
- /*
- * XXX KDM need to do an invalidate and then a free when any
- * pending I/O has completed. Or do we? CTL won't free a LUN
- * while any I/O is pending. So we won't get this notification
- * unless any I/O we have pending on a LUN has completed.
- */
- mtx_lock(&softc->lock);
- STAILQ_FOREACH(lun, &softc->lun_list, links) {
- if (lun->lun_id == lun_id) {
- found = 1;
- break;
- }
- }
- if (found != 0)
- STAILQ_REMOVE(&softc->lun_list, lun, cfi_lun, links);
-
- mtx_unlock(&softc->lock);
-
- if (found == 0) {
- printf("%s: can't find lun %d\n", __func__, lun_id);
- return (1);
- }
-
- uma_zfree(cfi_lun_zone, lun);
-
- return (0);
-}
-
-static void
-cfi_datamove(union ctl_io *io)
-{
- struct ctl_sg_entry *ext_sglist, *kern_sglist;
- struct ctl_sg_entry ext_entry, kern_entry;
- int ext_sglen, ext_sg_entries, kern_sg_entries;
- int ext_sg_start, ext_offset;
- int len_to_copy, len_copied;
- int kern_watermark, ext_watermark;
- int ext_sglist_malloced;
- struct ctl_scsiio *ctsio;
- int i, j;
-
- ext_sglist_malloced = 0;
- ext_sg_start = 0;
- ext_offset = 0;
- ext_sglist = NULL;
-
- CTL_DEBUG_PRINT(("%s\n", __func__));
-
- ctsio = &io->scsiio;
-
- /*
- * If this is the case, we're probably doing a BBR read and don't
- * actually need to transfer the data. This will effectively
- * bit-bucket the data.
- */
- if (ctsio->ext_data_ptr == NULL)
- goto bailout;
-
- /*
- * To simplify things here, if we have a single buffer, stick it in
- * a S/G entry and just make it a single entry S/G list.
- */
- if (ctsio->io_hdr.flags & CTL_FLAG_EDPTR_SGLIST) {
- int len_seen;
-
- ext_sglen = ctsio->ext_sg_entries * sizeof(*ext_sglist);
-
- ext_sglist = (struct ctl_sg_entry *)malloc(ext_sglen, M_CTL_CFI,
- M_WAITOK);
- ext_sglist_malloced = 1;
- if (memcpy(ext_sglist, ctsio->ext_data_ptr, ext_sglen) != 0) {
- ctl_set_internal_failure(ctsio,
- /*sks_valid*/ 0,
- /*retry_count*/ 0);
- goto bailout;
- }
- ext_sg_entries = ctsio->ext_sg_entries;
- len_seen = 0;
- for (i = 0; i < ext_sg_entries; i++) {
- if ((len_seen + ext_sglist[i].len) >=
- ctsio->ext_data_filled) {
- ext_sg_start = i;
- ext_offset = ctsio->ext_data_filled - len_seen;
- break;
- }
- len_seen += ext_sglist[i].len;
- }
- } else {
- ext_sglist = &ext_entry;
- ext_sglist->addr = ctsio->ext_data_ptr;
- ext_sglist->len = ctsio->ext_data_len;
- ext_sg_entries = 1;
- ext_sg_start = 0;
- ext_offset = ctsio->ext_data_filled;
- }
-
- if (ctsio->kern_sg_entries > 0) {
- kern_sglist = (struct ctl_sg_entry *)ctsio->kern_data_ptr;
- kern_sg_entries = ctsio->kern_sg_entries;
- } else {
- kern_sglist = &kern_entry;
- kern_sglist->addr = ctsio->kern_data_ptr;
- kern_sglist->len = ctsio->kern_data_len;
- kern_sg_entries = 1;
- }
-
-
- kern_watermark = 0;
- ext_watermark = ext_offset;
- len_copied = 0;
- for (i = ext_sg_start, j = 0;
- i < ext_sg_entries && j < kern_sg_entries;) {
- uint8_t *ext_ptr, *kern_ptr;
-
- len_to_copy = MIN(ext_sglist[i].len - ext_watermark,
- kern_sglist[j].len - kern_watermark);
-
- ext_ptr = (uint8_t *)ext_sglist[i].addr;
- ext_ptr = ext_ptr + ext_watermark;
- if (io->io_hdr.flags & CTL_FLAG_BUS_ADDR) {
- /*
- * XXX KDM fix this!
- */
- panic("need to implement bus address support");
-#if 0
- kern_ptr = bus_to_virt(kern_sglist[j].addr);
-#endif
- } else
- kern_ptr = (uint8_t *)kern_sglist[j].addr;
- kern_ptr = kern_ptr + kern_watermark;
-
- kern_watermark += len_to_copy;
- ext_watermark += len_to_copy;
-
- if ((ctsio->io_hdr.flags & CTL_FLAG_DATA_MASK) ==
- CTL_FLAG_DATA_IN) {
- CTL_DEBUG_PRINT(("%s: copying %d bytes to user\n",
- __func__, len_to_copy));
- CTL_DEBUG_PRINT(("%s: from %p to %p\n", __func__,
- kern_ptr, ext_ptr));
- memcpy(ext_ptr, kern_ptr, len_to_copy);
- } else {
- CTL_DEBUG_PRINT(("%s: copying %d bytes from user\n",
- __func__, len_to_copy));
- CTL_DEBUG_PRINT(("%s: from %p to %p\n", __func__,
- ext_ptr, kern_ptr));
- memcpy(kern_ptr, ext_ptr, len_to_copy);
- }
-
- len_copied += len_to_copy;
-
- if (ext_sglist[i].len == ext_watermark) {
- i++;
- ext_watermark = 0;
- }
-
- if (kern_sglist[j].len == kern_watermark) {
- j++;
- kern_watermark = 0;
- }
- }
-
- ctsio->ext_data_filled += len_copied;
-
- CTL_DEBUG_PRINT(("%s: ext_sg_entries: %d, kern_sg_entries: %d\n",
- __func__, ext_sg_entries, kern_sg_entries));
- CTL_DEBUG_PRINT(("%s: ext_data_len = %d, kern_data_len = %d\n",
- __func__, ctsio->ext_data_len, ctsio->kern_data_len));
-
-
- /* XXX KDM set residual?? */
-bailout:
-
- if (ext_sglist_malloced != 0)
- free(ext_sglist, M_CTL_CFI);
-
- io->scsiio.be_move_done(io);
-
- return;
-}
-
-/*
- * For any sort of check condition, busy, etc., we just retry. We do not
- * decrement the retry count for unit attention type errors. These are
- * normal, and we want to save the retry count for "real" errors. Otherwise,
- * we could end up with situations where a command will succeed in some
- * situations and fail in others, depending on whether a unit attention is
- * pending. Also, some of our error recovery actions, most notably the
- * LUN reset action, will cause a unit attention.
- *
- * We can add more detail here later if necessary.
- */
-static cfi_error_action
-cfi_checkcond_parse(union ctl_io *io, struct cfi_lun_io *lun_io)
-{
- cfi_error_action error_action;
- int error_code, sense_key, asc, ascq;
-
- /*
- * Default to retrying the command.
- */
- error_action = CFI_ERR_RETRY;
-
- scsi_extract_sense_len(&io->scsiio.sense_data,
- io->scsiio.sense_len,
- &error_code,
- &sense_key,
- &asc,
- &ascq,
- /*show_errors*/ 1);
-
- switch (error_code) {
- case SSD_DEFERRED_ERROR:
- case SSD_DESC_DEFERRED_ERROR:
- error_action |= CFI_ERR_NO_DECREMENT;
- break;
- case SSD_CURRENT_ERROR:
- case SSD_DESC_CURRENT_ERROR:
- default: {
- switch (sense_key) {
- case SSD_KEY_UNIT_ATTENTION:
- error_action |= CFI_ERR_NO_DECREMENT;
- break;
- case SSD_KEY_HARDWARE_ERROR:
- /*
- * This is our generic "something bad happened"
- * error code. It often isn't recoverable.
- */
- if ((asc == 0x44) && (ascq == 0x00))
- error_action = CFI_ERR_FAIL;
- break;
- case SSD_KEY_NOT_READY:
- /*
- * If the LUN is powered down, there likely isn't
- * much point in retrying right now.
- */
- if ((asc == 0x04) && (ascq == 0x02))
- error_action = CFI_ERR_FAIL;
- /*
- * If the LUN is offline, there probably isn't much
- * point in retrying, either.
- */
- if ((asc == 0x04) && (ascq == 0x03))
- error_action = CFI_ERR_FAIL;
- break;
- }
- }
- }
-
- return (error_action);
-}
-
-static cfi_error_action
-cfi_error_parse(union ctl_io *io, struct cfi_lun_io *lun_io)
-{
- cfi_error_action error_action;
-
- error_action = CFI_ERR_RETRY;
-
- switch (io->io_hdr.io_type) {
- case CTL_IO_SCSI:
- switch (io->io_hdr.status & CTL_STATUS_MASK) {
- case CTL_SCSI_ERROR:
- switch (io->scsiio.scsi_status) {
- case SCSI_STATUS_RESERV_CONFLICT:
- /*
- * For a reservation conflict, we'll usually
- * want the hard error recovery policy, so
- * we'll reset the LUN.
- */
- if (lun_io->policy == CFI_ERR_HARD)
- error_action =
- CFI_ERR_LUN_RESET;
- else
- error_action =
- CFI_ERR_RETRY;
- break;
- case SCSI_STATUS_CHECK_COND:
- default:
- error_action = cfi_checkcond_parse(io, lun_io);
- break;
- }
- break;
- default:
- error_action = CFI_ERR_RETRY;
- break;
- }
- break;
- case CTL_IO_TASK:
- /*
- * In theory task management commands shouldn't fail...
- */
- error_action = CFI_ERR_RETRY;
- break;
- default:
- printf("%s: invalid ctl_io type %d\n", __func__,
- io->io_hdr.io_type);
- panic("%s: invalid ctl_io type %d\n", __func__,
- io->io_hdr.io_type);
- break;
- }
-
- return (error_action);
-}
-
-static void
-cfi_init_io(union ctl_io *io, struct cfi_lun *lun,
- struct cfi_metatask *metatask, cfi_error_policy policy, int retries,
- struct cfi_lun_io *orig_lun_io,
- void (*done_function)(union ctl_io *io))
-{
- struct cfi_lun_io *lun_io;
-
- io->io_hdr.nexus.initid.id = 7;
- io->io_hdr.nexus.targ_port = lun->softc->port.targ_port;
- io->io_hdr.nexus.targ_target.id = 0;
- io->io_hdr.nexus.targ_lun = lun->lun_id;
- io->io_hdr.retries = retries;
- lun_io = (struct cfi_lun_io *)io->io_hdr.port_priv;
- io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr = lun_io;
- lun_io->lun = lun;
- lun_io->metatask = metatask;
- lun_io->ctl_io = io;
- lun_io->policy = policy;
- lun_io->orig_lun_io = orig_lun_io;
- lun_io->done_function = done_function;
- /*
- * We only set the tag number for SCSI I/Os. For task management
- * commands, the tag number is only really needed for aborts, so
- * the caller can set it if necessary.
- */
- switch (io->io_hdr.io_type) {
- case CTL_IO_SCSI:
- io->scsiio.tag_num = lun->cur_tag_num++;
- break;
- case CTL_IO_TASK:
- default:
- break;
- }
-}
-
-static void
-cfi_done(union ctl_io *io)
-{
- struct cfi_lun_io *lun_io;
- struct cfi_softc *softc;
- struct cfi_lun *lun;
-
- lun_io = (struct cfi_lun_io *)
- io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr;
-
- lun = lun_io->lun;
- softc = lun->softc;
-
- /*
- * Very minimal retry logic. We basically retry if we got an error
- * back, and the retry count is greater than 0. If we ever want
- * more sophisticated initiator type behavior, the CAM error
- * recovery code in ../common might be helpful.
- */
- if (((io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)
- && (io->io_hdr.retries > 0)) {
- ctl_io_status old_status;
- cfi_error_action error_action;
-
- error_action = cfi_error_parse(io, lun_io);
-
- switch (error_action & CFI_ERR_MASK) {
- case CFI_ERR_FAIL:
- goto done;
- break; /* NOTREACHED */
- case CFI_ERR_LUN_RESET: {
- union ctl_io *new_io;
- struct cfi_lun_io *new_lun_io;
-
- new_io = ctl_alloc_io(softc->port.ctl_pool_ref);
- ctl_zero_io(new_io);
-
- new_io->io_hdr.io_type = CTL_IO_TASK;
- new_io->taskio.task_action = CTL_TASK_LUN_RESET;
-
- cfi_init_io(new_io,
- /*lun*/ lun_io->lun,
- /*metatask*/ NULL,
- /*policy*/ CFI_ERR_SOFT,
- /*retries*/ 0,
- /*orig_lun_io*/lun_io,
- /*done_function*/ cfi_err_recovery_done);
-
-
- new_lun_io = (struct cfi_lun_io *)
- new_io->io_hdr.port_priv;
-
- mtx_lock(&lun->softc->lock);
- STAILQ_INSERT_TAIL(&lun->io_list, new_lun_io, links);
- mtx_unlock(&lun->softc->lock);
-
- io = new_io;
- break;
- }
- case CFI_ERR_RETRY:
- default:
- if ((error_action & CFI_ERR_NO_DECREMENT) == 0)
- io->io_hdr.retries--;
- break;
- }
-
- old_status = io->io_hdr.status;
- io->io_hdr.status = CTL_STATUS_NONE;
-#if 0
- io->io_hdr.flags &= ~CTL_FLAG_ALREADY_DONE;
-#endif
- io->io_hdr.flags &= ~CTL_FLAG_ABORT;
- io->io_hdr.flags &= ~CTL_FLAG_SENT_2OTHER_SC;
-
- if (ctl_queue(io) != CTL_RETVAL_COMPLETE) {
- printf("%s: error returned from ctl_queue()!\n",
- __func__);
- io->io_hdr.status = old_status;
- } else
- return;
- }
-done:
- lun_io->done_function(io);
-}
-
-static void
-cfi_lun_probe_done(union ctl_io *io)
-{
- struct cfi_lun *lun;
- struct cfi_lun_io *lun_io;
-
- lun_io = (struct cfi_lun_io *)
- io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr;
- lun = lun_io->lun;
-
- switch (lun->state) {
- case CFI_LUN_INQUIRY: {
- if ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS) {
- /* print out something here?? */
- printf("%s: LUN %d probe failed because inquiry "
- "failed\n", __func__, lun->lun_id);
- ctl_io_error_print(io, NULL);
- } else {
-
- if (SID_TYPE(&lun->inq_data) != T_DIRECT) {
- char path_str[40];
-
- lun->state = CFI_LUN_READY;
- ctl_scsi_path_string(io, path_str,
- sizeof(path_str));
- printf("%s", path_str);
- scsi_print_inquiry(&lun->inq_data);
- } else {
- lun->state = CFI_LUN_READCAPACITY;
- cfi_lun_probe(lun, /*have_lock*/ 0);
- }
- }
- mtx_lock(&lun->softc->lock);
- STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links);
- mtx_unlock(&lun->softc->lock);
- ctl_free_io(io);
- break;
- }
- case CFI_LUN_READCAPACITY:
- case CFI_LUN_READCAPACITY_16: {
- uint64_t maxlba;
- uint32_t blocksize;
-
- maxlba = 0;
- blocksize = 0;
-
- if ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS) {
- printf("%s: LUN %d probe failed because READ CAPACITY "
- "failed\n", __func__, lun->lun_id);
- ctl_io_error_print(io, NULL);
- } else {
-
- if (lun->state == CFI_LUN_READCAPACITY) {
- struct scsi_read_capacity_data *rdcap;
-
- rdcap = (struct scsi_read_capacity_data *)
- io->scsiio.ext_data_ptr;
-
- maxlba = scsi_4btoul(rdcap->addr);
- blocksize = scsi_4btoul(rdcap->length);
- if (blocksize == 0) {
- printf("%s: LUN %d has invalid "
- "blocksize 0, probe aborted\n",
- __func__, lun->lun_id);
- } else if (maxlba == 0xffffffff) {
- lun->state = CFI_LUN_READCAPACITY_16;
- cfi_lun_probe(lun, /*have_lock*/ 0);
- } else
- lun->state = CFI_LUN_READY;
- } else {
- struct scsi_read_capacity_data_long *rdcap_long;
-
- rdcap_long = (struct
- scsi_read_capacity_data_long *)
- io->scsiio.ext_data_ptr;
- maxlba = scsi_8btou64(rdcap_long->addr);
- blocksize = scsi_4btoul(rdcap_long->length);
-
- if (blocksize == 0) {
- printf("%s: LUN %d has invalid "
- "blocksize 0, probe aborted\n",
- __func__, lun->lun_id);
- } else
- lun->state = CFI_LUN_READY;
- }
- }
-
- if (lun->state == CFI_LUN_READY) {
- char path_str[40];
-
- lun->num_blocks = maxlba + 1;
- lun->blocksize = blocksize;
-
- /*
- * If this is true, the blocksize is a power of 2.
- * We already checked for 0 above.
- */
- if (((blocksize - 1) & blocksize) == 0) {
- int i;
-
- for (i = 0; i < 32; i++) {
- if ((blocksize & (1 << i)) != 0) {
- lun->blocksize_powerof2 = i;
- break;
- }
- }
- }
- ctl_scsi_path_string(io, path_str,sizeof(path_str));
- printf("%s", path_str);
- scsi_print_inquiry(&lun->inq_data);
- printf("%s %ju blocks, blocksize %d\n", path_str,
- (uintmax_t)maxlba + 1, blocksize);
- }
- mtx_lock(&lun->softc->lock);
- STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links);
- mtx_unlock(&lun->softc->lock);
- free(io->scsiio.ext_data_ptr, M_CTL_CFI);
- ctl_free_io(io);
- break;
- }
- case CFI_LUN_READY:
- default:
- mtx_lock(&lun->softc->lock);
- /* How did we get here?? */
- STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links);
- mtx_unlock(&lun->softc->lock);
- ctl_free_io(io);
- break;
- }
-}
-
-static void
-cfi_lun_probe(struct cfi_lun *lun, int have_lock)
-{
-
- if (have_lock == 0)
- mtx_lock(&lun->softc->lock);
- if ((lun->softc->flags & CFI_ONLINE) == 0) {
- if (have_lock == 0)
- mtx_unlock(&lun->softc->lock);
- return;
- }
- if (have_lock == 0)
- mtx_unlock(&lun->softc->lock);
-
- switch (lun->state) {
- case CFI_LUN_INQUIRY: {
- struct cfi_lun_io *lun_io;
- union ctl_io *io;
-
- io = ctl_alloc_io(lun->softc->port.ctl_pool_ref);
- ctl_scsi_inquiry(io,
- /*data_ptr*/(uint8_t *)&lun->inq_data,
- /*data_len*/ sizeof(lun->inq_data),
- /*byte2*/ 0,
- /*page_code*/ 0,
- /*tag_type*/ CTL_TAG_SIMPLE,
- /*control*/ 0);
-
- cfi_init_io(io,
- /*lun*/ lun,
- /*metatask*/ NULL,
- /*policy*/ CFI_ERR_SOFT,
- /*retries*/ 5,
- /*orig_lun_io*/ NULL,
- /*done_function*/
- cfi_lun_probe_done);
-
- lun_io = (struct cfi_lun_io *)io->io_hdr.port_priv;
-
- if (have_lock == 0)
- mtx_lock(&lun->softc->lock);
- STAILQ_INSERT_TAIL(&lun->io_list, lun_io, links);
- if (have_lock == 0)
- mtx_unlock(&lun->softc->lock);
-
- if (ctl_queue(io) != CTL_RETVAL_COMPLETE) {
- printf("%s: error returned from ctl_queue()!\n",
- __func__);
- STAILQ_REMOVE(&lun->io_list, lun_io,
- cfi_lun_io, links);
- ctl_free_io(io);
- }
- break;
- }
- case CFI_LUN_READCAPACITY:
- case CFI_LUN_READCAPACITY_16: {
- struct cfi_lun_io *lun_io;
- uint8_t *dataptr;
- union ctl_io *io;
-
- io = ctl_alloc_io(lun->softc->port.ctl_pool_ref);
-
- dataptr = malloc(sizeof(struct scsi_read_capacity_data_long),
- M_CTL_CFI, M_NOWAIT);
- if (dataptr == NULL) {
- printf("%s: unable to allocate SCSI read capacity "
- "buffer for lun %d\n", __func__, lun->lun_id);
- return;
- }
- if (lun->state == CFI_LUN_READCAPACITY) {
- ctl_scsi_read_capacity(io,
- /*data_ptr*/ dataptr,
- /*data_len*/
- sizeof(struct scsi_read_capacity_data_long),
- /*addr*/ 0,
- /*reladr*/ 0,
- /*pmi*/ 0,
- /*tag_type*/ CTL_TAG_SIMPLE,
- /*control*/ 0);
- } else {
- ctl_scsi_read_capacity_16(io,
- /*data_ptr*/ dataptr,
- /*data_len*/
- sizeof(struct scsi_read_capacity_data_long),
- /*addr*/ 0,
- /*reladr*/ 0,
- /*pmi*/ 0,
- /*tag_type*/ CTL_TAG_SIMPLE,
- /*control*/ 0);
- }
- cfi_init_io(io,
- /*lun*/ lun,
- /*metatask*/ NULL,
- /*policy*/ CFI_ERR_SOFT,
- /*retries*/ 7,
- /*orig_lun_io*/ NULL,
- /*done_function*/ cfi_lun_probe_done);
-
- lun_io = (struct cfi_lun_io *)io->io_hdr.port_priv;
-
- if (have_lock == 0)
- mtx_lock(&lun->softc->lock);
- STAILQ_INSERT_TAIL(&lun->io_list, lun_io, links);
- if (have_lock == 0)
- mtx_unlock(&lun->softc->lock);
-
- if (ctl_queue(io) != CTL_RETVAL_COMPLETE) {
- printf("%s: error returned from ctl_queue()!\n",
- __func__);
- STAILQ_REMOVE(&lun->io_list, lun_io,
- cfi_lun_io, links);
- free(dataptr, M_CTL_CFI);
- ctl_free_io(io);
- }
- break;
- }
- case CFI_LUN_READY:
- default:
- /* Why were we called? */
- break;
- }
-}
-
-static void
-cfi_metatask_done(struct cfi_softc *softc, struct cfi_metatask *metatask)
-{
- mtx_lock(&softc->lock);
- STAILQ_REMOVE(&softc->metatask_list, metatask, cfi_metatask, links);
- mtx_unlock(&softc->lock);
-
- /*
- * Return status to the caller. Caller allocated storage, and is
- * responsible for calling cfi_free_metatask to release it once
- * they've seen the status.
- */
- metatask->callback(metatask->callback_arg, metatask);
-}
-
-static void
-cfi_metatask_bbr_errorparse(struct cfi_metatask *metatask, union ctl_io *io)
-{
- int error_code, sense_key, asc, ascq;
-
- if (metatask->tasktype != CFI_TASK_BBRREAD)
- return;
-
- if ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS) {
- metatask->status = CFI_MT_SUCCESS;
- metatask->taskinfo.bbrread.status = CFI_BBR_SUCCESS;
- return;
- }
-
- if ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_SCSI_ERROR) {
- metatask->status = CFI_MT_ERROR;
- metatask->taskinfo.bbrread.status = CFI_BBR_ERROR;
- return;
- }
-
- metatask->taskinfo.bbrread.scsi_status = io->scsiio.scsi_status;
- memcpy(&metatask->taskinfo.bbrread.sense_data, &io->scsiio.sense_data,
- MIN(sizeof(metatask->taskinfo.bbrread.sense_data),
- sizeof(io->scsiio.sense_data)));
-
- if (io->scsiio.scsi_status == SCSI_STATUS_RESERV_CONFLICT) {
- metatask->status = CFI_MT_ERROR;
- metatask->taskinfo.bbrread.status = CFI_BBR_RESERV_CONFLICT;
- return;
- }
-
- if (io->scsiio.scsi_status != SCSI_STATUS_CHECK_COND) {
- metatask->status = CFI_MT_ERROR;
- metatask->taskinfo.bbrread.status = CFI_BBR_SCSI_ERROR;
- return;
- }
-
- scsi_extract_sense_len(&io->scsiio.sense_data,
- io->scsiio.sense_len,
- &error_code,
- &sense_key,
- &asc,
- &ascq,
- /*show_errors*/ 1);
-
- switch (error_code) {
- case SSD_DEFERRED_ERROR:
- case SSD_DESC_DEFERRED_ERROR:
- metatask->status = CFI_MT_ERROR;
- metatask->taskinfo.bbrread.status = CFI_BBR_SCSI_ERROR;
- break;
- case SSD_CURRENT_ERROR:
- case SSD_DESC_CURRENT_ERROR:
- default: {
- struct scsi_sense_data *sense;
-
- sense = &io->scsiio.sense_data;
-
- if ((asc == 0x04) && (ascq == 0x02)) {
- metatask->status = CFI_MT_ERROR;
- metatask->taskinfo.bbrread.status = CFI_BBR_LUN_STOPPED;
- } else if ((asc == 0x04) && (ascq == 0x03)) {
- metatask->status = CFI_MT_ERROR;
- metatask->taskinfo.bbrread.status =
- CFI_BBR_LUN_OFFLINE_CTL;
- } else if ((asc == 0x44) && (ascq == 0x00)) {
-#ifdef NEEDTOPORT
- if (sense->sense_key_spec[0] & SSD_SCS_VALID) {
- uint16_t retry_count;
-
- retry_count = sense->sense_key_spec[1] << 8 |
- sense->sense_key_spec[2];
- if (((retry_count & 0xf000) == CSC_RAIDCORE)
- && ((retry_count & 0x0f00) == CSC_SHELF_SW)
- && ((retry_count & 0xff) ==
- RC_STS_DEVICE_OFFLINE)) {
- metatask->status = CFI_MT_ERROR;
- metatask->taskinfo.bbrread.status =
- CFI_BBR_LUN_OFFLINE_RC;
- } else {
- metatask->status = CFI_MT_ERROR;
- metatask->taskinfo.bbrread.status =
- CFI_BBR_SCSI_ERROR;
- }
- } else {
-#endif /* NEEDTOPORT */
- metatask->status = CFI_MT_ERROR;
- metatask->taskinfo.bbrread.status =
- CFI_BBR_SCSI_ERROR;
-#ifdef NEEDTOPORT
- }
-#endif
- } else {
- metatask->status = CFI_MT_ERROR;
- metatask->taskinfo.bbrread.status = CFI_BBR_SCSI_ERROR;
- }
- break;
- }
- }
-}
-
-static void
-cfi_metatask_io_done(union ctl_io *io)
-{
- struct cfi_lun_io *lun_io;
- struct cfi_metatask *metatask;
- struct cfi_softc *softc;
- struct cfi_lun *lun;
-
- lun_io = (struct cfi_lun_io *)
- io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr;
-
- lun = lun_io->lun;
- softc = lun->softc;
-
- metatask = lun_io->metatask;
-
- switch (metatask->tasktype) {
- case CFI_TASK_STARTUP:
- case CFI_TASK_SHUTDOWN: {
- int failed, done, is_start;
-
- failed = 0;
- done = 0;
- if (metatask->tasktype == CFI_TASK_STARTUP)
- is_start = 1;
- else
- is_start = 0;
-
- mtx_lock(&softc->lock);
- if ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)
- metatask->taskinfo.startstop.luns_complete++;
- else {
- metatask->taskinfo.startstop.luns_failed++;
- failed = 1;
- }
- if ((metatask->taskinfo.startstop.luns_complete +
- metatask->taskinfo.startstop.luns_failed) >=
- metatask->taskinfo.startstop.total_luns)
- done = 1;
-
- mtx_unlock(&softc->lock);
-
- if (failed != 0) {
- printf("%s: LUN %d %s request failed\n", __func__,
- lun_io->lun->lun_id, (is_start == 1) ? "start" :
- "stop");
- ctl_io_error_print(io, &lun_io->lun->inq_data);
- }
- if (done != 0) {
- if (metatask->taskinfo.startstop.luns_failed > 0)
- metatask->status = CFI_MT_ERROR;
- else
- metatask->status = CFI_MT_SUCCESS;
- cfi_metatask_done(softc, metatask);
- }
- mtx_lock(&softc->lock);
- STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links);
- mtx_unlock(&softc->lock);
-
- ctl_free_io(io);
- break;
- }
- case CFI_TASK_BBRREAD: {
- /*
- * Translate the SCSI error into an enumeration.
- */
- cfi_metatask_bbr_errorparse(metatask, io);
-
- mtx_lock(&softc->lock);
- STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links);
- mtx_unlock(&softc->lock);
-
- ctl_free_io(io);
-
- cfi_metatask_done(softc, metatask);
- break;
- }
- default:
- /*
- * This shouldn't happen.
- */
- mtx_lock(&softc->lock);
- STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links);
- mtx_unlock(&softc->lock);
-
- ctl_free_io(io);
- break;
- }
-}
-
-static void
-cfi_err_recovery_done(union ctl_io *io)
-{
- struct cfi_lun_io *lun_io, *orig_lun_io;
- struct cfi_lun *lun;
- union ctl_io *orig_io;
-
- lun_io = (struct cfi_lun_io *)io->io_hdr.port_priv;
- orig_lun_io = lun_io->orig_lun_io;
- orig_io = orig_lun_io->ctl_io;
- lun = lun_io->lun;
-
- if (io->io_hdr.status != CTL_SUCCESS) {
- printf("%s: error recovery action failed. Original "
- "error:\n", __func__);
-
- ctl_io_error_print(orig_lun_io->ctl_io, &lun->inq_data);
-
- printf("%s: error from error recovery action:\n", __func__);
-
- ctl_io_error_print(io, &lun->inq_data);
-
- printf("%s: trying original command again...\n", __func__);
- }
-
- mtx_lock(&lun->softc->lock);
- STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links);
- mtx_unlock(&lun->softc->lock);
- ctl_free_io(io);
-
- orig_io->io_hdr.retries--;
- orig_io->io_hdr.status = CTL_STATUS_NONE;
-
- if (ctl_queue(orig_io) != CTL_RETVAL_COMPLETE) {
- printf("%s: error returned from ctl_queue()!\n", __func__);
- STAILQ_REMOVE(&lun->io_list, orig_lun_io,
- cfi_lun_io, links);
- ctl_free_io(orig_io);
- }
-}
-
-static void
-cfi_lun_io_done(union ctl_io *io)
-{
- struct cfi_lun *lun;
- struct cfi_lun_io *lun_io;
-
- lun_io = (struct cfi_lun_io *)
- io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr;
- lun = lun_io->lun;
-
- if (lun_io->metatask == NULL) {
- printf("%s: I/O has no metatask pointer, discarding\n",
- __func__);
- STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links);
- ctl_free_io(io);
- return;
- }
- cfi_metatask_io_done(io);
-}
-
-void
-cfi_action(struct cfi_metatask *metatask)
-{
- struct cfi_softc *softc;
-
- softc = &fetd_internal_softc;
-
- mtx_lock(&softc->lock);
-
- STAILQ_INSERT_TAIL(&softc->metatask_list, metatask, links);
-
- if ((softc->flags & CFI_ONLINE) == 0) {
- mtx_unlock(&softc->lock);
- metatask->status = CFI_MT_PORT_OFFLINE;
- cfi_metatask_done(softc, metatask);
- return;
- } else
- mtx_unlock(&softc->lock);
-
- switch (metatask->tasktype) {
- case CFI_TASK_STARTUP:
- case CFI_TASK_SHUTDOWN: {
- union ctl_io *io;
- int da_luns, ios_allocated, do_start;
- struct cfi_lun *lun;
- STAILQ_HEAD(, ctl_io_hdr) tmp_io_list;
-
- da_luns = 0;
- ios_allocated = 0;
- STAILQ_INIT(&tmp_io_list);
-
- if (metatask->tasktype == CFI_TASK_STARTUP)
- do_start = 1;
- else
- do_start = 0;
-
- mtx_lock(&softc->lock);
- STAILQ_FOREACH(lun, &softc->lun_list, links) {
- if (lun->state != CFI_LUN_READY)
- continue;
-
- if (SID_TYPE(&lun->inq_data) != T_DIRECT)
- continue;
- da_luns++;
- io = ctl_alloc_io_nowait(softc->port.ctl_pool_ref);
- if (io != NULL) {
- ios_allocated++;
- STAILQ_INSERT_TAIL(&tmp_io_list, &io->io_hdr,
- links);
- }
- }
-
- if (ios_allocated < da_luns) {
- printf("%s: error allocating ctl_io for %s\n",
- __func__, (do_start == 1) ? "startup" :
- "shutdown");
- da_luns = ios_allocated;
- }
-
- metatask->taskinfo.startstop.total_luns = da_luns;
-
- STAILQ_FOREACH(lun, &softc->lun_list, links) {
- struct cfi_lun_io *lun_io;
-
- if (lun->state != CFI_LUN_READY)
- continue;
-
- if (SID_TYPE(&lun->inq_data) != T_DIRECT)
- continue;
-
- io = (union ctl_io *)STAILQ_FIRST(&tmp_io_list);
- if (io == NULL)
- break;
-
- STAILQ_REMOVE(&tmp_io_list, &io->io_hdr, ctl_io_hdr,
- links);
-
- ctl_scsi_start_stop(io,
- /*start*/ do_start,
- /*load_eject*/ 0,
- /*immediate*/ 0,
- /*power_conditions*/
- SSS_PC_START_VALID,
- /*onoffline*/ 1,
- /*ctl_tag_type*/ CTL_TAG_ORDERED,
- /*control*/ 0);
-
- cfi_init_io(io,
- /*lun*/ lun,
- /*metatask*/ metatask,
- /*policy*/ CFI_ERR_HARD,
- /*retries*/ 3,
- /*orig_lun_io*/ NULL,
- /*done_function*/ cfi_lun_io_done);
-
- lun_io = (struct cfi_lun_io *) io->io_hdr.port_priv;
-
- STAILQ_INSERT_TAIL(&lun->io_list, lun_io, links);
-
- if (ctl_queue(io) != CTL_RETVAL_COMPLETE) {
- printf("%s: error returned from ctl_queue()!\n",
- __func__);
- STAILQ_REMOVE(&lun->io_list, lun_io,
- cfi_lun_io, links);
- ctl_free_io(io);
- metatask->taskinfo.startstop.total_luns--;
- }
- }
-
- if (STAILQ_FIRST(&tmp_io_list) != NULL) {
- printf("%s: error: tmp_io_list != NULL\n", __func__);
- for (io = (union ctl_io *)STAILQ_FIRST(&tmp_io_list);
- io != NULL;
- io = (union ctl_io *)STAILQ_FIRST(&tmp_io_list)) {
- STAILQ_REMOVE(&tmp_io_list, &io->io_hdr,
- ctl_io_hdr, links);
- ctl_free_io(io);
- }
- }
- mtx_unlock(&softc->lock);
-
- break;
- }
- case CFI_TASK_BBRREAD: {
- union ctl_io *io;
- struct cfi_lun *lun;
- struct cfi_lun_io *lun_io;
- cfi_bbrread_status status;
- int req_lun_num;
- uint32_t num_blocks;
-
- status = CFI_BBR_SUCCESS;
-
- req_lun_num = metatask->taskinfo.bbrread.lun_num;
-
- mtx_lock(&softc->lock);
- STAILQ_FOREACH(lun, &softc->lun_list, links) {
- if (lun->lun_id != req_lun_num)
- continue;
- if (lun->state != CFI_LUN_READY) {
- status = CFI_BBR_LUN_UNCONFIG;
- break;
- } else
- break;
- }
-
- if (lun == NULL)
- status = CFI_BBR_NO_LUN;
-
- if (status != CFI_BBR_SUCCESS) {
- metatask->status = CFI_MT_ERROR;
- metatask->taskinfo.bbrread.status = status;
- mtx_unlock(&softc->lock);
- cfi_metatask_done(softc, metatask);
- break;
- }
-
- /*
- * Convert the number of bytes given into blocks and check
- * that the number of bytes is a multiple of the blocksize.
- * CTL will verify that the LBA is okay.
- */
- if (lun->blocksize_powerof2 != 0) {
- if ((metatask->taskinfo.bbrread.len &
- (lun->blocksize - 1)) != 0) {
- metatask->status = CFI_MT_ERROR;
- metatask->taskinfo.bbrread.status =
- CFI_BBR_BAD_LEN;
- cfi_metatask_done(softc, metatask);
- break;
- }
-
- num_blocks = metatask->taskinfo.bbrread.len >>
- lun->blocksize_powerof2;
- } else {
- /*
- * XXX KDM this could result in floating point
- * division, which isn't supported in the kernel on
- * x86 at least.
- */
- if ((metatask->taskinfo.bbrread.len %
- lun->blocksize) != 0) {
- metatask->status = CFI_MT_ERROR;
- metatask->taskinfo.bbrread.status =
- CFI_BBR_BAD_LEN;
- cfi_metatask_done(softc, metatask);
- break;
- }
-
- /*
- * XXX KDM this could result in floating point
- * division in some cases.
- */
- num_blocks = metatask->taskinfo.bbrread.len /
- lun->blocksize;
-
- }
-
- io = ctl_alloc_io_nowait(softc->port.ctl_pool_ref);
- if (io == NULL) {
- metatask->status = CFI_MT_ERROR;
- metatask->taskinfo.bbrread.status = CFI_BBR_NO_MEM;
- mtx_unlock(&softc->lock);
- cfi_metatask_done(softc, metatask);
- break;
- }
-
- /*
- * XXX KDM need to do a read capacity to get the blocksize
- * for this device.
- */
- ctl_scsi_read_write(io,
- /*data_ptr*/ NULL,
- /*data_len*/ metatask->taskinfo.bbrread.len,
- /*read_op*/ 1,
- /*byte2*/ 0,
- /*minimum_cdb_size*/ 0,
- /*lba*/ metatask->taskinfo.bbrread.lba,
- /*num_blocks*/ num_blocks,
- /*tag_type*/ CTL_TAG_SIMPLE,
- /*control*/ 0);
-
- cfi_init_io(io,
- /*lun*/ lun,
- /*metatask*/ metatask,
- /*policy*/ CFI_ERR_SOFT,
- /*retries*/ 3,
- /*orig_lun_io*/ NULL,
- /*done_function*/ cfi_lun_io_done);
-
- lun_io = (struct cfi_lun_io *)io->io_hdr.port_priv;
-
- STAILQ_INSERT_TAIL(&lun->io_list, lun_io, links);
-
- if (ctl_queue(io) != CTL_RETVAL_COMPLETE) {
- printf("%s: error returned from ctl_queue()!\n",
- __func__);
- STAILQ_REMOVE(&lun->io_list, lun_io, cfi_lun_io, links);
- ctl_free_io(io);
- metatask->status = CFI_MT_ERROR;
- metatask->taskinfo.bbrread.status = CFI_BBR_ERROR;
- mtx_unlock(&softc->lock);
- cfi_metatask_done(softc, metatask);
- break;
- }
-
- mtx_unlock(&softc->lock);
- break;
- }
- default:
- panic("invalid metatask type %d", metatask->tasktype);
- break; /* NOTREACHED */
- }
-}
-
-struct cfi_metatask *
-cfi_alloc_metatask(int can_wait)
-{
- struct cfi_metatask *metatask;
- struct cfi_softc *softc;
-
- softc = &fetd_internal_softc;
-
- metatask = uma_zalloc(cfi_metatask_zone,
- (can_wait ? M_WAITOK : M_NOWAIT) | M_ZERO);
- if (metatask == NULL)
- return (NULL);
-
- metatask->status = CFI_MT_NONE;
-
- return (metatask);
-}
-
-void
-cfi_free_metatask(struct cfi_metatask *metatask)
-{
-
- uma_zfree(cfi_metatask_zone, metatask);
-}
-
-/*
- * vim: ts=8
- */
diff --git a/sys/cam/ctl/ctl_frontend_internal.h b/sys/cam/ctl/ctl_frontend_internal.h
deleted file mode 100644
index cb00dc6..0000000
--- a/sys/cam/ctl/ctl_frontend_internal.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/*-
- * Copyright (c) 2004 Silicon Graphics International Corp.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. Redistributions in binary form must reproduce at minimum a disclaimer
- * substantially similar to the "NO WARRANTY" disclaimer below
- * ("Disclaimer") and any redistribution must be conditioned upon
- * including a substantially similar Disclaimer requirement for further
- * binary redistribution.
- *
- * NO WARRANTY
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
- * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGES.
- *
- * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_frontend_internal.h#1 $
- * $FreeBSD$
- */
-/*
- * CTL kernel internal frontend target driver. This allows kernel-level
- * clients to send commands into CTL.
- *
- * Author: Ken Merry <ken@FreeBSD.org>
- */
-
-#ifndef _CTL_FRONTEND_INTERNAL_H_
-#define _CTL_FRONTEND_INTERNAL_H_
-
-/*
- * These are general metatask error codes. If the error code is CFI_MT_ERROR,
- * check any metatask-specific status codes for more detail on the problem.
- */
-typedef enum {
- CFI_MT_NONE,
- CFI_MT_PORT_OFFLINE,
- CFI_MT_ERROR,
- CFI_MT_SUCCESS
-} cfi_mt_status;
-
-typedef enum {
- CFI_TASK_NONE,
- CFI_TASK_SHUTDOWN,
- CFI_TASK_STARTUP,
- CFI_TASK_BBRREAD
-} cfi_tasktype;
-
-struct cfi_task_startstop {
- int total_luns;
- int luns_complete;
- int luns_failed;
-};
-
-/*
- * Error code description:
- * CFI_BBR_SUCCESS - the read was successful
- * CFI_BBR_LUN_UNCONFIG - CFI probe for this lun hasn't completed
- * CFI_BBR_NO_LUN - this lun doesn't exist, as far as CFI knows
- * CFI_BBR_NO_MEM - memory allocation error
- * CFI_BBR_BAD_LEN - data length isn't a multiple of the blocksize
- * CFI_BBR_RESERV_CONFLICT - another initiator has this lun reserved, so
- * we can't issue I/O at all.
- * CFI_BBR_LUN_STOPPED - the lun is powered off.
- * CFI_BBR_LUN_OFFLINE_CTL - the lun is offline from a CTL standpoint
- * CFI_BBR_LUN_OFFLINE_RC - the lun is offline from a RAIDCore standpoint.
- * This is bad, because it basically means we've
- * had a double failure on the LUN.
- * CFI_BBR_SCSI_ERROR - generic SCSI error, see status byte and sense
- * data for more resolution if you want it.
- * CFI_BBR_ERROR - the catch-all error code.
- */
-typedef enum {
- CFI_BBR_SUCCESS,
- CFI_BBR_LUN_UNCONFIG,
- CFI_BBR_NO_LUN,
- CFI_BBR_NO_MEM,
- CFI_BBR_BAD_LEN,
- CFI_BBR_RESERV_CONFLICT,
- CFI_BBR_LUN_STOPPED,
- CFI_BBR_LUN_OFFLINE_CTL,
- CFI_BBR_LUN_OFFLINE_RC,
- CFI_BBR_SCSI_ERROR,
- CFI_BBR_ERROR,
-} cfi_bbrread_status;
-
-struct cfi_task_bbrread {
- int lun_num; /* lun number */
- uint64_t lba; /* logical block address */
- int len; /* length in bytes */
- cfi_bbrread_status status; /* BBR status */
- uint8_t scsi_status; /* SCSI status */
- struct scsi_sense_data sense_data; /* SCSI sense data */
-};
-
-union cfi_taskinfo {
- struct cfi_task_startstop startstop;
- struct cfi_task_bbrread bbrread;
-};
-
-struct cfi_metatask;
-
-typedef void (*cfi_cb_t)(void *arg, struct cfi_metatask *metatask);
-
-struct cfi_metatask {
- cfi_tasktype tasktype; /* passed to CFI */
- cfi_mt_status status; /* returned from CFI */
- union cfi_taskinfo taskinfo; /* returned from CFI */
- struct ctl_mem_element *element; /* used by CFI, don't touch*/
- cfi_cb_t callback; /* passed to CFI */
- void *callback_arg; /* passed to CFI */
- STAILQ_ENTRY(cfi_metatask) links; /* used by CFI, don't touch*/
-};
-
-#ifdef _KERNEL
-
-MALLOC_DECLARE(M_CTL_CFI);
-
-/*
- * This is the API for sending meta commands (commands that are sent to more
- * than one LUN) to the internal frontend:
- * - Allocate a metatask using cfi_alloc_metatask(). can_wait == 0 means
- * that you're calling from an interrupt context. can_wait == 1 means
- * that you're calling from a thread context and don't mind waiting to
- * allocate memory.
- * - Setup the task type, callback and callback argument.
- * - Call cfi_action().
- * - When the callback comes, note the status and any per-command status
- * (see the taskinfo union) and then free the metatask with
- * cfi_free_metatask().
- */
-struct cfi_metatask *cfi_alloc_metatask(int can_wait);
-void cfi_free_metatask(struct cfi_metatask *metatask);
-void cfi_action(struct cfi_metatask *metatask);
-
-#endif /* _KERNEL */
-
-#endif /* _CTL_FRONTEND_INTERNAL_H_ */
-
-/*
- * vim: ts=8
- */
diff --git a/sys/cam/ctl/ctl_frontend_ioctl.c b/sys/cam/ctl/ctl_frontend_ioctl.c
new file mode 100644
index 0000000..7d57314
--- /dev/null
+++ b/sys/cam/ctl/ctl_frontend_ioctl.c
@@ -0,0 +1,470 @@
+/*-
+ * Copyright (c) 2003-2009 Silicon Graphics International Corp.
+ * Copyright (c) 2012 The FreeBSD Foundation
+ * Copyright (c) 2015 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer,
+ * without modification, immediately at the beginning of the file.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/types.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/condvar.h>
+#include <sys/malloc.h>
+#include <sys/conf.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+
+#include <cam/cam.h>
+#include <cam/scsi/scsi_all.h>
+#include <cam/scsi/scsi_da.h>
+#include <cam/ctl/ctl_io.h>
+#include <cam/ctl/ctl.h>
+#include <cam/ctl/ctl_frontend.h>
+#include <cam/ctl/ctl_util.h>
+#include <cam/ctl/ctl_backend.h>
+#include <cam/ctl/ctl_ioctl.h>
+#include <cam/ctl/ctl_ha.h>
+#include <cam/ctl/ctl_private.h>
+#include <cam/ctl/ctl_debug.h>
+#include <cam/ctl/ctl_error.h>
+
+struct cfi_softc {
+ uint32_t cur_tag_num;
+ struct ctl_port port;
+};
+
+static struct cfi_softc cfi_softc;
+
+static int cfi_init(void);
+static void cfi_shutdown(void);
+static void cfi_online(void *arg);
+static void cfi_offline(void *arg);
+static int cfi_lun_enable(void *arg, int lun_id);
+static int cfi_lun_disable(void *arg, int lun_id);
+static void cfi_datamove(union ctl_io *io);
+static void cfi_done(union ctl_io *io);
+
+static struct ctl_frontend cfi_frontend =
+{
+ .name = "ioctl",
+ .init = cfi_init,
+ .shutdown = cfi_shutdown,
+};
+CTL_FRONTEND_DECLARE(ctlioctl, cfi_frontend);
+
+static int
+cfi_init(void)
+{
+ struct cfi_softc *isoftc = &cfi_softc;
+ struct ctl_port *port;
+
+ memset(isoftc, 0, sizeof(*isoftc));
+
+ port = &isoftc->port;
+ port->frontend = &cfi_frontend;
+ port->port_type = CTL_PORT_IOCTL;
+ port->num_requested_ctl_io = 100;
+ port->port_name = "ioctl";
+ port->port_online = cfi_online;
+ port->port_offline = cfi_offline;
+ port->onoff_arg = &isoftc;
+ port->lun_enable = cfi_lun_enable;
+ port->lun_disable = cfi_lun_disable;
+ port->targ_lun_arg = &isoftc;
+ port->fe_datamove = cfi_datamove;
+ port->fe_done = cfi_done;
+ port->max_targets = 1;
+ port->max_target_id = 0;
+ port->max_initiators = 1;
+
+ if (ctl_port_register(port) != 0) {
+ printf("%s: ioctl port registration failed\n", __func__);
+ return (0);
+ }
+ ctl_port_online(port);
+ return (0);
+}
+
+void
+cfi_shutdown(void)
+{
+ struct cfi_softc *isoftc = &cfi_softc;
+ struct ctl_port *port;
+
+ port = &isoftc->port;
+ ctl_port_offline(port);
+ if (ctl_port_deregister(&isoftc->port) != 0)
+ printf("%s: ctl_frontend_deregister() failed\n", __func__);
+}
+
+static void
+cfi_online(void *arg)
+{
+}
+
+static void
+cfi_offline(void *arg)
+{
+}
+
+static int
+cfi_lun_enable(void *arg, int lun_id)
+{
+
+ return (0);
+}
+
+static int
+cfi_lun_disable(void *arg, int lun_id)
+{
+
+ return (0);
+}
+
+/*
+ * Data movement routine for the CTL ioctl frontend port.
+ */
+static int
+ctl_ioctl_do_datamove(struct ctl_scsiio *ctsio)
+{
+ struct ctl_sg_entry *ext_sglist, *kern_sglist;
+ struct ctl_sg_entry ext_entry, kern_entry;
+ int ext_sglen, ext_sg_entries, kern_sg_entries;
+ int ext_sg_start, ext_offset;
+ int len_to_copy, len_copied;
+ int kern_watermark, ext_watermark;
+ int ext_sglist_malloced;
+ int i, j;
+
+ ext_sglist_malloced = 0;
+ ext_sg_start = 0;
+ ext_offset = 0;
+
+ CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove\n"));
+
+ /*
+ * If this flag is set, fake the data transfer.
+ */
+ if (ctsio->io_hdr.flags & CTL_FLAG_NO_DATAMOVE) {
+ ctsio->ext_data_filled = ctsio->ext_data_len;
+ goto bailout;
+ }
+
+ /*
+ * To simplify things here, if we have a single buffer, stick it in
+ * a S/G entry and just make it a single entry S/G list.
+ */
+ if (ctsio->io_hdr.flags & CTL_FLAG_EDPTR_SGLIST) {
+ int len_seen;
+
+ ext_sglen = ctsio->ext_sg_entries * sizeof(*ext_sglist);
+
+ ext_sglist = (struct ctl_sg_entry *)malloc(ext_sglen, M_CTL,
+ M_WAITOK);
+ ext_sglist_malloced = 1;
+ if (copyin(ctsio->ext_data_ptr, ext_sglist,
+ ext_sglen) != 0) {
+ ctl_set_internal_failure(ctsio,
+ /*sks_valid*/ 0,
+ /*retry_count*/ 0);
+ goto bailout;
+ }
+ ext_sg_entries = ctsio->ext_sg_entries;
+ len_seen = 0;
+ for (i = 0; i < ext_sg_entries; i++) {
+ if ((len_seen + ext_sglist[i].len) >=
+ ctsio->ext_data_filled) {
+ ext_sg_start = i;
+ ext_offset = ctsio->ext_data_filled - len_seen;
+ break;
+ }
+ len_seen += ext_sglist[i].len;
+ }
+ } else {
+ ext_sglist = &ext_entry;
+ ext_sglist->addr = ctsio->ext_data_ptr;
+ ext_sglist->len = ctsio->ext_data_len;
+ ext_sg_entries = 1;
+ ext_sg_start = 0;
+ ext_offset = ctsio->ext_data_filled;
+ }
+
+ if (ctsio->kern_sg_entries > 0) {
+ kern_sglist = (struct ctl_sg_entry *)ctsio->kern_data_ptr;
+ kern_sg_entries = ctsio->kern_sg_entries;
+ } else {
+ kern_sglist = &kern_entry;
+ kern_sglist->addr = ctsio->kern_data_ptr;
+ kern_sglist->len = ctsio->kern_data_len;
+ kern_sg_entries = 1;
+ }
+
+
+ kern_watermark = 0;
+ ext_watermark = ext_offset;
+ len_copied = 0;
+ for (i = ext_sg_start, j = 0;
+ i < ext_sg_entries && j < kern_sg_entries;) {
+ uint8_t *ext_ptr, *kern_ptr;
+
+ len_to_copy = MIN(ext_sglist[i].len - ext_watermark,
+ kern_sglist[j].len - kern_watermark);
+
+ ext_ptr = (uint8_t *)ext_sglist[i].addr;
+ ext_ptr = ext_ptr + ext_watermark;
+ if (ctsio->io_hdr.flags & CTL_FLAG_BUS_ADDR) {
+ /*
+ * XXX KDM fix this!
+ */
+ panic("need to implement bus address support");
+#if 0
+ kern_ptr = bus_to_virt(kern_sglist[j].addr);
+#endif
+ } else
+ kern_ptr = (uint8_t *)kern_sglist[j].addr;
+ kern_ptr = kern_ptr + kern_watermark;
+
+ kern_watermark += len_to_copy;
+ ext_watermark += len_to_copy;
+
+ if ((ctsio->io_hdr.flags & CTL_FLAG_DATA_MASK) ==
+ CTL_FLAG_DATA_IN) {
+ CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: copying %d "
+ "bytes to user\n", len_to_copy));
+ CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: from %p "
+ "to %p\n", kern_ptr, ext_ptr));
+ if (copyout(kern_ptr, ext_ptr, len_to_copy) != 0) {
+ ctl_set_internal_failure(ctsio,
+ /*sks_valid*/ 0,
+ /*retry_count*/ 0);
+ goto bailout;
+ }
+ } else {
+ CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: copying %d "
+ "bytes from user\n", len_to_copy));
+ CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: from %p "
+ "to %p\n", ext_ptr, kern_ptr));
+ if (copyin(ext_ptr, kern_ptr, len_to_copy)!= 0){
+ ctl_set_internal_failure(ctsio,
+ /*sks_valid*/ 0,
+ /*retry_count*/0);
+ goto bailout;
+ }
+ }
+
+ len_copied += len_to_copy;
+
+ if (ext_sglist[i].len == ext_watermark) {
+ i++;
+ ext_watermark = 0;
+ }
+
+ if (kern_sglist[j].len == kern_watermark) {
+ j++;
+ kern_watermark = 0;
+ }
+ }
+
+ ctsio->ext_data_filled += len_copied;
+
+ CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: ext_sg_entries: %d, "
+ "kern_sg_entries: %d\n", ext_sg_entries,
+ kern_sg_entries));
+ CTL_DEBUG_PRINT(("ctl_ioctl_do_datamove: ext_data_len = %d, "
+ "kern_data_len = %d\n", ctsio->ext_data_len,
+ ctsio->kern_data_len));
+
+
+ /* XXX KDM set residual?? */
+bailout:
+
+ if (ext_sglist_malloced != 0)
+ free(ext_sglist, M_CTL);
+
+ return (CTL_RETVAL_COMPLETE);
+}
+
+static void
+cfi_datamove(union ctl_io *io)
+{
+ struct ctl_fe_ioctl_params *params;
+
+ params = (struct ctl_fe_ioctl_params *)
+ io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr;
+
+ mtx_lock(&params->ioctl_mtx);
+ params->state = CTL_IOCTL_DATAMOVE;
+ cv_broadcast(&params->sem);
+ mtx_unlock(&params->ioctl_mtx);
+}
+
+static void
+cfi_done(union ctl_io *io)
+{
+ struct ctl_fe_ioctl_params *params;
+
+ params = (struct ctl_fe_ioctl_params *)
+ io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr;
+
+ mtx_lock(&params->ioctl_mtx);
+ params->state = CTL_IOCTL_DONE;
+ cv_broadcast(&params->sem);
+ mtx_unlock(&params->ioctl_mtx);
+}
+
+static int
+cfi_submit_wait(union ctl_io *io)
+{
+ struct ctl_fe_ioctl_params params;
+ ctl_fe_ioctl_state last_state;
+ int done, retval;
+
+ retval = 0;
+
+ bzero(&params, sizeof(params));
+
+ mtx_init(&params.ioctl_mtx, "ctliocmtx", NULL, MTX_DEF);
+ cv_init(&params.sem, "ctlioccv");
+ params.state = CTL_IOCTL_INPROG;
+ last_state = params.state;
+
+ io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr = &params;
+
+ CTL_DEBUG_PRINT(("cfi_submit_wait\n"));
+
+ /* This shouldn't happen */
+ if ((retval = ctl_queue(io)) != CTL_RETVAL_COMPLETE)
+ return (retval);
+
+ done = 0;
+
+ do {
+ mtx_lock(&params.ioctl_mtx);
+ /*
+ * Check the state here, and don't sleep if the state has
+ * already changed (i.e. wakeup has already occured, but we
+ * weren't waiting yet).
+ */
+ if (params.state == last_state) {
+ /* XXX KDM cv_wait_sig instead? */
+ cv_wait(&params.sem, &params.ioctl_mtx);
+ }
+ last_state = params.state;
+
+ switch (params.state) {
+ case CTL_IOCTL_INPROG:
+ /* Why did we wake up? */
+ /* XXX KDM error here? */
+ mtx_unlock(&params.ioctl_mtx);
+ break;
+ case CTL_IOCTL_DATAMOVE:
+ CTL_DEBUG_PRINT(("got CTL_IOCTL_DATAMOVE\n"));
+
+ /*
+ * change last_state back to INPROG to avoid
+ * deadlock on subsequent data moves.
+ */
+ params.state = last_state = CTL_IOCTL_INPROG;
+
+ mtx_unlock(&params.ioctl_mtx);
+ ctl_ioctl_do_datamove(&io->scsiio);
+ /*
+ * Note that in some cases, most notably writes,
+ * this will queue the I/O and call us back later.
+ * In other cases, generally reads, this routine
+ * will immediately call back and wake us up,
+ * probably using our own context.
+ */
+ io->scsiio.be_move_done(io);
+ break;
+ case CTL_IOCTL_DONE:
+ mtx_unlock(&params.ioctl_mtx);
+ CTL_DEBUG_PRINT(("got CTL_IOCTL_DONE\n"));
+ done = 1;
+ break;
+ default:
+ mtx_unlock(&params.ioctl_mtx);
+ /* XXX KDM error here? */
+ break;
+ }
+ } while (done == 0);
+
+ mtx_destroy(&params.ioctl_mtx);
+ cv_destroy(&params.sem);
+
+ return (CTL_RETVAL_COMPLETE);
+}
+
+int
+ctl_ioctl_io(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
+ struct thread *td)
+{
+ union ctl_io *io;
+ void *pool_tmp;
+ int retval = 0;
+
+ /*
+ * If we haven't been "enabled", don't allow any SCSI I/O
+ * to this FETD.
+ */
+ if ((cfi_softc.port.status & CTL_PORT_STATUS_ONLINE) == 0)
+ return (EPERM);
+
+ io = ctl_alloc_io(cfi_softc.port.ctl_pool_ref);
+
+ /*
+ * Need to save the pool reference so it doesn't get
+ * spammed by the user's ctl_io.
+ */
+ pool_tmp = io->io_hdr.pool;
+ memcpy(io, (void *)addr, sizeof(*io));
+ io->io_hdr.pool = pool_tmp;
+
+ /*
+ * No status yet, so make sure the status is set properly.
+ */
+ io->io_hdr.status = CTL_STATUS_NONE;
+
+ /*
+ * The user sets the initiator ID, target and LUN IDs.
+ */
+ io->io_hdr.nexus.targ_port = cfi_softc.port.targ_port;
+ io->io_hdr.flags |= CTL_FLAG_USER_REQ;
+ if ((io->io_hdr.io_type == CTL_IO_SCSI) &&
+ (io->scsiio.tag_type != CTL_TAG_UNTAGGED))
+ io->scsiio.tag_num = cfi_softc.cur_tag_num++;
+
+ retval = cfi_submit_wait(io);
+ if (retval == 0)
+ memcpy((void *)addr, io, sizeof(*io));
+ ctl_free_io(io);
+ return (retval);
+}
diff --git a/sys/cam/ctl/ctl_frontend_iscsi.c b/sys/cam/ctl/ctl_frontend_iscsi.c
index 652c961..7f8f8a8 100644
--- a/sys/cam/ctl/ctl_frontend_iscsi.c
+++ b/sys/cam/ctl/ctl_frontend_iscsi.c
@@ -61,7 +61,6 @@ __FBSDID("$FreeBSD$");
#include <cam/ctl/ctl_backend.h>
#include <cam/ctl/ctl_error.h>
#include <cam/ctl/ctl_frontend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
#include <cam/ctl/ctl_debug.h>
#include <cam/ctl/ctl_ha.h>
#include <cam/ctl/ctl_ioctl.h>
diff --git a/sys/cam/ctl/ctl_ioctl.h b/sys/cam/ctl/ctl_ioctl.h
index c7a3c29..f62bbe1 100644
--- a/sys/cam/ctl/ctl_ioctl.h
+++ b/sys/cam/ctl/ctl_ioctl.h
@@ -92,23 +92,6 @@ struct ctl_ooa_info {
ctl_ooa_status status; /* Returned from CTL */
};
-struct ctl_hard_startstop_info {
- cfi_mt_status status;
- int total_luns;
- int luns_complete;
- int luns_failed;
-};
-
-struct ctl_bbrread_info {
- int lun_num; /* Passed in to CTL */
- uint64_t lba; /* Passed in to CTL */
- int len; /* Passed in to CTL */
- cfi_mt_status status; /* Returned from CTL */
- cfi_bbrread_status bbr_status; /* Returned from CTL */
- uint8_t scsi_status; /* Returned from CTL */
- struct scsi_sense_data sense_data; /* Returned from CTL */
-};
-
typedef enum {
CTL_DELAY_TYPE_NONE,
CTL_DELAY_TYPE_CONT,
@@ -828,10 +811,6 @@ struct ctl_lun_map {
#define CTL_DISABLE_PORT _IOW(CTL_MINOR, 0x05, struct ctl_port_entry)
#define CTL_DUMP_OOA _IO(CTL_MINOR, 0x06)
#define CTL_CHECK_OOA _IOWR(CTL_MINOR, 0x07, struct ctl_ooa_info)
-#define CTL_HARD_STOP _IOR(CTL_MINOR, 0x08, \
- struct ctl_hard_startstop_info)
-#define CTL_HARD_START _IOR(CTL_MINOR, 0x09, \
- struct ctl_hard_startstop_info)
#define CTL_DELAY_IO _IOWR(CTL_MINOR, 0x10, struct ctl_io_delay_info)
#define CTL_REALSYNC_GET _IOR(CTL_MINOR, 0x11, int)
#define CTL_REALSYNC_SET _IOW(CTL_MINOR, 0x12, int)
@@ -839,7 +818,6 @@ struct ctl_lun_map {
#define CTL_GETSYNC _IOWR(CTL_MINOR, 0x14, struct ctl_sync_info)
#define CTL_GETSTATS _IOWR(CTL_MINOR, 0x15, struct ctl_stats)
#define CTL_ERROR_INJECT _IOWR(CTL_MINOR, 0x16, struct ctl_error_desc)
-#define CTL_BBRREAD _IOWR(CTL_MINOR, 0x17, struct ctl_bbrread_info)
#define CTL_GET_OOA _IOWR(CTL_MINOR, 0x18, struct ctl_ooa)
#define CTL_DUMP_STRUCTS _IO(CTL_MINOR, 0x19)
#define CTL_GET_PORT_LIST _IOWR(CTL_MINOR, 0x20, struct ctl_port_list)
diff --git a/sys/cam/ctl/ctl_private.h b/sys/cam/ctl/ctl_private.h
index a038552..6f7379a 100644
--- a/sys/cam/ctl/ctl_private.h
+++ b/sys/cam/ctl/ctl_private.h
@@ -47,18 +47,6 @@
#define CTL_PROCESSOR_PRODUCT "CTLPROCESSOR "
#define CTL_UNKNOWN_PRODUCT "CTLDEVICE "
-struct ctl_fe_ioctl_startstop_info {
- struct cv sem;
- struct ctl_hard_startstop_info hs_info;
-};
-
-struct ctl_fe_ioctl_bbrread_info {
- struct cv sem;
- struct ctl_bbrread_info *bbr_info;
- int wakeup_done;
- struct mtx *lock;
-};
-
typedef enum {
CTL_IOCTL_INPROG,
CTL_IOCTL_DATAMOVE,
@@ -81,18 +69,6 @@ struct ctl_io_pool {
};
typedef enum {
- CTL_IOCTL_FLAG_NONE = 0x00,
- CTL_IOCTL_FLAG_ENABLED = 0x01
-} ctl_ioctl_flags;
-
-struct ctl_ioctl_info {
- ctl_ioctl_flags flags;
- uint32_t cur_tag_num;
- struct ctl_port port;
- char port_name[24];
-};
-
-typedef enum {
CTL_SER_BLOCK,
CTL_SER_BLOCKOPT,
CTL_SER_EXTENT,
@@ -472,7 +448,6 @@ struct ctl_softc {
int inquiry_pq_no_lun;
struct sysctl_ctx_list sysctl_ctx;
struct sysctl_oid *sysctl_tree;
- struct ctl_ioctl_info ioctl_info;
void *othersc_pool;
struct proc *ctl_proc;
int targ_online;
diff --git a/sys/cam/ctl/ctl_tpc.c b/sys/cam/ctl/ctl_tpc.c
index 662ee3d..b1b674f 100644
--- a/sys/cam/ctl/ctl_tpc.c
+++ b/sys/cam/ctl/ctl_tpc.c
@@ -47,7 +47,6 @@ __FBSDID("$FreeBSD$");
#include <cam/ctl/ctl_io.h>
#include <cam/ctl/ctl.h>
#include <cam/ctl/ctl_frontend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
#include <cam/ctl/ctl_util.h>
#include <cam/ctl/ctl_backend.h>
#include <cam/ctl/ctl_ioctl.h>
diff --git a/sys/cam/ctl/ctl_tpc_local.c b/sys/cam/ctl/ctl_tpc_local.c
index d0319ee..fb1f2ac 100644
--- a/sys/cam/ctl/ctl_tpc_local.c
+++ b/sys/cam/ctl/ctl_tpc_local.c
@@ -47,7 +47,6 @@ __FBSDID("$FreeBSD$");
#include <cam/ctl/ctl_io.h>
#include <cam/ctl/ctl.h>
#include <cam/ctl/ctl_frontend.h>
-#include <cam/ctl/ctl_frontend_internal.h>
#include <cam/ctl/ctl_util.h>
#include <cam/ctl/ctl_backend.h>
#include <cam/ctl/ctl_ioctl.h>
diff --git a/sys/cddl/compat/opensolaris/sys/nvpair.h b/sys/cddl/compat/opensolaris/sys/nvpair.h
index c90ab70..33b62cb 100644
--- a/sys/cddl/compat/opensolaris/sys/nvpair.h
+++ b/sys/cddl/compat/opensolaris/sys/nvpair.h
@@ -42,29 +42,19 @@
*/
#define nvlist_add_binary illumos_nvlist_add_binary
#define nvlist_add_bool illumos_nvlist_add_bool
+#define nvlist_add_bool_array illumos_nvlist_add_bool_array
#define nvlist_add_descriptor illumos_nvlist_add_descriptor
+#define nvlist_add_descriptor_array illumos_nvlist_add_descriptor_array
#define nvlist_add_null illumos_nvlist_add_null
#define nvlist_add_number illumos_nvlist_add_number
+#define nvlist_add_number_array illumos_nvlist_add_number_array
#define nvlist_add_nvlist illumos_nvlist_add_nvlist
+#define nvlist_add_nvlist_array illumos_nvlist_add_nvlist_array
#define nvlist_add_nvpair illumos_nvlist_add_nvpair
#define nvlist_add_string illumos_nvlist_add_string
+#define nvlist_add_string_array illumos_nvlist_add_string_array
#define nvlist_add_stringf illumos_nvlist_add_stringf
#define nvlist_add_stringv illumos_nvlist_add_stringv
-#define nvlist_addf_binary illumos_nvlist_addf_binary
-#define nvlist_addf_bool illumos_nvlist_addf_bool
-#define nvlist_addf_descriptor illumos_nvlist_addf_descriptor
-#define nvlist_addf_null illumos_nvlist_addf_null
-#define nvlist_addf_number illumos_nvlist_addf_number
-#define nvlist_addf_nvlist illumos_nvlist_addf_nvlist
-#define nvlist_addf_string illumos_nvlist_addf_string
-#define nvlist_addv_binary illumos_nvlist_addv_binary
-#define nvlist_addv_bool illumos_nvlist_addv_bool
-#define nvlist_addv_descriptor illumos_nvlist_addv_descriptor
-#define nvlist_addv_null illumos_nvlist_addv_null
-#define nvlist_addv_number illumos_nvlist_addv_number
-#define nvlist_addv_nvlist illumos_nvlist_addv_nvlist
-#define nvlist_addv_string illumos_nvlist_addv_string
-#define nvlist_check_header illumos_nvlist_check_header
#define nvlist_clone illumos_nvlist_clone
#define nvlist_create illumos_nvlist_create
#define nvlist_descriptors illumos_nvlist_descriptors
@@ -75,92 +65,61 @@
#define nvlist_exists illumos_nvlist_exists
#define nvlist_exists_binary illumos_nvlist_exists_binary
#define nvlist_exists_bool illumos_nvlist_exists_bool
+#define nvlist_exists_bool_array illumos_nvlist_exists_bool_array
#define nvlist_exists_descriptor illumos_nvlist_exists_descriptor
+#define nvlist_exists_descriptor_array illumos_nvlist_exists_descriptor_array
#define nvlist_exists_null illumos_nvlist_exists_null
#define nvlist_exists_number illumos_nvlist_exists_number
+#define nvlist_exists_number_array illumos_nvlist_exists_number_array
#define nvlist_exists_nvlist illumos_nvlist_exists_nvlist
+#define nvlist_exists_nvlist_array illumos_nvlist_exists_nvlist_array
#define nvlist_exists_string illumos_nvlist_exists_string
+#define nvlist_exists_string_array illumos_nvlist_exists_string_array
#define nvlist_exists_type illumos_nvlist_exists_type
-#define nvlist_existsf illumos_nvlist_existsf
-#define nvlist_existsf_binary illumos_nvlist_existsf_binary
-#define nvlist_existsf_bool illumos_nvlist_existsf_bool
-#define nvlist_existsf_descriptor illumos_nvlist_existsf_descriptor
-#define nvlist_existsf_null illumos_nvlist_existsf_null
-#define nvlist_existsf_number illumos_nvlist_existsf_number
-#define nvlist_existsf_nvlist illumos_nvlist_existsf_nvlist
-#define nvlist_existsf_string illumos_nvlist_existsf_string
-#define nvlist_existsf_type illumos_nvlist_existsf_type
-#define nvlist_existsv illumos_nvlist_existsv
-#define nvlist_existsv_binary illumos_nvlist_existsv_binary
-#define nvlist_existsv_bool illumos_nvlist_existsv_bool
-#define nvlist_existsv_descriptor illumos_nvlist_existsv_descriptor
-#define nvlist_existsv_null illumos_nvlist_existsv_null
-#define nvlist_existsv_number illumos_nvlist_existsv_number
-#define nvlist_existsv_nvlist illumos_nvlist_existsv_nvlist
-#define nvlist_existsv_string illumos_nvlist_existsv_string
-#define nvlist_existsv_type illumos_nvlist_existsv_type
#define nvlist_fdump illumos_nvlist_fdump
#define nvlist_first_nvpair illumos_nvlist_first_nvpair
+#define nvlist_flags illumos_nvlist_flags
#define nvlist_free illumos_nvlist_free
#define nvlist_free_binary illumos_nvlist_free_binary
+#define nvlist_free_binary_array illumos_nvlist_free_binary_array
#define nvlist_free_bool illumos_nvlist_free_bool
+#define nvlist_free_bool_array illumos_nvlist_free_bool_array
#define nvlist_free_descriptor illumos_nvlist_free_descriptor
+#define nvlist_free_descriptor_array illumos_nvlist_free_descriptor_array
#define nvlist_free_null illumos_nvlist_free_null
#define nvlist_free_number illumos_nvlist_free_number
+#define nvlist_free_number_array illumos_nvlist_free_number_array
#define nvlist_free_nvlist illumos_nvlist_free_nvlist
+#define nvlist_free_nvlist_array illumos_nvlist_free_nvlist_array
#define nvlist_free_nvpair illumos_nvlist_free_nvpair
#define nvlist_free_string illumos_nvlist_free_string
+#define nvlist_free_string_array illumos_nvlist_free_string_array
#define nvlist_free_type illumos_nvlist_free_type
-#define nvlist_freef illumos_nvlist_freef
-#define nvlist_freef_binary illumos_nvlist_freef_binary
-#define nvlist_freef_bool illumos_nvlist_freef_bool
-#define nvlist_freef_descriptor illumos_nvlist_freef_descriptor
-#define nvlist_freef_null illumos_nvlist_freef_null
-#define nvlist_freef_number illumos_nvlist_freef_number
-#define nvlist_freef_nvlist illumos_nvlist_freef_nvlist
-#define nvlist_freef_string illumos_nvlist_freef_string
-#define nvlist_freef_type illumos_nvlist_freef_type
-#define nvlist_freev illumos_nvlist_freev
-#define nvlist_freev_binary illumos_nvlist_freev_binary
-#define nvlist_freev_bool illumos_nvlist_freev_bool
-#define nvlist_freev_descriptor illumos_nvlist_freev_descriptor
-#define nvlist_freev_null illumos_nvlist_freev_null
-#define nvlist_freev_number illumos_nvlist_freev_number
-#define nvlist_freev_nvlist illumos_nvlist_freev_nvlist
-#define nvlist_freev_string illumos_nvlist_freev_string
-#define nvlist_freev_type illumos_nvlist_freev_type
+#define nvlist_get_array_next illumos_nvlist_get_array_next
#define nvlist_get_binary illumos_nvlist_get_binary
#define nvlist_get_bool illumos_nvlist_get_bool
+#define nvlist_get_bool_array illumos_nvlist_get_bool_array
#define nvlist_get_descriptor illumos_nvlist_get_descriptor
+#define nvlist_get_descriptor_array illumos_nvlist_get_descriptor_array
#define nvlist_get_number illumos_nvlist_get_number
+#define nvlist_get_number_array illumos_nvlist_get_number_array
#define nvlist_get_nvlist illumos_nvlist_get_nvlist
#define nvlist_get_nvpair illumos_nvlist_get_nvpair
+#define nvlist_get_nvpair_parent illumos_nvlist_get_nvpair_parent
+#define nvlist_get_pararr illumos_nvlist_get_pararr
+#define nvlist_get_parent illumos_nvlist_get_parent
#define nvlist_get_string illumos_nvlist_get_string
-#define nvlist_getf_binary illumos_nvlist_getf_binary
-#define nvlist_getf_bool illumos_nvlist_getf_bool
-#define nvlist_getf_descriptor illumos_nvlist_getf_descriptor
-#define nvlist_getf_number illumos_nvlist_getf_number
-#define nvlist_getf_nvlist illumos_nvlist_getf_nvlist
-#define nvlist_getf_string illumos_nvlist_getf_string
-#define nvlist_getv_binary illumos_nvlist_getv_binary
-#define nvlist_getv_bool illumos_nvlist_getv_bool
-#define nvlist_getv_descriptor illumos_nvlist_getv_descriptor
-#define nvlist_getv_number illumos_nvlist_getv_number
-#define nvlist_getv_nvlist illumos_nvlist_getv_nvlist
-#define nvlist_getv_string illumos_nvlist_getv_string
+#define nvlist_in_array illumos_nvlist_in_array
#define nvlist_move_binary illumos_nvlist_move_binary
+#define nvlist_move_bool_array illumos_nvlist_move_bool_array
#define nvlist_move_descriptor illumos_nvlist_move_descriptor
+#define nvlist_move_descriptor_array illumos_nvlist_move_descriptor_array
+#define nvlist_move_number_array illumos_nvlist_move_number_array
#define nvlist_move_nvlist illumos_nvlist_move_nvlist
+#define nvlist_move_nvlist_array illumos_nvlist_move_nvlist_array
#define nvlist_move_nvpair illumos_nvlist_move_nvpair
#define nvlist_move_string illumos_nvlist_move_string
-#define nvlist_movef_binary illumos_nvlist_movef_binary
-#define nvlist_movef_descriptor illumos_nvlist_movef_descriptor
-#define nvlist_movef_nvlist illumos_nvlist_movef_nvlist
-#define nvlist_movef_string illumos_nvlist_movef_string
-#define nvlist_movev_binary illumos_nvlist_movev_binary
-#define nvlist_movev_descriptor illumos_nvlist_movev_descriptor
-#define nvlist_movev_nvlist illumos_nvlist_movev_nvlist
-#define nvlist_movev_string illumos_nvlist_movev_string
+#define nvlist_move_string_array illumos_nvlist_move_string_array
#define nvlist_ndescriptors illumos_nvlist_ndescriptors
#define nvlist_next illumos_nvlist_next
#define nvlist_next_nvpair illumos_nvlist_next_nvpair
@@ -168,93 +127,101 @@
#define nvlist_prev_nvpair illumos_nvlist_prev_nvpair
#define nvlist_recv illumos_nvlist_recv
#define nvlist_remove_nvpair illumos_nvlist_remove_nvpair
-#define nvlist_report_missing illumos_nvlist_report_missing
#define nvlist_send illumos_nvlist_send
+#define nvlist_set_array_next illumos_nvlist_set_array_next
#define nvlist_set_error illumos_nvlist_set_error
+#define nvlist_set_flags illumos_nvlist_set_flags
+#define nvlist_set_parent illumos_nvlist_set_parent
#define nvlist_size illumos_nvlist_size
#define nvlist_take_binary illumos_nvlist_take_binary
#define nvlist_take_bool illumos_nvlist_take_bool
+#define nvlist_take_bool_array illumos_nvlist_take_bool_array
#define nvlist_take_descriptor illumos_nvlist_take_descriptor
+#define nvlist_take_descriptor_array illumos_nvlist_take_descriptor_array
#define nvlist_take_number illumos_nvlist_take_number
+#define nvlist_take_number_array illumos_nvlist_take_number_array
#define nvlist_take_nvlist illumos_nvlist_take_nvlist
+#define nvlist_take_nvlist_array illumos_nvlist_take_nvlist_array
#define nvlist_take_nvpair illumos_nvlist_take_nvpair
#define nvlist_take_string illumos_nvlist_take_string
-#define nvlist_takef_binary illumos_nvlist_takef_binary
-#define nvlist_takef_bool illumos_nvlist_takef_bool
-#define nvlist_takef_descriptor illumos_nvlist_takef_descriptor
-#define nvlist_takef_number illumos_nvlist_takef_number
-#define nvlist_takef_nvlist illumos_nvlist_takef_nvlist
-#define nvlist_takef_string illumos_nvlist_takef_string
-#define nvlist_takev_binary illumos_nvlist_takev_binary
-#define nvlist_takev_bool illumos_nvlist_takev_bool
-#define nvlist_takev_descriptor illumos_nvlist_takev_descriptor
-#define nvlist_takev_number illumos_nvlist_takev_number
-#define nvlist_takev_nvlist illumos_nvlist_takev_nvlist
-#define nvlist_takev_string illumos_nvlist_takev_string
+#define nvlist_take_string_array illumos_nvlist_take_string_array
#define nvlist_unpack illumos_nvlist_unpack
+#define nvlist_unpack_header illumos_nvlist_unpack_header
#define nvlist_xfer illumos_nvlist_xfer
-#define nvlist_xpack illumos_nvlist_xpack
-#define nvlist_xunpack illumos_nvlist_xunpack
-#define nvpair_allocv illumos_nvpair_allocv
#define nvpair_assert illumos_nvpair_assert
#define nvpair_clone illumos_nvpair_clone
#define nvpair_create_binary illumos_nvpair_create_binary
#define nvpair_create_bool illumos_nvpair_create_bool
+#define nvpair_create_bool_array illumos_nvpair_create_bool_array
#define nvpair_create_descriptor illumos_nvpair_create_descriptor
+#define nvpair_create_descriptor_array illumos_nvpair_create_descriptor_array
#define nvpair_create_null illumos_nvpair_create_null
#define nvpair_create_number illumos_nvpair_create_number
+#define nvpair_create_number_array illumos_nvpair_create_number_array
#define nvpair_create_nvlist illumos_nvpair_create_nvlist
+#define nvpair_create_nvlist_array illumos_nvpair_create_nvlist_array
#define nvpair_create_string illumos_nvpair_create_string
+#define nvpair_create_string_array illumos_nvpair_create_string_array
#define nvpair_create_stringf illumos_nvpair_create_stringf
#define nvpair_create_stringv illumos_nvpair_create_stringv
-#define nvpair_createf_binary illumos_nvpair_createf_binary
-#define nvpair_createf_bool illumos_nvpair_createf_bool
-#define nvpair_createf_descriptor illumos_nvpair_createf_descriptor
-#define nvpair_createf_null illumos_nvpair_createf_null
-#define nvpair_createf_number illumos_nvpair_createf_number
-#define nvpair_createf_nvlist illumos_nvpair_createf_nvlist
-#define nvpair_createf_string illumos_nvpair_createf_string
-#define nvpair_createv_binary illumos_nvpair_createv_binary
-#define nvpair_createv_bool illumos_nvpair_createv_bool
-#define nvpair_createv_descriptor illumos_nvpair_createv_descriptor
-#define nvpair_createv_null illumos_nvpair_createv_null
-#define nvpair_createv_number illumos_nvpair_createv_number
-#define nvpair_createv_nvlist illumos_nvpair_createv_nvlist
-#define nvpair_createv_string illumos_nvpair_createv_string
#define nvpair_free illumos_nvpair_free
#define nvpair_free_structure illumos_nvpair_free_structure
#define nvpair_get_binary illumos_nvpair_get_binary
#define nvpair_get_bool illumos_nvpair_get_bool
+#define nvpair_get_bool_array illumos_nvpair_get_bool_array
#define nvpair_get_descriptor illumos_nvpair_get_descriptor
+#define nvpair_get_descriptor_array illumos_nvpair_get_descriptor_array
#define nvpair_get_number illumos_nvpair_get_number
+#define nvpair_get_number_array illumos_nvpair_get_number_array
#define nvpair_get_nvlist illumos_nvpair_get_nvlist
#define nvpair_get_string illumos_nvpair_get_string
#define nvpair_header_size illumos_nvpair_header_size
+#define nvpair_init_datasize illumos_nvpair_init_datasize
#define nvpair_insert illumos_nvpair_insert
#define nvpair_move_binary illumos_nvpair_move_binary
+#define nvpair_move_bool_array illumos_nvpair_move_bool_array
#define nvpair_move_descriptor illumos_nvpair_move_descriptor
+#define nvpair_move_descriptor_array illumos_nvpair_move_descriptor_array
+#define nvpair_move_number_array illumos_nvpair_move_number_array
#define nvpair_move_nvlist illumos_nvpair_move_nvlist
+#define nvpair_move_nvlist_array illumos_nvpair_move_nvlist_array
#define nvpair_move_string illumos_nvpair_move_string
-#define nvpair_movef_binary illumos_nvpair_movef_binary
-#define nvpair_movef_descriptor illumos_nvpair_movef_descriptor
-#define nvpair_movef_nvlist illumos_nvpair_movef_nvlist
-#define nvpair_movef_string illumos_nvpair_movef_string
-#define nvpair_movev_binary illumos_nvpair_movev_binary
-#define nvpair_movev_descriptor illumos_nvpair_movev_descriptor
-#define nvpair_movev_nvlist illumos_nvpair_movev_nvlist
-#define nvpair_movev_string illumos_nvpair_movev_string
+#define nvpair_move_string_array illumos_nvpair_move_string_array
#define nvpair_name illumos_nvpair_name
#define nvpair_next illumos_nvpair_next
#define nvpair_nvlist illumos_nvpair_nvlist
-#define nvpair_pack illumos_nvpair_pack
+#define nvpair_pack_binary illumos_nvpair_pack_binary
+#define nvpair_pack_bool illumos_nvpair_pack_bool
+#define nvpair_pack_bool_array illumos_nvpair_pack_bool_array
#define nvpair_pack_descriptor illumos_nvpair_pack_descriptor
+#define nvpair_pack_descriptor_array illumos_nvpair_pack_descriptor_array
+#define nvpair_pack_header illumos_nvpair_pack_header
+#define nvpair_pack_null illumos_nvpair_pack_null
+#define nvpair_pack_number illumos_nvpair_pack_number
+#define nvpair_pack_number_array illumos_nvpair_pack_number_array
+#define nvpair_pack_nvlist_array_next illumos_nvpair_pack_nvlist_array_next
+#define nvpair_pack_nvlist_up illumos_nvpair_pack_nvlist_up
+#define nvpair_pack_string illumos_nvpair_pack_string
+#define nvpair_pack_string_array illumos_nvpair_pack_string_array
#define nvpair_prev illumos_nvpair_prev
#define nvpair_remove illumos_nvpair_remove
#define nvpair_size illumos_nvpair_size
#define nvpair_type illumos_nvpair_type
#define nvpair_type_string illumos_nvpair_type_string
#define nvpair_unpack illumos_nvpair_unpack
+#define nvpair_unpack_binary illumos_nvpair_unpack_binary
+#define nvpair_unpack_bool illumos_nvpair_unpack_bool
+#define nvpair_unpack_bool_array illumos_nvpair_unpack_bool_array
#define nvpair_unpack_descriptor illumos_nvpair_unpack_descriptor
+#define nvpair_unpack_descriptor_array illumos_nvpair_unpack_descriptor_array
+#define nvpair_unpack_header illumos_nvpair_unpack_header
+#define nvpair_unpack_null illumos_nvpair_unpack_null
+#define nvpair_unpack_number illumos_nvpair_unpack_number
+#define nvpair_unpack_number_array illumos_nvpair_unpack_number_array
+#define nvpair_unpack_nvlist illumos_nvpair_unpack_nvlist
+#define nvpair_unpack_nvlist_array illumos_nvpair_unpack_nvlist_array
+#define nvpair_unpack_string illumos_nvpair_unpack_string
+#define nvpair_unpack_string_array illumos_nvpair_unpack_string_array
#endif /* _KERNEL */
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
index 52a355d..d59fbf0 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
@@ -129,15 +129,15 @@ zfeature_depends_on(spa_feature_t fid, spa_feature_t check) {
static void
zfeature_register(spa_feature_t fid, const char *guid, const char *name,
- const char *desc, boolean_t readonly, boolean_t mos,
- boolean_t activate_on_enable, const spa_feature_t *deps)
+ const char *desc, zfeature_flags_t flags, const spa_feature_t *deps)
{
zfeature_info_t *feature = &spa_feature_table[fid];
static spa_feature_t nodeps[] = { SPA_FEATURE_NONE };
ASSERT(name != NULL);
ASSERT(desc != NULL);
- ASSERT(!readonly || !mos);
+ ASSERT((flags & ZFEATURE_FLAG_READONLY_COMPAT) == 0 ||
+ (flags & ZFEATURE_FLAG_MOS) == 0);
ASSERT3U(fid, <, SPA_FEATURES);
ASSERT(zfeature_is_valid_guid(guid));
@@ -148,9 +148,7 @@ zfeature_register(spa_feature_t fid, const char *guid, const char *name,
feature->fi_guid = guid;
feature->fi_uname = name;
feature->fi_desc = desc;
- feature->fi_can_readonly = readonly;
- feature->fi_mos = mos;
- feature->fi_activate_on_enable = activate_on_enable;
+ feature->fi_flags = flags;
feature->fi_depends = deps;
}
@@ -159,45 +157,46 @@ zpool_feature_init(void)
{
zfeature_register(SPA_FEATURE_ASYNC_DESTROY,
"com.delphix:async_destroy", "async_destroy",
- "Destroy filesystems asynchronously.", B_TRUE, B_FALSE,
- B_FALSE, NULL);
+ "Destroy filesystems asynchronously.",
+ ZFEATURE_FLAG_READONLY_COMPAT, NULL);
zfeature_register(SPA_FEATURE_EMPTY_BPOBJ,
"com.delphix:empty_bpobj", "empty_bpobj",
- "Snapshots use less space.", B_TRUE, B_FALSE,
- B_FALSE, NULL);
+ "Snapshots use less space.",
+ ZFEATURE_FLAG_READONLY_COMPAT, NULL);
zfeature_register(SPA_FEATURE_LZ4_COMPRESS,
"org.illumos:lz4_compress", "lz4_compress",
- "LZ4 compression algorithm support.", B_FALSE, B_FALSE,
- B_TRUE, NULL);
+ "LZ4 compression algorithm support.",
+ ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, NULL);
zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
"com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump",
- "Crash dumps to multiple vdev pools.", B_FALSE, B_FALSE,
- B_FALSE, NULL);
+ "Crash dumps to multiple vdev pools.",
+ 0, NULL);
zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM,
"com.delphix:spacemap_histogram", "spacemap_histogram",
- "Spacemaps maintain space histograms.", B_TRUE, B_FALSE,
- B_FALSE, NULL);
+ "Spacemaps maintain space histograms.",
+ ZFEATURE_FLAG_READONLY_COMPAT, NULL);
zfeature_register(SPA_FEATURE_ENABLED_TXG,
"com.delphix:enabled_txg", "enabled_txg",
- "Record txg at which a feature is enabled", B_TRUE, B_FALSE,
- B_FALSE, NULL);
+ "Record txg at which a feature is enabled",
+ ZFEATURE_FLAG_READONLY_COMPAT, NULL);
static spa_feature_t hole_birth_deps[] = { SPA_FEATURE_ENABLED_TXG,
SPA_FEATURE_NONE };
zfeature_register(SPA_FEATURE_HOLE_BIRTH,
"com.delphix:hole_birth", "hole_birth",
"Retain hole birth txg for more precise zfs send",
- B_FALSE, B_TRUE, B_TRUE, hole_birth_deps);
+ ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
+ hole_birth_deps);
zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET,
"com.delphix:extensible_dataset", "extensible_dataset",
"Enhanced dataset functionality, used by other features.",
- B_FALSE, B_FALSE, B_FALSE, NULL);
+ 0, NULL);
static const spa_feature_t bookmarks_deps[] = {
SPA_FEATURE_EXTENSIBLE_DATASET,
@@ -206,7 +205,7 @@ zpool_feature_init(void)
zfeature_register(SPA_FEATURE_BOOKMARKS,
"com.delphix:bookmarks", "bookmarks",
"\"zfs bookmark\" command",
- B_TRUE, B_FALSE, B_FALSE, bookmarks_deps);
+ ZFEATURE_FLAG_READONLY_COMPAT, bookmarks_deps);
static const spa_feature_t filesystem_limits_deps[] = {
SPA_FEATURE_EXTENSIBLE_DATASET,
@@ -214,13 +213,14 @@ zpool_feature_init(void)
};
zfeature_register(SPA_FEATURE_FS_SS_LIMIT,
"com.joyent:filesystem_limits", "filesystem_limits",
- "Filesystem and snapshot limits.", B_TRUE, B_FALSE, B_FALSE,
- filesystem_limits_deps);
+ "Filesystem and snapshot limits.",
+ ZFEATURE_FLAG_READONLY_COMPAT, filesystem_limits_deps);
zfeature_register(SPA_FEATURE_EMBEDDED_DATA,
"com.delphix:embedded_data", "embedded_data",
"Blocks which compress very well use even less space.",
- B_FALSE, B_TRUE, B_TRUE, NULL);
+ ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
+ NULL);
static const spa_feature_t large_blocks_deps[] = {
SPA_FEATURE_EXTENSIBLE_DATASET,
@@ -228,6 +228,6 @@ zpool_feature_init(void)
};
zfeature_register(SPA_FEATURE_LARGE_BLOCKS,
"org.open-zfs:large_blocks", "large_blocks",
- "Support for blocks larger than 128KB.", B_FALSE, B_FALSE, B_FALSE,
- large_blocks_deps);
+ "Support for blocks larger than 128KB.",
+ ZFEATURE_FLAG_PER_DATASET, large_blocks_deps);
}
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
index 4ffe435..0e88a9a 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
@@ -20,7 +20,7 @@
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
@@ -56,15 +56,23 @@ typedef enum spa_feature {
#define SPA_FEATURE_DISABLED (-1ULL)
+typedef enum zfeature_flags {
+ /* Can open pool readonly even if this feature is not supported. */
+ ZFEATURE_FLAG_READONLY_COMPAT = (1 << 0),
+ /* Is this feature necessary to read the MOS? */
+ ZFEATURE_FLAG_MOS = (1 << 1),
+ /* Activate this feature at the same time it is enabled. */
+ ZFEATURE_FLAG_ACTIVATE_ON_ENABLE = (1 << 2),
+ /* Each dataset has a field set if it has ever used this feature. */
+ ZFEATURE_FLAG_PER_DATASET = (1 << 3)
+} zfeature_flags_t;
+
typedef struct zfeature_info {
spa_feature_t fi_feature;
const char *fi_uname; /* User-facing feature name */
const char *fi_guid; /* On-disk feature identifier */
const char *fi_desc; /* Feature description */
- boolean_t fi_can_readonly; /* Can open pool readonly w/o support? */
- boolean_t fi_mos; /* Is the feature necessary to read the MOS? */
- /* Activate this feature at the same time it is enabled */
- boolean_t fi_activate_on_enable;
+ zfeature_flags_t fi_flags;
/* array of dependencies, terminated by SPA_FEATURE_NONE */
const spa_feature_t *fi_depends;
} zfeature_info_t;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
index 4c7e225..77c7b1d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
+++ b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
@@ -22,7 +22,9 @@
#
# Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
-# Copyright (c) 2013 by Delphix. All rights reserved.
+# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
+# Copyright (c) 2012 Joyent, Inc. All rights reserved.
+# Copyright (c) 2011, 2014 by Delphix. All rights reserved.
# Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
#
#
@@ -36,6 +38,7 @@ ZFS_COMMON_OBJS += \
blkptr.o \
bpobj.o \
bptree.o \
+ bqueue.o \
dbuf.o \
ddt.o \
ddt_zap.o \
@@ -65,6 +68,7 @@ ZFS_COMMON_OBJS += \
lz4.o \
lzjb.o \
metaslab.o \
+ multilist.o \
range_tree.o \
refcount.o \
rrwlock.o \
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index 6de36f2..07fcb51 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -21,9 +21,9 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
- * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
/*
@@ -82,9 +82,9 @@
* types of locks: 1) the hash table lock array, and 2) the
* arc list locks.
*
- * Buffers do not have their own mutexs, rather they rely on the
- * hash table mutexs for the bulk of their protection (i.e. most
- * fields in the arc_buf_hdr_t are protected by these mutexs).
+ * Buffers do not have their own mutexes, rather they rely on the
+ * hash table mutexes for the bulk of their protection (i.e. most
+ * fields in the arc_buf_hdr_t are protected by these mutexes).
*
* buf_hash_find() returns the appropriate mutex (held) when it
* locates the requested buffer in the hash table. It returns
@@ -129,6 +129,7 @@
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
#include <sys/dsl_pool.h>
+#include <sys/multilist.h>
#ifdef _KERNEL
#include <sys/dnlc.h>
#endif
@@ -149,21 +150,39 @@ int arc_procfd;
#endif
#endif /* illumos */
-static kmutex_t arc_reclaim_thr_lock;
-static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
-static uint8_t arc_thread_exit;
+static kmutex_t arc_reclaim_lock;
+static kcondvar_t arc_reclaim_thread_cv;
+static boolean_t arc_reclaim_thread_exit;
+static kcondvar_t arc_reclaim_waiters_cv;
+
+static kmutex_t arc_user_evicts_lock;
+static kcondvar_t arc_user_evicts_cv;
+static boolean_t arc_user_evicts_thread_exit;
uint_t arc_reduce_dnlc_percent = 3;
/*
- * The number of iterations through arc_evict_*() before we
- * drop & reacquire the lock.
+ * The number of headers to evict in arc_evict_state_impl() before
+ * dropping the sublist lock and evicting from another sublist. A lower
+ * value means we're more likely to evict the "correct" header (i.e. the
+ * oldest header in the arc state), but comes with higher overhead
+ * (i.e. more invocations of arc_evict_state_impl()).
*/
-int arc_evict_iterations = 100;
+int zfs_arc_evict_batch_limit = 10;
+
+/*
+ * The number of sublists used for each of the arc state lists. If this
+ * is not set to a suitable value by the user, it will be configured to
+ * the number of CPUs on the system in arc_init().
+ */
+int zfs_arc_num_sublists_per_state = 0;
/* number of seconds before growing cache again */
static int arc_grow_retry = 60;
+/* shift of arc_c for calculating overflow limit in arc_get_data_buf */
+int zfs_arc_overflow_shift = 8;
+
/* shift of arc_c for calculating both min and max arc_p */
static int arc_p_min_shift = 4;
@@ -201,6 +220,9 @@ extern int zfs_prefetch_disable;
*/
static boolean_t arc_warm;
+/*
+ * These tunables are for performance analysis.
+ */
uint64_t zfs_arc_max;
uint64_t zfs_arc_min;
uint64_t zfs_arc_meta_limit = 0;
@@ -312,31 +334,22 @@ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit,
* second level ARC benefit from these fast lookups.
*/
-#define ARCS_LOCK_PAD CACHE_LINE_SIZE
-struct arcs_lock {
- kmutex_t arcs_lock;
-#ifdef _KERNEL
- unsigned char pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
-#endif
-};
-
-/*
- * must be power of two for mask use to work
- *
- */
-#define ARC_BUFC_NUMDATALISTS 16
-#define ARC_BUFC_NUMMETADATALISTS 16
-#define ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
-
typedef struct arc_state {
- uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
- uint64_t arcs_size; /* total amount of data in this state */
- list_t arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
- struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
+ /*
+ * list of evictable buffers
+ */
+ multilist_t arcs_list[ARC_BUFC_NUMTYPES];
+ /*
+ * total amount of evictable data in this state
+ */
+ uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];
+ /*
+ * total amount of data in this state; this includes: evictable,
+ * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
+ */
+ refcount_t arcs_size;
} arc_state_t;
-#define ARCS_LOCK(s, i) (&((s)->arcs_locks[(i)].arcs_lock))
-
/* The 6 states: */
static arc_state_t ARC_anon;
static arc_state_t ARC_mru;
@@ -362,8 +375,6 @@ typedef struct arc_stats {
kstat_named_t arcstat_mfu_ghost_hits;
kstat_named_t arcstat_allocated;
kstat_named_t arcstat_deleted;
- kstat_named_t arcstat_stolen;
- kstat_named_t arcstat_recycle_miss;
/*
* Number of buffers that could not be evicted because the hash lock
* was held by another thread. The lock may not necessarily be held
@@ -377,9 +388,15 @@ typedef struct arc_stats {
* not from the spa we're trying to evict from.
*/
kstat_named_t arcstat_evict_skip;
+ /*
+ * Number of times arc_evict_state() was unable to evict enough
+ * buffers to reach it's target amount.
+ */
+ kstat_named_t arcstat_evict_not_enough;
kstat_named_t arcstat_evict_l2_cached;
kstat_named_t arcstat_evict_l2_eligible;
kstat_named_t arcstat_evict_l2_ineligible;
+ kstat_named_t arcstat_evict_l2_skip;
kstat_named_t arcstat_hash_elements;
kstat_named_t arcstat_hash_elements_max;
kstat_named_t arcstat_hash_collisions;
@@ -530,7 +547,7 @@ typedef struct arc_stats {
kstat_named_t arcstat_l2_writes_sent;
kstat_named_t arcstat_l2_writes_done;
kstat_named_t arcstat_l2_writes_error;
- kstat_named_t arcstat_l2_writes_hdr_miss;
+ kstat_named_t arcstat_l2_writes_lock_retry;
kstat_named_t arcstat_l2_evict_lock_retry;
kstat_named_t arcstat_l2_evict_reading;
kstat_named_t arcstat_l2_evict_l1cached;
@@ -584,13 +601,13 @@ static arc_stats_t arc_stats = {
{ "mfu_ghost_hits", KSTAT_DATA_UINT64 },
{ "allocated", KSTAT_DATA_UINT64 },
{ "deleted", KSTAT_DATA_UINT64 },
- { "stolen", KSTAT_DATA_UINT64 },
- { "recycle_miss", KSTAT_DATA_UINT64 },
{ "mutex_miss", KSTAT_DATA_UINT64 },
{ "evict_skip", KSTAT_DATA_UINT64 },
+ { "evict_not_enough", KSTAT_DATA_UINT64 },
{ "evict_l2_cached", KSTAT_DATA_UINT64 },
{ "evict_l2_eligible", KSTAT_DATA_UINT64 },
{ "evict_l2_ineligible", KSTAT_DATA_UINT64 },
+ { "evict_l2_skip", KSTAT_DATA_UINT64 },
{ "hash_elements", KSTAT_DATA_UINT64 },
{ "hash_elements_max", KSTAT_DATA_UINT64 },
{ "hash_collisions", KSTAT_DATA_UINT64 },
@@ -629,7 +646,7 @@ static arc_stats_t arc_stats = {
{ "l2_writes_sent", KSTAT_DATA_UINT64 },
{ "l2_writes_done", KSTAT_DATA_UINT64 },
{ "l2_writes_error", KSTAT_DATA_UINT64 },
- { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
+ { "l2_writes_lock_retry", KSTAT_DATA_UINT64 },
{ "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
{ "l2_evict_reading", KSTAT_DATA_UINT64 },
{ "l2_evict_l1cached", KSTAT_DATA_UINT64 },
@@ -806,7 +823,7 @@ typedef struct l1arc_buf_hdr {
/* protected by arc state mutex */
arc_state_t *b_state;
- list_node_t b_arc_node;
+ multilist_node_t b_arc_node;
/* updated atomically */
clock_t b_arc_access;
@@ -877,7 +894,6 @@ sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS)
#endif
static arc_buf_t *arc_eviction_list;
-static kmutex_t arc_eviction_mtx;
static arc_buf_hdr_t arc_eviction_hdr;
#define GHOST_STATE(state) \
@@ -1011,21 +1027,21 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
&l2arc_norw, 0, "no reads during writes");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
- &ARC_anon.arcs_size, 0, "size of anonymous state");
+ &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
&ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
&ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
- &ARC_mru.arcs_size, 0, "size of mru state");
+ &ARC_mru.arcs_size.rc_count, 0, "size of mru state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
&ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
&ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
- &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
+ &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
&ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
"size of metadata in mru ghost state");
@@ -1034,14 +1050,14 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
"size of data in mru ghost state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
- &ARC_mfu.arcs_size, 0, "size of mfu state");
+ &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
&ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
&ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
- &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
+ &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
&ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
"size of metadata in mfu ghost state");
@@ -1050,7 +1066,7 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
"size of data in mfu ghost state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
- &ARC_l2c_only.arcs_size, 0, "size of mru state");
+ &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state");
/*
* L2ARC Internals
@@ -1106,8 +1122,7 @@ static uint8_t l2arc_thread_exit;
static void arc_get_data_buf(arc_buf_t *);
static void arc_access(arc_buf_hdr_t *, kmutex_t *);
-static int arc_evict_needed(arc_buf_contents_t);
-static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t);
+static boolean_t arc_is_overflowing();
static void arc_buf_watch(arc_buf_t *);
static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
@@ -1288,6 +1303,7 @@ hdr_full_cons(void *vbuf, void *unused, int kmflag)
cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
refcount_create(&hdr->b_l1hdr.b_refcnt);
mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+ multilist_link_init(&hdr->b_l1hdr.b_arc_node);
arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
return (0);
@@ -1332,6 +1348,7 @@ hdr_full_dest(void *vbuf, void *unused)
cv_destroy(&hdr->b_l1hdr.b_cv);
refcount_destroy(&hdr->b_l1hdr.b_refcnt);
mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
+ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
}
@@ -1368,7 +1385,7 @@ hdr_recl(void *unused)
* which is after we do arc_fini().
*/
if (!arc_dead)
- cv_signal(&arc_reclaim_thr_cv);
+ cv_signal(&arc_reclaim_thread_cv);
}
static void
@@ -1447,18 +1464,31 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
* l2c_only even though it's about to change.
*/
nhdr->b_l1hdr.b_state = arc_l2c_only;
+
+ /* Verify previous threads set to NULL before freeing */
+ ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL);
} else {
ASSERT(hdr->b_l1hdr.b_buf == NULL);
ASSERT0(hdr->b_l1hdr.b_datacnt);
- ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+
/*
- * We might be removing the L1hdr of a buffer which was just
- * written out to L2ARC. If such a buffer is compressed then we
- * need to free its b_tmp_cdata before destroying the header.
+ * If we've reached here, We must have been called from
+ * arc_evict_hdr(), as such we should have already been
+ * removed from any ghost list we were previously on
+ * (which protects us from racing with arc_evict_state),
+ * thus no locking is needed during this check.
*/
- if (hdr->b_l1hdr.b_tmp_cdata != NULL &&
- HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
- l2arc_release_cdata_buf(hdr);
+ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+
+ /*
+ * A buffer must not be moved into the arc_l2c_only
+ * state if it's not finished being written out to the
+ * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field
+ * might try to be accessed, even though it was removed.
+ */
+ VERIFY(!HDR_L2_WRITING(hdr));
+ VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+
nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
}
/*
@@ -1681,23 +1711,6 @@ arc_buf_freeze(arc_buf_t *buf)
}
static void
-get_buf_info(arc_buf_hdr_t *hdr, arc_state_t *state, list_t **list, kmutex_t **lock)
-{
- uint64_t buf_hashid = buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
-
- if (arc_buf_type(hdr) == ARC_BUFC_METADATA)
- buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
- else {
- buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
- buf_hashid += ARC_BUFC_NUMMETADATALISTS;
- }
-
- *list = &state->arcs_lists[buf_hashid];
- *lock = ARCS_LOCK(state, buf_hashid);
-}
-
-
-static void
add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
{
ASSERT(HDR_HAS_L1HDR(hdr));
@@ -1708,16 +1721,13 @@ add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
(state != arc_anon)) {
/* We don't use the L2-only state list. */
if (state != arc_l2c_only) {
+ arc_buf_contents_t type = arc_buf_type(hdr);
uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt;
- uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
- list_t *list;
- kmutex_t *lock;
-
- get_buf_info(hdr, state, &list, &lock);
- ASSERT(!MUTEX_HELD(lock));
- mutex_enter(lock);
- ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
- list_remove(list, hdr);
+ multilist_t *list = &state->arcs_list[type];
+ uint64_t *size = &state->arcs_lsize[type];
+
+ multilist_remove(list, hdr);
+
if (GHOST_STATE(state)) {
ASSERT0(hdr->b_l1hdr.b_datacnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
@@ -1726,7 +1736,6 @@ add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
ASSERT(delta > 0);
ASSERT3U(*size, >=, delta);
atomic_add_64(size, -delta);
- mutex_exit(lock);
}
/* remove the prefetch flag if we get a reference */
hdr->b_flags &= ~ARC_FLAG_PREFETCH;
@@ -1749,25 +1758,21 @@ remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
*/
if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
(state != arc_anon)) {
- uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
- list_t *list;
- kmutex_t *lock;
+ arc_buf_contents_t type = arc_buf_type(hdr);
+ multilist_t *list = &state->arcs_list[type];
+ uint64_t *size = &state->arcs_lsize[type];
+
+ multilist_insert(list, hdr);
- get_buf_info(hdr, state, &list, &lock);
- ASSERT(!MUTEX_HELD(lock));
- mutex_enter(lock);
- ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
- list_insert_head(list, hdr);
ASSERT(hdr->b_l1hdr.b_datacnt > 0);
atomic_add_64(size, hdr->b_size *
hdr->b_l1hdr.b_datacnt);
- mutex_exit(lock);
}
return (cnt);
}
/*
- * Move the supplied buffer to the indicated state. The mutex
+ * Move the supplied buffer to the indicated state. The hash lock
* for the buffer must be held by the caller.
*/
static void
@@ -1779,8 +1784,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
uint32_t datacnt;
uint64_t from_delta, to_delta;
arc_buf_contents_t buftype = arc_buf_type(hdr);
- list_t *list;
- kmutex_t *lock;
/*
* We almost always have an L1 hdr here, since we call arc_hdr_realloc()
@@ -1813,17 +1816,10 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
*/
if (refcnt == 0) {
if (old_state != arc_anon && old_state != arc_l2c_only) {
- int use_mutex;
uint64_t *size = &old_state->arcs_lsize[buftype];
- get_buf_info(hdr, old_state, &list, &lock);
- use_mutex = !MUTEX_HELD(lock);
- if (use_mutex)
- mutex_enter(lock);
-
ASSERT(HDR_HAS_L1HDR(hdr));
- ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
- list_remove(list, hdr);
+ multilist_remove(&old_state->arcs_list[buftype], hdr);
/*
* If prefetching out of the ghost cache,
@@ -1836,12 +1832,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
}
ASSERT3U(*size, >=, from_delta);
atomic_add_64(size, -from_delta);
-
- if (use_mutex)
- mutex_exit(lock);
}
if (new_state != arc_anon && new_state != arc_l2c_only) {
- int use_mutex;
uint64_t *size = &new_state->arcs_lsize[buftype];
/*
@@ -1851,23 +1843,15 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
* beforehand.
*/
ASSERT(HDR_HAS_L1HDR(hdr));
- get_buf_info(hdr, new_state, &list, &lock);
- use_mutex = !MUTEX_HELD(lock);
- if (use_mutex)
- mutex_enter(lock);
-
- list_insert_head(list, hdr);
+ multilist_insert(&new_state->arcs_list[buftype], hdr);
/* ghost elements have a ghost size */
if (GHOST_STATE(new_state)) {
- ASSERT(datacnt == 0);
+ ASSERT0(datacnt);
ASSERT(hdr->b_l1hdr.b_buf == NULL);
to_delta = hdr->b_size;
}
atomic_add_64(size, to_delta);
-
- if (use_mutex)
- mutex_exit(lock);
}
}
@@ -1876,12 +1860,73 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
buf_hash_remove(hdr);
/* adjust state sizes (ignore arc_l2c_only) */
- if (to_delta && new_state != arc_l2c_only)
- atomic_add_64(&new_state->arcs_size, to_delta);
+
+ if (to_delta && new_state != arc_l2c_only) {
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ if (GHOST_STATE(new_state)) {
+ ASSERT0(datacnt);
+
+ /*
+ * We moving a header to a ghost state, we first
+ * remove all arc buffers. Thus, we'll have a
+ * datacnt of zero, and no arc buffer to use for
+ * the reference. As a result, we use the arc
+ * header pointer for the reference.
+ */
+ (void) refcount_add_many(&new_state->arcs_size,
+ hdr->b_size, hdr);
+ } else {
+ ASSERT3U(datacnt, !=, 0);
+
+ /*
+ * Each individual buffer holds a unique reference,
+ * thus we must remove each of these references one
+ * at a time.
+ */
+ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+ buf = buf->b_next) {
+ (void) refcount_add_many(&new_state->arcs_size,
+ hdr->b_size, buf);
+ }
+ }
+ }
+
if (from_delta && old_state != arc_l2c_only) {
- ASSERT3U(old_state->arcs_size, >=, from_delta);
- atomic_add_64(&old_state->arcs_size, -from_delta);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ if (GHOST_STATE(old_state)) {
+ /*
+ * When moving a header off of a ghost state,
+ * there's the possibility for datacnt to be
+ * non-zero. This is because we first add the
+ * arc buffer to the header prior to changing
+ * the header's state. Since we used the header
+ * for the reference when putting the header on
+ * the ghost state, we must balance that and use
+ * the header when removing off the ghost state
+ * (even though datacnt is non zero).
+ */
+
+ IMPLY(datacnt == 0, new_state == arc_anon ||
+ new_state == arc_l2c_only);
+
+ (void) refcount_remove_many(&old_state->arcs_size,
+ hdr->b_size, hdr);
+ } else {
+ ASSERT3P(datacnt, !=, 0);
+
+ /*
+ * Each individual buffer holds a unique reference,
+ * thus we must remove each of these references one
+ * at a time.
+ */
+ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+ buf = buf->b_next) {
+ (void) refcount_remove_many(
+ &old_state->arcs_size, hdr->b_size, buf);
+ }
+ }
}
+
if (HDR_HAS_L1HDR(hdr))
hdr->b_l1hdr.b_state = new_state;
@@ -1889,10 +1934,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
* L2 headers should never be on the L2 state list since they don't
* have L1 headers allocated.
*/
-#ifdef illumos
- ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
- list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
-#endif
+ ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
+ multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
}
void
@@ -1985,6 +2028,7 @@ arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type)
hdr->b_l1hdr.b_state = arc_anon;
hdr->b_l1hdr.b_arc_access = 0;
hdr->b_l1hdr.b_datacnt = 1;
+ hdr->b_l1hdr.b_tmp_cdata = NULL;
arc_get_data_buf(buf);
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
@@ -2120,7 +2164,7 @@ arc_buf_free_on_write(void *data, size_t size,
{
l2arc_data_free_t *df;
- df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
+ df = kmem_alloc(sizeof (*df), KM_SLEEP);
df->l2df_data = data;
df->l2df_size = size;
df->l2df_func = free_func;
@@ -2146,10 +2190,6 @@ arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
}
}
-/*
- * Free up buf->b_data and if 'remove' is set, then pull the
- * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
- */
static void
arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
{
@@ -2164,19 +2204,53 @@ arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
if (!HDR_HAS_L1HDR(hdr))
return;
- if (hdr->b_l1hdr.b_tmp_cdata == NULL)
+ /*
+ * The header isn't being written to the l2arc device, thus it
+ * shouldn't have a b_tmp_cdata to free.
+ */
+ if (!HDR_L2_WRITING(hdr)) {
+ ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+ return;
+ }
+
+ /*
+ * The header does not have compression enabled. This can be due
+ * to the buffer not being compressible, or because we're
+ * freeing the buffer before the second phase of
+ * l2arc_write_buffer() has started (which does the compression
+ * step). In either case, b_tmp_cdata does not point to a
+ * separately compressed buffer, so there's nothing to free (it
+ * points to the same buffer as the arc_buf_t's b_data field).
+ */
+ if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) {
+ hdr->b_l1hdr.b_tmp_cdata = NULL;
+ return;
+ }
+
+ /*
+ * There's nothing to free since the buffer was all zero's and
+ * compressed to a zero length buffer.
+ */
+ if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_EMPTY) {
+ ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
return;
+ }
- ASSERT(HDR_L2_WRITING(hdr));
- arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size,
- zio_data_buf_free);
+ ASSERT(L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr)));
+
+ arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata,
+ hdr->b_size, zio_data_buf_free);
ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
hdr->b_l1hdr.b_tmp_cdata = NULL;
}
+/*
+ * Free up buf->b_data and if 'remove' is set, then pull the
+ * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
+ */
static void
-arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
+arc_buf_destroy(arc_buf_t *buf, boolean_t remove)
{
arc_buf_t **bufp;
@@ -2191,17 +2265,17 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
arc_buf_unwatch(buf);
#endif
- if (!recycle) {
- if (type == ARC_BUFC_METADATA) {
- arc_buf_data_free(buf, zio_buf_free);
- arc_space_return(size, ARC_SPACE_META);
- } else {
- ASSERT(type == ARC_BUFC_DATA);
- arc_buf_data_free(buf, zio_data_buf_free);
- arc_space_return(size, ARC_SPACE_DATA);
- }
+ if (type == ARC_BUFC_METADATA) {
+ arc_buf_data_free(buf, zio_buf_free);
+ arc_space_return(size, ARC_SPACE_META);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ arc_buf_data_free(buf, zio_data_buf_free);
+ arc_space_return(size, ARC_SPACE_DATA);
}
- if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
+
+ /* protected by hash lock, if in the hash table */
+ if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
uint64_t *cnt = &state->arcs_lsize[type];
ASSERT(refcount_is_zero(
@@ -2211,8 +2285,8 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
ASSERT3U(*cnt, >=, size);
atomic_add_64(cnt, -size);
}
- ASSERT3U(state->arcs_size, >=, size);
- atomic_add_64(&state->arcs_size, -size);
+
+ (void) refcount_remove_many(&state->arcs_size, size, buf);
buf->b_data = NULL;
/*
@@ -2339,6 +2413,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
if (!BUF_EMPTY(hdr))
buf_discard_identity(hdr);
+
if (hdr->b_freeze_cksum != NULL) {
kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
hdr->b_freeze_cksum = NULL;
@@ -2349,20 +2424,19 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
arc_buf_t *buf = hdr->b_l1hdr.b_buf;
if (buf->b_efunc != NULL) {
- mutex_enter(&arc_eviction_mtx);
+ mutex_enter(&arc_user_evicts_lock);
mutex_enter(&buf->b_evict_lock);
ASSERT(buf->b_hdr != NULL);
- arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
- FALSE);
+ arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE);
hdr->b_l1hdr.b_buf = buf->b_next;
buf->b_hdr = &arc_eviction_hdr;
buf->b_next = arc_eviction_list;
arc_eviction_list = buf;
mutex_exit(&buf->b_evict_lock);
- mutex_exit(&arc_eviction_mtx);
+ cv_signal(&arc_user_evicts_cv);
+ mutex_exit(&arc_user_evicts_lock);
} else {
- arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
- TRUE);
+ arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE);
}
}
#ifdef ZFS_DEBUG
@@ -2375,7 +2449,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
ASSERT3P(hdr->b_hash_next, ==, NULL);
if (HDR_HAS_L1HDR(hdr)) {
- ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
kmem_cache_free(hdr_full_cache, hdr);
} else {
@@ -2401,7 +2475,7 @@ arc_buf_free(arc_buf_t *buf, void *tag)
(void) remove_reference(hdr, hash_lock, tag);
if (hdr->b_l1hdr.b_datacnt > 1) {
- arc_buf_destroy(buf, FALSE, TRUE);
+ arc_buf_destroy(buf, TRUE);
} else {
ASSERT(buf == hdr->b_l1hdr.b_buf);
ASSERT(buf->b_efunc == NULL);
@@ -2415,16 +2489,16 @@ arc_buf_free(arc_buf_t *buf, void *tag)
* this buffer unless the write completes before we finish
* decrementing the reference count.
*/
- mutex_enter(&arc_eviction_mtx);
+ mutex_enter(&arc_user_evicts_lock);
(void) remove_reference(hdr, NULL, tag);
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
- mutex_exit(&arc_eviction_mtx);
+ mutex_exit(&arc_user_evicts_lock);
if (destroy_hdr)
arc_hdr_destroy(hdr);
} else {
if (remove_reference(hdr, NULL, tag) > 0)
- arc_buf_destroy(buf, FALSE, TRUE);
+ arc_buf_destroy(buf, TRUE);
else
arc_hdr_destroy(hdr);
}
@@ -2453,7 +2527,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag)
(void) remove_reference(hdr, hash_lock, tag);
if (hdr->b_l1hdr.b_datacnt > 1) {
if (no_callback)
- arc_buf_destroy(buf, FALSE, TRUE);
+ arc_buf_destroy(buf, TRUE);
} else if (no_callback) {
ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL);
ASSERT(buf->b_efunc == NULL);
@@ -2514,499 +2588,678 @@ arc_buf_eviction_needed(arc_buf_t *buf)
}
/*
- * Evict buffers from list until we've removed the specified number of
- * bytes. Move the removed buffers to the appropriate evict state.
- * If the recycle flag is set, then attempt to "recycle" a buffer:
- * - look for a buffer to evict that is `bytes' long.
- * - return the data block from this buffer rather than freeing it.
- * This flag is used by callers that are trying to make space for a
- * new buffer in a full arc cache.
+ * Evict the arc_buf_hdr that is provided as a parameter. The resultant
+ * state of the header is dependent on it's state prior to entering this
+ * function. The following transitions are possible:
*
- * This function makes a "best effort". It skips over any buffers
- * it can't get a hash_lock on, and so may not catch all candidates.
- * It may also return without evicting as much space as requested.
+ * - arc_mru -> arc_mru_ghost
+ * - arc_mfu -> arc_mfu_ghost
+ * - arc_mru_ghost -> arc_l2c_only
+ * - arc_mru_ghost -> deleted
+ * - arc_mfu_ghost -> arc_l2c_only
+ * - arc_mfu_ghost -> deleted
*/
-static void *
-arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
- arc_buf_contents_t type)
+static int64_t
+arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
{
- arc_state_t *evicted_state;
- uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
- int64_t bytes_remaining;
- arc_buf_hdr_t *hdr, *hdr_prev = NULL;
- list_t *evicted_list, *list, *evicted_list_start, *list_start;
- kmutex_t *lock, *evicted_lock;
- kmutex_t *hash_lock;
- boolean_t have_lock;
- void *stolen = NULL;
- arc_buf_hdr_t marker = { 0 };
- int count = 0;
- static int evict_metadata_offset, evict_data_offset;
- int i, idx, offset, list_count, lists;
+ arc_state_t *evicted_state, *state;
+ int64_t bytes_evicted = 0;
- ASSERT(state == arc_mru || state == arc_mfu);
+ ASSERT(MUTEX_HELD(hash_lock));
+ ASSERT(HDR_HAS_L1HDR(hdr));
- evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+ state = hdr->b_l1hdr.b_state;
+ if (GHOST_STATE(state)) {
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT(hdr->b_l1hdr.b_buf == NULL);
- /*
- * Decide which "type" (data vs metadata) to recycle from.
- *
- * If we are over the metadata limit, recycle from metadata.
- * If we are under the metadata minimum, recycle from data.
- * Otherwise, recycle from whichever type has the oldest (least
- * recently accessed) header. This is not yet implemented.
- */
- if (recycle) {
- arc_buf_contents_t realtype;
- if (state->arcs_lsize[ARC_BUFC_DATA] == 0) {
- realtype = ARC_BUFC_METADATA;
- } else if (state->arcs_lsize[ARC_BUFC_METADATA] == 0) {
- realtype = ARC_BUFC_DATA;
- } else if (arc_meta_used >= arc_meta_limit) {
- realtype = ARC_BUFC_METADATA;
- } else if (arc_meta_used <= arc_meta_min) {
- realtype = ARC_BUFC_DATA;
-#ifdef illumos
- } else if (HDR_HAS_L1HDR(data_hdr) &&
- HDR_HAS_L1HDR(metadata_hdr) &&
- data_hdr->b_l1hdr.b_arc_access <
- metadata_hdr->b_l1hdr.b_arc_access) {
- realtype = ARC_BUFC_DATA;
- } else {
- realtype = ARC_BUFC_METADATA;
-#else
- } else {
- /* TODO */
- realtype = type;
-#endif
+ /*
+ * l2arc_write_buffers() relies on a header's L1 portion
+ * (i.e. it's b_tmp_cdata field) during it's write phase.
+ * Thus, we cannot push a header onto the arc_l2c_only
+ * state (removing it's L1 piece) until the header is
+ * done being written to the l2arc.
+ */
+ if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
+ ARCSTAT_BUMP(arcstat_evict_l2_skip);
+ return (bytes_evicted);
}
- if (realtype != type) {
+
+ ARCSTAT_BUMP(arcstat_deleted);
+ bytes_evicted += hdr->b_size;
+
+ DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
+
+ if (HDR_HAS_L2HDR(hdr)) {
+ /*
+ * This buffer is cached on the 2nd Level ARC;
+ * don't destroy the header.
+ */
+ arc_change_state(arc_l2c_only, hdr, hash_lock);
/*
- * If we want to evict from a different list,
- * we can not recycle, because DATA vs METADATA
- * buffers are segregated into different kmem
- * caches (and vmem arenas).
+ * dropping from L1+L2 cached to L2-only,
+ * realloc to remove the L1 header.
*/
- type = realtype;
- recycle = B_FALSE;
+ hdr = arc_hdr_realloc(hdr, hdr_full_cache,
+ hdr_l2only_cache);
+ } else {
+ arc_change_state(arc_anon, hdr, hash_lock);
+ arc_hdr_destroy(hdr);
}
+ return (bytes_evicted);
}
- if (type == ARC_BUFC_METADATA) {
- offset = 0;
- list_count = ARC_BUFC_NUMMETADATALISTS;
- list_start = &state->arcs_lists[0];
- evicted_list_start = &evicted_state->arcs_lists[0];
- idx = evict_metadata_offset;
- } else {
- offset = ARC_BUFC_NUMMETADATALISTS;
- list_start = &state->arcs_lists[offset];
- evicted_list_start = &evicted_state->arcs_lists[offset];
- list_count = ARC_BUFC_NUMDATALISTS;
- idx = evict_data_offset;
- }
- bytes_remaining = evicted_state->arcs_lsize[type];
- lists = 0;
+ ASSERT(state == arc_mru || state == arc_mfu);
+ evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
-evict_start:
- list = &list_start[idx];
- evicted_list = &evicted_list_start[idx];
- lock = ARCS_LOCK(state, (offset + idx));
- evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
+ /* prefetch buffers have a minimum lifespan */
+ if (HDR_IO_IN_PROGRESS(hdr) ||
+ ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
+ ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
+ arc_min_prefetch_lifespan)) {
+ ARCSTAT_BUMP(arcstat_evict_skip);
+ return (bytes_evicted);
+ }
- /*
- * The ghost list lock must be acquired first in order to prevent
- * a 3 party deadlock:
- *
- * - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by
- * l2ad_mtx in arc_hdr_realloc
- * - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx
- * - arc_evict acquires arc_*_ghost->arcs_mtx, followed by
- * arc_*_ghost->arcs_mtx and forms a deadlock cycle.
- *
- * This situation is avoided by acquiring the ghost list lock first.
- */
- mutex_enter(evicted_lock);
- mutex_enter(lock);
-
- for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
- hdr_prev = list_prev(list, hdr);
- if (HDR_HAS_L1HDR(hdr)) {
- bytes_remaining -=
- (hdr->b_size * hdr->b_l1hdr.b_datacnt);
+ ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
+ ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
+ while (hdr->b_l1hdr.b_buf) {
+ arc_buf_t *buf = hdr->b_l1hdr.b_buf;
+ if (!mutex_tryenter(&buf->b_evict_lock)) {
+ ARCSTAT_BUMP(arcstat_mutex_miss);
+ break;
}
- /* prefetch buffers have a minimum lifespan */
- if (HDR_IO_IN_PROGRESS(hdr) ||
- (spa && hdr->b_spa != spa) ||
- ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
- ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
- arc_min_prefetch_lifespan)) {
- skipped++;
- continue;
+ if (buf->b_data != NULL)
+ bytes_evicted += hdr->b_size;
+ if (buf->b_efunc != NULL) {
+ mutex_enter(&arc_user_evicts_lock);
+ arc_buf_destroy(buf, FALSE);
+ hdr->b_l1hdr.b_buf = buf->b_next;
+ buf->b_hdr = &arc_eviction_hdr;
+ buf->b_next = arc_eviction_list;
+ arc_eviction_list = buf;
+ cv_signal(&arc_user_evicts_cv);
+ mutex_exit(&arc_user_evicts_lock);
+ mutex_exit(&buf->b_evict_lock);
+ } else {
+ mutex_exit(&buf->b_evict_lock);
+ arc_buf_destroy(buf, TRUE);
}
- /* "lookahead" for better eviction candidate */
- if (recycle && hdr->b_size != bytes &&
- hdr_prev && hdr_prev->b_size == bytes)
- continue;
+ }
- /* ignore markers */
- if (hdr->b_spa == 0)
- continue;
+ if (HDR_HAS_L2HDR(hdr)) {
+ ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size);
+ } else {
+ if (l2arc_write_eligible(hdr->b_spa, hdr))
+ ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size);
+ else
+ ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size);
+ }
+
+ if (hdr->b_l1hdr.b_datacnt == 0) {
+ arc_change_state(evicted_state, hdr, hash_lock);
+ ASSERT(HDR_IN_HASH_TABLE(hdr));
+ hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
+ hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
+ DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
+ }
+
+ return (bytes_evicted);
+}
+
+static uint64_t
+arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
+ uint64_t spa, int64_t bytes)
+{
+ multilist_sublist_t *mls;
+ uint64_t bytes_evicted = 0;
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+ int evict_count = 0;
+
+ ASSERT3P(marker, !=, NULL);
+ IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
+
+ mls = multilist_sublist_lock(ml, idx);
+
+ for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
+ hdr = multilist_sublist_prev(mls, marker)) {
+ if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
+ (evict_count >= zfs_arc_evict_batch_limit))
+ break;
/*
- * It may take a long time to evict all the bufs requested.
- * To avoid blocking all arc activity, periodically drop
- * the arcs_mtx and give other threads a chance to run
- * before reacquiring the lock.
- *
- * If we are looking for a buffer to recycle, we are in
- * the hot code path, so don't sleep.
+ * To keep our iteration location, move the marker
+ * forward. Since we're not holding hdr's hash lock, we
+ * must be very careful and not remove 'hdr' from the
+ * sublist. Otherwise, other consumers might mistake the
+ * 'hdr' as not being on a sublist when they call the
+ * multilist_link_active() function (they all rely on
+ * the hash lock protecting concurrent insertions and
+ * removals). multilist_sublist_move_forward() was
+ * specifically implemented to ensure this is the case
+ * (only 'marker' will be removed and re-inserted).
*/
- if (!recycle && count++ > arc_evict_iterations) {
- list_insert_after(list, hdr, &marker);
- mutex_exit(lock);
- mutex_exit(evicted_lock);
- kpreempt(KPREEMPT_SYNC);
- mutex_enter(evicted_lock);
- mutex_enter(lock);
- hdr_prev = list_prev(list, &marker);
- list_remove(list, &marker);
- count = 0;
+ multilist_sublist_move_forward(mls, marker);
+
+ /*
+ * The only case where the b_spa field should ever be
+ * zero, is the marker headers inserted by
+ * arc_evict_state(). It's possible for multiple threads
+ * to be calling arc_evict_state() concurrently (e.g.
+ * dsl_pool_close() and zio_inject_fault()), so we must
+ * skip any markers we see from these other threads.
+ */
+ if (hdr->b_spa == 0)
+ continue;
+
+ /* we're only interested in evicting buffers of a certain spa */
+ if (spa != 0 && hdr->b_spa != spa) {
+ ARCSTAT_BUMP(arcstat_evict_skip);
continue;
}
hash_lock = HDR_LOCK(hdr);
- have_lock = MUTEX_HELD(hash_lock);
- if (have_lock || mutex_tryenter(hash_lock)) {
- ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
- ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
- while (hdr->b_l1hdr.b_buf) {
- arc_buf_t *buf = hdr->b_l1hdr.b_buf;
- if (!mutex_tryenter(&buf->b_evict_lock)) {
- missed += 1;
- break;
- }
- if (buf->b_data != NULL) {
- bytes_evicted += hdr->b_size;
- if (recycle &&
- arc_buf_type(hdr) == type &&
- hdr->b_size == bytes &&
- !HDR_L2_WRITING(hdr)) {
- stolen = buf->b_data;
- recycle = FALSE;
- }
- }
- if (buf->b_efunc != NULL) {
- mutex_enter(&arc_eviction_mtx);
- arc_buf_destroy(buf,
- buf->b_data == stolen, FALSE);
- hdr->b_l1hdr.b_buf = buf->b_next;
- buf->b_hdr = &arc_eviction_hdr;
- buf->b_next = arc_eviction_list;
- arc_eviction_list = buf;
- mutex_exit(&arc_eviction_mtx);
- mutex_exit(&buf->b_evict_lock);
- } else {
- mutex_exit(&buf->b_evict_lock);
- arc_buf_destroy(buf,
- buf->b_data == stolen, TRUE);
- }
- }
- if (HDR_HAS_L2HDR(hdr)) {
- ARCSTAT_INCR(arcstat_evict_l2_cached,
- hdr->b_size);
- } else {
- if (l2arc_write_eligible(hdr->b_spa, hdr)) {
- ARCSTAT_INCR(arcstat_evict_l2_eligible,
- hdr->b_size);
- } else {
- ARCSTAT_INCR(
- arcstat_evict_l2_ineligible,
- hdr->b_size);
- }
- }
+ /*
+ * We aren't calling this function from any code path
+ * that would already be holding a hash lock, so we're
+ * asserting on this assumption to be defensive in case
+ * this ever changes. Without this check, it would be
+ * possible to incorrectly increment arcstat_mutex_miss
+ * below (e.g. if the code changed such that we called
+ * this function with a hash lock held).
+ */
+ ASSERT(!MUTEX_HELD(hash_lock));
- if (hdr->b_l1hdr.b_datacnt == 0) {
- arc_change_state(evicted_state, hdr, hash_lock);
- ASSERT(HDR_IN_HASH_TABLE(hdr));
- hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
- hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
- DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
- }
- if (!have_lock)
- mutex_exit(hash_lock);
- if (bytes >= 0 && bytes_evicted >= bytes)
- break;
- if (bytes_remaining > 0) {
- mutex_exit(evicted_lock);
- mutex_exit(lock);
- idx = ((idx + 1) & (list_count - 1));
- lists++;
- goto evict_start;
- }
- } else {
- missed += 1;
- }
- }
+ if (mutex_tryenter(hash_lock)) {
+ uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
+ mutex_exit(hash_lock);
- mutex_exit(lock);
- mutex_exit(evicted_lock);
+ bytes_evicted += evicted;
- idx = ((idx + 1) & (list_count - 1));
- lists++;
+ /*
+ * If evicted is zero, arc_evict_hdr() must have
+ * decided to skip this header, don't increment
+ * evict_count in this case.
+ */
+ if (evicted != 0)
+ evict_count++;
- if (bytes_evicted < bytes) {
- if (lists < list_count)
- goto evict_start;
- else
- dprintf("only evicted %lld bytes from %x",
- (longlong_t)bytes_evicted, state);
+ /*
+ * If arc_size isn't overflowing, signal any
+ * threads that might happen to be waiting.
+ *
+ * For each header evicted, we wake up a single
+ * thread. If we used cv_broadcast, we could
+ * wake up "too many" threads causing arc_size
+ * to significantly overflow arc_c; since
+ * arc_get_data_buf() doesn't check for overflow
+ * when it's woken up (it doesn't because it's
+ * possible for the ARC to be overflowing while
+ * full of un-evictable buffers, and the
+ * function should proceed in this case).
+ *
+ * If threads are left sleeping, due to not
+ * using cv_broadcast, they will be woken up
+ * just before arc_reclaim_thread() sleeps.
+ */
+ mutex_enter(&arc_reclaim_lock);
+ if (!arc_is_overflowing())
+ cv_signal(&arc_reclaim_waiters_cv);
+ mutex_exit(&arc_reclaim_lock);
+ } else {
+ ARCSTAT_BUMP(arcstat_mutex_miss);
+ }
}
- if (type == ARC_BUFC_METADATA)
- evict_metadata_offset = idx;
- else
- evict_data_offset = idx;
- if (skipped)
- ARCSTAT_INCR(arcstat_evict_skip, skipped);
-
- if (missed)
- ARCSTAT_INCR(arcstat_mutex_miss, missed);
-
- /*
- * Note: we have just evicted some data into the ghost state,
- * potentially putting the ghost size over the desired size. Rather
- * that evicting from the ghost list in this hot code path, leave
- * this chore to the arc_reclaim_thread().
- */
+ multilist_sublist_unlock(mls);
- if (stolen)
- ARCSTAT_BUMP(arcstat_stolen);
- return (stolen);
+ return (bytes_evicted);
}
/*
- * Remove buffers from list until we've removed the specified number of
- * bytes. Destroy the buffers that are removed.
+ * Evict buffers from the given arc state, until we've removed the
+ * specified number of bytes. Move the removed buffers to the
+ * appropriate evict state.
+ *
+ * This function makes a "best effort". It skips over any buffers
+ * it can't get a hash_lock on, and so, may not catch all candidates.
+ * It may also return without evicting as much space as requested.
+ *
+ * If bytes is specified using the special value ARC_EVICT_ALL, this
+ * will evict all available (i.e. unlocked and evictable) buffers from
+ * the given arc state; which is used by arc_flush().
*/
-static void
-arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
+static uint64_t
+arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
+ arc_buf_contents_t type)
{
- arc_buf_hdr_t *hdr, *hdr_prev;
- arc_buf_hdr_t marker = { 0 };
- list_t *list, *list_start;
- kmutex_t *hash_lock, *lock;
- uint64_t bytes_deleted = 0;
- uint64_t bufs_skipped = 0;
- int count = 0;
- static int evict_offset;
- int list_count, idx = evict_offset;
- int offset, lists = 0;
-
- ASSERT(GHOST_STATE(state));
+ uint64_t total_evicted = 0;
+ multilist_t *ml = &state->arcs_list[type];
+ int num_sublists;
+ arc_buf_hdr_t **markers;
+
+ IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
+
+ num_sublists = multilist_get_num_sublists(ml);
/*
- * data lists come after metadata lists
+ * If we've tried to evict from each sublist, made some
+ * progress, but still have not hit the target number of bytes
+ * to evict, we want to keep trying. The markers allow us to
+ * pick up where we left off for each individual sublist, rather
+ * than starting from the tail each time.
*/
- list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
- list_count = ARC_BUFC_NUMDATALISTS;
- offset = ARC_BUFC_NUMMETADATALISTS;
-
-evict_start:
- list = &list_start[idx];
- lock = ARCS_LOCK(state, idx + offset);
-
- mutex_enter(lock);
- for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
- hdr_prev = list_prev(list, hdr);
- if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES)
- panic("invalid hdr=%p", (void *)hdr);
- if (spa && hdr->b_spa != spa)
- continue;
+ markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
+ for (int i = 0; i < num_sublists; i++) {
+ markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
- /* ignore markers */
- if (hdr->b_spa == 0)
- continue;
+ /*
+ * A b_spa of 0 is used to indicate that this header is
+ * a marker. This fact is used in arc_adjust_type() and
+ * arc_evict_state_impl().
+ */
+ markers[i]->b_spa = 0;
- hash_lock = HDR_LOCK(hdr);
- /* caller may be trying to modify this buffer, skip it */
- if (MUTEX_HELD(hash_lock))
- continue;
+ multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+ multilist_sublist_insert_tail(mls, markers[i]);
+ multilist_sublist_unlock(mls);
+ }
+ /*
+ * While we haven't hit our target number of bytes to evict, or
+ * we're evicting all available buffers.
+ */
+ while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
/*
- * It may take a long time to evict all the bufs requested.
- * To avoid blocking all arc activity, periodically drop
- * the arcs_mtx and give other threads a chance to run
- * before reacquiring the lock.
+ * Start eviction using a randomly selected sublist,
+ * this is to try and evenly balance eviction across all
+ * sublists. Always starting at the same sublist
+ * (e.g. index 0) would cause evictions to favor certain
+ * sublists over others.
*/
- if (count++ > arc_evict_iterations) {
- list_insert_after(list, hdr, &marker);
- mutex_exit(lock);
- kpreempt(KPREEMPT_SYNC);
- mutex_enter(lock);
- hdr_prev = list_prev(list, &marker);
- list_remove(list, &marker);
- count = 0;
- continue;
- }
- if (mutex_tryenter(hash_lock)) {
- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- ASSERT(!HDR_HAS_L1HDR(hdr) ||
- hdr->b_l1hdr.b_buf == NULL);
- ARCSTAT_BUMP(arcstat_deleted);
- bytes_deleted += hdr->b_size;
+ int sublist_idx = multilist_get_random_index(ml);
+ uint64_t scan_evicted = 0;
- if (HDR_HAS_L2HDR(hdr)) {
- /*
- * This buffer is cached on the 2nd Level ARC;
- * don't destroy the header.
- */
- arc_change_state(arc_l2c_only, hdr, hash_lock);
- /*
- * dropping from L1+L2 cached to L2-only,
- * realloc to remove the L1 header.
- */
- hdr = arc_hdr_realloc(hdr, hdr_full_cache,
- hdr_l2only_cache);
- mutex_exit(hash_lock);
- } else {
- arc_change_state(arc_anon, hdr, hash_lock);
- mutex_exit(hash_lock);
- arc_hdr_destroy(hdr);
- }
+ for (int i = 0; i < num_sublists; i++) {
+ uint64_t bytes_remaining;
+ uint64_t bytes_evicted;
- DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
- if (bytes >= 0 && bytes_deleted >= bytes)
+ if (bytes == ARC_EVICT_ALL)
+ bytes_remaining = ARC_EVICT_ALL;
+ else if (total_evicted < bytes)
+ bytes_remaining = bytes - total_evicted;
+ else
break;
- } else if (bytes < 0) {
+
+ bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
+ markers[sublist_idx], spa, bytes_remaining);
+
+ scan_evicted += bytes_evicted;
+ total_evicted += bytes_evicted;
+
+ /* we've reached the end, wrap to the beginning */
+ if (++sublist_idx >= num_sublists)
+ sublist_idx = 0;
+ }
+
+ /*
+ * If we didn't evict anything during this scan, we have
+ * no reason to believe we'll evict more during another
+ * scan, so break the loop.
+ */
+ if (scan_evicted == 0) {
+ /* This isn't possible, let's make that obvious */
+ ASSERT3S(bytes, !=, 0);
+
/*
- * Insert a list marker and then wait for the
- * hash lock to become available. Once its
- * available, restart from where we left off.
+ * When bytes is ARC_EVICT_ALL, the only way to
+ * break the loop is when scan_evicted is zero.
+ * In that case, we actually have evicted enough,
+ * so we don't want to increment the kstat.
*/
- list_insert_after(list, hdr, &marker);
- mutex_exit(lock);
- mutex_enter(hash_lock);
- mutex_exit(hash_lock);
- mutex_enter(lock);
- hdr_prev = list_prev(list, &marker);
- list_remove(list, &marker);
- } else {
- bufs_skipped += 1;
+ if (bytes != ARC_EVICT_ALL) {
+ ASSERT3S(total_evicted, <, bytes);
+ ARCSTAT_BUMP(arcstat_evict_not_enough);
+ }
+
+ break;
}
+ }
+ for (int i = 0; i < num_sublists; i++) {
+ multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+ multilist_sublist_remove(mls, markers[i]);
+ multilist_sublist_unlock(mls);
+
+ kmem_cache_free(hdr_full_cache, markers[i]);
}
- mutex_exit(lock);
- idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
- lists++;
+ kmem_free(markers, sizeof (*markers) * num_sublists);
- if (lists < list_count)
- goto evict_start;
+ return (total_evicted);
+}
- evict_offset = idx;
- if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
- (bytes < 0 || bytes_deleted < bytes)) {
- list_start = &state->arcs_lists[0];
- list_count = ARC_BUFC_NUMMETADATALISTS;
- offset = lists = 0;
- goto evict_start;
+/*
+ * Flush all "evictable" data of the given type from the arc state
+ * specified. This will not evict any "active" buffers (i.e. referenced).
+ *
+ * When 'retry' is set to FALSE, the function will make a single pass
+ * over the state and evict any buffers that it can. Since it doesn't
+ * continually retry the eviction, it might end up leaving some buffers
+ * in the ARC due to lock misses.
+ *
+ * When 'retry' is set to TRUE, the function will continually retry the
+ * eviction until *all* evictable buffers have been removed from the
+ * state. As a result, if concurrent insertions into the state are
+ * allowed (e.g. if the ARC isn't shutting down), this function might
+ * wind up in an infinite loop, continually trying to evict buffers.
+ */
+static uint64_t
+arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
+ boolean_t retry)
+{
+ uint64_t evicted = 0;
+
+ while (state->arcs_lsize[type] != 0) {
+ evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
+
+ if (!retry)
+ break;
}
- if (bufs_skipped) {
- ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
- ASSERT(bytes >= 0);
+ return (evicted);
+}
+
+/*
+ * Evict the specified number of bytes from the state specified,
+ * restricting eviction to the spa and type given. This function
+ * prevents us from trying to evict more from a state's list than
+ * is "evictable", and to skip evicting altogether when passed a
+ * negative value for "bytes". In contrast, arc_evict_state() will
+ * evict everything it can, when passed a negative value for "bytes".
+ */
+static uint64_t
+arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
+ arc_buf_contents_t type)
+{
+ int64_t delta;
+
+ if (bytes > 0 && state->arcs_lsize[type] > 0) {
+ delta = MIN(state->arcs_lsize[type], bytes);
+ return (arc_evict_state(state, spa, delta, type));
}
- if (bytes_deleted < bytes)
- dprintf("only deleted %lld bytes from %p",
- (longlong_t)bytes_deleted, state);
+ return (0);
}
-static void
+/*
+ * Evict metadata buffers from the cache, such that arc_meta_used is
+ * capped by the arc_meta_limit tunable.
+ */
+static uint64_t
+arc_adjust_meta(void)
+{
+ uint64_t total_evicted = 0;
+ int64_t target;
+
+ /*
+ * If we're over the meta limit, we want to evict enough
+ * metadata to get back under the meta limit. We don't want to
+ * evict so much that we drop the MRU below arc_p, though. If
+ * we're over the meta limit more than we're over arc_p, we
+ * evict some from the MRU here, and some from the MFU below.
+ */
+ target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
+ (int64_t)(refcount_count(&arc_anon->arcs_size) +
+ refcount_count(&arc_mru->arcs_size) - arc_p));
+
+ total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
+
+ /*
+ * Similar to the above, we want to evict enough bytes to get us
+ * below the meta limit, but not so much as to drop us below the
+ * space alloted to the MFU (which is defined as arc_c - arc_p).
+ */
+ target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
+ (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));
+
+ total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
+
+ return (total_evicted);
+}
+
+/*
+ * Return the type of the oldest buffer in the given arc state
+ *
+ * This function will select a random sublist of type ARC_BUFC_DATA and
+ * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
+ * is compared, and the type which contains the "older" buffer will be
+ * returned.
+ */
+static arc_buf_contents_t
+arc_adjust_type(arc_state_t *state)
+{
+ multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA];
+ multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA];
+ int data_idx = multilist_get_random_index(data_ml);
+ int meta_idx = multilist_get_random_index(meta_ml);
+ multilist_sublist_t *data_mls;
+ multilist_sublist_t *meta_mls;
+ arc_buf_contents_t type;
+ arc_buf_hdr_t *data_hdr;
+ arc_buf_hdr_t *meta_hdr;
+
+ /*
+ * We keep the sublist lock until we're finished, to prevent
+ * the headers from being destroyed via arc_evict_state().
+ */
+ data_mls = multilist_sublist_lock(data_ml, data_idx);
+ meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
+
+ /*
+ * These two loops are to ensure we skip any markers that
+ * might be at the tail of the lists due to arc_evict_state().
+ */
+
+ for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
+ data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
+ if (data_hdr->b_spa != 0)
+ break;
+ }
+
+ for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
+ meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
+ if (meta_hdr->b_spa != 0)
+ break;
+ }
+
+ if (data_hdr == NULL && meta_hdr == NULL) {
+ type = ARC_BUFC_DATA;
+ } else if (data_hdr == NULL) {
+ ASSERT3P(meta_hdr, !=, NULL);
+ type = ARC_BUFC_METADATA;
+ } else if (meta_hdr == NULL) {
+ ASSERT3P(data_hdr, !=, NULL);
+ type = ARC_BUFC_DATA;
+ } else {
+ ASSERT3P(data_hdr, !=, NULL);
+ ASSERT3P(meta_hdr, !=, NULL);
+
+ /* The headers can't be on the sublist without an L1 header */
+ ASSERT(HDR_HAS_L1HDR(data_hdr));
+ ASSERT(HDR_HAS_L1HDR(meta_hdr));
+
+ if (data_hdr->b_l1hdr.b_arc_access <
+ meta_hdr->b_l1hdr.b_arc_access) {
+ type = ARC_BUFC_DATA;
+ } else {
+ type = ARC_BUFC_METADATA;
+ }
+ }
+
+ multilist_sublist_unlock(meta_mls);
+ multilist_sublist_unlock(data_mls);
+
+ return (type);
+}
+
+/*
+ * Evict buffers from the cache, such that arc_size is capped by arc_c.
+ */
+static uint64_t
arc_adjust(void)
{
- int64_t adjustment, delta;
+ uint64_t total_evicted = 0;
+ uint64_t bytes;
+ int64_t target;
+
+ /*
+ * If we're over arc_meta_limit, we want to correct that before
+ * potentially evicting data buffers below.
+ */
+ total_evicted += arc_adjust_meta();
/*
* Adjust MRU size
+ *
+ * If we're over the target cache size, we want to evict enough
+ * from the list to get back to our target size. We don't want
+ * to evict too much from the MRU, such that it drops below
+ * arc_p. So, if we're over our target cache size more than
+ * the MRU is over arc_p, we'll evict enough to get back to
+ * arc_p here, and then evict more from the MFU below.
*/
+ target = MIN((int64_t)(arc_size - arc_c),
+ (int64_t)(refcount_count(&arc_anon->arcs_size) +
+ refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p));
- adjustment = MIN((int64_t)(arc_size - arc_c),
- (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
- arc_p));
+ /*
+ * If we're below arc_meta_min, always prefer to evict data.
+ * Otherwise, try to satisfy the requested number of bytes to
+ * evict from the type which contains older buffers; in an
+ * effort to keep newer buffers in the cache regardless of their
+ * type. If we cannot satisfy the number of bytes from this
+ * type, spill over into the next type.
+ */
+ if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
+ arc_meta_used > arc_meta_min) {
+ bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
+ total_evicted += bytes;
- if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
- delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
- (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
- adjustment -= delta;
- }
+ /*
+ * If we couldn't evict our target number of bytes from
+ * metadata, we try to get the rest from data.
+ */
+ target -= bytes;
+
+ total_evicted +=
+ arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
+ } else {
+ bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
+ total_evicted += bytes;
+
+ /*
+ * If we couldn't evict our target number of bytes from
+ * data, we try to get the rest from metadata.
+ */
+ target -= bytes;
- if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
- delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
- (void) arc_evict(arc_mru, 0, delta, FALSE,
- ARC_BUFC_METADATA);
+ total_evicted +=
+ arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
}
/*
* Adjust MFU size
+ *
+ * Now that we've tried to evict enough from the MRU to get its
+ * size back to arc_p, if we're still above the target cache
+ * size, we evict the rest from the MFU.
*/
+ target = arc_size - arc_c;
- adjustment = arc_size - arc_c;
+ if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
+ arc_meta_used > arc_meta_min) {
+ bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
+ total_evicted += bytes;
- if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
- delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
- (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
- adjustment -= delta;
- }
+ /*
+ * If we couldn't evict our target number of bytes from
+ * metadata, we try to get the rest from data.
+ */
+ target -= bytes;
+
+ total_evicted +=
+ arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
+ } else {
+ bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
+ total_evicted += bytes;
+
+ /*
+ * If we couldn't evict our target number of bytes from
+ * data, we try to get the rest from data.
+ */
+ target -= bytes;
- if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
- int64_t delta = MIN(adjustment,
- arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
- (void) arc_evict(arc_mfu, 0, delta, FALSE,
- ARC_BUFC_METADATA);
+ total_evicted +=
+ arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
}
/*
* Adjust ghost lists
+ *
+ * In addition to the above, the ARC also defines target values
+ * for the ghost lists. The sum of the mru list and mru ghost
+ * list should never exceed the target size of the cache, and
+ * the sum of the mru list, mfu list, mru ghost list, and mfu
+ * ghost list should never exceed twice the target size of the
+ * cache. The following logic enforces these limits on the ghost
+ * caches, and evicts from them as needed.
*/
+ target = refcount_count(&arc_mru->arcs_size) +
+ refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
- adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
+ bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
+ total_evicted += bytes;
- if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
- delta = MIN(arc_mru_ghost->arcs_size, adjustment);
- arc_evict_ghost(arc_mru_ghost, 0, delta);
- }
+ target -= bytes;
- adjustment =
- arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
+ total_evicted +=
+ arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
- if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
- delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
- arc_evict_ghost(arc_mfu_ghost, 0, delta);
- }
+ /*
+ * We assume the sum of the mru list and mfu list is less than
+ * or equal to arc_c (we enforced this above), which means we
+ * can use the simpler of the two equations below:
+ *
+ * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
+ * mru ghost + mfu ghost <= arc_c
+ */
+ target = refcount_count(&arc_mru_ghost->arcs_size) +
+ refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
+
+ bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
+ total_evicted += bytes;
+
+ target -= bytes;
+
+ total_evicted +=
+ arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
+
+ return (total_evicted);
}
static void
arc_do_user_evicts(void)
{
- static arc_buf_t *tmp_arc_eviction_list;
-
- /*
- * Move list over to avoid LOR
- */
-restart:
- mutex_enter(&arc_eviction_mtx);
- tmp_arc_eviction_list = arc_eviction_list;
- arc_eviction_list = NULL;
- mutex_exit(&arc_eviction_mtx);
-
- while (tmp_arc_eviction_list != NULL) {
- arc_buf_t *buf = tmp_arc_eviction_list;
- tmp_arc_eviction_list = buf->b_next;
+ mutex_enter(&arc_user_evicts_lock);
+ while (arc_eviction_list != NULL) {
+ arc_buf_t *buf = arc_eviction_list;
+ arc_eviction_list = buf->b_next;
mutex_enter(&buf->b_evict_lock);
buf->b_hdr = NULL;
mutex_exit(&buf->b_evict_lock);
+ mutex_exit(&arc_user_evicts_lock);
if (buf->b_efunc != NULL)
VERIFY0(buf->b_efunc(buf->b_private));
@@ -3014,58 +3267,45 @@ restart:
buf->b_efunc = NULL;
buf->b_private = NULL;
kmem_cache_free(buf_cache, buf);
+ mutex_enter(&arc_user_evicts_lock);
}
-
- if (arc_eviction_list != NULL)
- goto restart;
+ mutex_exit(&arc_user_evicts_lock);
}
-/*
- * Flush all *evictable* data from the cache for the given spa.
- * NOTE: this will not touch "active" (i.e. referenced) data.
- */
void
-arc_flush(spa_t *spa)
+arc_flush(spa_t *spa, boolean_t retry)
{
uint64_t guid = 0;
+ /*
+ * If retry is TRUE, a spa must not be specified since we have
+ * no good way to determine if all of a spa's buffers have been
+ * evicted from an arc state.
+ */
+ ASSERT(!retry || spa == 0);
+
if (spa != NULL)
guid = spa_load_guid(spa);
- while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
- (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
- if (spa != NULL)
- break;
- }
- while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
- (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
- if (spa != NULL)
- break;
- }
- while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
- (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
- if (spa != NULL)
- break;
- }
- while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
- (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
- if (spa != NULL)
- break;
- }
+ (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
+ (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
+
+ (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
+ (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
- arc_evict_ghost(arc_mru_ghost, guid, -1);
- arc_evict_ghost(arc_mfu_ghost, guid, -1);
+ (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
+ (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
+
+ (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
+ (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
- mutex_enter(&arc_reclaim_thr_lock);
arc_do_user_evicts();
- mutex_exit(&arc_reclaim_thr_lock);
ASSERT(spa || arc_eviction_list == NULL);
}
void
arc_shrink(int64_t to_free)
{
-
if (arc_c > arc_c_min) {
DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
arc_c_min, uint64_t, arc_p, uint64_t, to_free);
@@ -3090,7 +3330,7 @@ arc_shrink(int64_t to_free)
if (arc_size > arc_c) {
DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,
uint64_t, arc_c);
- arc_adjust();
+ (void) arc_adjust();
}
}
@@ -3329,17 +3569,37 @@ arc_kmem_reap_now(void)
DTRACE_PROBE(arc__kmem_reap_end);
}
+/*
+ * Threads can block in arc_get_data_buf() waiting for this thread to evict
+ * enough data and signal them to proceed. When this happens, the threads in
+ * arc_get_data_buf() are sleeping while holding the hash lock for their
+ * particular arc header. Thus, we must be careful to never sleep on a
+ * hash lock in this thread. This is to prevent the following deadlock:
+ *
+ * - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L",
+ * waiting for the reclaim thread to signal it.
+ *
+ * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
+ * fails, and goes to sleep forever.
+ *
+ * This possible deadlock is avoided by always acquiring a hash lock
+ * using mutex_tryenter() from arc_reclaim_thread().
+ */
static void
arc_reclaim_thread(void *dummy __unused)
{
clock_t growtime = 0;
callb_cpr_t cpr;
- CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
+ CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
- mutex_enter(&arc_reclaim_thr_lock);
- while (arc_thread_exit == 0) {
+ mutex_enter(&arc_reclaim_lock);
+ while (!arc_reclaim_thread_exit) {
int64_t free_memory = arc_available_memory();
+ uint64_t evicted = 0;
+
+ mutex_exit(&arc_reclaim_lock);
+
if (free_memory < 0) {
arc_no_grow = B_TRUE;
@@ -3373,17 +3633,60 @@ arc_reclaim_thread(void *dummy __unused)
arc_no_grow = B_FALSE;
}
- arc_adjust();
+ evicted = arc_adjust();
- if (arc_eviction_list != NULL)
- arc_do_user_evicts();
+ mutex_enter(&arc_reclaim_lock);
+ /*
+ * If evicted is zero, we couldn't evict anything via
+ * arc_adjust(). This could be due to hash lock
+ * collisions, but more likely due to the majority of
+ * arc buffers being unevictable. Therefore, even if
+ * arc_size is above arc_c, another pass is unlikely to
+ * be helpful and could potentially cause us to enter an
+ * infinite loop.
+ */
+ if (arc_size <= arc_c || evicted == 0) {
#ifdef _KERNEL
- if (needfree) {
needfree = 0;
- wakeup(&needfree);
- }
#endif
+ /*
+ * We're either no longer overflowing, or we
+ * can't evict anything more, so we should wake
+ * up any threads before we go to sleep.
+ */
+ cv_broadcast(&arc_reclaim_waiters_cv);
+
+ /*
+ * Block until signaled, or after one second (we
+ * might need to perform arc_kmem_reap_now()
+ * even if we aren't being signalled)
+ */
+ CALLB_CPR_SAFE_BEGIN(&cpr);
+ (void) cv_timedwait(&arc_reclaim_thread_cv,
+ &arc_reclaim_lock, hz);
+ CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
+ }
+ }
+
+ arc_reclaim_thread_exit = FALSE;
+ cv_broadcast(&arc_reclaim_thread_cv);
+ CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */
+ thread_exit();
+}
+
+static void
+arc_user_evicts_thread(void *dummy __unused)
+{
+ callb_cpr_t cpr;
+
+ CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG);
+
+ mutex_enter(&arc_user_evicts_lock);
+ while (!arc_user_evicts_thread_exit) {
+ mutex_exit(&arc_user_evicts_lock);
+
+ arc_do_user_evicts();
/*
* This is necessary in order for the mdb ::arc dcmd to
@@ -3399,16 +3702,21 @@ arc_reclaim_thread(void *dummy __unused)
if (arc_ksp != NULL)
arc_ksp->ks_update(arc_ksp, KSTAT_READ);
- /* block until needed, or one second, whichever is shorter */
+ mutex_enter(&arc_user_evicts_lock);
+
+ /*
+ * Block until signaled, or after one second (we need to
+ * call the arc's kstat update function regularly).
+ */
CALLB_CPR_SAFE_BEGIN(&cpr);
- (void) cv_timedwait(&arc_reclaim_thr_cv,
- &arc_reclaim_thr_lock, hz);
- CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
+ (void) cv_timedwait(&arc_user_evicts_cv,
+ &arc_user_evicts_lock, hz);
+ CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock);
}
- arc_thread_exit = 0;
- cv_broadcast(&arc_reclaim_thr_cv);
- CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */
+ arc_user_evicts_thread_exit = FALSE;
+ cv_broadcast(&arc_user_evicts_cv);
+ CALLB_CPR_EXIT(&cpr); /* drops arc_user_evicts_lock */
thread_exit();
}
@@ -3422,6 +3730,8 @@ arc_adapt(int bytes, arc_state_t *state)
{
int mult;
uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
+ int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size);
+ int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size);
if (state == arc_l2c_only)
return;
@@ -3436,16 +3746,14 @@ arc_adapt(int bytes, arc_state_t *state)
* target size of the MRU list.
*/
if (state == arc_mru_ghost) {
- mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
- 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
+ mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
} else if (state == arc_mfu_ghost) {
uint64_t delta;
- mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
- 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
+ mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
mult = MIN(mult, 10);
delta = MIN(bytes * mult, arc_p);
@@ -3454,7 +3762,7 @@ arc_adapt(int bytes, arc_state_t *state)
ASSERT((int64_t)arc_p >= 0);
if (arc_reclaim_needed()) {
- cv_signal(&arc_reclaim_thr_cv);
+ cv_signal(&arc_reclaim_thread_cv);
return;
}
@@ -3482,43 +3790,25 @@ arc_adapt(int bytes, arc_state_t *state)
}
/*
- * Check if the cache has reached its limits and eviction is required
- * prior to insert.
+ * Check if arc_size has grown past our upper threshold, determined by
+ * zfs_arc_overflow_shift.
*/
-static int
-arc_evict_needed(arc_buf_contents_t type)
+static boolean_t
+arc_is_overflowing(void)
{
- if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
- return (1);
+ /* Always allow at least one block of overflow */
+ uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
+ arc_c >> zfs_arc_overflow_shift);
- if (arc_reclaim_needed())
- return (1);
-
- return (arc_size > arc_c);
+ return (arc_size >= arc_c + overflow);
}
/*
- * The buffer, supplied as the first argument, needs a data block.
- * So, if we are at cache max, determine which cache should be victimized.
- * We have the following cases:
- *
- * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
- * In this situation if we're out of space, but the resident size of the MFU is
- * under the limit, victimize the MFU cache to satisfy this insertion request.
- *
- * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
- * Here, we've used up all of the available space for the MRU, so we need to
- * evict from our own cache instead. Evict from the set of resident MRU
- * entries.
- *
- * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
- * c minus p represents the MFU space in the cache, since p is the size of the
- * cache that is dedicated to the MRU. In this situation there's still space on
- * the MFU side, so the MRU side needs to be victimized.
- *
- * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
- * MFU's resident set is consuming more space than it has been allotted. In
- * this situation, we must victimize our own cache, the MFU, for this insertion.
+ * The buffer, supplied as the first argument, needs a data block. If we
+ * are hitting the hard limit for the cache size, we must sleep, waiting
+ * for the eviction thread to catch up. If we're past the target size
+ * but below the hard limit, we'll only signal the reclaim thread and
+ * continue on.
*/
static void
arc_get_data_buf(arc_buf_t *buf)
@@ -3530,62 +3820,70 @@ arc_get_data_buf(arc_buf_t *buf)
arc_adapt(size, state);
/*
- * We have not yet reached cache maximum size,
- * just allocate a new buffer.
+ * If arc_size is currently overflowing, and has grown past our
+ * upper limit, we must be adding data faster than the evict
+ * thread can evict. Thus, to ensure we don't compound the
+ * problem by adding more data and forcing arc_size to grow even
+ * further past it's target size, we halt and wait for the
+ * eviction thread to catch up.
+ *
+ * It's also possible that the reclaim thread is unable to evict
+ * enough buffers to get arc_size below the overflow limit (e.g.
+ * due to buffers being un-evictable, or hash lock collisions).
+ * In this case, we want to proceed regardless if we're
+ * overflowing; thus we don't use a while loop here.
*/
- if (!arc_evict_needed(type)) {
- if (type == ARC_BUFC_METADATA) {
- buf->b_data = zio_buf_alloc(size);
- arc_space_consume(size, ARC_SPACE_META);
- } else {
- ASSERT(type == ARC_BUFC_DATA);
- buf->b_data = zio_data_buf_alloc(size);
- arc_space_consume(size, ARC_SPACE_DATA);
+ if (arc_is_overflowing()) {
+ mutex_enter(&arc_reclaim_lock);
+
+ /*
+ * Now that we've acquired the lock, we may no longer be
+ * over the overflow limit, lets check.
+ *
+ * We're ignoring the case of spurious wake ups. If that
+ * were to happen, it'd let this thread consume an ARC
+ * buffer before it should have (i.e. before we're under
+ * the overflow limit and were signalled by the reclaim
+ * thread). As long as that is a rare occurrence, it
+ * shouldn't cause any harm.
+ */
+ if (arc_is_overflowing()) {
+ cv_signal(&arc_reclaim_thread_cv);
+ cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
}
- goto out;
+
+ mutex_exit(&arc_reclaim_lock);
}
- /*
- * If we are prefetching from the mfu ghost list, this buffer
- * will end up on the mru list; so steal space from there.
- */
- if (state == arc_mfu_ghost)
- state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu;
- else if (state == arc_mru_ghost)
- state = arc_mru;
-
- if (state == arc_mru || state == arc_anon) {
- uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
- state = (arc_mfu->arcs_lsize[type] >= size &&
- arc_p > mru_used) ? arc_mfu : arc_mru;
+ if (type == ARC_BUFC_METADATA) {
+ buf->b_data = zio_buf_alloc(size);
+ arc_space_consume(size, ARC_SPACE_META);
} else {
- /* MFU cases */
- uint64_t mfu_space = arc_c - arc_p;
- state = (arc_mru->arcs_lsize[type] >= size &&
- mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
+ ASSERT(type == ARC_BUFC_DATA);
+ buf->b_data = zio_data_buf_alloc(size);
+ arc_space_consume(size, ARC_SPACE_DATA);
}
- if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
- if (type == ARC_BUFC_METADATA) {
- buf->b_data = zio_buf_alloc(size);
- arc_space_consume(size, ARC_SPACE_META);
- } else {
- ASSERT(type == ARC_BUFC_DATA);
- buf->b_data = zio_data_buf_alloc(size);
- arc_space_consume(size, ARC_SPACE_DATA);
- }
- ARCSTAT_BUMP(arcstat_recycle_miss);
- }
- ASSERT(buf->b_data != NULL);
-out:
+
/*
* Update the state size. Note that ghost states have a
* "ghost size" and so don't need to be updated.
*/
if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) {
arc_buf_hdr_t *hdr = buf->b_hdr;
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+
+ (void) refcount_add_many(&state->arcs_size, size, buf);
- atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size);
- if (list_link_active(&hdr->b_l1hdr.b_arc_node)) {
+ /*
+ * If this is reached via arc_read, the link is
+ * protected by the hash lock. If reached via
+ * arc_buf_alloc, the header should not be accessed by
+ * any other thread. And, if reached via arc_read_done,
+ * the hash lock will protect it if it's found in the
+ * hash table; otherwise no other thread should be
+ * trying to [add|remove]_reference it.
+ */
+ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type],
size);
@@ -3595,7 +3893,8 @@ out:
* data, and we have outgrown arc_p, update arc_p
*/
if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
- arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
+ (refcount_count(&arc_anon->arcs_size) +
+ refcount_count(&arc_mru->arcs_size) > arc_p))
arc_p = MIN(arc_c, arc_p + size);
}
ARCSTAT_BUMP(arcstat_allocated);
@@ -3638,7 +3937,8 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
*/
if (HDR_PREFETCH(hdr)) {
if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
- ASSERT(list_link_active(
+ /* link protected by hash lock */
+ ASSERT(multilist_link_active(
&hdr->b_l1hdr.b_arc_node));
} else {
hdr->b_flags &= ~ARC_FLAG_PREFETCH;
@@ -3698,7 +3998,8 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
*/
if ((HDR_PREFETCH(hdr)) != 0) {
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
- ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
+ /* link protected by hash_lock */
+ ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
}
ARCSTAT_BUMP(arcstat_mfu_hits);
hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
@@ -3903,7 +4204,7 @@ arc_read_done(zio_t *zio)
}
/*
- * "Read" the block block at the specified DVA (in bp) via the
+ * "Read" the block at the specified DVA (in bp) via the
* cache. If the block is found in the cache, invoke the provided
* callback immediately and return. Note that the `zio' parameter
* in the callback will be NULL in this case, since no IO was
@@ -4070,7 +4371,7 @@ top:
ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
- ASSERT(hdr->b_l1hdr.b_buf == NULL);
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
/* if this is a prefetch, we don't have a reference */
if (*arc_flags & ARC_FLAG_PREFETCH)
@@ -4297,8 +4598,6 @@ arc_clear_callback(arc_buf_t *buf)
kmutex_t *hash_lock;
arc_evict_func_t *efunc = buf->b_efunc;
void *private = buf->b_private;
- list_t *list, *evicted_list;
- kmutex_t *lock, *evicted_lock;
mutex_enter(&buf->b_evict_lock);
hdr = buf->b_hdr;
@@ -4334,7 +4633,7 @@ arc_clear_callback(arc_buf_t *buf)
if (hdr->b_l1hdr.b_datacnt > 1) {
mutex_exit(&buf->b_evict_lock);
- arc_buf_destroy(buf, FALSE, TRUE);
+ arc_buf_destroy(buf, TRUE);
} else {
ASSERT(buf == hdr->b_l1hdr.b_buf);
hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
@@ -4364,6 +4663,9 @@ arc_release(arc_buf_t *buf, void *tag)
*/
mutex_enter(&buf->b_evict_lock);
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
/*
* We don't grab the hash lock prior to this check, because if
* the buffer's header is in the arc_anon state, it won't be
@@ -4449,8 +4751,10 @@ arc_release(arc_buf_t *buf, void *tag)
buf->b_next = NULL;
ASSERT3P(state, !=, arc_l2c_only);
- ASSERT3U(state->arcs_size, >=, hdr->b_size);
- atomic_add_64(&state->arcs_size, -hdr->b_size);
+
+ (void) refcount_remove_many(
+ &state->arcs_size, hdr->b_size, buf);
+
if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
ASSERT3P(state, !=, arc_l2c_only);
uint64_t *size = &state->arcs_lsize[type];
@@ -4487,17 +4791,18 @@ arc_release(arc_buf_t *buf, void *tag)
nhdr->b_l1hdr.b_datacnt = 1;
nhdr->b_l1hdr.b_state = arc_anon;
nhdr->b_l1hdr.b_arc_access = 0;
+ nhdr->b_l1hdr.b_tmp_cdata = NULL;
nhdr->b_freeze_cksum = NULL;
(void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
buf->b_hdr = nhdr;
mutex_exit(&buf->b_evict_lock);
- atomic_add_64(&arc_anon->arcs_size, blksz);
+ (void) refcount_add_many(&arc_anon->arcs_size, blksz, buf);
} else {
mutex_exit(&buf->b_evict_lock);
ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
- /* protected by hash lock */
- ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+ /* protected by hash lock, or hdr is on arc_anon */
+ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
arc_change_state(arc_anon, hdr, hash_lock);
hdr->b_l1hdr.b_arc_access = 0;
@@ -4759,7 +5064,8 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
* network delays from blocking transactions that are ready to be
* assigned to a txg.
*/
- anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
+ anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) -
+ arc_loaned_bytes), 0);
/*
* Writes will, almost always, require additional memory allocations
@@ -4796,7 +5102,7 @@ static void
arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
kstat_named_t *evict_data, kstat_named_t *evict_metadata)
{
- size->value.ui64 = state->arcs_size;
+ size->value.ui64 = refcount_count(&state->arcs_size);
evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
}
@@ -4834,6 +5140,41 @@ arc_kstat_update(kstat_t *ksp, int rw)
return (0);
}
+/*
+ * This function *must* return indices evenly distributed between all
+ * sublists of the multilist. This is needed due to how the ARC eviction
+ * code is laid out; arc_evict_state() assumes ARC buffers are evenly
+ * distributed between all sublists and uses this assumption when
+ * deciding which sublist to evict from and how much to evict from it.
+ */
+unsigned int
+arc_state_multilist_index_func(multilist_t *ml, void *obj)
+{
+ arc_buf_hdr_t *hdr = obj;
+
+ /*
+ * We rely on b_dva to generate evenly distributed index
+ * numbers using buf_hash below. So, as an added precaution,
+ * let's make sure we never add empty buffers to the arc lists.
+ */
+ ASSERT(!BUF_EMPTY(hdr));
+
+ /*
+ * The assumption here, is the hash value for a given
+ * arc_buf_hdr_t will remain constant throughout it's lifetime
+ * (i.e. it's b_spa, b_dva, and b_birth fields don't change).
+ * Thus, we don't need to store the header's sublist index
+ * on insertion, as this index can be recalculated on removal.
+ *
+ * Also, the low order bits of the hash value are thought to be
+ * distributed evenly. Otherwise, in the case that the multilist
+ * has a power of two number of sublists, each sublists' usage
+ * would not be evenly distributed.
+ */
+ return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
+ multilist_get_num_sublists(ml));
+}
+
#ifdef _KERNEL
static eventhandler_tag arc_event_lowmem = NULL;
@@ -4841,11 +5182,11 @@ static void
arc_lowmem(void *arg __unused, int howto __unused)
{
- mutex_enter(&arc_reclaim_thr_lock);
+ mutex_enter(&arc_reclaim_lock);
/* XXX: Memory deficit should be passed as argument. */
needfree = btoc(arc_c >> arc_shrink_shift);
DTRACE_PROBE(arc__needfree);
- cv_signal(&arc_reclaim_thr_cv);
+ cv_signal(&arc_reclaim_thread_cv);
/*
* It is unsafe to block here in arbitrary threads, because we can come
@@ -4853,8 +5194,8 @@ arc_lowmem(void *arg __unused, int howto __unused)
* with ARC reclaim thread.
*/
if (curproc == pageproc)
- msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
- mutex_exit(&arc_reclaim_thr_lock);
+ (void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
+ mutex_exit(&arc_reclaim_lock);
}
#endif
@@ -4863,8 +5204,12 @@ arc_init(void)
{
int i, prefetch_tunable_set = 0;
- mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
+
+ mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL);
/* Convert seconds to clock ticks */
arc_min_prefetch_lifespan = 1 * hz;
@@ -4936,6 +5281,9 @@ arc_init(void)
if (zfs_arc_p_min_shift > 0)
arc_p_min_shift = zfs_arc_p_min_shift;
+ if (zfs_arc_num_sublists_per_state < 1)
+ zfs_arc_num_sublists_per_state = MAX(max_ncpus, 1);
+
/* if kmem_flags are set, lets try to use less memory */
if (kmem_debugging())
arc_c = arc_c / 2;
@@ -4953,45 +5301,59 @@ arc_init(void)
arc_l2c_only = &ARC_l2c_only;
arc_size = 0;
- for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
- mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
- NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
- NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
- NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
- NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
- NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
- NULL, MUTEX_DEFAULT, NULL);
-
- list_create(&arc_mru->arcs_lists[i],
- sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
- list_create(&arc_mru_ghost->arcs_lists[i],
- sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
- list_create(&arc_mfu->arcs_lists[i],
- sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
- list_create(&arc_mfu_ghost->arcs_lists[i],
- sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
- list_create(&arc_mfu_ghost->arcs_lists[i],
- sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
- list_create(&arc_l2c_only->arcs_lists[i],
- sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
- }
+ multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+ multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+ multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+ multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+ multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+ multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+ multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+ multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+ multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+ multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+
+ refcount_create(&arc_anon->arcs_size);
+ refcount_create(&arc_mru->arcs_size);
+ refcount_create(&arc_mru_ghost->arcs_size);
+ refcount_create(&arc_mfu->arcs_size);
+ refcount_create(&arc_mfu_ghost->arcs_size);
+ refcount_create(&arc_l2c_only->arcs_size);
buf_init();
- arc_thread_exit = 0;
+ arc_reclaim_thread_exit = FALSE;
+ arc_user_evicts_thread_exit = FALSE;
arc_eviction_list = NULL;
- mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
@@ -5011,6 +5373,9 @@ arc_init(void)
EVENTHANDLER_PRI_FIRST);
#endif
+ (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0,
+ TS_RUN, minclsyspri);
+
arc_dead = FALSE;
arc_warm = B_FALSE;
@@ -5069,16 +5434,32 @@ arc_init(void)
void
arc_fini(void)
{
- int i;
+ mutex_enter(&arc_reclaim_lock);
+ arc_reclaim_thread_exit = TRUE;
+ /*
+ * The reclaim thread will set arc_reclaim_thread_exit back to
+ * FALSE when it is finished exiting; we're waiting for that.
+ */
+ while (arc_reclaim_thread_exit) {
+ cv_signal(&arc_reclaim_thread_cv);
+ cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
+ }
+ mutex_exit(&arc_reclaim_lock);
- mutex_enter(&arc_reclaim_thr_lock);
- arc_thread_exit = 1;
- cv_signal(&arc_reclaim_thr_cv);
- while (arc_thread_exit != 0)
- cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
- mutex_exit(&arc_reclaim_thr_lock);
+ mutex_enter(&arc_user_evicts_lock);
+ arc_user_evicts_thread_exit = TRUE;
+ /*
+ * The user evicts thread will set arc_user_evicts_thread_exit
+ * to FALSE when it is finished exiting; we're waiting for that.
+ */
+ while (arc_user_evicts_thread_exit) {
+ cv_signal(&arc_user_evicts_cv);
+ cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock);
+ }
+ mutex_exit(&arc_user_evicts_lock);
- arc_flush(NULL);
+ /* Use TRUE to ensure *all* buffers are evicted */
+ arc_flush(NULL, TRUE);
arc_dead = TRUE;
@@ -5087,24 +5468,28 @@ arc_fini(void)
arc_ksp = NULL;
}
- mutex_destroy(&arc_eviction_mtx);
- mutex_destroy(&arc_reclaim_thr_lock);
- cv_destroy(&arc_reclaim_thr_cv);
+ mutex_destroy(&arc_reclaim_lock);
+ cv_destroy(&arc_reclaim_thread_cv);
+ cv_destroy(&arc_reclaim_waiters_cv);
- for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
- list_destroy(&arc_mru->arcs_lists[i]);
- list_destroy(&arc_mru_ghost->arcs_lists[i]);
- list_destroy(&arc_mfu->arcs_lists[i]);
- list_destroy(&arc_mfu_ghost->arcs_lists[i]);
- list_destroy(&arc_l2c_only->arcs_lists[i]);
+ mutex_destroy(&arc_user_evicts_lock);
+ cv_destroy(&arc_user_evicts_cv);
- mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
- mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
- mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
- mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
- mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
- mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
- }
+ refcount_destroy(&arc_anon->arcs_size);
+ refcount_destroy(&arc_mru->arcs_size);
+ refcount_destroy(&arc_mru_ghost->arcs_size);
+ refcount_destroy(&arc_mfu->arcs_size);
+ refcount_destroy(&arc_mfu_ghost->arcs_size);
+ refcount_destroy(&arc_l2c_only->arcs_size);
+
+ multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
+ multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
+ multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
+ multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
+ multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
+ multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
+ multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
+ multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
buf_fini();
@@ -5450,39 +5835,68 @@ l2arc_write_done(zio_t *zio)
if (zio->io_error != 0)
ARCSTAT_BUMP(arcstat_l2_writes_error);
- mutex_enter(&dev->l2ad_mtx);
-
/*
* All writes completed, or an error was hit.
*/
+top:
+ mutex_enter(&dev->l2ad_mtx);
for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
hdr_prev = list_prev(buflist, hdr);
hash_lock = HDR_LOCK(hdr);
+
+ /*
+ * We cannot use mutex_enter or else we can deadlock
+ * with l2arc_write_buffers (due to swapping the order
+ * the hash lock and l2ad_mtx are taken).
+ */
if (!mutex_tryenter(hash_lock)) {
/*
- * This buffer misses out. It may be in a stage
- * of eviction. Its ARC_FLAG_L2_WRITING flag will be
- * left set, denying reads to this buffer.
+ * Missed the hash lock. We must retry so we
+ * don't leave the ARC_FLAG_L2_WRITING bit set.
*/
- ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
- continue;
+ ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
+
+ /*
+ * We don't want to rescan the headers we've
+ * already marked as having been written out, so
+ * we reinsert the head node so we can pick up
+ * where we left off.
+ */
+ list_remove(buflist, head);
+ list_insert_after(buflist, hdr, head);
+
+ mutex_exit(&dev->l2ad_mtx);
+
+ /*
+ * We wait for the hash lock to become available
+ * to try and prevent busy waiting, and increase
+ * the chance we'll be able to acquire the lock
+ * the next time around.
+ */
+ mutex_enter(hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
}
/*
- * It's possible that this buffer got evicted from the L1 cache
- * before we grabbed the vdev + hash locks, in which case
- * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated.
- * Only free the buffer if we still have an L1 hdr.
+ * We could not have been moved into the arc_l2c_only
+ * state while in-flight due to our ARC_FLAG_L2_WRITING
+ * bit being set. Let's just ensure that's being enforced.
*/
- if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL &&
- HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
- l2arc_release_cdata_buf(hdr);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ /*
+ * We may have allocated a buffer for L2ARC compression,
+ * we must release it to avoid leaking this data.
+ */
+ l2arc_release_cdata_buf(hdr);
if (zio->io_error != 0) {
/*
* Error - drop L2ARC entry.
*/
+ list_remove(buflist, hdr);
trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev,
hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0);
hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
@@ -5496,7 +5910,8 @@ l2arc_write_done(zio_t *zio)
}
/*
- * Allow ARC to begin reads to this L2ARC entry.
+ * Allow ARC to begin reads and ghost list evictions to
+ * this L2ARC entry.
*/
hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
@@ -5604,36 +6019,37 @@ l2arc_read_done(zio_t *zio)
* the data lists. This function returns a locked list, and also returns
* the lock pointer.
*/
-static list_t *
-l2arc_list_locked(int list_num, kmutex_t **lock)
+static multilist_sublist_t *
+l2arc_sublist_lock(int list_num)
{
- list_t *list = NULL;
- int idx;
-
- ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);
-
- if (list_num < ARC_BUFC_NUMMETADATALISTS) {
- idx = list_num;
- list = &arc_mfu->arcs_lists[idx];
- *lock = ARCS_LOCK(arc_mfu, idx);
- } else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) {
- idx = list_num - ARC_BUFC_NUMMETADATALISTS;
- list = &arc_mru->arcs_lists[idx];
- *lock = ARCS_LOCK(arc_mru, idx);
- } else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 +
- ARC_BUFC_NUMDATALISTS)) {
- idx = list_num - ARC_BUFC_NUMMETADATALISTS;
- list = &arc_mfu->arcs_lists[idx];
- *lock = ARCS_LOCK(arc_mfu, idx);
- } else {
- idx = list_num - ARC_BUFC_NUMLISTS;
- list = &arc_mru->arcs_lists[idx];
- *lock = ARCS_LOCK(arc_mru, idx);
+ multilist_t *ml = NULL;
+ unsigned int idx;
+
+ ASSERT(list_num >= 0 && list_num <= 3);
+
+ switch (list_num) {
+ case 0:
+ ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
+ break;
+ case 1:
+ ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
+ break;
+ case 2:
+ ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
+ break;
+ case 3:
+ ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
+ break;
}
- ASSERT(!(MUTEX_HELD(*lock)));
- mutex_enter(*lock);
- return (list);
+ /*
+ * Return a randomly-selected sublist. This is acceptable
+ * because the caller feeds only a little bit of data for each
+ * call (8MB). Subsequent calls will result in different
+ * sublists being selected.
+ */
+ idx = multilist_get_random_index(ml);
+ return (multilist_sublist_lock(ml, idx));
}
/*
@@ -5678,6 +6094,12 @@ top:
hdr_prev = list_prev(buflist, hdr);
hash_lock = HDR_LOCK(hdr);
+
+ /*
+ * We cannot use mutex_enter or else we can deadlock
+ * with l2arc_write_buffers (due to swapping the order
+ * the hash lock and l2ad_mtx are taken).
+ */
if (!mutex_tryenter(hash_lock)) {
/*
* Missed the hash lock. Retry.
@@ -5733,6 +6155,10 @@ top:
hdr->b_flags |= ARC_FLAG_L2_EVICTED;
}
+ /* Ensure this header has finished being written */
+ ASSERT(!HDR_L2_WRITING(hdr));
+ ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+
arc_hdr_l2hdr_destroy(hdr);
}
mutex_exit(hash_lock);
@@ -5756,11 +6182,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
boolean_t *headroom_boost)
{
arc_buf_hdr_t *hdr, *hdr_prev, *head;
- list_t *list;
uint64_t write_asize, write_psize, write_sz, headroom,
buf_compress_minsz;
void *buf_data;
- kmutex_t *list_lock;
boolean_t full;
l2arc_write_callback_t *cb;
zio_t *pio, *wzio;
@@ -5790,11 +6214,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
/*
* Copy buffers for L2ARC writing.
*/
- mutex_enter(&dev->l2ad_mtx);
- for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
+ for (try = 0; try <= 3; try++) {
+ multilist_sublist_t *mls = l2arc_sublist_lock(try);
uint64_t passed_sz = 0;
- list = l2arc_list_locked(try, &list_lock);
ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
/*
@@ -5804,13 +6227,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
* head of the ARC lists rather than the tail.
*/
if (arc_warm == B_FALSE)
- hdr = list_head(list);
+ hdr = multilist_sublist_head(mls);
else
- hdr = list_tail(list);
+ hdr = multilist_sublist_tail(mls);
if (hdr == NULL)
ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
- headroom = target_sz * l2arc_headroom * 2 / ARC_BUFC_NUMLISTS;
+ headroom = target_sz * l2arc_headroom;
if (do_headroom_boost)
headroom = (headroom * l2arc_headroom_boost) / 100;
@@ -5819,9 +6242,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
uint64_t buf_sz;
if (arc_warm == B_FALSE)
- hdr_prev = list_next(list, hdr);
+ hdr_prev = multilist_sublist_next(mls, hdr);
else
- hdr_prev = list_prev(list, hdr);
+ hdr_prev = multilist_sublist_prev(mls, hdr);
ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size);
hash_lock = HDR_LOCK(hdr);
@@ -5861,7 +6284,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
* l2arc_write_done() can find where the
* write buffers begin without searching.
*/
+ mutex_enter(&dev->l2ad_mtx);
list_insert_head(&dev->l2ad_buflist, head);
+ mutex_exit(&dev->l2ad_mtx);
cb = kmem_alloc(
sizeof (l2arc_write_callback_t), KM_SLEEP);
@@ -5915,7 +6340,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
buf_sz = hdr->b_size;
hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
+ mutex_enter(&dev->l2ad_mtx);
list_insert_head(&dev->l2ad_buflist, hdr);
+ mutex_exit(&dev->l2ad_mtx);
/*
* Compute and store the buffer cksum before
@@ -5929,7 +6356,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
write_sz += buf_sz;
}
- mutex_exit(list_lock);
+ multilist_sublist_unlock(mls);
if (full == B_TRUE)
break;
@@ -5938,12 +6365,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
/* No buffers selected for writing? */
if (pio == NULL) {
ASSERT0(write_sz);
- mutex_exit(&dev->l2ad_mtx);
ASSERT(!HDR_HAS_L1HDR(head));
kmem_cache_free(hdr_l2only_cache, head);
return (0);
}
+ mutex_enter(&dev->l2ad_mtx);
+
/*
* Now start writing the buffers. We're starting at the write head
* and work backwards, retracing the course of the buffer selector
@@ -5954,6 +6382,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
uint64_t buf_sz;
/*
+ * We rely on the L1 portion of the header below, so
+ * it's invalid for this header to have been evicted out
+ * of the ghost cache, prior to being written out. The
+ * ARC_FLAG_L2_WRITING bit ensures this won't happen.
+ */
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ /*
* We shouldn't need to lock the buffer here, since we flagged
* it as ARC_FLAG_L2_WRITING in the previous step, but we must
* take care to only access its L2 cache parameters. In
@@ -5981,14 +6417,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
buf_sz = hdr->b_l2hdr.b_asize;
/*
- * If the data has not been compressed, then clear b_tmp_cdata
- * to make sure that it points only to a temporary compression
- * buffer.
- */
- if (!L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr)))
- hdr->b_l1hdr.b_tmp_cdata = NULL;
-
- /*
* We need to do this regardless if buf_sz is zero or
* not, otherwise, when this l2hdr is evicted we'll
* remove a reference that was never added.
@@ -6081,6 +6509,12 @@ l2arc_compress_buf(arc_buf_hdr_t *hdr)
csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata,
cdata, l2hdr->b_asize);
+ rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
+ if (rounded > csize) {
+ bzero((char *)cdata + csize, rounded - csize);
+ csize = rounded;
+ }
+
if (csize == 0) {
/* zero block, indicate that there's nothing to write */
zio_data_buf_free(cdata, len);
@@ -6089,19 +6523,11 @@ l2arc_compress_buf(arc_buf_hdr_t *hdr)
hdr->b_l1hdr.b_tmp_cdata = NULL;
ARCSTAT_BUMP(arcstat_l2_compress_zeros);
return (B_TRUE);
- }
-
- rounded = P2ROUNDUP(csize,
- (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift);
- if (rounded < len) {
+ } else if (csize > 0 && csize < len) {
/*
* Compression succeeded, we'll keep the cdata around for
* writing and release it afterwards.
*/
- if (rounded > csize) {
- bzero((char *)cdata + csize, rounded - csize);
- csize = rounded;
- }
HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4);
l2hdr->b_asize = csize;
hdr->b_l1hdr.b_tmp_cdata = cdata;
@@ -6189,8 +6615,26 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
static void
l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
{
+ enum zio_compress comp = HDR_GET_COMPRESS(hdr);
+
ASSERT(HDR_HAS_L1HDR(hdr));
- if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) {
+ ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp));
+
+ if (comp == ZIO_COMPRESS_OFF) {
+ /*
+ * In this case, b_tmp_cdata points to the same buffer
+ * as the arc_buf_t's b_data field. We don't want to
+ * free it, since the arc_buf_t will handle that.
+ */
+ hdr->b_l1hdr.b_tmp_cdata = NULL;
+ } else if (comp == ZIO_COMPRESS_EMPTY) {
+ /*
+ * In this case, b_tmp_cdata was compressed to an empty
+ * buffer, thus there's nothing to free and b_tmp_cdata
+ * should have been set to NULL in l2arc_write_buffers().
+ */
+ ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+ } else {
/*
* If the data was compressed, then we've allocated a
* temporary buffer for it, so now we need to release it.
@@ -6199,9 +6643,8 @@ l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata,
hdr->b_size);
hdr->b_l1hdr.b_tmp_cdata = NULL;
- } else {
- ASSERT(hdr->b_l1hdr.b_tmp_cdata == NULL);
}
+
}
/*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
index 5f7d76f..b2b9887 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
@@ -154,7 +154,7 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
int err;
struct bptree_args *ba = arg;
- if (BP_IS_HOLE(bp))
+ if (bp == NULL || BP_IS_HOLE(bp))
return (0);
err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c
new file mode 100644
index 0000000..1ddc697
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c
@@ -0,0 +1,111 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/bqueue.h>
+#include <sys/zfs_context.h>
+
+static inline bqueue_node_t *
+obj2node(bqueue_t *q, void *data)
+{
+ return ((bqueue_node_t *)((char *)data + q->bq_node_offset));
+}
+
+/*
+ * Initialize a blocking queue The maximum capacity of the queue is set to
+ * size. Types that want to be stored in a bqueue must contain a bqueue_node_t,
+ * and offset should give its offset from the start of the struct. Return 0 on
+ * success, or -1 on failure.
+ */
+int
+bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset)
+{
+ list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t),
+ node_offset + offsetof(bqueue_node_t, bqn_node));
+ cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL);
+ q->bq_node_offset = node_offset;
+ q->bq_size = 0;
+ q->bq_maxsize = size;
+ return (0);
+}
+
+/*
+ * Destroy a blocking queue. This function asserts that there are no
+ * elements in the queue, and no one is blocked on the condition
+ * variables.
+ */
+void
+bqueue_destroy(bqueue_t *q)
+{
+ ASSERT0(q->bq_size);
+ cv_destroy(&q->bq_add_cv);
+ cv_destroy(&q->bq_pop_cv);
+ mutex_destroy(&q->bq_lock);
+ list_destroy(&q->bq_list);
+}
+
+/*
+ * Add data to q, consuming size units of capacity. If there is insufficient
+ * capacity to consume size units, block until capacity exists. Asserts size is
+ * > 0.
+ */
+void
+bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size)
+{
+ ASSERT3U(item_size, >, 0);
+ ASSERT3U(item_size, <, q->bq_maxsize);
+ mutex_enter(&q->bq_lock);
+ obj2node(q, data)->bqn_size = item_size;
+ while (q->bq_size + item_size > q->bq_maxsize) {
+ cv_wait(&q->bq_add_cv, &q->bq_lock);
+ }
+ q->bq_size += item_size;
+ list_insert_tail(&q->bq_list, data);
+ cv_signal(&q->bq_pop_cv);
+ mutex_exit(&q->bq_lock);
+}
+/*
+ * Take the first element off of q. If there are no elements on the queue, wait
+ * until one is put there. Return the removed element.
+ */
+void *
+bqueue_dequeue(bqueue_t *q)
+{
+ void *ret;
+ uint64_t item_size;
+ mutex_enter(&q->bq_lock);
+ while (q->bq_size == 0) {
+ cv_wait(&q->bq_pop_cv, &q->bq_lock);
+ }
+ ret = list_remove_head(&q->bq_list);
+ item_size = obj2node(q, ret)->bqn_size;
+ q->bq_size -= item_size;
+ mutex_exit(&q->bq_lock);
+ cv_signal(&q->bq_add_cv);
+ return (ret);
+}
+
+/*
+ * Returns true if the space used is 0.
+ */
+boolean_t
+bqueue_empty(bqueue_t *q)
+{
+ return (q->bq_size == 0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
index 79b6aed..16d8a2e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
@@ -548,11 +548,35 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
return (abuf);
}
+/*
+ * Calculate which level n block references the data at the level 0 offset
+ * provided.
+ */
uint64_t
-dbuf_whichblock(dnode_t *dn, uint64_t offset)
+dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
{
- if (dn->dn_datablkshift) {
- return (offset >> dn->dn_datablkshift);
+ if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
+ /*
+ * The level n blkid is equal to the level 0 blkid divided by
+ * the number of level 0s in a level n block.
+ *
+ * The level 0 blkid is offset >> datablkshift =
+ * offset / 2^datablkshift.
+ *
+ * The number of level 0s in a level n is the number of block
+ * pointers in an indirect block, raised to the power of level.
+ * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
+ * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
+ *
+ * Thus, the level n blkid is: offset /
+ * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT)))
+ * = offset / 2^(datablkshift + level *
+ * (indblkshift - SPA_BLKPTRSHIFT))
+ * = offset >> (datablkshift + level *
+ * (indblkshift - SPA_BLKPTRSHIFT))
+ */
+ return (offset >> (dn->dn_datablkshift + level *
+ (dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
} else {
ASSERT3U(offset, <, dn->dn_datablksz);
return (0);
@@ -1549,6 +1573,11 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
struct dirty_leaf *dl;
dmu_object_type_t type;
+ if (etype == BP_EMBEDDED_TYPE_DATA) {
+ ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
+ SPA_FEATURE_EMBEDDED_DATA));
+ }
+
DB_DNODE_ENTER(db);
type = DB_DNODE(db)->dn_type;
DB_DNODE_EXIT(db);
@@ -1715,6 +1744,12 @@ dbuf_clear(dmu_buf_impl_t *db)
dbuf_rele(parent, db);
}
+/*
+ * Note: While bpp will always be updated if the function returns success,
+ * parentp will not be updated if the dnode does not have dn_dbuf filled in;
+ * this happens when the dnode is the meta-dnode, or a userused or groupused
+ * object.
+ */
static int
dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
dmu_buf_impl_t **parentp, blkptr_t **bpp)
@@ -1755,7 +1790,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
} else if (level < nlevels-1) {
/* this block is referenced from an indirect block */
int err = dbuf_hold_impl(dn, level+1,
- blkid >> epbs, fail_sparse, NULL, parentp);
+ blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
if (err)
return (err);
err = dbuf_read(*parentp, NULL,
@@ -1930,11 +1965,96 @@ dbuf_destroy(dmu_buf_impl_t *db)
arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
}
+typedef struct dbuf_prefetch_arg {
+ spa_t *dpa_spa; /* The spa to issue the prefetch in. */
+ zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
+ int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
+ int dpa_curlevel; /* The current level that we're reading */
+ zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
+ zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
+ arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
+} dbuf_prefetch_arg_t;
+
+/*
+ * Actually issue the prefetch read for the block given.
+ */
+static void
+dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
+{
+ if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+ return;
+
+ arc_flags_t aflags =
+ dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+
+ ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
+ ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
+ ASSERT(dpa->dpa_zio != NULL);
+ (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
+ dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ &aflags, &dpa->dpa_zb);
+}
+
+/*
+ * Called when an indirect block above our prefetch target is read in. This
+ * will either read in the next indirect block down the tree or issue the actual
+ * prefetch if the next block down is our target.
+ */
+static void
+dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
+{
+ dbuf_prefetch_arg_t *dpa = private;
+
+ ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
+ ASSERT3S(dpa->dpa_curlevel, >, 0);
+ if (zio != NULL) {
+ ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
+ ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
+ ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
+ }
+
+ dpa->dpa_curlevel--;
+
+ uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
+ (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
+ blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
+ P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
+ if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
+ kmem_free(dpa, sizeof (*dpa));
+ } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
+ ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
+ dbuf_issue_final_prefetch(dpa, bp);
+ kmem_free(dpa, sizeof (*dpa));
+ } else {
+ arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
+ zbookmark_phys_t zb;
+
+ ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
+
+ SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
+ dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
+
+ (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
+ bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ &iter_aflags, &zb);
+ }
+ (void) arc_buf_remove_ref(abuf, private);
+}
+
+/*
+ * Issue prefetch reads for the given block on the given level. If the indirect
+ * blocks above that block are not in memory, we will read them in
+ * asynchronously. As a result, this call never blocks waiting for a read to
+ * complete.
+ */
void
-dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
+dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
+ arc_flags_t aflags)
{
- dmu_buf_impl_t *db = NULL;
- blkptr_t *bp = NULL;
+ blkptr_t bp;
+ int epbs, nlevels, curlevel;
+ uint64_t curblkid;
ASSERT(blkid != DMU_BONUS_BLKID);
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
@@ -1942,35 +2062,104 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
if (dnode_block_freed(dn, blkid))
return;
- /* dbuf_find() returns with db_mtx held */
- if (db = dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid)) {
+ /*
+ * This dnode hasn't been written to disk yet, so there's nothing to
+ * prefetch.
+ */
+ nlevels = dn->dn_phys->dn_nlevels;
+ if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
+ return;
+
+ epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
+ return;
+
+ dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
+ level, blkid);
+ if (db != NULL) {
+ mutex_exit(&db->db_mtx);
/*
- * This dbuf is already in the cache. We assume that
- * it is already CACHED, or else about to be either
- * read or filled.
+ * This dbuf already exists. It is either CACHED, or
+ * (we assume) about to be read or filled.
*/
- mutex_exit(&db->db_mtx);
return;
}
- if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
- if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
- dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
- arc_flags_t aflags =
- ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
- zbookmark_phys_t zb;
+ /*
+ * Find the closest ancestor (indirect block) of the target block
+ * that is present in the cache. In this indirect block, we will
+ * find the bp that is at curlevel, curblkid.
+ */
+ curlevel = level;
+ curblkid = blkid;
+ while (curlevel < nlevels - 1) {
+ int parent_level = curlevel + 1;
+ uint64_t parent_blkid = curblkid >> epbs;
+ dmu_buf_impl_t *db;
+
+ if (dbuf_hold_impl(dn, parent_level, parent_blkid,
+ FALSE, TRUE, FTAG, &db) == 0) {
+ blkptr_t *bpp = db->db_buf->b_data;
+ bp = bpp[P2PHASE(curblkid, 1 << epbs)];
+ dbuf_rele(db, FTAG);
+ break;
+ }
- SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
- dn->dn_object, 0, blkid);
+ curlevel = parent_level;
+ curblkid = parent_blkid;
+ }
- (void) arc_read(NULL, dn->dn_objset->os_spa,
- bp, NULL, NULL, prio,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
- &aflags, &zb);
- }
- if (db)
- dbuf_rele(db, NULL);
+ if (curlevel == nlevels - 1) {
+ /* No cached indirect blocks found. */
+ ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
+ bp = dn->dn_phys->dn_blkptr[curblkid];
+ }
+ if (BP_IS_HOLE(&bp))
+ return;
+
+ ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
+
+ zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+
+ dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+ SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
+ dn->dn_object, level, blkid);
+ dpa->dpa_curlevel = curlevel;
+ dpa->dpa_prio = prio;
+ dpa->dpa_aflags = aflags;
+ dpa->dpa_spa = dn->dn_objset->os_spa;
+ dpa->dpa_epbs = epbs;
+ dpa->dpa_zio = pio;
+
+ /*
+ * If we have the indirect just above us, no need to do the asynchronous
+ * prefetch chain; we'll just run the last step ourselves. If we're at
+ * a higher level, though, we want to issue the prefetches for all the
+ * indirect blocks asynchronously, so we can go on with whatever we were
+ * doing.
+ */
+ if (curlevel == level) {
+ ASSERT3U(curblkid, ==, blkid);
+ dbuf_issue_final_prefetch(dpa, &bp);
+ kmem_free(dpa, sizeof (*dpa));
+ } else {
+ arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
+ zbookmark_phys_t zb;
+
+ SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
+ dn->dn_object, curlevel, curblkid);
+ (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
+ &bp, dbuf_prefetch_indirect_done, dpa, prio,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ &iter_aflags, &zb);
}
+ /*
+ * We use pio here instead of dpa_zio since it's possible that
+ * dpa may have already been freed.
+ */
+ zio_nowait(pio);
}
/*
@@ -1978,7 +2167,8 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
* Note: dn_struct_rwlock must be held.
*/
int
-dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
+dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
+ boolean_t fail_sparse, boolean_t fail_uncached,
void *tag, dmu_buf_impl_t **dbp)
{
dmu_buf_impl_t *db, *parent = NULL;
@@ -1996,6 +2186,9 @@ top:
blkptr_t *bp = NULL;
int err;
+ if (fail_uncached)
+ return (SET_ERROR(ENOENT));
+
ASSERT3P(parent, ==, NULL);
err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
if (fail_sparse) {
@@ -2012,6 +2205,11 @@ top:
db = dbuf_create(dn, level, blkid, parent, bp);
}
+ if (fail_uncached && db->db_state != DB_CACHED) {
+ mutex_exit(&db->db_mtx);
+ return (SET_ERROR(ENOENT));
+ }
+
if (db->db_buf && refcount_is_zero(&db->db_holds)) {
arc_buf_add_ref(db->db_buf, db);
if (db->db_buf->b_data == NULL) {
@@ -2067,16 +2265,14 @@ top:
dmu_buf_impl_t *
dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
{
- dmu_buf_impl_t *db;
- int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
- return (err ? NULL : db);
+ return (dbuf_hold_level(dn, 0, blkid, tag));
}
dmu_buf_impl_t *
dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
{
dmu_buf_impl_t *db;
- int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
+ int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
return (err ? NULL : db);
}
@@ -2429,8 +2625,8 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
if (parent == NULL) {
mutex_exit(&db->db_mtx);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
- (void) dbuf_hold_impl(dn, db->db_level+1,
- db->db_blkid >> epbs, FALSE, db, &parent);
+ parent = dbuf_hold_level(dn, db->db_level + 1,
+ db->db_blkid >> epbs, db);
rw_exit(&dn->dn_struct_rwlock);
mutex_enter(&db->db_mtx);
db->db_parent = parent;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
index f45071b..91cd511 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -141,7 +141,7 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
- blkid = dbuf_whichblock(dn, offset);
+ blkid = dbuf_whichblock(dn, 0, offset);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
db = dbuf_hold(dn, blkid, tag);
rw_exit(&dn->dn_struct_rwlock);
@@ -424,7 +424,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
- blkid = dbuf_whichblock(dn, offset);
+ blkid = dbuf_whichblock(dn, 0, offset);
for (i = 0; i < nblks; i++) {
dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
if (db == NULL) {
@@ -528,17 +528,16 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
}
/*
- * Issue prefetch i/os for the given blocks.
+ * Issue prefetch i/os for the given blocks. If level is greater than 0, the
+ * indirect blocks prefeteched will be those that point to the blocks containing
+ * the data starting at offset, and continuing to offset + len.
*
- * Note: The assumption is that we *know* these blocks will be needed
- * almost immediately. Therefore, the prefetch i/os will be issued at
- * ZIO_PRIORITY_SYNC_READ
- *
- * Note: indirect blocks and other metadata will be read synchronously,
- * causing this function to block if they are not already cached.
+ * Note that if the indirect blocks above the blocks being prefetched are not in
+ * cache, they will be asychronously read in.
*/
void
-dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
+dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
+ uint64_t len, zio_priority_t pri)
{
dnode_t *dn;
uint64_t blkid;
@@ -554,8 +553,9 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
return;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
- blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
- dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ);
+ blkid = dbuf_whichblock(dn, level,
+ object * sizeof (dnode_phys_t));
+ dbuf_prefetch(dn, level, blkid, pri, 0);
rw_exit(&dn->dn_struct_rwlock);
return;
}
@@ -570,18 +570,24 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
return;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
- if (dn->dn_datablkshift) {
- int blkshift = dn->dn_datablkshift;
- nblks = (P2ROUNDUP(offset + len, 1 << blkshift) -
- P2ALIGN(offset, 1 << blkshift)) >> blkshift;
+ /*
+ * offset + len - 1 is the last byte we want to prefetch for, and offset
+ * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the
+ * last block we want to prefetch, and dbuf_whichblock(dn, level,
+ * offset) is the first. Then the number we need to prefetch is the
+ * last - first + 1.
+ */
+ if (level > 0 || dn->dn_datablkshift != 0) {
+ nblks = dbuf_whichblock(dn, level, offset + len - 1) -
+ dbuf_whichblock(dn, level, offset) + 1;
} else {
nblks = (offset < dn->dn_datablksz);
}
if (nblks != 0) {
- blkid = dbuf_whichblock(dn, offset);
+ blkid = dbuf_whichblock(dn, level, offset);
for (int i = 0; i < nblks; i++)
- dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ);
+ dbuf_prefetch(dn, level, blkid + i, pri, 0);
}
rw_exit(&dn->dn_struct_rwlock);
@@ -1393,7 +1399,7 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
DB_DNODE_ENTER(dbuf);
dn = DB_DNODE(dbuf);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
- blkid = dbuf_whichblock(dn, offset);
+ blkid = dbuf_whichblock(dn, 0, offset);
VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(dbuf);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
index bd9e894..e88968b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
@@ -138,7 +138,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
if (issig(JUSTLOOKING) && issig(FORREAL))
return (SET_ERROR(EINTR));
- if (zb->zb_object != DMU_META_DNODE_OBJECT)
+ if (bp == NULL || zb->zb_object != DMU_META_DNODE_OBJECT)
return (0);
if (BP_IS_HOLE(bp)) {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
index 808864a..6ca021e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
@@ -148,6 +148,11 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
return (0);
}
+/*
+ * Return (in *objectp) the next object which is allocated (or a hole)
+ * after *object, taking into account only objects that may have been modified
+ * after the specified txg.
+ */
int
dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
{
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
index be1f46d..267aa35 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved.
* Copyright 2014 HybridCluster. All rights reserved.
@@ -53,6 +53,7 @@
#include <sys/blkptr.h>
#include <sys/dsl_bookmark.h>
#include <sys/zfeature.h>
+#include <sys/bqueue.h>
#ifdef __FreeBSD__
#undef dump_write
@@ -61,10 +62,34 @@
/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
int zfs_send_corrupt_data = B_FALSE;
+int zfs_send_queue_length = 16 * 1024 * 1024;
+int zfs_recv_queue_length = 16 * 1024 * 1024;
static char *dmu_recv_tag = "dmu_recv_tag";
static const char *recv_clone_name = "%recv";
+#define BP_SPAN(datablkszsec, indblkshift, level) \
+ (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \
+ (level) * (indblkshift - SPA_BLKPTRSHIFT)))
+
+struct send_thread_arg {
+ bqueue_t q;
+ dsl_dataset_t *ds; /* Dataset to traverse */
+ uint64_t fromtxg; /* Traverse from this txg */
+ int flags; /* flags to pass to traverse_dataset */
+ int error_code;
+ boolean_t cancel;
+};
+
+struct send_block_record {
+ boolean_t eos_marker; /* Marks the end of the stream */
+ blkptr_t bp;
+ zbookmark_phys_t zb;
+ uint8_t indblkshift;
+ uint16_t datablkszsec;
+ bqueue_node_t ln;
+};
+
static int
dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
{
@@ -455,58 +480,116 @@ backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
return (B_FALSE);
}
-#define BP_SPAN(dnp, level) \
- (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
- (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
+/*
+ * This is the callback function to traverse_dataset that acts as the worker
+ * thread for dmu_send_impl.
+ */
+/*ARGSUSED*/
+static int
+send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
+{
+ struct send_thread_arg *sta = arg;
+ struct send_block_record *record;
+ uint64_t record_size;
+ int err = 0;
-/* ARGSUSED */
+ if (sta->cancel)
+ return (SET_ERROR(EINTR));
+
+ if (bp == NULL) {
+ ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL);
+ return (0);
+ } else if (zb->zb_level < 0) {
+ return (0);
+ }
+
+ record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP);
+ record->eos_marker = B_FALSE;
+ record->bp = *bp;
+ record->zb = *zb;
+ record->indblkshift = dnp->dn_indblkshift;
+ record->datablkszsec = dnp->dn_datablkszsec;
+ record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+ bqueue_enqueue(&sta->q, record, record_size);
+
+ return (err);
+}
+
+/*
+ * This function kicks off the traverse_dataset. It also handles setting the
+ * error code of the thread in case something goes wrong, and pushes the End of
+ * Stream record when the traverse_dataset call has finished. If there is no
+ * dataset to traverse, the thread immediately pushes End of Stream marker.
+ */
+static void
+send_traverse_thread(void *arg)
+{
+ struct send_thread_arg *st_arg = arg;
+ int err;
+ struct send_block_record *data;
+
+ if (st_arg->ds != NULL) {
+ err = traverse_dataset(st_arg->ds, st_arg->fromtxg,
+ st_arg->flags, send_cb, arg);
+ if (err != EINTR)
+ st_arg->error_code = err;
+ }
+ data = kmem_zalloc(sizeof (*data), KM_SLEEP);
+ data->eos_marker = B_TRUE;
+ bqueue_enqueue(&st_arg->q, data, 1);
+ thread_exit();
+}
+
+/*
+ * This function actually handles figuring out what kind of record needs to be
+ * dumped, reading the data (which has hopefully been prefetched), and calling
+ * the appropriate helper function.
+ */
static int
-backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
- const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
{
- dmu_sendarg_t *dsp = arg;
+ dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os);
+ const blkptr_t *bp = &data->bp;
+ const zbookmark_phys_t *zb = &data->zb;
+ uint8_t indblkshift = data->indblkshift;
+ uint16_t dblkszsec = data->datablkszsec;
+ spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
int err = 0;
- if (issig(JUSTLOOKING) && issig(FORREAL))
- return (SET_ERROR(EINTR));
+ ASSERT3U(zb->zb_level, >=, 0);
if (zb->zb_object != DMU_META_DNODE_OBJECT &&
DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
return (0);
- } else if (zb->zb_level == ZB_ZIL_LEVEL) {
- /*
- * If we are sending a non-snapshot (which is allowed on
- * read-only pools), it may have a ZIL, which must be ignored.
- */
- return (0);
} else if (BP_IS_HOLE(bp) &&
zb->zb_object == DMU_META_DNODE_OBJECT) {
- uint64_t span = BP_SPAN(dnp, zb->zb_level);
+ uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
- err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT);
+ err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT);
} else if (BP_IS_HOLE(bp)) {
- uint64_t span = BP_SPAN(dnp, zb->zb_level);
- err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span);
+ uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
+ uint64_t offset = zb->zb_blkid * span;
+ err = dump_free(dsa, zb->zb_object, offset, span);
} else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
return (0);
} else if (type == DMU_OT_DNODE) {
- dnode_phys_t *blk;
- int i;
int blksz = BP_GET_LSIZE(bp);
arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf;
+ ASSERT0(zb->zb_level);
+
if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
&aflags, zb) != 0)
return (SET_ERROR(EIO));
- blk = abuf->b_data;
- for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
- uint64_t dnobj = (zb->zb_blkid <<
- (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
- err = dump_dnode(dsp, dnobj, blk+i);
+ dnode_phys_t *blk = abuf->b_data;
+ uint64_t dnobj = zb->zb_blkid * (blksz >> DNODE_SHIFT);
+ for (int i = 0; i < blksz >> DNODE_SHIFT; i++) {
+ err = dump_dnode(dsa, dnobj + i, blk + i);
if (err != 0)
break;
}
@@ -521,20 +604,21 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
&aflags, zb) != 0)
return (SET_ERROR(EIO));
- err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data);
+ err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data);
(void) arc_buf_remove_ref(abuf, &abuf);
- } else if (backup_do_embed(dsp, bp)) {
+ } else if (backup_do_embed(dsa, bp)) {
/* it's an embedded level-0 block of a regular object */
- int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
- err = dump_write_embedded(dsp, zb->zb_object,
+ int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
+ ASSERT0(zb->zb_level);
+ err = dump_write_embedded(dsa, zb->zb_object,
zb->zb_blkid * blksz, blksz, bp);
- } else { /* it's a level-0 block of a regular object */
+ } else {
+ /* it's a level-0 block of a regular object */
arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf;
- int blksz = BP_GET_LSIZE(bp);
+ int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
uint64_t offset;
- ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
ASSERT0(zb->zb_level);
if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
@@ -555,20 +639,20 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
offset = zb->zb_blkid * blksz;
- if (!(dsp->dsa_featureflags &
+ if (!(dsa->dsa_featureflags &
DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
blksz > SPA_OLD_MAXBLOCKSIZE) {
char *buf = abuf->b_data;
while (blksz > 0 && err == 0) {
int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
- err = dump_write(dsp, type, zb->zb_object,
+ err = dump_write(dsa, type, zb->zb_object,
offset, n, NULL, buf);
offset += n;
buf += n;
blksz -= n;
}
} else {
- err = dump_write(dsp, type, zb->zb_object,
+ err = dump_write(dsa, type, zb->zb_object,
offset, blksz, bp, abuf->b_data);
}
(void) arc_buf_remove_ref(abuf, &abuf);
@@ -579,11 +663,24 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
}
/*
- * Releases dp using the specified tag.
+ * Pop the new data off the queue, and free the old data.
+ */
+static struct send_block_record *
+get_next_record(bqueue_t *bq, struct send_block_record *data)
+{
+ struct send_block_record *tmp = bqueue_dequeue(bq);
+ kmem_free(data, sizeof (*data));
+ return (tmp);
+}
+
+/*
+ * Actually do the bulk of the work in a zfs send.
+ *
+ * Note: Releases dp using the specified tag.
*/
static int
-dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
- zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
+dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
+ zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, boolean_t embedok,
#ifdef illumos
boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off)
#else
@@ -596,8 +693,9 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
int err;
uint64_t fromtxg = 0;
uint64_t featureflags = 0;
+ struct send_thread_arg to_arg;
- err = dmu_objset_from_ds(ds, &os);
+ err = dmu_objset_from_ds(to_ds, &os);
if (err != 0) {
dsl_pool_rele(dp, tag);
return (err);
@@ -623,35 +721,34 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
}
#endif
- if (large_block_ok && ds->ds_large_blocks)
+ if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS])
featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
if (embedok &&
spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4;
- } else {
- embedok = B_FALSE;
}
DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
featureflags);
drr->drr_u.drr_begin.drr_creation_time =
- dsl_dataset_phys(ds)->ds_creation_time;
+ dsl_dataset_phys(to_ds)->ds_creation_time;
drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
if (is_clone)
drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
- drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(ds)->ds_guid;
- if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
+ drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
+ if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
- if (fromzb != NULL) {
- drr->drr_u.drr_begin.drr_fromguid = fromzb->zbm_guid;
- fromtxg = fromzb->zbm_creation_txg;
+ if (ancestor_zb != NULL) {
+ drr->drr_u.drr_begin.drr_fromguid =
+ ancestor_zb->zbm_guid;
+ fromtxg = ancestor_zb->zbm_creation_txg;
}
- dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
- if (!ds->ds_is_snapshot) {
+ dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname);
+ if (!to_ds->ds_is_snapshot) {
(void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--",
sizeof (drr->drr_u.drr_begin.drr_toname));
}
@@ -665,16 +762,16 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
dsp->dsa_fp = fp;
dsp->dsa_os = os;
dsp->dsa_off = off;
- dsp->dsa_toguid = dsl_dataset_phys(ds)->ds_guid;
+ dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
dsp->dsa_pending_op = PENDING_NONE;
- dsp->dsa_incremental = (fromzb != NULL);
+ dsp->dsa_incremental = (ancestor_zb != NULL);
dsp->dsa_featureflags = featureflags;
- mutex_enter(&ds->ds_sendstream_lock);
- list_insert_head(&ds->ds_sendstreams, dsp);
- mutex_exit(&ds->ds_sendstream_lock);
+ mutex_enter(&to_ds->ds_sendstream_lock);
+ list_insert_head(&to_ds->ds_sendstreams, dsp);
+ mutex_exit(&to_ds->ds_sendstream_lock);
- dsl_dataset_long_hold(ds, FTAG);
+ dsl_dataset_long_hold(to_ds, FTAG);
dsl_pool_rele(dp, tag);
if (dump_record(dsp, NULL, 0) != 0) {
@@ -682,8 +779,41 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
goto out;
}
- err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
- backup_cb, dsp);
+ err = bqueue_init(&to_arg.q, zfs_send_queue_length,
+ offsetof(struct send_block_record, ln));
+ to_arg.error_code = 0;
+ to_arg.cancel = B_FALSE;
+ to_arg.ds = to_ds;
+ to_arg.fromtxg = fromtxg;
+ to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH;
+ (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, curproc,
+ TS_RUN, minclsyspri);
+
+ struct send_block_record *to_data;
+ to_data = bqueue_dequeue(&to_arg.q);
+
+ while (!to_data->eos_marker && err == 0) {
+ err = do_dump(dsp, to_data);
+ to_data = get_next_record(&to_arg.q, to_data);
+ if (issig(JUSTLOOKING) && issig(FORREAL))
+ err = EINTR;
+ }
+
+ if (err != 0) {
+ to_arg.cancel = B_TRUE;
+ while (!to_data->eos_marker) {
+ to_data = get_next_record(&to_arg.q, to_data);
+ }
+ }
+ kmem_free(to_data, sizeof (*to_data));
+
+ bqueue_destroy(&to_arg.q);
+
+ if (err == 0 && to_arg.error_code != 0)
+ err = to_arg.error_code;
+
+ if (err != 0)
+ goto out;
if (dsp->dsa_pending_op != PENDING_NONE)
if (dump_record(dsp, NULL, 0) != 0)
@@ -700,20 +830,18 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
- if (dump_record(dsp, NULL, 0) != 0) {
+ if (dump_record(dsp, NULL, 0) != 0)
err = dsp->dsa_err;
- goto out;
- }
out:
- mutex_enter(&ds->ds_sendstream_lock);
- list_remove(&ds->ds_sendstreams, dsp);
- mutex_exit(&ds->ds_sendstream_lock);
+ mutex_enter(&to_ds->ds_sendstream_lock);
+ list_remove(&to_ds->ds_sendstreams, dsp);
+ mutex_exit(&to_ds->ds_sendstream_lock);
kmem_free(drr, sizeof (dmu_replay_record_t));
kmem_free(dsp, sizeof (dmu_sendarg_t));
- dsl_dataset_long_rele(ds, FTAG);
+ dsl_dataset_long_rele(to_ds, FTAG);
return (err);
}
@@ -1144,7 +1272,8 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
* If it's a non-clone incremental, we are missing the
* target fs, so fail the recv.
*/
- if (fromguid != 0 && !(flags & DRR_FLAG_CLONE))
+ if (fromguid != 0 && !(flags & DRR_FLAG_CLONE ||
+ drba->drba_origin))
return (SET_ERROR(ENOENT));
/* Open the parent of tofs */
@@ -1250,13 +1379,6 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
}
VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
- if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
- DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
- !newds->ds_large_blocks) {
- dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
- newds->ds_large_blocks = B_TRUE;
- }
-
dmu_buf_will_dirty(newds->ds_dbuf, tx);
dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
@@ -1326,22 +1448,58 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
&drba, 5, ZFS_SPACE_CHECK_NORMAL));
}
-struct restorearg {
+struct receive_record_arg {
+ dmu_replay_record_t header;
+ void *payload; /* Pointer to a buffer containing the payload */
+ /*
+ * If the record is a write, pointer to the arc_buf_t containing the
+ * payload.
+ */
+ arc_buf_t *write_buf;
+ int payload_size;
+ boolean_t eos_marker; /* Marks the end of the stream */
+ bqueue_node_t node;
+};
+
+struct receive_writer_arg {
objset_t *os;
- int err;
boolean_t byteswap;
+ bqueue_t q;
+ /*
+ * These three args are used to signal to the main thread that we're
+ * done.
+ */
+ kmutex_t mutex;
+ kcondvar_t cv;
+ boolean_t done;
+ int err;
+ /* A map from guid to dataset to help handle dedup'd streams. */
+ avl_tree_t *guid_to_ds_map;
+};
+
+struct receive_arg {
+ objset_t *os;
kthread_t *td;
struct file *fp;
- uint64_t voff;
- int bufsize; /* amount of memory allocated for buf */
-
- dmu_replay_record_t *drr;
- dmu_replay_record_t *next_drr;
- char *buf;
+ uint64_t voff; /* The current offset in the stream */
+ /*
+ * A record that has had its payload read in, but hasn't yet been handed
+ * off to the worker thread.
+ */
+ struct receive_record_arg *rrd;
+ /* A record that has had its header read in, but not its payload. */
+ struct receive_record_arg *next_rrd;
zio_cksum_t cksum;
zio_cksum_t prev_cksum;
+ int err;
+ boolean_t byteswap;
+ /* Sorted list of objects not to issue prefetches for. */
+ list_t ignore_obj_list;
+};
- avl_tree_t *guid_to_ds_map;
+struct receive_ign_obj_node {
+ list_node_t node;
+ uint64_t object;
};
typedef struct guid_map_entry {
@@ -1380,7 +1538,7 @@ free_guid_map_onexit(void *arg)
}
static int
-restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *resid)
+restore_bytes(struct receive_arg *ra, void *buf, int len, off_t off, ssize_t *resid)
{
struct uio auio;
struct iovec aiov;
@@ -1406,13 +1564,12 @@ restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *res
}
static int
-restore_read(struct restorearg *ra, int len, void *buf)
+receive_read(struct receive_arg *ra, int len, void *buf)
{
int done = 0;
/* some things will require 8-byte alignment, so everything must */
ASSERT0(len % 8);
- ASSERT3U(len, <=, ra->bufsize);
while (done < len) {
ssize_t resid;
@@ -1529,7 +1686,8 @@ deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
}
static int
-restore_object(struct restorearg *ra, struct drr_object *drro, void *data)
+receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
+ void *data)
{
dmu_object_info_t doi;
dmu_tx_t *tx;
@@ -1543,12 +1701,12 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data)
drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
drro->drr_blksz < SPA_MINBLOCKSIZE ||
- drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(ra->os)) ||
+ drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
drro->drr_bonuslen > DN_MAX_BONUSLEN) {
return (SET_ERROR(EINVAL));
}
- err = dmu_object_info(ra->os, drro->drr_object, &doi);
+ err = dmu_object_info(rwa->os, drro->drr_object, &doi);
if (err != 0 && err != ENOENT)
return (SET_ERROR(EINVAL));
@@ -1567,14 +1725,14 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data)
if (drro->drr_blksz != doi.doi_data_block_size ||
nblkptr < doi.doi_nblkptr) {
- err = dmu_free_long_range(ra->os, drro->drr_object,
+ err = dmu_free_long_range(rwa->os, drro->drr_object,
0, DMU_OBJECT_END);
if (err != 0)
return (SET_ERROR(EINVAL));
}
}
- tx = dmu_tx_create(ra->os);
+ tx = dmu_tx_create(rwa->os);
dmu_tx_hold_bonus(tx, object);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err != 0) {
@@ -1584,7 +1742,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data)
if (object == DMU_NEW_OBJECT) {
/* currently free, want to be allocated */
- err = dmu_object_claim(ra->os, drro->drr_object,
+ err = dmu_object_claim(rwa->os, drro->drr_object,
drro->drr_type, drro->drr_blksz,
drro->drr_bonustype, drro->drr_bonuslen, tx);
} else if (drro->drr_type != doi.doi_type ||
@@ -1592,7 +1750,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data)
drro->drr_bonustype != doi.doi_bonus_type ||
drro->drr_bonuslen != doi.doi_bonus_size) {
/* currently allocated, but with different properties */
- err = dmu_object_reclaim(ra->os, drro->drr_object,
+ err = dmu_object_reclaim(rwa->os, drro->drr_object,
drro->drr_type, drro->drr_blksz,
drro->drr_bonustype, drro->drr_bonuslen, tx);
}
@@ -1601,20 +1759,20 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data)
return (SET_ERROR(EINVAL));
}
- dmu_object_set_checksum(ra->os, drro->drr_object,
+ dmu_object_set_checksum(rwa->os, drro->drr_object,
drro->drr_checksumtype, tx);
- dmu_object_set_compress(ra->os, drro->drr_object,
+ dmu_object_set_compress(rwa->os, drro->drr_object,
drro->drr_compress, tx);
if (data != NULL) {
dmu_buf_t *db;
- VERIFY0(dmu_bonus_hold(ra->os, drro->drr_object, FTAG, &db));
+ VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db));
dmu_buf_will_dirty(db, tx);
ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
bcopy(data, db->db_data, drro->drr_bonuslen);
- if (ra->byteswap) {
+ if (rwa->byteswap) {
dmu_object_byteswap_t byteswap =
DMU_OT_BYTESWAP(drro->drr_bonustype);
dmu_ot_byteswap[byteswap].ob_func(db->db_data,
@@ -1628,7 +1786,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data)
/* ARGSUSED */
static int
-restore_freeobjects(struct restorearg *ra,
+receive_freeobjects(struct receive_writer_arg *rwa,
struct drr_freeobjects *drrfo)
{
uint64_t obj;
@@ -1638,13 +1796,13 @@ restore_freeobjects(struct restorearg *ra,
for (obj = drrfo->drr_firstobj;
obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
- (void) dmu_object_next(ra->os, &obj, FALSE, 0)) {
+ (void) dmu_object_next(rwa->os, &obj, FALSE, 0)) {
int err;
- if (dmu_object_info(ra->os, obj, NULL) != 0)
+ if (dmu_object_info(rwa->os, obj, NULL) != 0)
continue;
- err = dmu_free_long_object(ra->os, obj);
+ err = dmu_free_long_object(rwa->os, obj);
if (err != 0)
return (err);
}
@@ -1652,7 +1810,8 @@ restore_freeobjects(struct restorearg *ra,
}
static int
-restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf)
+receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
+ arc_buf_t *abuf)
{
dmu_tx_t *tx;
int err;
@@ -1661,10 +1820,10 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf)
!DMU_OT_IS_VALID(drrw->drr_type))
return (SET_ERROR(EINVAL));
- if (dmu_object_info(ra->os, drrw->drr_object, NULL) != 0)
+ if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0)
return (SET_ERROR(EINVAL));
- tx = dmu_tx_create(ra->os);
+ tx = dmu_tx_create(rwa->os);
dmu_tx_hold_write(tx, drrw->drr_object,
drrw->drr_offset, drrw->drr_length);
@@ -1673,7 +1832,7 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf)
dmu_tx_abort(tx);
return (err);
}
- if (ra->byteswap) {
+ if (rwa->byteswap) {
dmu_object_byteswap_t byteswap =
DMU_OT_BYTESWAP(drrw->drr_type);
dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
@@ -1681,7 +1840,7 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf)
}
dmu_buf_t *bonus;
- if (dmu_bonus_hold(ra->os, drrw->drr_object, FTAG, &bonus) != 0)
+ if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0)
return (SET_ERROR(EINVAL));
dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx);
dmu_tx_commit(tx);
@@ -1697,7 +1856,8 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf)
* data from the stream to fulfill this write.
*/
static int
-restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr)
+receive_write_byref(struct receive_writer_arg *rwa,
+ struct drr_write_byref *drrwbr)
{
dmu_tx_t *tx;
int err;
@@ -1716,14 +1876,14 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr)
*/
if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
gmesrch.guid = drrwbr->drr_refguid;
- if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch,
+ if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch,
&where)) == NULL) {
return (SET_ERROR(EINVAL));
}
if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
return (SET_ERROR(EINVAL));
} else {
- ref_os = ra->os;
+ ref_os = rwa->os;
}
err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
@@ -1731,7 +1891,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr)
if (err != 0)
return (err);
- tx = dmu_tx_create(ra->os);
+ tx = dmu_tx_create(rwa->os);
dmu_tx_hold_write(tx, drrwbr->drr_object,
drrwbr->drr_offset, drrwbr->drr_length);
@@ -1740,7 +1900,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr)
dmu_tx_abort(tx);
return (err);
}
- dmu_write(ra->os, drrwbr->drr_object,
+ dmu_write(rwa->os, drrwbr->drr_object,
drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
dmu_buf_rele(dbp, FTAG);
dmu_tx_commit(tx);
@@ -1748,7 +1908,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr)
}
static int
-restore_write_embedded(struct restorearg *ra,
+receive_write_embedded(struct receive_writer_arg *rwa,
struct drr_write_embedded *drrwnp, void *data)
{
dmu_tx_t *tx;
@@ -1765,7 +1925,7 @@ restore_write_embedded(struct restorearg *ra,
if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
return (EINVAL);
- tx = dmu_tx_create(ra->os);
+ tx = dmu_tx_create(rwa->os);
dmu_tx_hold_write(tx, drrwnp->drr_object,
drrwnp->drr_offset, drrwnp->drr_length);
@@ -1775,36 +1935,37 @@ restore_write_embedded(struct restorearg *ra,
return (err);
}
- dmu_write_embedded(ra->os, drrwnp->drr_object,
+ dmu_write_embedded(rwa->os, drrwnp->drr_object,
drrwnp->drr_offset, data, drrwnp->drr_etype,
drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize,
- ra->byteswap ^ ZFS_HOST_BYTEORDER, tx);
+ rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx);
dmu_tx_commit(tx);
return (0);
}
static int
-restore_spill(struct restorearg *ra, struct drr_spill *drrs, void *data)
+receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
+ void *data)
{
dmu_tx_t *tx;
dmu_buf_t *db, *db_spill;
int err;
if (drrs->drr_length < SPA_MINBLOCKSIZE ||
- drrs->drr_length > spa_maxblocksize(dmu_objset_spa(ra->os)))
+ drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os)))
return (SET_ERROR(EINVAL));
- if (dmu_object_info(ra->os, drrs->drr_object, NULL) != 0)
+ if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0)
return (SET_ERROR(EINVAL));
- VERIFY0(dmu_bonus_hold(ra->os, drrs->drr_object, FTAG, &db));
+ VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
dmu_buf_rele(db, FTAG);
return (err);
}
- tx = dmu_tx_create(ra->os);
+ tx = dmu_tx_create(rwa->os);
dmu_tx_hold_spill(tx, db->db_object);
@@ -1831,7 +1992,7 @@ restore_spill(struct restorearg *ra, struct drr_spill *drrs, void *data)
/* ARGSUSED */
static int
-restore_free(struct restorearg *ra, struct drr_free *drrf)
+receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
{
int err;
@@ -1839,11 +2000,12 @@ restore_free(struct restorearg *ra, struct drr_free *drrf)
drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
return (SET_ERROR(EINVAL));
- if (dmu_object_info(ra->os, drrf->drr_object, NULL) != 0)
+ if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0)
return (SET_ERROR(EINVAL));
- err = dmu_free_long_range(ra->os, drrf->drr_object,
+ err = dmu_free_long_range(rwa->os, drrf->drr_object,
drrf->drr_offset, drrf->drr_length);
+
return (err);
}
@@ -1858,7 +2020,7 @@ dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
}
static void
-restore_cksum(struct restorearg *ra, int len, void *buf)
+receive_cksum(struct receive_arg *ra, int len, void *buf)
{
if (ra->byteswap) {
fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
@@ -1868,30 +2030,42 @@ restore_cksum(struct restorearg *ra, int len, void *buf)
}
/*
- * If len != 0, read payload into buf.
- * Read next record's header into ra->next_drr.
+ * Read the payload into a buffer of size len, and update the current record's
+ * payload field.
+ * Allocate ra->next_rrd and read the next record's header into
+ * ra->next_rrd->header.
* Verify checksum of payload and next record.
*/
static int
-restore_read_payload_and_next_header(struct restorearg *ra, int len, void *buf)
+receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
{
int err;
if (len != 0) {
- ASSERT3U(len, <=, ra->bufsize);
- err = restore_read(ra, len, buf);
+ ASSERT3U(len, <=, SPA_MAXBLOCKSIZE);
+ ra->rrd->payload = buf;
+ ra->rrd->payload_size = len;
+ err = receive_read(ra, len, ra->rrd->payload);
if (err != 0)
return (err);
- restore_cksum(ra, len, buf);
+ receive_cksum(ra, len, ra->rrd->payload);
}
ra->prev_cksum = ra->cksum;
- err = restore_read(ra, sizeof (*ra->next_drr), ra->next_drr);
- if (err != 0)
+ ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP);
+ err = receive_read(ra, sizeof (ra->next_rrd->header),
+ &ra->next_rrd->header);
+ if (err != 0) {
+ kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
+ ra->next_rrd = NULL;
return (err);
- if (ra->next_drr->drr_type == DRR_BEGIN)
+ }
+ if (ra->next_rrd->header.drr_type == DRR_BEGIN) {
+ kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
+ ra->next_rrd = NULL;
return (SET_ERROR(EINVAL));
+ }
/*
* Note: checksum is of everything up to but not including the
@@ -1899,107 +2073,248 @@ restore_read_payload_and_next_header(struct restorearg *ra, int len, void *buf)
*/
ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
- restore_cksum(ra,
+ receive_cksum(ra,
offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
- ra->next_drr);
+ &ra->next_rrd->header);
- zio_cksum_t cksum_orig = ra->next_drr->drr_u.drr_checksum.drr_checksum;
- zio_cksum_t *cksump = &ra->next_drr->drr_u.drr_checksum.drr_checksum;
+ zio_cksum_t cksum_orig =
+ ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
+ zio_cksum_t *cksump =
+ &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
if (ra->byteswap)
- byteswap_record(ra->next_drr);
+ byteswap_record(&ra->next_rrd->header);
if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) &&
- !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump))
+ !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) {
+ kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
+ ra->next_rrd = NULL;
return (SET_ERROR(ECKSUM));
+ }
- restore_cksum(ra, sizeof (cksum_orig), &cksum_orig);
+ receive_cksum(ra, sizeof (cksum_orig), &cksum_orig);
return (0);
}
+/*
+ * Issue the prefetch reads for any necessary indirect blocks.
+ *
+ * We use the object ignore list to tell us whether or not to issue prefetches
+ * for a given object. We do this for both correctness (in case the blocksize
+ * of an object has changed) and performance (if the object doesn't exist, don't
+ * needlessly try to issue prefetches). We also trim the list as we go through
+ * the stream to prevent it from growing to an unbounded size.
+ *
+ * The object numbers within will always be in sorted order, and any write
+ * records we see will also be in sorted order, but they're not sorted with
+ * respect to each other (i.e. we can get several object records before
+ * receiving each object's write records). As a result, once we've reached a
+ * given object number, we can safely remove any reference to lower object
+ * numbers in the ignore list. In practice, we receive up to 32 object records
+ * before receiving write records, so the list can have up to 32 nodes in it.
+ */
+/* ARGSUSED */
+static void
+receive_read_prefetch(struct receive_arg *ra,
+ uint64_t object, uint64_t offset, uint64_t length)
+{
+ struct receive_ign_obj_node *node = list_head(&ra->ignore_obj_list);
+ while (node != NULL && node->object < object) {
+ VERIFY3P(node, ==, list_remove_head(&ra->ignore_obj_list));
+ kmem_free(node, sizeof (*node));
+ node = list_head(&ra->ignore_obj_list);
+ }
+ if (node == NULL || node->object > object) {
+ dmu_prefetch(ra->os, object, 1, offset, length,
+ ZIO_PRIORITY_SYNC_READ);
+ }
+}
+
+/*
+ * Read records off the stream, issuing any necessary prefetches.
+ */
static int
-restore_process_record(struct restorearg *ra)
+receive_read_record(struct receive_arg *ra)
{
int err;
- switch (ra->drr->drr_type) {
+ switch (ra->rrd->header.drr_type) {
case DRR_OBJECT:
{
- struct drr_object *drro = &ra->drr->drr_u.drr_object;
- err = restore_read_payload_and_next_header(ra,
- P2ROUNDUP(drro->drr_bonuslen, 8), ra->buf);
- if (err != 0)
+ struct drr_object *drro = &ra->rrd->header.drr_u.drr_object;
+ uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8);
+ void *buf = kmem_zalloc(size, KM_SLEEP);
+ dmu_object_info_t doi;
+ err = receive_read_payload_and_next_header(ra, size, buf);
+ if (err != 0) {
+ kmem_free(buf, size);
return (err);
- return (restore_object(ra, drro, ra->buf));
+ }
+ err = dmu_object_info(ra->os, drro->drr_object, &doi);
+ /*
+ * See receive_read_prefetch for an explanation why we're
+ * storing this object in the ignore_obj_list.
+ */
+ if (err == ENOENT ||
+ (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
+ struct receive_ign_obj_node *node =
+ kmem_zalloc(sizeof (*node),
+ KM_SLEEP);
+ node->object = drro->drr_object;
+#ifdef ZFS_DEBUG
+ struct receive_ign_obj_node *last_object =
+ list_tail(&ra->ignore_obj_list);
+ uint64_t last_objnum = (last_object != NULL ?
+ last_object->object : 0);
+ ASSERT3U(node->object, >, last_objnum);
+#endif
+ list_insert_tail(&ra->ignore_obj_list, node);
+ err = 0;
+ }
+ return (err);
}
case DRR_FREEOBJECTS:
{
- struct drr_freeobjects *drrfo =
- &ra->drr->drr_u.drr_freeobjects;
- err = restore_read_payload_and_next_header(ra, 0, NULL);
- if (err != 0)
- return (err);
- return (restore_freeobjects(ra, drrfo));
+ err = receive_read_payload_and_next_header(ra, 0, NULL);
+ return (err);
}
case DRR_WRITE:
{
- struct drr_write *drrw = &ra->drr->drr_u.drr_write;
+ struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write;
arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os),
drrw->drr_length);
- err = restore_read_payload_and_next_header(ra,
+ err = receive_read_payload_and_next_header(ra,
drrw->drr_length, abuf->b_data);
- if (err != 0)
- return (err);
- err = restore_write(ra, drrw, abuf);
- /* if restore_write() is successful, it consumes the arc_buf */
- if (err != 0)
+ if (err != 0) {
dmu_return_arcbuf(abuf);
+ return (err);
+ }
+ ra->rrd->write_buf = abuf;
+ receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset,
+ drrw->drr_length);
return (err);
}
case DRR_WRITE_BYREF:
{
- struct drr_write_byref *drrwbr =
- &ra->drr->drr_u.drr_write_byref;
- err = restore_read_payload_and_next_header(ra, 0, NULL);
- if (err != 0)
- return (err);
- return (restore_write_byref(ra, drrwbr));
+ struct drr_write_byref *drrwb =
+ &ra->rrd->header.drr_u.drr_write_byref;
+ err = receive_read_payload_and_next_header(ra, 0, NULL);
+ receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset,
+ drrwb->drr_length);
+ return (err);
}
case DRR_WRITE_EMBEDDED:
{
struct drr_write_embedded *drrwe =
- &ra->drr->drr_u.drr_write_embedded;
- err = restore_read_payload_and_next_header(ra,
- P2ROUNDUP(drrwe->drr_psize, 8), ra->buf);
- if (err != 0)
+ &ra->rrd->header.drr_u.drr_write_embedded;
+ uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8);
+ void *buf = kmem_zalloc(size, KM_SLEEP);
+
+ err = receive_read_payload_and_next_header(ra, size, buf);
+ if (err != 0) {
+ kmem_free(buf, size);
return (err);
- return (restore_write_embedded(ra, drrwe, ra->buf));
+ }
+
+ receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset,
+ drrwe->drr_length);
+ return (err);
}
case DRR_FREE:
{
- struct drr_free *drrf = &ra->drr->drr_u.drr_free;
- err = restore_read_payload_and_next_header(ra, 0, NULL);
- if (err != 0)
- return (err);
- return (restore_free(ra, drrf));
+ /*
+ * It might be beneficial to prefetch indirect blocks here, but
+ * we don't really have the data to decide for sure.
+ */
+ err = receive_read_payload_and_next_header(ra, 0, NULL);
+ return (err);
}
case DRR_END:
{
- struct drr_end *drre = &ra->drr->drr_u.drr_end;
+ struct drr_end *drre = &ra->rrd->header.drr_u.drr_end;
if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum))
return (SET_ERROR(EINVAL));
return (0);
}
case DRR_SPILL:
{
- struct drr_spill *drrs = &ra->drr->drr_u.drr_spill;
- err = restore_read_payload_and_next_header(ra,
- drrs->drr_length, ra->buf);
+ struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill;
+ void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP);
+ err = receive_read_payload_and_next_header(ra, drrs->drr_length,
+ buf);
if (err != 0)
- return (err);
- return (restore_spill(ra, drrs, ra->buf));
+ kmem_free(buf, drrs->drr_length);
+ return (err);
+ }
+ default:
+ return (SET_ERROR(EINVAL));
+ }
+}
+
+/*
+ * Commit the records to the pool.
+ */
+static int
+receive_process_record(struct receive_writer_arg *rwa,
+ struct receive_record_arg *rrd)
+{
+ int err;
+
+ switch (rrd->header.drr_type) {
+ case DRR_OBJECT:
+ {
+ struct drr_object *drro = &rrd->header.drr_u.drr_object;
+ err = receive_object(rwa, drro, rrd->payload);
+ kmem_free(rrd->payload, rrd->payload_size);
+ rrd->payload = NULL;
+ return (err);
+ }
+ case DRR_FREEOBJECTS:
+ {
+ struct drr_freeobjects *drrfo =
+ &rrd->header.drr_u.drr_freeobjects;
+ return (receive_freeobjects(rwa, drrfo));
+ }
+ case DRR_WRITE:
+ {
+ struct drr_write *drrw = &rrd->header.drr_u.drr_write;
+ err = receive_write(rwa, drrw, rrd->write_buf);
+ /* if receive_write() is successful, it consumes the arc_buf */
+ if (err != 0)
+ dmu_return_arcbuf(rrd->write_buf);
+ rrd->write_buf = NULL;
+ rrd->payload = NULL;
+ return (err);
+ }
+ case DRR_WRITE_BYREF:
+ {
+ struct drr_write_byref *drrwbr =
+ &rrd->header.drr_u.drr_write_byref;
+ return (receive_write_byref(rwa, drrwbr));
+ }
+ case DRR_WRITE_EMBEDDED:
+ {
+ struct drr_write_embedded *drrwe =
+ &rrd->header.drr_u.drr_write_embedded;
+ err = receive_write_embedded(rwa, drrwe, rrd->payload);
+ kmem_free(rrd->payload, rrd->payload_size);
+ rrd->payload = NULL;
+ return (err);
+ }
+ case DRR_FREE:
+ {
+ struct drr_free *drrf = &rrd->header.drr_u.drr_free;
+ return (receive_free(rwa, drrf));
+ }
+ case DRR_SPILL:
+ {
+ struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
+ err = receive_spill(rwa, drrs, rrd->payload);
+ kmem_free(rrd->payload, rrd->payload_size);
+ rrd->payload = NULL;
+ return (err);
}
default:
return (SET_ERROR(EINVAL));
@@ -2007,6 +2322,51 @@ restore_process_record(struct restorearg *ra)
}
/*
+ * dmu_recv_stream's worker thread; pull records off the queue, and then call
+ * receive_process_record When we're done, signal the main thread and exit.
+ */
+static void
+receive_writer_thread(void *arg)
+{
+ struct receive_writer_arg *rwa = arg;
+ struct receive_record_arg *rrd;
+ for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker;
+ rrd = bqueue_dequeue(&rwa->q)) {
+ /*
+ * If there's an error, the main thread will stop putting things
+ * on the queue, but we need to clear everything in it before we
+ * can exit.
+ */
+ if (rwa->err == 0) {
+ rwa->err = receive_process_record(rwa, rrd);
+ } else if (rrd->write_buf != NULL) {
+ dmu_return_arcbuf(rrd->write_buf);
+ rrd->write_buf = NULL;
+ rrd->payload = NULL;
+ } else if (rrd->payload != NULL) {
+ kmem_free(rrd->payload, rrd->payload_size);
+ rrd->payload = NULL;
+ }
+ kmem_free(rrd, sizeof (*rrd));
+ }
+ kmem_free(rrd, sizeof (*rrd));
+ mutex_enter(&rwa->mutex);
+ rwa->done = B_TRUE;
+ cv_signal(&rwa->cv);
+ mutex_exit(&rwa->mutex);
+ thread_exit();
+}
+
+/*
+ * Read in the stream's records, one by one, and apply them to the pool. There
+ * are two threads involved; the thread that calls this function will spin up a
+ * worker thread, read the records off the stream one by one, and issue
+ * prefetches for any necessary indirect blocks. It will then push the records
+ * onto an internal blocking queue. The worker thread will pull the records off
+ * the queue, and actually write the data into the DMU. This way, the worker
+ * thread doesn't have to wait for reads to complete, since everything it needs
+ * (the indirect blocks) will be prefetched.
+ *
* NB: callers *must* call dmu_recv_end() if this succeeds.
*/
int
@@ -2014,7 +2374,8 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
int cleanup_fd, uint64_t *action_handlep)
{
int err = 0;
- struct restorearg ra = { 0 };
+ struct receive_arg ra = { 0 };
+ struct receive_writer_arg rwa = { 0 };
int featureflags;
ra.byteswap = drc->drc_byteswap;
@@ -2022,10 +2383,8 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
ra.td = curthread;
ra.fp = fp;
ra.voff = *voffp;
- ra.bufsize = SPA_MAXBLOCKSIZE;
- ra.drr = kmem_alloc(sizeof (*ra.drr), KM_SLEEP);
- ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
- ra.next_drr = kmem_alloc(sizeof (*ra.next_drr), KM_SLEEP);
+ list_create(&ra.ignore_obj_list, sizeof (struct receive_ign_obj_node),
+ offsetof(struct receive_ign_obj_node, node));
/* these were verified in dmu_recv_begin */
ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
@@ -2056,48 +2415,92 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
}
if (*action_handlep == 0) {
- ra.guid_to_ds_map =
+ rwa.guid_to_ds_map =
kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
- avl_create(ra.guid_to_ds_map, guid_compare,
+ avl_create(rwa.guid_to_ds_map, guid_compare,
sizeof (guid_map_entry_t),
offsetof(guid_map_entry_t, avlnode));
err = zfs_onexit_add_cb(minor,
- free_guid_map_onexit, ra.guid_to_ds_map,
+ free_guid_map_onexit, rwa.guid_to_ds_map,
action_handlep);
if (ra.err != 0)
goto out;
} else {
err = zfs_onexit_cb_data(minor, *action_handlep,
- (void **)&ra.guid_to_ds_map);
+ (void **)&rwa.guid_to_ds_map);
if (ra.err != 0)
goto out;
}
- drc->drc_guid_to_ds_map = ra.guid_to_ds_map;
+ drc->drc_guid_to_ds_map = rwa.guid_to_ds_map;
}
- err = restore_read_payload_and_next_header(&ra, 0, NULL);
- if (err != 0)
+ err = receive_read_payload_and_next_header(&ra, 0, NULL);
+ if (err)
goto out;
- for (;;) {
- void *tmp;
+ (void) bqueue_init(&rwa.q, zfs_recv_queue_length,
+ offsetof(struct receive_record_arg, node));
+ cv_init(&rwa.cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL);
+ rwa.os = ra.os;
+ rwa.byteswap = drc->drc_byteswap;
+
+ (void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, curproc,
+ TS_RUN, minclsyspri);
+ /*
+ * We're reading rwa.err without locks, which is safe since we are the
+ * only reader, and the worker thread is the only writer. It's ok if we
+ * miss a write for an iteration or two of the loop, since the writer
+ * thread will keep freeing records we send it until we send it an eos
+ * marker.
+ *
+ * We can leave this loop in 3 ways: First, if rwa.err is
+ * non-zero. In that case, the writer thread will free the rrd we just
+ * pushed. Second, if we're interrupted; in that case, either it's the
+ * first loop and ra.rrd was never allocated, or it's later, and ra.rrd
+ * has been handed off to the writer thread who will free it. Finally,
+ * if receive_read_record fails or we're at the end of the stream, then
+ * we free ra.rrd and exit.
+ */
+ while (rwa.err == 0) {
if (issig(JUSTLOOKING) && issig(FORREAL)) {
err = SET_ERROR(EINTR);
break;
}
- tmp = ra.next_drr;
- ra.next_drr = ra.drr;
- ra.drr = tmp;
+ ASSERT3P(ra.rrd, ==, NULL);
+ ra.rrd = ra.next_rrd;
+ ra.next_rrd = NULL;
+ /* Allocates and loads header into ra.next_rrd */
+ err = receive_read_record(&ra);
- /* process ra.drr, read in ra.next_drr */
- err = restore_process_record(&ra);
- if (err != 0)
- break;
- if (ra.drr->drr_type == DRR_END)
+ if (ra.rrd->header.drr_type == DRR_END || err != 0) {
+ kmem_free(ra.rrd, sizeof (*ra.rrd));
+ ra.rrd = NULL;
break;
+ }
+
+ bqueue_enqueue(&rwa.q, ra.rrd,
+ sizeof (struct receive_record_arg) + ra.rrd->payload_size);
+ ra.rrd = NULL;
+ }
+ if (ra.next_rrd == NULL)
+ ra.next_rrd = kmem_zalloc(sizeof (*ra.next_rrd), KM_SLEEP);
+ ra.next_rrd->eos_marker = B_TRUE;
+ bqueue_enqueue(&rwa.q, ra.next_rrd, 1);
+
+ mutex_enter(&rwa.mutex);
+ while (!rwa.done) {
+ cv_wait(&rwa.cv, &rwa.mutex);
}
+ mutex_exit(&rwa.mutex);
+
+ cv_destroy(&rwa.cv);
+ mutex_destroy(&rwa.mutex);
+ bqueue_destroy(&rwa.q);
+ if (err == 0)
+ err = rwa.err;
out:
if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
@@ -2111,10 +2514,13 @@ out:
dmu_recv_cleanup_ds(drc);
}
- kmem_free(ra.drr, sizeof (*ra.drr));
- kmem_free(ra.buf, ra.bufsize);
- kmem_free(ra.next_drr, sizeof (*ra.next_drr));
*voffp = ra.voff;
+ for (struct receive_ign_obj_node *n =
+ list_remove_head(&ra.ignore_obj_list); n != NULL;
+ n = list_remove_head(&ra.ignore_obj_list)) {
+ kmem_free(n, sizeof (*n));
+ }
+ list_destroy(&ra.ignore_obj_list);
return (err);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
index e246c49..151d04c 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
@@ -158,7 +158,7 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
* If we already visited this bp & everything below,
* don't bother doing it again.
*/
- if (zbookmark_is_before(dnp, zb, td->td_resume))
+ if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
return (RESUME_SKIP_ALL);
/*
@@ -425,6 +425,17 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
int j, err = 0;
zbookmark_phys_t czb;
+ if (td->td_flags & TRAVERSE_PRE) {
+ SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
+ ZB_DNODE_BLKID);
+ err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
+ td->td_arg);
+ if (err == TRAVERSE_VISIT_NO_CHILDREN)
+ return (0);
+ if (err != 0)
+ return (err);
+ }
+
for (j = 0; j < dnp->dn_nblkptr; j++) {
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
@@ -432,10 +443,21 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
break;
}
- if (err == 0 && dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
}
+
+ if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
+ SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
+ ZB_DNODE_BLKID);
+ err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
+ td->td_arg);
+ if (err == TRAVERSE_VISIT_NO_CHILDREN)
+ return (0);
+ if (err != 0)
+ return (err);
+ }
return (err);
}
@@ -448,6 +470,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
ASSERT(pfd->pd_bytes_fetched >= 0);
+ if (bp == NULL)
+ return (0);
if (pfd->pd_cancel)
return (SET_ERROR(EINTR));
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
index dff9fab..65a017f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
@@ -315,7 +315,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
dmu_buf_impl_t *db;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
- err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
+ err = dbuf_hold_impl(dn, 0, start,
+ FALSE, FALSE, FTAG, &db);
rw_exit(&dn->dn_struct_rwlock);
if (err) {
@@ -516,7 +517,8 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
blkoff = P2PHASE(blkid, epb);
tochk = MIN(epb - blkoff, nblks);
- err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf);
+ err = dbuf_hold_impl(dn, 1, blkid >> epbs,
+ FALSE, FALSE, FTAG, &dbuf);
if (err) {
txh->txh_tx->tx_err = err;
break;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
index 77100ef..65ce914 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
@@ -305,7 +305,8 @@ dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
for (i = 0; i < fetchsz; i++) {
- dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_ASYNC_READ);
+ dbuf_prefetch(dn, 0, blkid + i, ZIO_PRIORITY_ASYNC_READ,
+ ARC_FLAG_PREFETCH);
}
return (fetchsz);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
index 5b953fc..0fdcde4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
@@ -1116,7 +1116,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
drop_struct_lock = TRUE;
}
- blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
+ blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
db = dbuf_hold(mdn, blk, FTAG);
if (drop_struct_lock)
@@ -1413,7 +1413,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
goto fail;
/* resize the old block */
- err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db);
+ err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
if (err == 0)
dbuf_new_size(db, size, tx);
else if (err != ENOENT)
@@ -1586,8 +1586,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
ASSERT3U(blkoff + head, ==, blksz);
if (len < head)
head = len;
- if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
- FTAG, &db) == 0) {
+ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
+ TRUE, FALSE, FTAG, &db) == 0) {
caddr_t data;
/* don't dirty if it isn't on disk and isn't dirty */
@@ -1624,8 +1624,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
if (tail) {
if (len < tail)
tail = len;
- if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
- TRUE, FTAG, &db) == 0) {
+ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
+ TRUE, FALSE, FTAG, &db) == 0) {
/* don't dirty if not on disk and not dirty */
if (db->db_last_dirty ||
(db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
@@ -1854,7 +1854,7 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
*/
static int
dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
- int lvl, uint64_t blkfill, uint64_t txg)
+ int lvl, uint64_t blkfill, uint64_t txg)
{
dmu_buf_impl_t *db = NULL;
void *data = NULL;
@@ -1876,8 +1876,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
epb = dn->dn_phys->dn_nblkptr;
data = dn->dn_phys->dn_blkptr;
} else {
- uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
- error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
+ uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
+ error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
if (error) {
if (error != ENOENT)
return (error);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
index 0633604..0787885 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
@@ -188,7 +188,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
rw_enter(&dn->dn_struct_rwlock, RW_READER);
err = dbuf_hold_impl(dn, db->db_level-1,
- (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
+ (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child);
rw_exit(&dn->dn_struct_rwlock);
if (err == ENOENT)
continue;
@@ -284,7 +284,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
continue;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
- i, B_TRUE, FTAG, &subdb));
+ i, TRUE, FALSE, FTAG, &subdb));
rw_exit(&dn->dn_struct_rwlock);
ASSERT3P(bp, ==, subdb->db_blkptr);
@@ -357,7 +357,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
continue;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
- TRUE, FTAG, &db));
+ TRUE, FALSE, FTAG, &db));
rw_exit(&dn->dn_struct_rwlock);
free_children(db, blkid, nblks, tx);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
index 551e35b..f4fdaf9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Portions Copyright (c) 2011 Martin Matuska <mm@FreeBSD.org>
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 RackTop Systems.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
@@ -130,8 +130,10 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
dsl_dataset_phys(ds)->ds_unique_bytes += used;
- if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE)
- ds->ds_need_large_blocks = B_TRUE;
+ if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) {
+ ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] =
+ B_TRUE;
+ }
mutex_exit(&ds->ds_lock);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
compressed, uncompressed, tx);
@@ -433,19 +435,23 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
offsetof(dmu_sendarg_t, dsa_link));
if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
- int zaperr = zap_contains(mos, dsobj,
- DS_FIELD_LARGE_BLOCKS);
- if (zaperr != ENOENT) {
- VERIFY0(zaperr);
- ds->ds_large_blocks = B_TRUE;
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (!(spa_feature_table[f].fi_flags &
+ ZFEATURE_FLAG_PER_DATASET))
+ continue;
+ err = zap_contains(mos, dsobj,
+ spa_feature_table[f].fi_guid);
+ if (err == 0) {
+ ds->ds_feature_inuse[f] = B_TRUE;
+ } else {
+ ASSERT3U(err, ==, ENOENT);
+ err = 0;
+ }
}
}
- if (err == 0) {
- err = dsl_dir_hold_obj(dp,
- dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds,
- &ds->ds_dir);
- }
+ err = dsl_dir_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, &ds->ds_dir);
if (err != 0) {
mutex_destroy(&ds->ds_lock);
mutex_destroy(&ds->ds_opening_lock);
@@ -540,6 +546,7 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name,
const char *snapname;
uint64_t obj;
int err = 0;
+ dsl_dataset_t *ds;
err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
if (err != 0)
@@ -548,36 +555,37 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name,
ASSERT(dsl_pool_config_held(dp));
obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
if (obj != 0)
- err = dsl_dataset_hold_obj(dp, obj, tag, dsp);
+ err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
else
err = SET_ERROR(ENOENT);
/* we may be looking for a snapshot */
if (err == 0 && snapname != NULL) {
- dsl_dataset_t *ds;
+ dsl_dataset_t *snap_ds;
if (*snapname++ != '@') {
- dsl_dataset_rele(*dsp, tag);
+ dsl_dataset_rele(ds, tag);
dsl_dir_rele(dd, FTAG);
return (SET_ERROR(ENOENT));
}
dprintf("looking for snapshot '%s'\n", snapname);
- err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
+ err = dsl_dataset_snap_lookup(ds, snapname, &obj);
if (err == 0)
- err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
- dsl_dataset_rele(*dsp, tag);
+ err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds);
+ dsl_dataset_rele(ds, tag);
if (err == 0) {
- mutex_enter(&ds->ds_lock);
- if (ds->ds_snapname[0] == 0)
- (void) strlcpy(ds->ds_snapname, snapname,
- sizeof (ds->ds_snapname));
- mutex_exit(&ds->ds_lock);
- *dsp = ds;
+ mutex_enter(&snap_ds->ds_lock);
+ if (snap_ds->ds_snapname[0] == 0)
+ (void) strlcpy(snap_ds->ds_snapname, snapname,
+ sizeof (snap_ds->ds_snapname));
+ mutex_exit(&snap_ds->ds_lock);
+ ds = snap_ds;
}
}
-
+ if (err == 0)
+ *dsp = ds;
dsl_dir_rele(dd, FTAG);
return (err);
}
@@ -699,6 +707,34 @@ dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
return (gotit);
}
+static void
+dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+ uint64_t zero = 0;
+
+ VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
+
+ spa_feature_incr(spa, f, tx);
+ dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
+
+ VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,
+ sizeof (zero), 1, &zero, tx));
+}
+
+void
+dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+
+ VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
+
+ VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx));
+ spa_feature_decr(spa, f, tx);
+}
+
uint64_t
dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
uint64_t flags, dmu_tx_t *tx)
@@ -759,8 +795,10 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
(DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
- if (origin->ds_large_blocks)
- dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (origin->ds_feature_inuse[f])
+ dsl_dataset_activate_feature(dsobj, f, tx);
+ }
dmu_buf_will_dirty(origin->ds_dbuf, tx);
dsl_dataset_phys(origin)->ds_num_children++;
@@ -1322,8 +1360,10 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
dmu_buf_rele(dbuf, FTAG);
- if (ds->ds_large_blocks)
- dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (ds->ds_feature_inuse[f])
+ dsl_dataset_activate_feature(dsobj, f, tx);
+ }
ASSERT3U(ds->ds_prev != 0, ==,
dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
@@ -1615,9 +1655,13 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
dmu_objset_sync(ds->ds_objset, zio, tx);
- if (ds->ds_need_large_blocks && !ds->ds_large_blocks) {
- dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
- ds->ds_large_blocks = B_TRUE;
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (ds->ds_feature_activation_needed[f]) {
+ if (ds->ds_feature_inuse[f])
+ continue;
+ dsl_dataset_activate_feature(ds->ds_object, f, tx);
+ ds->ds_feature_inuse[f] = B_TRUE;
+ }
}
}
@@ -2781,6 +2825,40 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota);
ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
+ /*
+ * Swap per-dataset feature flags.
+ */
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (!(spa_feature_table[f].fi_flags &
+ ZFEATURE_FLAG_PER_DATASET)) {
+ ASSERT(!clone->ds_feature_inuse[f]);
+ ASSERT(!origin_head->ds_feature_inuse[f]);
+ continue;
+ }
+
+ boolean_t clone_inuse = clone->ds_feature_inuse[f];
+ boolean_t origin_head_inuse = origin_head->ds_feature_inuse[f];
+
+ if (clone_inuse) {
+ dsl_dataset_deactivate_feature(clone->ds_object, f, tx);
+ clone->ds_feature_inuse[f] = B_FALSE;
+ }
+ if (origin_head_inuse) {
+ dsl_dataset_deactivate_feature(origin_head->ds_object,
+ f, tx);
+ origin_head->ds_feature_inuse[f] = B_FALSE;
+ }
+ if (clone_inuse) {
+ dsl_dataset_activate_feature(origin_head->ds_object,
+ f, tx);
+ origin_head->ds_feature_inuse[f] = B_TRUE;
+ }
+ if (origin_head_inuse) {
+ dsl_dataset_activate_feature(clone->ds_object, f, tx);
+ clone->ds_feature_inuse[f] = B_TRUE;
+ }
+ }
+
dmu_buf_will_dirty(clone->ds_dbuf, tx);
dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
@@ -3335,77 +3413,6 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
return (err);
}
-static int
-dsl_dataset_activate_large_blocks_check(void *arg, dmu_tx_t *tx)
-{
- const char *dsname = arg;
- dsl_dataset_t *ds;
- dsl_pool_t *dp = dmu_tx_pool(tx);
- int error = 0;
-
- if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
- return (SET_ERROR(ENOTSUP));
-
- ASSERT(spa_feature_is_enabled(dp->dp_spa,
- SPA_FEATURE_EXTENSIBLE_DATASET));
-
- error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
- if (error != 0)
- return (error);
-
- if (ds->ds_large_blocks)
- error = EALREADY;
- dsl_dataset_rele(ds, FTAG);
-
- return (error);
-}
-
-void
-dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx)
-{
- spa_t *spa = dmu_tx_pool(tx)->dp_spa;
- objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
- uint64_t zero = 0;
-
- spa_feature_incr(spa, SPA_FEATURE_LARGE_BLOCKS, tx);
- dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
-
- VERIFY0(zap_add(mos, dsobj, DS_FIELD_LARGE_BLOCKS,
- sizeof (zero), 1, &zero, tx));
-}
-
-static void
-dsl_dataset_activate_large_blocks_sync(void *arg, dmu_tx_t *tx)
-{
- const char *dsname = arg;
- dsl_dataset_t *ds;
-
- VERIFY0(dsl_dataset_hold(dmu_tx_pool(tx), dsname, FTAG, &ds));
-
- dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
- ASSERT(!ds->ds_large_blocks);
- ds->ds_large_blocks = B_TRUE;
- dsl_dataset_rele(ds, FTAG);
-}
-
-int
-dsl_dataset_activate_large_blocks(const char *dsname)
-{
- int error;
-
- error = dsl_sync_task(dsname,
- dsl_dataset_activate_large_blocks_check,
- dsl_dataset_activate_large_blocks_sync, (void *)dsname,
- 1, ZFS_SPACE_CHECK_RESERVED);
-
- /*
- * EALREADY indicates that this dataset already supports large blocks.
- */
- if (error == EALREADY)
- error = 0;
- return (error);
-}
-
/*
* Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
* For example, they could both be snapshots of the same filesystem, and
@@ -3450,7 +3457,6 @@ dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
return (ret);
}
-
void
dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
{
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
index 7f90469..c7a623c 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
* Copyright (c) 2013 by Joyent, Inc. All rights reserved.
*/
@@ -267,9 +267,11 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
obj = ds->ds_object;
- if (ds->ds_large_blocks) {
- ASSERT0(zap_contains(mos, obj, DS_FIELD_LARGE_BLOCKS));
- spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx);
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (ds->ds_feature_inuse[f]) {
+ dsl_dataset_deactivate_feature(obj, f, tx);
+ ds->ds_feature_inuse[f] = B_FALSE;
+ }
}
if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
ASSERT3P(ds->ds_prev, ==, NULL);
@@ -552,7 +554,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
struct killarg *ka = arg;
dmu_tx_t *tx = ka->tx;
- if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+ if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return (0);
if (zb->zb_level == ZB_ZIL_LEVEL) {
@@ -736,12 +738,16 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
ASSERT0(ds->ds_reserved);
}
- if (ds->ds_large_blocks)
- spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx);
+ obj = ds->ds_object;
- dsl_scan_ds_destroyed(ds, tx);
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (ds->ds_feature_inuse[f]) {
+ dsl_dataset_deactivate_feature(obj, f, tx);
+ ds->ds_feature_inuse[f] = B_FALSE;
+ }
+ }
- obj = ds->ds_object;
+ dsl_scan_ds_destroyed(ds, tx);
if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
/* This is a clone */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
index d58886b..189ca19 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
@@ -415,7 +415,14 @@ dsl_pool_close(dsl_pool_t *dp)
txg_list_destroy(&dp->dp_sync_tasks);
txg_list_destroy(&dp->dp_dirty_dirs);
- arc_flush(dp->dp_spa);
+ /*
+ * We can't set retry to TRUE since we're explicitly specifying
+ * a spa to flush. This is good enough; any missed buffers for
+ * this spa won't cause trouble, and they'll eventually fall
+ * out of the ARC just like any other unused buffer.
+ */
+ arc_flush(dp->dp_spa, FALSE);
+
txg_fini(dp);
dsl_scan_fini(dp);
dmu_buf_user_evict_wait();
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
index d08b5fb..406af3b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
@@ -600,7 +600,8 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
* If we already visited this bp & everything below (in
* a prior txg sync), don't bother doing it again.
*/
- if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
+ if (zbookmark_subtree_completed(dnp, zb,
+ &scn->scn_phys.scn_bookmark))
return (B_TRUE);
/*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c
new file mode 100644
index 0000000..8296057
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c
@@ -0,0 +1,366 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/multilist.h>
+
+/* needed for spa_get_random() */
+#include <sys/spa.h>
+
+/*
+ * Given the object contained on the list, return a pointer to the
+ * object's multilist_node_t structure it contains.
+ */
+static multilist_node_t *
+multilist_d2l(multilist_t *ml, void *obj)
+{
+ return ((multilist_node_t *)((char *)obj + ml->ml_offset));
+}
+
+/*
+ * Initialize a new mutlilist using the parameters specified.
+ *
+ * - 'size' denotes the size of the structure containing the
+ * multilist_node_t.
+ * - 'offset' denotes the byte offset of the mutlilist_node_t within
+ * the structure that contains it.
+ * - 'num' specifies the number of internal sublists to create.
+ * - 'index_func' is used to determine which sublist to insert into
+ * when the multilist_insert() function is called; as well as which
+ * sublist to remove from when multilist_remove() is called. The
+ * requirements this function must meet, are the following:
+ *
+ * - It must always return the same value when called on the same
+ * object (to ensure the object is removed from the list it was
+ * inserted into).
+ *
+ * - It must return a value in the range [0, number of sublists).
+ * The multilist_get_num_sublists() function may be used to
+ * determine the number of sublists in the multilist.
+ *
+ * Also, in order to reduce internal contention between the sublists
+ * during insertion and removal, this function should choose evenly
+ * between all available sublists when inserting. This isn't a hard
+ * requirement, but a general rule of thumb in order to garner the
+ * best multi-threaded performance out of the data structure.
+ */
+void
+multilist_create(multilist_t *ml, size_t size, size_t offset, unsigned int num,
+ multilist_sublist_index_func_t *index_func)
+{
+ ASSERT3P(ml, !=, NULL);
+ ASSERT3U(size, >, 0);
+ ASSERT3U(size, >=, offset + sizeof (multilist_node_t));
+ ASSERT3U(num, >, 0);
+ ASSERT3P(index_func, !=, NULL);
+
+ ml->ml_offset = offset;
+ ml->ml_num_sublists = num;
+ ml->ml_index_func = index_func;
+
+ ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) *
+ ml->ml_num_sublists, KM_SLEEP);
+
+ ASSERT3P(ml->ml_sublists, !=, NULL);
+
+ for (int i = 0; i < ml->ml_num_sublists; i++) {
+ multilist_sublist_t *mls = &ml->ml_sublists[i];
+ mutex_init(&mls->mls_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&mls->mls_list, size, offset);
+ }
+}
+
+/*
+ * Destroy the given multilist object, and free up any memory it holds.
+ */
+void
+multilist_destroy(multilist_t *ml)
+{
+ ASSERT(multilist_is_empty(ml));
+
+ for (int i = 0; i < ml->ml_num_sublists; i++) {
+ multilist_sublist_t *mls = &ml->ml_sublists[i];
+
+ ASSERT(list_is_empty(&mls->mls_list));
+
+ list_destroy(&mls->mls_list);
+ mutex_destroy(&mls->mls_lock);
+ }
+
+ ASSERT3P(ml->ml_sublists, !=, NULL);
+ kmem_free(ml->ml_sublists,
+ sizeof (multilist_sublist_t) * ml->ml_num_sublists);
+
+ ml->ml_num_sublists = 0;
+ ml->ml_offset = 0;
+}
+
+/*
+ * Insert the given object into the multilist.
+ *
+ * This function will insert the object specified into the sublist
+ * determined using the function given at multilist creation time.
+ *
+ * The sublist locks are automatically acquired if not already held, to
+ * ensure consistency when inserting and removing from multiple threads.
+ */
+void
+multilist_insert(multilist_t *ml, void *obj)
+{
+ unsigned int sublist_idx = ml->ml_index_func(ml, obj);
+ multilist_sublist_t *mls;
+ boolean_t need_lock;
+
+ DTRACE_PROBE3(multilist__insert, multilist_t *, ml,
+ unsigned int, sublist_idx, void *, obj);
+
+ ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+
+ mls = &ml->ml_sublists[sublist_idx];
+
+ /*
+ * Note: Callers may already hold the sublist lock by calling
+ * multilist_sublist_lock(). Here we rely on MUTEX_HELD()
+ * returning TRUE if and only if the current thread holds the
+ * lock. While it's a little ugly to make the lock recursive in
+ * this way, it works and allows the calling code to be much
+ * simpler -- otherwise it would have to pass around a flag
+ * indicating that it already has the lock.
+ */
+ need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+ if (need_lock)
+ mutex_enter(&mls->mls_lock);
+
+ ASSERT(!multilist_link_active(multilist_d2l(ml, obj)));
+
+ multilist_sublist_insert_head(mls, obj);
+
+ if (need_lock)
+ mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * Remove the given object from the multilist.
+ *
+ * This function will remove the object specified from the sublist
+ * determined using the function given at multilist creation time.
+ *
+ * The necessary sublist locks are automatically acquired, to ensure
+ * consistency when inserting and removing from multiple threads.
+ */
+void
+multilist_remove(multilist_t *ml, void *obj)
+{
+ unsigned int sublist_idx = ml->ml_index_func(ml, obj);
+ multilist_sublist_t *mls;
+ boolean_t need_lock;
+
+ DTRACE_PROBE3(multilist__remove, multilist_t *, ml,
+ unsigned int, sublist_idx, void *, obj);
+
+ ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+
+ mls = &ml->ml_sublists[sublist_idx];
+ /* See comment in multilist_insert(). */
+ need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+ if (need_lock)
+ mutex_enter(&mls->mls_lock);
+
+ ASSERT(multilist_link_active(multilist_d2l(ml, obj)));
+
+ multilist_sublist_remove(mls, obj);
+
+ if (need_lock)
+ mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * Check to see if this multilist object is empty.
+ *
+ * This will return TRUE if it finds all of the sublists of this
+ * multilist to be empty, and FALSE otherwise. Each sublist lock will be
+ * automatically acquired as necessary.
+ *
+ * If concurrent insertions and removals are occurring, the semantics
+ * of this function become a little fuzzy. Instead of locking all
+ * sublists for the entire call time of the function, each sublist is
+ * only locked as it is individually checked for emptiness. Thus, it's
+ * possible for this function to return TRUE with non-empty sublists at
+ * the time the function returns. This would be due to another thread
+ * inserting into a given sublist, after that specific sublist was check
+ * and deemed empty, but before all sublists have been checked.
+ */
+int
+multilist_is_empty(multilist_t *ml)
+{
+ for (int i = 0; i < ml->ml_num_sublists; i++) {
+ multilist_sublist_t *mls = &ml->ml_sublists[i];
+ /* See comment in multilist_insert(). */
+ boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+ if (need_lock)
+ mutex_enter(&mls->mls_lock);
+
+ if (!list_is_empty(&mls->mls_list)) {
+ if (need_lock)
+ mutex_exit(&mls->mls_lock);
+
+ return (FALSE);
+ }
+
+ if (need_lock)
+ mutex_exit(&mls->mls_lock);
+ }
+
+ return (TRUE);
+}
+
+/* Return the number of sublists composing this multilist */
+unsigned int
+multilist_get_num_sublists(multilist_t *ml)
+{
+ return (ml->ml_num_sublists);
+}
+
+/* Return a randomly selected, valid sublist index for this multilist */
+unsigned int
+multilist_get_random_index(multilist_t *ml)
+{
+ return (spa_get_random(ml->ml_num_sublists));
+}
+
+/* Lock and return the sublist specified at the given index */
+multilist_sublist_t *
+multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
+{
+ multilist_sublist_t *mls;
+
+ ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+ mls = &ml->ml_sublists[sublist_idx];
+ mutex_enter(&mls->mls_lock);
+
+ return (mls);
+}
+
+void
+multilist_sublist_unlock(multilist_sublist_t *mls)
+{
+ mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * We're allowing any object to be inserted into this specific sublist,
+ * but this can lead to trouble if multilist_remove() is called to
+ * remove this object. Specifically, if calling ml_index_func on this
+ * object returns an index for sublist different than what is passed as
+ * a parameter here, any call to multilist_remove() with this newly
+ * inserted object is undefined! (the call to multilist_remove() will
+ * remove the object from a list that it isn't contained in)
+ */
+void
+multilist_sublist_insert_head(multilist_sublist_t *mls, void *obj)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ list_insert_head(&mls->mls_list, obj);
+}
+
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ list_insert_tail(&mls->mls_list, obj);
+}
+
+/*
+ * Move the object one element forward in the list.
+ *
+ * This function will move the given object forward in the list (towards
+ * the head) by one object. So, in essence, it will swap its position in
+ * the list with its "prev" pointer. If the given object is already at the
+ * head of the list, it cannot be moved forward any more than it already
+ * is, so no action is taken.
+ *
+ * NOTE: This function **must not** remove any object from the list other
+ * than the object given as the parameter. This is relied upon in
+ * arc_evict_state_impl().
+ */
+void
+multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj)
+{
+ void *prev = list_prev(&mls->mls_list, obj);
+
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ ASSERT(!list_is_empty(&mls->mls_list));
+
+ /* 'obj' must be at the head of the list, nothing to do */
+ if (prev == NULL)
+ return;
+
+ list_remove(&mls->mls_list, obj);
+ list_insert_before(&mls->mls_list, prev, obj);
+}
+
+void
+multilist_sublist_remove(multilist_sublist_t *mls, void *obj)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ list_remove(&mls->mls_list, obj);
+}
+
+void *
+multilist_sublist_head(multilist_sublist_t *mls)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ return (list_head(&mls->mls_list));
+}
+
+void *
+multilist_sublist_tail(multilist_sublist_t *mls)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ return (list_tail(&mls->mls_list));
+}
+
+void *
+multilist_sublist_next(multilist_sublist_t *mls, void *obj)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ return (list_next(&mls->mls_list, obj));
+}
+
+void *
+multilist_sublist_prev(multilist_sublist_t *mls, void *obj)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ return (list_prev(&mls->mls_list, obj));
+}
+
+void
+multilist_link_init(multilist_node_t *link)
+{
+ list_link_init(link);
+}
+
+int
+multilist_link_active(multilist_node_t *link)
+{
+ return (list_link_active(link));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
index 40efaba..a5389c3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -1943,7 +1943,7 @@ static int
spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
- if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+ if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return (0);
/*
* Note: normally this routine will not be called if
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
index aeac124..1ea829f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
@@ -80,8 +80,8 @@ space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
mutex_exit(sm->sm_lock);
if (end > bufsize) {
- dmu_prefetch(sm->sm_os, space_map_object(sm), bufsize,
- end - bufsize);
+ dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize,
+ end - bufsize, ZIO_PRIORITY_SYNC_READ);
}
mutex_enter(sm->sm_lock);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
index 4d13cb1..a26d8f8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
@@ -37,6 +37,12 @@ extern "C" {
#include <sys/dmu.h>
#include <sys/spa.h>
+/*
+ * Used by arc_flush() to inform arc_evict_state() that it should evict
+ * all available buffers from the arc state being passed in.
+ */
+#define ARC_EVICT_ALL -1ULL
+
typedef struct arc_buf_hdr arc_buf_hdr_t;
typedef struct arc_buf arc_buf_t;
typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv);
@@ -154,7 +160,7 @@ void arc_freed(spa_t *spa, const blkptr_t *bp);
void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *priv);
boolean_t arc_clear_callback(arc_buf_t *buf);
-void arc_flush(spa_t *spa);
+void arc_flush(spa_t *spa, boolean_t retry);
void arc_tempreserve_clear(uint64_t reserve);
int arc_tempreserve_space(uint64_t reserve, uint64_t txg);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h
new file mode 100644
index 0000000..63722df
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bqueue.h
@@ -0,0 +1,54 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
+#ifndef _BQUEUE_H
+#define _BQUEUE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+
+typedef struct bqueue {
+ list_t bq_list;
+ kmutex_t bq_lock;
+ kcondvar_t bq_add_cv;
+ kcondvar_t bq_pop_cv;
+ uint64_t bq_size;
+ uint64_t bq_maxsize;
+ size_t bq_node_offset;
+} bqueue_t;
+
+typedef struct bqueue_node {
+ list_node_t bqn_node;
+ uint64_t bqn_size;
+} bqueue_node_t;
+
+
+int bqueue_init(bqueue_t *, uint64_t, size_t);
+void bqueue_destroy(bqueue_t *);
+void bqueue_enqueue(bqueue_t *, void *, uint64_t);
+void *bqueue_dequeue(bqueue_t *);
+boolean_t bqueue_empty(bqueue_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BQUEUE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
index 2e07185..482ccb0 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
@@ -245,8 +245,7 @@ typedef struct dbuf_hash_table {
kmutex_t hash_mutexes[DBUF_MUTEXES];
} dbuf_hash_table_t;
-
-uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
+uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset);
dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
void dbuf_create_bonus(struct dnode *dn);
@@ -258,10 +257,12 @@ void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx);
dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
void *tag);
-int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create,
+int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid,
+ boolean_t fail_sparse, boolean_t fail_uncached,
void *tag, dmu_buf_impl_t **dbp);
-void dbuf_prefetch(struct dnode *dn, uint64_t blkid, zio_priority_t prio);
+void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
+ zio_priority_t prio, arc_flags_t aflags);
void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
index 3c5cfbe..f6c72b0 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
@@ -45,6 +45,7 @@
#include <sys/zfs_context.h>
#include <sys/cred.h>
#include <sys/fs/zfs.h>
+#include <sys/zio_priority.h>
#ifdef __cplusplus
extern "C" {
@@ -748,8 +749,8 @@ extern int zfs_max_recordsize;
/*
* Asynchronously try to read in the data.
*/
-void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
- uint64_t len);
+void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
+ uint64_t len, enum zio_priority pri);
typedef struct dmu_object_info {
/* All sizes are in bytes unless otherwise indicated. */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
index 7d490ec..001bff5 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
@@ -38,6 +38,7 @@
#include <sys/zfs_context.h>
#include <sys/dsl_deadlist.h>
#include <sys/refcount.h>
+#include <zfeature_common.h>
#ifdef __cplusplus
extern "C" {
@@ -145,8 +146,6 @@ typedef struct dsl_dataset {
/* only used in syncing context, only valid for non-snapshots: */
struct dsl_dataset *ds_prev;
uint64_t ds_bookmarks; /* DMU_OTN_ZAP_METADATA */
- boolean_t ds_large_blocks;
- boolean_t ds_need_large_blocks;
/* has internal locking: */
dsl_deadlist_t ds_deadlist;
@@ -185,6 +184,18 @@ typedef struct dsl_dataset {
kmutex_t ds_sendstream_lock;
list_t ds_sendstreams;
+ /*
+ * For ZFEATURE_FLAG_PER_DATASET features, set if this dataset
+ * uses this feature.
+ */
+ uint8_t ds_feature_inuse[SPA_FEATURES];
+
+ /*
+ * Set if we need to activate the feature on this dataset this txg
+ * (used only in syncing context).
+ */
+ uint8_t ds_feature_activation_needed[SPA_FEATURES];
+
/* Protected by ds_lock; keep at end of struct for better locality */
char ds_snapname[MAXNAMELEN];
} dsl_dataset_t;
@@ -264,8 +275,6 @@ int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds);
-int dsl_dataset_activate_large_blocks(const char *dsname);
-void dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx);
int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
@@ -305,6 +314,9 @@ void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx);
int dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result);
+void dsl_dataset_deactivate_feature(uint64_t dsobj,
+ spa_feature_t f, dmu_tx_t *tx);
+
#ifdef ZFS_DEBUG
#define dprintf_ds(ds, fmt, ...) do { \
if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h
new file mode 100644
index 0000000..5ebb7fe
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h
@@ -0,0 +1,106 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_MULTILIST_H
+#define _SYS_MULTILIST_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef list_node_t multilist_node_t;
+typedef struct multilist multilist_t;
+typedef struct multilist_sublist multilist_sublist_t;
+typedef unsigned int multilist_sublist_index_func_t(multilist_t *, void *);
+
+struct multilist_sublist {
+ /*
+ * The mutex used internally to implement thread safe insertions
+ * and removals to this individual sublist. It can also be locked
+ * by a consumer using multilist_sublist_{lock,unlock}, which is
+ * useful if a consumer needs to traverse the list in a thread
+ * safe manner.
+ */
+ kmutex_t mls_lock;
+ /*
+ * The actual list object containing all objects in this sublist.
+ */
+ list_t mls_list;
+ /*
+ * Pad to cache line (64 bytes), in an effort to try and prevent
+ * cache line contention.
+ */
+ uint8_t mls_pad[24];
+};
+
+struct multilist {
+ /*
+ * This is used to get to the multilist_node_t structure given
+ * the void *object contained on the list.
+ */
+ size_t ml_offset;
+ /*
+ * The number of sublists used internally by this multilist.
+ */
+ uint64_t ml_num_sublists;
+ /*
+ * The array of pointers to the actual sublists.
+ */
+ multilist_sublist_t *ml_sublists;
+ /*
+ * Pointer to function which determines the sublist to use
+ * when inserting and removing objects from this multilist.
+ * Please see the comment above multilist_create for details.
+ */
+ multilist_sublist_index_func_t *ml_index_func;
+};
+
+void multilist_destroy(multilist_t *);
+void multilist_create(multilist_t *, size_t, size_t, unsigned int,
+ multilist_sublist_index_func_t *);
+
+void multilist_insert(multilist_t *, void *);
+void multilist_remove(multilist_t *, void *);
+int multilist_is_empty(multilist_t *);
+
+unsigned int multilist_get_num_sublists(multilist_t *);
+unsigned int multilist_get_random_index(multilist_t *);
+
+multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int);
+void multilist_sublist_unlock(multilist_sublist_t *);
+
+void multilist_sublist_insert_head(multilist_sublist_t *, void *);
+void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
+void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
+void multilist_sublist_remove(multilist_sublist_t *, void *);
+
+void *multilist_sublist_head(multilist_sublist_t *);
+void *multilist_sublist_tail(multilist_sublist_t *);
+void *multilist_sublist_next(multilist_sublist_t *, void *);
+void *multilist_sublist_prev(multilist_sublist_t *, void *);
+
+void multilist_link_init(multilist_node_t *);
+int multilist_link_active(multilist_node_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_MULTILIST_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
index 36739cd..342c9cd 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
@@ -29,6 +29,7 @@
#ifndef _ZIO_H
#define _ZIO_H
+#include <sys/zio_priority.h>
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/txg.h>
@@ -144,18 +145,6 @@ enum zio_compress {
#define ZIO_FAILURE_MODE_CONTINUE 1
#define ZIO_FAILURE_MODE_PANIC 2
-typedef enum zio_priority {
- ZIO_PRIORITY_SYNC_READ,
- ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
- ZIO_PRIORITY_ASYNC_READ, /* prefetch */
- ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
- ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
- ZIO_PRIORITY_TRIM, /* free requests used for TRIM */
- ZIO_PRIORITY_NUM_QUEUEABLE,
-
- ZIO_PRIORITY_NOW /* non-queued I/Os (e.g. ioctl) */
-} zio_priority_t;
-
enum zio_flag {
/*
* Flags inherited by gang, ddt, and vdev children,
@@ -260,6 +249,7 @@ extern const char *zio_type_name[ZIO_TYPES];
* Root blocks (objset_phys_t) are object 0, level -1: <objset, 0, -1, 0>.
* ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
* dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
+ * dnode visit bookmarks are <objset, object id of dnode, -3, 0>.
*
* Note: this structure is called a bookmark because its original purpose
* was to remember where to resume a pool-wide traverse.
@@ -292,6 +282,9 @@ typedef struct zbookmark_phys {
#define ZB_ZIL_OBJECT (0ULL)
#define ZB_ZIL_LEVEL (-2LL)
+#define ZB_DNODE_LEVEL (-3LL)
+#define ZB_DNODE_BLKID (0ULL)
+
#define ZB_IS_ZERO(zb) \
((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \
(zb)->zb_level == 0 && (zb)->zb_blkid == 0)
@@ -633,8 +626,10 @@ extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
extern void spa_handle_ignored_writes(spa_t *spa);
/* zbookmark_phys functions */
-boolean_t zbookmark_is_before(const struct dnode_phys *dnp,
- const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2);
+boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp,
+ const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block);
+int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2,
+ uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
index a921a2f..0c293ab 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
@@ -44,7 +44,7 @@ typedef struct zio_checksum_info {
zio_checksum_func_t *ci_func[2]; /* checksum function per byteorder */
int ci_correctable; /* number of correctable bits */
int ci_eck; /* uses zio embedded checksum? */
- int ci_dedup; /* strong enough for dedup? */
+ boolean_t ci_dedup; /* strong enough for dedup? */
char *ci_name; /* descriptive name */
} zio_checksum_info_t;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h
new file mode 100644
index 0000000..32e90e2
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h
@@ -0,0 +1,41 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+#ifndef _ZIO_PRIORITY_H
+#define _ZIO_PRIORITY_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum zio_priority {
+ ZIO_PRIORITY_SYNC_READ,
+ ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
+ ZIO_PRIORITY_ASYNC_READ, /* prefetch */
+ ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
+ ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
+ ZIO_PRIORITY_TRIM, /* free requests used for TRIM */
+ ZIO_PRIORITY_NUM_QUEUEABLE,
+
+ ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
+} zio_priority_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZIO_PRIORITY_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
index 36969e8..44919d2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
@@ -162,8 +162,9 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
tbl->zt_nextblk = newblk;
ASSERT0(tbl->zt_blks_copied);
- dmu_prefetch(zap->zap_objset, zap->zap_object,
- tbl->zt_blk << bs, tbl->zt_numblks << bs);
+ dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+ tbl->zt_blk << bs, tbl->zt_numblks << bs,
+ ZIO_PRIORITY_SYNC_READ);
}
/*
@@ -939,7 +940,8 @@ fzap_prefetch(zap_name_t *zn)
if (zap_idx_to_blk(zap, idx, &blk) != 0)
return;
bs = FZAP_BLOCK_SHIFT(zap);
- dmu_prefetch(zap->zap_objset, zap->zap_object, blk << bs, 1 << bs);
+ dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
+ ZIO_PRIORITY_SYNC_READ);
}
/*
@@ -1310,9 +1312,10 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
} else {
int b;
- dmu_prefetch(zap->zap_objset, zap->zap_object,
+ dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
- zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs);
+ zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
+ ZIO_PRIORITY_SYNC_READ);
for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
b++) {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
index 7540320..80a3f0b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -245,7 +245,7 @@ feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature,
{
int err;
uint64_t refcount;
- uint64_t zapobj = feature->fi_can_readonly ?
+ uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
/*
@@ -296,7 +296,7 @@ feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount,
dmu_tx_t *tx)
{
ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature));
- uint64_t zapobj = feature->fi_can_readonly ?
+ uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid,
@@ -322,7 +322,7 @@ feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount,
if (refcount == 0)
spa_deactivate_mos_feature(spa, feature->fi_guid);
- else if (feature->fi_mos)
+ else if (feature->fi_flags & ZFEATURE_FLAG_MOS)
spa_activate_mos_feature(spa, feature->fi_guid, tx);
}
@@ -333,8 +333,9 @@ feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount,
void
feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
{
- uint64_t initial_refcount = feature->fi_activate_on_enable ? 1 : 0;
- uint64_t zapobj = feature->fi_can_readonly ?
+ uint64_t initial_refcount =
+ (feature->fi_flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE) ? 1 : 0;
+ uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
ASSERT(0 != zapobj);
@@ -379,7 +380,7 @@ feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action,
{
uint64_t refcount;
zfeature_info_t *feature = &spa_feature_table[fid];
- uint64_t zapobj = feature->fi_can_readonly ?
+ uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
ASSERT(VALID_FEATURE_FID(fid));
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
index c2dd020..693ba41 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
*/
/*
@@ -1149,10 +1150,11 @@ zfsctl_shares_lookup(ap)
ZFS_EXIT(zfsvfs);
return (SET_ERROR(ENOTSUP));
}
- if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0)
+ if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
error = VOP_LOOKUP(ZTOV(dzp), vpp, cnp);
+ VN_RELE(ZTOV(dzp));
+ }
- VN_RELE(ZTOV(dzp));
ZFS_EXIT(zfsvfs);
return (error);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
index 2a583d4..2e51916 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
@@ -5190,6 +5190,7 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
if ((error = get_nvlist(zc->zc_nvlist_src,
zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) {
VN_RELE(vp);
+ VN_RELE(ZTOV(sharedir));
ZFS_EXIT(zfsvfs);
return (error);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
index 8a08c8d..ed56d17 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
@@ -22,7 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
* All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -950,7 +950,7 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
&sa_obj);
if (error)
- return (error);
+ goto out;
} else {
/*
* Pre SA versions file systems should never touch
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
index 1038a87..45a2bd7 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
@@ -2675,7 +2675,8 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
/* Prefetch znode */
if (prefetch)
- dmu_prefetch(os, objnum, 0, 0);
+ dmu_prefetch(os, objnum, 0, 0, 0,
+ ZIO_PRIORITY_SYNC_READ);
skip_entry:
/*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
index 48de571..867b798 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
@@ -94,6 +94,9 @@ extern vmem_t *zio_alloc_arena;
#define ZIO_PIPELINE_CONTINUE 0x100
#define ZIO_PIPELINE_STOP 0x101
+#define BP_SPANB(indblkshift, level) \
+ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
+#define COMPARE_META_LEVEL 0x80000000ul
/*
* The following actions directly effect the spa's sync-to-convergence logic.
* The values below define the sync pass when we start performing the action.
@@ -3461,37 +3464,127 @@ static zio_pipe_stage_t *zio_pipeline[] = {
zio_done
};
-/* dnp is the dnode for zb1->zb_object */
-boolean_t
-zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1,
- const zbookmark_phys_t *zb2)
-{
- uint64_t zb1nextL0, zb2thisobj;
- ASSERT(zb1->zb_objset == zb2->zb_objset);
- ASSERT(zb2->zb_level == 0);
- /* The objset_phys_t isn't before anything. */
- if (dnp == NULL)
- return (B_FALSE);
- zb1nextL0 = (zb1->zb_blkid + 1) <<
- ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+/*
+ * Compare two zbookmark_phys_t's to see which we would reach first in a
+ * pre-order traversal of the object tree.
+ *
+ * This is simple in every case aside from the meta-dnode object. For all other
+ * objects, we traverse them in order (object 1 before object 2, and so on).
+ * However, all of these objects are traversed while traversing object 0, since
+ * the data it points to is the list of objects. Thus, we need to convert to a
+ * canonical representation so we can compare meta-dnode bookmarks to
+ * non-meta-dnode bookmarks.
+ *
+ * We do this by calculating "equivalents" for each field of the zbookmark.
+ * zbookmarks outside of the meta-dnode use their own object and level, and
+ * calculate the level 0 equivalent (the first L0 blkid that is contained in the
+ * blocks this bookmark refers to) by multiplying their blkid by their span
+ * (the number of L0 blocks contained within one block at their level).
+ * zbookmarks inside the meta-dnode calculate their object equivalent
+ * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
+ * level + 1<<31 (any value larger than a level could ever be) for their level.
+ * This causes them to always compare before a bookmark in their object
+ * equivalent, compare appropriately to bookmarks in other objects, and to
+ * compare appropriately to other bookmarks in the meta-dnode.
+ */
+int
+zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
+ const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
+{
+ /*
+ * These variables represent the "equivalent" values for the zbookmark,
+ * after converting zbookmarks inside the meta dnode to their
+ * normal-object equivalents.
+ */
+ uint64_t zb1obj, zb2obj;
+ uint64_t zb1L0, zb2L0;
+ uint64_t zb1level, zb2level;
- zb2thisobj = zb2->zb_object ? zb2->zb_object :
- zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
+ if (zb1->zb_object == zb2->zb_object &&
+ zb1->zb_level == zb2->zb_level &&
+ zb1->zb_blkid == zb2->zb_blkid)
+ return (0);
+
+ /*
+ * BP_SPANB calculates the span in blocks.
+ */
+ zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
+ zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
- uint64_t nextobj = zb1nextL0 *
- (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
- return (nextobj <= zb2thisobj);
+ zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
+ zb1L0 = 0;
+ zb1level = zb1->zb_level + COMPARE_META_LEVEL;
+ } else {
+ zb1obj = zb1->zb_object;
+ zb1level = zb1->zb_level;
}
- if (zb1->zb_object < zb2thisobj)
- return (B_TRUE);
- if (zb1->zb_object > zb2thisobj)
- return (B_FALSE);
- if (zb2->zb_object == DMU_META_DNODE_OBJECT)
+ if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
+ zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
+ zb2L0 = 0;
+ zb2level = zb2->zb_level + COMPARE_META_LEVEL;
+ } else {
+ zb2obj = zb2->zb_object;
+ zb2level = zb2->zb_level;
+ }
+
+ /* Now that we have a canonical representation, do the comparison. */
+ if (zb1obj != zb2obj)
+ return (zb1obj < zb2obj ? -1 : 1);
+ else if (zb1L0 != zb2L0)
+ return (zb1L0 < zb2L0 ? -1 : 1);
+ else if (zb1level != zb2level)
+ return (zb1level > zb2level ? -1 : 1);
+ /*
+ * This can (theoretically) happen if the bookmarks have the same object
+ * and level, but different blkids, if the block sizes are not the same.
+ * There is presently no way to change the indirect block sizes
+ */
+ return (0);
+}
+
+/*
+ * This function checks the following: given that last_block is the place that
+ * our traversal stopped last time, does that guarantee that we've visited
+ * every node under subtree_root? Therefore, we can't just use the raw output
+ * of zbookmark_compare. We have to pass in a modified version of
+ * subtree_root; by incrementing the block id, and then checking whether
+ * last_block is before or equal to that, we can tell whether or not having
+ * visited last_block implies that all of subtree_root's children have been
+ * visited.
+ */
+boolean_t
+zbookmark_subtree_completed(const dnode_phys_t *dnp,
+ const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
+{
+ zbookmark_phys_t mod_zb = *subtree_root;
+ mod_zb.zb_blkid++;
+ ASSERT(last_block->zb_level == 0);
+
+ /* The objset_phys_t isn't before anything. */
+ if (dnp == NULL)
return (B_FALSE);
- return (zb1nextL0 <= zb2->zb_blkid);
+
+ /*
+ * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
+ * data block size in sectors, because that variable is only used if
+ * the bookmark refers to a block in the meta-dnode. Since we don't
+ * know without examining it what object it refers to, and there's no
+ * harm in passing in this value in other cases, we always pass it in.
+ *
+ * We pass in 0 for the indirect block size shift because zb2 must be
+ * level 0. The indirect block size is only used to calculate the span
+ * of the bookmark, but since the bookmark must be level 0, the span is
+ * always 1, so the math works out.
+ *
+ * If you make changes to how the zbookmark_compare code works, be sure
+ * to make sure that this code still works afterwards.
+ */
+ return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
+ 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
+ last_block) <= 0);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
index 991a0a3..0a7f4e4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
@@ -438,7 +438,11 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
* fault injection isn't a performance critical path.
*/
if (flags & ZINJECT_FLUSH_ARC)
- arc_flush(NULL);
+ /*
+ * We must use FALSE to ensure arc_flush returns, since
+ * we're not preventing concurrent ARC insertions.
+ */
+ arc_flush(NULL, FALSE);
return (0);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
index 55de1b4..2c90810 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
@@ -358,7 +358,7 @@ zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
zvol_extent_t *ze;
int bs = ma->ma_zv->zv_volblocksize;
- if (BP_IS_HOLE(bp) ||
+ if (bp == NULL || BP_IS_HOLE(bp) ||
zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
return (0);
diff --git a/sys/compat/cloudabi/cloudabi_proc.c b/sys/compat/cloudabi/cloudabi_proc.c
index 9c735fa..d917337 100644
--- a/sys/compat/cloudabi/cloudabi_proc.c
+++ b/sys/compat/cloudabi/cloudabi_proc.c
@@ -46,14 +46,19 @@ cloudabi_sys_proc_exec(struct thread *td,
struct cloudabi_sys_proc_exec_args *uap)
{
struct image_args args;
+ struct vmspace *oldvmspace;
int error;
+ error = pre_execve(td, &oldvmspace);
+ if (error != 0)
+ return (error);
error = exec_copyin_data_fds(td, &args, uap->data, uap->datalen,
uap->fds, uap->fdslen);
if (error == 0) {
args.fd = uap->fd;
error = kern_execve(td, &args, NULL);
}
+ post_execve(td, error, oldvmspace);
return (error);
}
diff --git a/sys/conf/Makefile.arm b/sys/conf/Makefile.arm
index 86b11c6..af5f7da 100644
--- a/sys/conf/Makefile.arm
+++ b/sys/conf/Makefile.arm
@@ -66,10 +66,6 @@ SYSTEM_LD_TAIL +=;sed s/" + SIZEOF_HEADERS"// ldscript.$M\
${KERNEL_KO}.bin; \
rm ${FULLKERNEL}.noheader
-.if defined(MFS_IMAGE)
-SYSTEM_LD_TAIL += ;sh ${S}/tools/embed_mfs.sh ${KERNEL_KO}.bin ${MFS_IMAGE};
-.endif
-
FILES_CPU_FUNC = \
$S/$M/$M/cpufunc_asm_arm9.S \
$S/$M/$M/cpufunc_asm_arm10.S \
diff --git a/sys/conf/NOTES b/sys/conf/NOTES
index b0619cb..7bc2048 100644
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@@ -2981,9 +2981,10 @@ options MAXFILES=999
# Random number generator
# Only ONE of the below two may be used; they are mutually exclusive.
-# If neither is present, then the Fortuna algorithm is used.
-options RANDOM_YARROW # Yarrow CSPRNG (old default)
-#options RANDOM_DUMMY # Dummy CSPRNG that always blocks
+# If neither is present, then the Fortuna algorithm is selected.
+#options RANDOM_YARROW # Yarrow CSPRNG (old default)
+#options RANDOM_LOADABLE # Allow the algorithm to be loaded as
+ # a module.
# For developers.
options RANDOM_DEBUG # Extra debugging messages
diff --git a/sys/conf/files b/sys/conf/files
index 531647f..dfe9763 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -83,7 +83,7 @@ cam/ctl/ctl_backend_ramdisk.c optional ctl
cam/ctl/ctl_cmd_table.c optional ctl
cam/ctl/ctl_frontend.c optional ctl
cam/ctl/ctl_frontend_cam_sim.c optional ctl
-cam/ctl/ctl_frontend_internal.c optional ctl
+cam/ctl/ctl_frontend_ioctl.c optional ctl
cam/ctl/ctl_frontend_iscsi.c optional ctl
cam/ctl/ctl_scsi_all.c optional ctl
cam/ctl/ctl_tpc.c optional ctl
@@ -145,6 +145,7 @@ cddl/contrib/opensolaris/uts/common/fs/zfs/blkptr.c optional zfs compile-with
cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c optional zfs compile-with "${ZFS_C}"
+cddl/contrib/opensolaris/uts/common/fs/zfs/bqueue.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c optional zfs compile-with "${ZFS_C}"
@@ -174,6 +175,7 @@ cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c optional zfs compile-with "$
cddl/contrib/opensolaris/uts/common/fs/zfs/lz4.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c optional zfs compile-with "${ZFS_C}"
+cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c optional zfs compile-with "${ZFS_C}"
cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c optional zfs compile-with "${ZFS_C}"
@@ -548,14 +550,14 @@ crypto/des/des_ecb.c optional crypto | ipsec | netsmb
crypto/des/des_setkey.c optional crypto | ipsec | netsmb
crypto/rc4/rc4.c optional netgraph_mppc_encryption | kgssapi
crypto/rijndael/rijndael-alg-fst.c optional crypto | geom_bde | \
- ipsec | random random_yarrow | random !random_yarrow !random_dummy | wlan_ccmp
-crypto/rijndael/rijndael-api-fst.c optional geom_bde | random random_yarrow | random !random_yarrow !random_dummy
+ ipsec | random !random_loadable | wlan_ccmp
+crypto/rijndael/rijndael-api-fst.c optional geom_bde | random !random_loadable
crypto/rijndael/rijndael-api.c optional crypto | ipsec | wlan_ccmp
crypto/sha1.c optional carp | crypto | ipsec | \
netgraph_mppc_encryption | sctp
-crypto/sha2/sha2.c optional crypto | geom_bde | ipsec | random random_yarrow | random !random_yarrow !random_dummy | \
+crypto/sha2/sha2.c optional crypto | geom_bde | ipsec | random !random_loadable | \
sctp | zfs
-crypto/sha2/sha256c.c optional crypto | geom_bde | ipsec | random random_yarrow | random !random_yarrow !random_dummy | \
+crypto/sha2/sha256c.c optional crypto | geom_bde | ipsec | random !random_loadable | \
sctp | zfs
crypto/siphash/siphash.c optional inet | inet6
crypto/siphash/siphash_test.c optional inet | inet6
@@ -2312,12 +2314,14 @@ rt2860.fw optional rt2860fw | ralfw \
compile-with "${NORMAL_FW}" \
no-obj no-implicit-rule \
clean "rt2860.fw"
-dev/random/randomdev_none.c optional !random
-dev/random/randomdev.c optional random
-dev/random/random_harvestq.c optional random random_yarrow | random !random_dummy
+dev/random/random_infra.c optional random
+dev/random/random_harvestq.c optional random
+dev/random/randomdev.c optional random random_yarrow | \
+ random !random_yarrow !random_loadable
dev/random/yarrow.c optional random random_yarrow
-dev/random/fortuna.c optional random !random_yarrow !random_dummy
-dev/random/hash.c optional random random_yarrow | random !random_dummy
+dev/random/fortuna.c optional random !random_yarrow !random_loadable
+dev/random/hash.c optional random random_yarrow | \
+ random !random_yarrow !random_loadable
dev/rc/rc.c optional rc
dev/re/if_re.c optional re
dev/rl/if_rl.c optional rl pci
diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64
index 2ffe102..8451e00 100644
--- a/sys/conf/files.amd64
+++ b/sys/conf/files.amd64
@@ -40,7 +40,7 @@ ia32_genassym.o standard \
#
ia32_assym.h standard \
dependency "$S/kern/genassym.sh ia32_genassym.o" \
- compile-with "env NM='${NM}' sh $S/kern/genassym.sh ia32_genassym.o > ${.TARGET}" \
+ compile-with "env NM='${NM}' NMFLAGS='${NMFLAGS}' sh $S/kern/genassym.sh ia32_genassym.o > ${.TARGET}" \
no-obj no-implicit-rule before-depend \
clean "ia32_assym.h"
#
diff --git a/sys/conf/kern.post.mk b/sys/conf/kern.post.mk
index 28ea453..137e72c 100644
--- a/sys/conf/kern.post.mk
+++ b/sys/conf/kern.post.mk
@@ -121,7 +121,7 @@ gdbinit:
.endif
.endif
-${FULLKERNEL}: ${SYSTEM_DEP} vers.o ${MFS_IMAGE}
+${FULLKERNEL}: ${SYSTEM_DEP} vers.o
@rm -f ${.TARGET}
@echo linking ${.TARGET}
${SYSTEM_LD}
@@ -133,9 +133,6 @@ ${FULLKERNEL}: ${SYSTEM_DEP} vers.o ${MFS_IMAGE}
${OBJCOPY} --strip-debug ${.TARGET}
.endif
${SYSTEM_LD_TAIL}
-.if defined(MFS_IMAGE)
- sh ${S}/tools/embed_mfs.sh ${FULLKERNEL} ${MFS_IMAGE}
-.endif
.if !exists(${.OBJDIR}/.depend)
${SYSTEM_OBJS}: assym.s vnode_if.h ${BEFORE_DEPEND:M*.h} ${MFILES:T:S/.m$/.h/}
@@ -177,7 +174,7 @@ hack.So: Makefile
./assym.s: assym.s
assym.s: $S/kern/genassym.sh genassym.o
- NM='${NM}' sh $S/kern/genassym.sh genassym.o > ${.TARGET}
+ NM='${NM}' NMFLAGS='${NMFLAGS}' sh $S/kern/genassym.sh genassym.o > ${.TARGET}
genassym.o: $S/$M/$M/genassym.c
${CC} -c ${CFLAGS:N-fno-common} $S/$M/$M/genassym.c
@@ -301,6 +298,27 @@ vnode_if_newproto.h:
vnode_if_typedef.h:
${AWK} -f $S/tools/vnode_if.awk $S/kern/vnode_if.src -q
+.if ${MFS_IMAGE:Uno} != "no"
+# Generate an object file from the file system image to embed in the kernel
+# via linking. Make sure the contents are in the mfs section and rename the
+# start/end/size variables to __start_mfs, __stop_mfs, and mfs_size,
+# respectively.
+embedfs_${MFS_IMAGE:T:R}.o: ${MFS_IMAGE}
+ ${OBJCOPY} --input-target binary \
+ --output-target ${EMBEDFS_FORMAT.${MACHINE_ARCH}} \
+ --binary-architecture ${EMBEDFS_ARCH.${MACHINE_ARCH}} \
+ ${MFS_IMAGE} ${.TARGET}
+ ${OBJCOPY} \
+ --rename-section .data=mfs,contents,alloc,load,readonly,data \
+ --redefine-sym \
+ _binary_${MFS_IMAGE:C,[^[:alnum:]],_,g}_size=__mfs_root_size \
+ --redefine-sym \
+ _binary_${MFS_IMAGE:C,[^[:alnum:]],_,g}_start=mfs_root \
+ --redefine-sym \
+ _binary_${MFS_IMAGE:C,[^[:alnum:]],_,g}_end=mfs_root_end \
+ ${.TARGET}
+.endif
+
# XXX strictly, everything depends on Makefile because changes to ${PROF}
# only appear there, but we don't handle that.
diff --git a/sys/conf/kern.pre.mk b/sys/conf/kern.pre.mk
index cf1b127..3783881 100644
--- a/sys/conf/kern.pre.mk
+++ b/sys/conf/kern.pre.mk
@@ -191,6 +191,9 @@ SYSTEM_DEP= Makefile ${SYSTEM_OBJS}
SYSTEM_OBJS= locore.o ${MDOBJS} ${OBJS}
SYSTEM_OBJS+= ${SYSTEM_CFILES:.c=.o}
SYSTEM_OBJS+= hack.So
+.if ${MFS_IMAGE:Uno} != "no"
+SYSTEM_OBJS+= embedfs_${MFS_IMAGE:T:R}.o
+.endif
SYSTEM_LD= @${LD} -Bdynamic -T ${LDSCRIPT} ${_LDFLAGS} --no-warn-mismatch \
--warn-common --export-dynamic --dynamic-linker /red/herring \
-o ${.TARGET} -X ${SYSTEM_OBJS} vers.o
@@ -222,6 +225,32 @@ MKMODULESENV+= DEBUG_FLAGS="${DEBUG}"
.endif
MKMODULESENV+= _MPATH="${_MPATH}"
+# Architecture and output format arguments for objdump to convert image to
+# object file
+.if ${MFS_IMAGE:Uno} != "no"
+
+.if !defined(EMBEDFS_FORMAT.${MACHINE_ARCH})
+EMBEDFS_FORMAT.${MACHINE_ARCH}!= awk -F'"' '/OUTPUT_FORMAT/ {print $$2}' ${LDSCRIPT}
+.if empty(EMBEDFS_FORMAT.${MACHINE_ARCH})
+.undef EMBEDFS_FORMAT.${MACHINE_ARCH}
+.endif
+.endif
+
+.if !defined(EMBEDFS_ARCH.${MACHINE_ARCH})
+EMBEDFS_ARCH.${MACHINE_ARCH}!= sed -n '/OUTPUT_ARCH/s/.*(\(.*\)).*/\1/p' ${LDSCRIPT}
+.if empty(EMBEDFS_ARCH.${MACHINE_ARCH})
+.undef EMBEDFS_ARCH.${MACHINE_ARCH}
+.endif
+.endif
+
+EMBEDFS_FORMAT.arm?= elf32-littlearm
+EMBEDFS_FORMAT.armv6?= elf32-littlearm
+EMBEDFS_FORMAT.mips?= elf32-tradbigmips
+EMBEDFS_FORMAT.mipsel?= elf32-tradlittlemips
+EMBEDFS_FORMAT.mips64?= elf64-tradbigmips
+EMBEDFS_FORMAT.mips64el?= elf64-tradlittlemips
+.endif
+
# Detect kernel config options that force stack frames to be turned on.
DDB_ENABLED!= grep DDB opt_ddb.h || true ; echo
DTR_ENABLED!= grep KDTRACE_FRAME opt_kdtrace.h || true ; echo
diff --git a/sys/conf/options b/sys/conf/options
index bf6c4a6..30bbc53 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -711,6 +711,7 @@ DEV_PCI opt_pci.h
DEV_PF opt_pf.h
DEV_PFLOG opt_pf.h
DEV_PFSYNC opt_pf.h
+DEV_RANDOM opt_global.h
DEV_SPLASH opt_splash.h
DEV_VLAN opt_vlan.h
@@ -946,13 +947,14 @@ RCTL opt_global.h
# The DEBUG option is in global.h as the random harvesting
# puts probes all over the place, and it makes little sense
# to pollute these headers with an extra include.
-# the DUMMY option is in global.h because it is used to
-# turn off harvesting all over the kernel.
-RANDOM_DEBUG opt_global.h
+RANDOM_DEBUG opt_random.h
# Which CSPRNG hashes we get.
-# These are mutually exclusive. With neither, Fortuna is selected.
-RANDOM_DUMMY opt_global.h
+# If Yarrow is not chosen, Fortuna is selected.
RANDOM_YARROW opt_random.h
+# With this, no entropy processor is loaded, but the entropy
+# harvesting infrastructure is present. This means an entropy
+# processor may be loaded as a module.
+RANDOM_LOADABLE opt_random.h
# Intel em(4) driver
EM_MULTIQUEUE opt_em.h
diff --git a/sys/contrib/libnv/nv_impl.h b/sys/contrib/libnv/nv_impl.h
index 7928431..b50bdf7 100644
--- a/sys/contrib/libnv/nv_impl.h
+++ b/sys/contrib/libnv/nv_impl.h
@@ -1,5 +1,6 @@
/*-
* Copyright (c) 2013 The FreeBSD Foundation
+ * Copyright (c) 2013-2015 Mariusz Zaborski <oshogbo@FreeBSD.org>
* All rights reserved.
*
* This software was developed by Pawel Jakub Dawidek under sponsorship from
@@ -39,12 +40,14 @@ struct nvpair;
typedef struct nvpair nvpair_t;
#endif
+#define NV_TYPE_NVLIST_ARRAY_NEXT 254
#define NV_TYPE_NVLIST_UP 255
#define NV_TYPE_FIRST NV_TYPE_NULL
-#define NV_TYPE_LAST NV_TYPE_BINARY
+#define NV_TYPE_LAST NV_TYPE_DESCRIPTOR_ARRAY
-#define NV_FLAG_BIG_ENDIAN 0x80
+#define NV_FLAG_BIG_ENDIAN 0x080
+#define NV_FLAG_IN_ARRAY 0x100
#ifdef _KERNEL
#define nv_malloc(size) malloc((size), M_NVLIST, M_WAITOK)
@@ -86,6 +89,7 @@ typedef struct nvpair nvpair_t;
int *nvlist_descriptors(const nvlist_t *nvl, size_t *nitemsp);
size_t nvlist_ndescriptors(const nvlist_t *nvl);
+void nvlist_set_flags(nvlist_t *nvl, int flags);
nvpair_t *nvlist_first_nvpair(const nvlist_t *nvl);
nvpair_t *nvlist_next_nvpair(const nvlist_t *nvl, const nvpair_t *nvp);
@@ -96,6 +100,7 @@ void nvlist_add_nvpair(nvlist_t *nvl, const nvpair_t *nvp);
bool nvlist_move_nvpair(nvlist_t *nvl, nvpair_t *nvp);
void nvlist_set_parent(nvlist_t *nvl, nvpair_t *parent);
+void nvlist_set_array_next(nvlist_t *nvl, nvpair_t *ele);
const nvpair_t *nvlist_get_nvpair(const nvlist_t *nvl, const char *name);
@@ -120,18 +125,33 @@ nvpair_t *nvpair_create_stringv(const char *name, const char *valuefmt, va_list
nvpair_t *nvpair_create_nvlist(const char *name, const nvlist_t *value);
nvpair_t *nvpair_create_descriptor(const char *name, int value);
nvpair_t *nvpair_create_binary(const char *name, const void *value, size_t size);
+nvpair_t *nvpair_create_bool_array(const char *name, const bool *value, size_t nitems);
+nvpair_t *nvpair_create_number_array(const char *name, const uint64_t *value, size_t nitems);
+nvpair_t *nvpair_create_string_array(const char *name, const char * const *value, size_t nitems);
+nvpair_t *nvpair_create_nvlist_array(const char *name, const nvlist_t * const *value, size_t nitems);
+nvpair_t *nvpair_create_descriptor_array(const char *name, const int *value, size_t nitems);
nvpair_t *nvpair_move_string(const char *name, char *value);
nvpair_t *nvpair_move_nvlist(const char *name, nvlist_t *value);
nvpair_t *nvpair_move_descriptor(const char *name, int value);
nvpair_t *nvpair_move_binary(const char *name, void *value, size_t size);
-
-bool nvpair_get_bool(const nvpair_t *nvp);
-uint64_t nvpair_get_number(const nvpair_t *nvp);
-const char *nvpair_get_string(const nvpair_t *nvp);
-const nvlist_t *nvpair_get_nvlist(const nvpair_t *nvp);
-int nvpair_get_descriptor(const nvpair_t *nvp);
-const void *nvpair_get_binary(const nvpair_t *nvp, size_t *sizep);
+nvpair_t *nvpair_move_bool_array(const char *name, bool *value, size_t nitems);
+nvpair_t *nvpair_move_nvlist_array(const char *name, nvlist_t **value, size_t nitems);
+nvpair_t *nvpair_move_descriptor_array(const char *name, int *value, size_t nitems);
+nvpair_t *nvpair_move_number_array(const char *name, uint64_t *value, size_t nitems);
+nvpair_t *nvpair_move_string_array(const char *name, char **value, size_t nitems);
+
+bool nvpair_get_bool(const nvpair_t *nvp);
+uint64_t nvpair_get_number(const nvpair_t *nvp);
+const char *nvpair_get_string(const nvpair_t *nvp);
+const nvlist_t *nvpair_get_nvlist(const nvpair_t *nvp);
+int nvpair_get_descriptor(const nvpair_t *nvp);
+const void *nvpair_get_binary(const nvpair_t *nvp, size_t *sizep);
+const bool *nvpair_get_bool_array(const nvpair_t *nvp, size_t *nitemsp);
+const uint64_t *nvpair_get_number_array(const nvpair_t *nvp, size_t *nitemsp);
+const char * const *nvpair_get_string_array(const nvpair_t *nvp, size_t *nitemsp);
+const nvlist_t * const *nvpair_get_nvlist_array(const nvpair_t *nvp, size_t *nitemsp);
+const int *nvpair_get_descriptor_array(const nvpair_t *nvp, size_t *nitemsp);
void nvpair_free(nvpair_t *nvp);
diff --git a/sys/contrib/libnv/nvlist.c b/sys/contrib/libnv/nvlist.c
index edcd074..cf8281e 100644
--- a/sys/contrib/libnv/nvlist.c
+++ b/sys/contrib/libnv/nvlist.c
@@ -1,5 +1,6 @@
/*-
* Copyright (c) 2009-2013 The FreeBSD Foundation
+ * Copyright (c) 2013-2015 Mariusz Zaborski <oshogbo@FreeBSD.org>
* All rights reserved.
*
* This software was developed by Pawel Jakub Dawidek under sponsorship from
@@ -88,7 +89,7 @@ __FBSDID("$FreeBSD$");
#endif
#endif
-#define NV_FLAG_PRIVATE_MASK (NV_FLAG_BIG_ENDIAN)
+#define NV_FLAG_PRIVATE_MASK (NV_FLAG_BIG_ENDIAN | NV_FLAG_IN_ARRAY)
#define NV_FLAG_PUBLIC_MASK (NV_FLAG_IGNORE_CASE | NV_FLAG_NO_UNIQUE)
#define NV_FLAG_ALL_MASK (NV_FLAG_PRIVATE_MASK | NV_FLAG_PUBLIC_MASK)
@@ -98,6 +99,7 @@ struct nvlist {
int nvl_error;
int nvl_flags;
nvpair_t *nvl_parent;
+ nvpair_t *nvl_array_next;
struct nvl_head nvl_head;
};
@@ -135,6 +137,7 @@ nvlist_create(int flags)
nvl->nvl_error = 0;
nvl->nvl_flags = flags;
nvl->nvl_parent = NULL;
+ nvl->nvl_array_next = NULL;
TAILQ_INIT(&nvl->nvl_head);
nvl->nvl_magic = NVLIST_MAGIC;
@@ -157,6 +160,10 @@ nvlist_destroy(nvlist_t *nvl)
nvlist_remove_nvpair(nvl, nvp);
nvpair_free(nvp);
}
+ if (nvl->nvl_array_next != NULL)
+ nvpair_free_structure(nvl->nvl_array_next);
+ nvl->nvl_array_next = NULL;
+ nvl->nvl_parent = NULL;
nvl->nvl_magic = 0;
nv_free(nvl);
@@ -223,6 +230,59 @@ nvlist_set_parent(nvlist_t *nvl, nvpair_t *parent)
nvl->nvl_parent = parent;
}
+void
+nvlist_set_array_next(nvlist_t *nvl, nvpair_t *ele)
+{
+
+ NVLIST_ASSERT(nvl);
+
+ if (ele != NULL)
+ nvl->nvl_flags |= NV_FLAG_IN_ARRAY;
+ else
+ nvl->nvl_flags &= ~NV_FLAG_IN_ARRAY;
+
+ nvl->nvl_array_next = ele;
+}
+
+bool
+nvlist_in_array(const nvlist_t *nvl)
+{
+
+ NVLIST_ASSERT(nvl);
+
+ return ((nvl->nvl_flags & NV_FLAG_IN_ARRAY) != 0);
+}
+
+const nvlist_t *
+nvlist_get_array_next(const nvlist_t *nvl)
+{
+ nvpair_t *nvp;
+
+ NVLIST_ASSERT(nvl);
+
+ nvp = nvl->nvl_array_next;
+ if (nvp == NULL)
+ return (NULL);
+
+ return (nvpair_get_nvlist(nvp));
+}
+
+const nvlist_t *
+nvlist_get_pararr(const nvlist_t *nvl, void **cookiep)
+{
+ const nvlist_t *ret;
+
+ ret = nvlist_get_array_next(nvl);
+ if (ret != NULL) {
+ if (cookiep != NULL)
+ *cookiep = NULL;
+ return (ret);
+ }
+
+ ret = nvlist_get_parent(nvl, cookiep);
+ return (ret);
+}
+
bool
nvlist_empty(const nvlist_t *nvl)
{
@@ -239,9 +299,18 @@ nvlist_flags(const nvlist_t *nvl)
NVLIST_ASSERT(nvl);
PJDLOG_ASSERT(nvl->nvl_error == 0);
- PJDLOG_ASSERT((nvl->nvl_flags & ~(NV_FLAG_PUBLIC_MASK)) == 0);
- return (nvl->nvl_flags);
+ return (nvl->nvl_flags & NV_FLAG_PUBLIC_MASK);
+}
+
+void
+nvlist_set_flags(nvlist_t *nvl, int flags)
+{
+
+ NVLIST_ASSERT(nvl);
+ PJDLOG_ASSERT(nvl->nvl_error == 0);
+
+ nvl->nvl_flags = flags;
}
static void
@@ -418,17 +487,129 @@ nvlist_dump(const nvlist_t *nvl, int fd)
dprintf(fd, "\n");
break;
}
+ case NV_TYPE_BOOL_ARRAY:
+ {
+ const bool *value;
+ unsigned int ii;
+ size_t nitems;
+
+ value = nvpair_get_bool_array(nvp, &nitems);
+ dprintf(fd, " [ ");
+ for (ii = 0; ii < nitems; ii++) {
+ dprintf(fd, "%s", value[ii] ? "TRUE" : "FALSE");
+ if (ii != nitems - 1)
+ dprintf(fd, ", ");
+ }
+ dprintf(fd, " ]\n");
+ break;
+ }
+ case NV_TYPE_STRING_ARRAY:
+ {
+ const char * const *value;
+ unsigned int ii;
+ size_t nitems;
+
+ value = nvpair_get_string_array(nvp, &nitems);
+ dprintf(fd, " [ ");
+ for (ii = 0; ii < nitems; ii++) {
+ if (value[ii] == NULL)
+ dprintf(fd, "NULL");
+ else
+ dprintf(fd, "\"%s\"", value[ii]);
+ if (ii != nitems - 1)
+ dprintf(fd, ", ");
+ }
+ dprintf(fd, " ]\n");
+ break;
+ }
+ case NV_TYPE_NUMBER_ARRAY:
+ {
+ const uint64_t *value;
+ unsigned int ii;
+ size_t nitems;
+
+ value = nvpair_get_number_array(nvp, &nitems);
+ dprintf(fd, " [ ");
+ for (ii = 0; ii < nitems; ii++) {
+ dprintf(fd, "%ju (%jd) (0x%jx)",
+ value[ii], value[ii], value[ii]);
+ if (ii != nitems - 1)
+ dprintf(fd, ", ");
+ }
+ dprintf(fd, " ]\n");
+ break;
+ }
+ case NV_TYPE_DESCRIPTOR_ARRAY:
+ {
+ const int *value;
+ unsigned int ii;
+ size_t nitems;
+
+ value = nvpair_get_descriptor_array(nvp, &nitems);
+ dprintf(fd, " [ ");
+ for (ii = 0; ii < nitems; ii++) {
+ dprintf(fd, "%d", value[ii]);
+ if (ii != nitems - 1)
+ dprintf(fd, ", ");
+ }
+ dprintf(fd, " ]\n");
+ break;
+ }
+ case NV_TYPE_NVLIST_ARRAY:
+ {
+ const nvlist_t * const *value;
+ unsigned int ii;
+ size_t nitems;
+
+ value = nvpair_get_nvlist_array(nvp, &nitems);
+ dprintf(fd, " %zu\n", nitems);
+ tmpnvl = NULL;
+ tmpnvp = NULL;
+ for (ii = 0; ii < nitems; ii++) {
+ if (nvlist_dump_error_check(value[ii], fd,
+ level + 1)) {
+ break;
+ }
+
+ if (tmpnvl == NULL) {
+ tmpnvp = nvlist_first_nvpair(value[ii]);
+ if (tmpnvp != NULL) {
+ tmpnvl = value[ii];
+ } else {
+ dprintf(fd, "%*s,\n",
+ (level + 1) * 4, "");
+ }
+ }
+ }
+ if (tmpnvp != NULL) {
+ nvl = tmpnvl;
+ nvp = tmpnvp;
+ level++;
+ continue;
+ }
+ break;
+ }
default:
PJDLOG_ABORT("Unknown type: %d.", nvpair_type(nvp));
}
while ((nvp = nvlist_next_nvpair(nvl, nvp)) == NULL) {
- cookie = NULL;
- nvl = nvlist_get_parent(nvl, &cookie);
- if (nvl == NULL)
- return;
- nvp = cookie;
- level--;
+ do {
+ cookie = NULL;
+ if (nvlist_in_array(nvl))
+ dprintf(fd, "%*s,\n", level * 4, "");
+ nvl = nvlist_get_pararr(nvl, &cookie);
+ if (nvl == NULL)
+ return;
+ if (nvlist_in_array(nvl) && cookie == NULL) {
+ nvp = nvlist_first_nvpair(nvl);
+ } else {
+ nvp = cookie;
+ level--;
+ }
+ } while (nvp == NULL);
+ if (nvlist_in_array(nvl) && cookie == NULL)
+ break;
}
}
}
@@ -449,9 +630,11 @@ size_t
nvlist_size(const nvlist_t *nvl)
{
const nvlist_t *tmpnvl;
+ const nvlist_t * const *nvlarray;
const nvpair_t *nvp, *tmpnvp;
void *cookie;
- size_t size;
+ size_t size, nitems;
+ unsigned int ii;
NVLIST_ASSERT(nvl);
PJDLOG_ASSERT(nvl->nvl_error == 0);
@@ -472,16 +655,47 @@ nvlist_size(const nvlist_t *nvl)
nvp = tmpnvp;
continue;
}
+ } else if (nvpair_type(nvp) == NV_TYPE_NVLIST_ARRAY) {
+ nvlarray = nvpair_get_nvlist_array(nvp, &nitems);
+ PJDLOG_ASSERT(nitems > 0);
+
+ size += (nvpair_header_size() + 1) * nitems;
+ size += sizeof(struct nvlist_header) * nitems;
+
+ tmpnvl = NULL;
+ tmpnvp = NULL;
+ for (ii = 0; ii < nitems; ii++) {
+ PJDLOG_ASSERT(nvlarray[ii]->nvl_error == 0);
+ tmpnvp = nvlist_first_nvpair(nvlarray[ii]);
+ if (tmpnvp != NULL) {
+ tmpnvl = nvlarray[ii];
+ break;
+ }
+ }
+ if (tmpnvp != NULL) {
+ nvp = tmpnvp;
+ nvl = tmpnvl;
+ continue;
+ }
+
} else {
size += nvpair_size(nvp);
}
while ((nvp = nvlist_next_nvpair(nvl, nvp)) == NULL) {
- cookie = NULL;
- nvl = nvlist_get_parent(nvl, &cookie);
- if (nvl == NULL)
- goto out;
- nvp = cookie;
+ do {
+ cookie = NULL;
+ nvl = nvlist_get_pararr(nvl, &cookie);
+ if (nvl == NULL)
+ goto out;
+ if (nvlist_in_array(nvl) && cookie == NULL) {
+ nvp = nvlist_first_nvpair(nvl);
+ } else {
+ nvp = cookie;
+ }
+ } while (nvp == NULL);
+ if (nvlist_in_array(nvl) && cookie == NULL)
+ break;
}
}
@@ -508,13 +722,40 @@ nvlist_xdescriptors(const nvlist_t *nvl, int *descs)
*descs = nvpair_get_descriptor(nvp);
descs++;
break;
+ case NV_TYPE_DESCRIPTOR_ARRAY:
+ {
+ const int *value;
+ size_t nitems;
+ unsigned int ii;
+
+ value = nvpair_get_descriptor_array(nvp,
+ &nitems);
+ for (ii = 0; ii < nitems; ii++) {
+ *descs = value[ii];
+ descs++;
+ }
+ break;
+ }
case NV_TYPE_NVLIST:
nvl = nvpair_get_nvlist(nvp);
nvp = NULL;
break;
+ case NV_TYPE_NVLIST_ARRAY:
+ {
+ const nvlist_t * const *value;
+ size_t nitems;
+
+ value = nvpair_get_nvlist_array(nvp, &nitems);
+ PJDLOG_ASSERT(value != NULL);
+ PJDLOG_ASSERT(nitems > 0);
+
+ nvl = value[0];
+ nvp = NULL;
+ break;
+ }
}
}
- } while ((nvl = nvlist_get_parent(nvl, (void**)&nvp)) != NULL);
+ } while ((nvl = nvlist_get_pararr(nvl, (void**)&nvp)) != NULL);
return (descs);
}
@@ -564,9 +805,31 @@ nvlist_ndescriptors(const nvlist_t *nvl)
nvl = nvpair_get_nvlist(nvp);
nvp = NULL;
break;
+ case NV_TYPE_NVLIST_ARRAY:
+ {
+ const nvlist_t * const *value;
+ size_t nitems;
+
+ value = nvpair_get_nvlist_array(nvp, &nitems);
+ PJDLOG_ASSERT(value != NULL);
+ PJDLOG_ASSERT(nitems > 0);
+
+ nvl = value[0];
+ nvp = NULL;
+ break;
+ }
+ case NV_TYPE_DESCRIPTOR_ARRAY:
+ {
+ size_t nitems;
+
+ (void)nvpair_get_descriptor_array(nvp,
+ &nitems);
+ ndescs += nitems;
+ break;
+ }
}
}
- } while ((nvl = nvlist_get_parent(nvl, (void**)&nvp)) != NULL);
+ } while ((nvl = nvlist_get_pararr(nvl, (void**)&nvp)) != NULL);
return (ndescs);
#else
@@ -661,24 +924,86 @@ nvlist_xpack(const nvlist_t *nvl, int64_t *fdidxp, size_t *sizep)
case NV_TYPE_DESCRIPTOR:
ptr = nvpair_pack_descriptor(nvp, ptr, fdidxp, &left);
break;
+ case NV_TYPE_DESCRIPTOR_ARRAY:
+ ptr = nvpair_pack_descriptor_array(nvp, ptr, fdidxp,
+ &left);
+ break;
#endif
case NV_TYPE_BINARY:
ptr = nvpair_pack_binary(nvp, ptr, &left);
break;
+ case NV_TYPE_BOOL_ARRAY:
+ ptr = nvpair_pack_bool_array(nvp, ptr, &left);
+ break;
+ case NV_TYPE_NUMBER_ARRAY:
+ ptr = nvpair_pack_number_array(nvp, ptr, &left);
+ break;
+ case NV_TYPE_STRING_ARRAY:
+ ptr = nvpair_pack_string_array(nvp, ptr, &left);
+ break;
+ case NV_TYPE_NVLIST_ARRAY:
+ {
+ const nvlist_t * const * value;
+ size_t nitems;
+ unsigned int ii;
+
+ tmpnvl = NULL;
+ value = nvpair_get_nvlist_array(nvp, &nitems);
+ for (ii = 0; ii < nitems; ii++) {
+ ptr = nvlist_pack_header(value[ii], ptr, &left);
+ if (ptr == NULL)
+ goto out;
+ tmpnvp = nvlist_first_nvpair(value[ii]);
+ if (tmpnvp != NULL) {
+ tmpnvl = value[ii];
+ break;
+ }
+ ptr = nvpair_pack_nvlist_array_next(ptr, &left);
+ if (ptr == NULL)
+ goto out;
+ }
+ if (tmpnvl != NULL) {
+ nvl = tmpnvl;
+ nvp = tmpnvp;
+ continue;
+ }
+ break;
+ }
default:
PJDLOG_ABORT("Invalid type (%d).", nvpair_type(nvp));
}
if (ptr == NULL)
goto fail;
while ((nvp = nvlist_next_nvpair(nvl, nvp)) == NULL) {
- cookie = NULL;
- nvl = nvlist_get_parent(nvl, &cookie);
- if (nvl == NULL)
- goto out;
- nvp = cookie;
- ptr = nvpair_pack_nvlist_up(ptr, &left);
- if (ptr == NULL)
- goto fail;
+ do {
+ cookie = NULL;
+ if (nvlist_in_array(nvl)) {
+ ptr = nvpair_pack_nvlist_array_next(ptr,
+ &left);
+ if (ptr == NULL)
+ goto fail;
+ }
+ nvl = nvlist_get_pararr(nvl, &cookie);
+ if (nvl == NULL)
+ goto out;
+ if (nvlist_in_array(nvl) && cookie == NULL) {
+ nvp = nvlist_first_nvpair(nvl);
+ ptr = nvlist_pack_header(nvl, ptr,
+ &left);
+ if (ptr == NULL)
+ goto fail;
+ } else if (nvpair_type((nvpair_t *)cookie) !=
+ NV_TYPE_NVLIST_ARRAY) {
+ ptr = nvpair_pack_nvlist_up(ptr, &left);
+ if (ptr == NULL)
+ goto fail;
+ nvp = cookie;
+ } else {
+ nvp = cookie;
+ }
+ } while (nvp == NULL);
+ if (nvlist_in_array(nvl) && cookie == NULL)
+ break;
}
}
@@ -741,6 +1066,7 @@ nvlist_unpack_header(nvlist_t *nvl, const unsigned char *ptr, size_t nfds,
bool *isbep, size_t *leftp)
{
struct nvlist_header nvlhdr;
+ int inarrayf;
if (*leftp < sizeof(nvlhdr))
goto failed;
@@ -762,7 +1088,8 @@ nvlist_unpack_header(nvlist_t *nvl, const unsigned char *ptr, size_t nfds,
if ((nvlhdr.nvlh_flags & ~NV_FLAG_ALL_MASK) != 0)
goto failed;
- nvl->nvl_flags = (nvlhdr.nvlh_flags & NV_FLAG_PUBLIC_MASK);
+ inarrayf = (nvl->nvl_flags & NV_FLAG_IN_ARRAY);
+ nvl->nvl_flags = (nvlhdr.nvlh_flags & NV_FLAG_PUBLIC_MASK) | inarrayf;
ptr += sizeof(nvlhdr);
if (isbep != NULL)
@@ -780,7 +1107,7 @@ nvlist_xunpack(const void *buf, size_t size, const int *fds, size_t nfds,
int flags)
{
const unsigned char *ptr;
- nvlist_t *nvl, *retnvl, *tmpnvl;
+ nvlist_t *nvl, *retnvl, *tmpnvl, *array;
nvpair_t *nvp;
size_t left;
bool isbe;
@@ -790,7 +1117,7 @@ nvlist_xunpack(const void *buf, size_t size, const int *fds, size_t nfds,
left = size;
ptr = buf;
- tmpnvl = NULL;
+ tmpnvl = array = NULL;
nvl = retnvl = nvlist_create(0);
if (nvl == NULL)
goto failed;
@@ -832,6 +1159,10 @@ nvlist_xunpack(const void *buf, size_t size, const int *fds, size_t nfds,
ptr = nvpair_unpack_descriptor(isbe, nvp, ptr, &left,
fds, nfds);
break;
+ case NV_TYPE_DESCRIPTOR_ARRAY:
+ ptr = nvpair_unpack_descriptor_array(isbe, nvp, ptr,
+ &left, fds, nfds);
+ break;
#endif
case NV_TYPE_BINARY:
ptr = nvpair_unpack_binary(isbe, nvp, ptr, &left);
@@ -842,6 +1173,44 @@ nvlist_xunpack(const void *buf, size_t size, const int *fds, size_t nfds,
nvl = nvpair_nvlist(nvl->nvl_parent);
nvpair_free_structure(nvp);
continue;
+ case NV_TYPE_NVLIST_ARRAY_NEXT:
+ if (nvl->nvl_array_next == NULL) {
+ if (nvl->nvl_parent == NULL)
+ goto failed;
+ nvl = nvpair_nvlist(nvl->nvl_parent);
+ } else {
+ nvl = __DECONST(nvlist_t *,
+ nvlist_get_array_next(nvl));
+ ptr = nvlist_unpack_header(nvl, ptr, nfds,
+ &isbe, &left);
+ if (ptr == NULL)
+ goto failed;
+ }
+ nvpair_free_structure(nvp);
+ continue;
+ case NV_TYPE_BOOL_ARRAY:
+ ptr = nvpair_unpack_bool_array(isbe, nvp, ptr, &left);
+ break;
+ case NV_TYPE_NUMBER_ARRAY:
+ ptr = nvpair_unpack_number_array(isbe, nvp, ptr, &left);
+ break;
+ case NV_TYPE_STRING_ARRAY:
+ ptr = nvpair_unpack_string_array(isbe, nvp, ptr, &left);
+ break;
+ case NV_TYPE_NVLIST_ARRAY:
+ ptr = nvpair_unpack_nvlist_array(isbe, nvp, ptr, &left,
+ &array);
+ if (ptr == NULL)
+ goto failed;
+ tmpnvl = array;
+ while (array != NULL) {
+ nvlist_set_parent(array, nvp);
+ array = __DECONST(nvlist_t *,
+ nvlist_get_array_next(array));
+ }
+ ptr = nvlist_unpack_header(tmpnvl, ptr, nfds, &isbe,
+ &left);
+ break;
default:
PJDLOG_ABORT("Invalid type (%d).", nvpair_type(nvp));
}
@@ -1062,10 +1431,15 @@ NVLIST_EXISTS(bool, BOOL)
NVLIST_EXISTS(number, NUMBER)
NVLIST_EXISTS(string, STRING)
NVLIST_EXISTS(nvlist, NVLIST)
+NVLIST_EXISTS(binary, BINARY)
+NVLIST_EXISTS(bool_array, BOOL_ARRAY)
+NVLIST_EXISTS(number_array, NUMBER_ARRAY)
+NVLIST_EXISTS(string_array, STRING_ARRAY)
+NVLIST_EXISTS(nvlist_array, NVLIST_ARRAY)
#ifndef _KERNEL
NVLIST_EXISTS(descriptor, DESCRIPTOR)
+NVLIST_EXISTS(descriptor_array, DESCRIPTOR_ARRAY)
#endif
-NVLIST_EXISTS(binary, BINARY)
#undef NVLIST_EXISTS
@@ -1198,6 +1572,37 @@ NVLIST_ADD(int, descriptor);
#undef NVLIST_ADD
+#define NVLIST_ADD_ARRAY(vtype, type) \
+void \
+nvlist_add_##type##_array(nvlist_t *nvl, const char *name, vtype value, \
+ size_t nitems) \
+{ \
+ nvpair_t *nvp; \
+ \
+ if (nvlist_error(nvl) != 0) { \
+ ERRNO_SET(nvlist_error(nvl)); \
+ return; \
+ } \
+ \
+ nvp = nvpair_create_##type##_array(name, value, nitems); \
+ if (nvp == NULL) { \
+ nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM); \
+ ERRNO_SET(nvl->nvl_error); \
+ } else { \
+ (void)nvlist_move_nvpair(nvl, nvp); \
+ } \
+}
+
+NVLIST_ADD_ARRAY(const bool *, bool)
+NVLIST_ADD_ARRAY(const uint64_t *, number)
+NVLIST_ADD_ARRAY(const char * const *, string)
+NVLIST_ADD_ARRAY(const nvlist_t * const *, nvlist)
+#ifndef _KERNEL
+NVLIST_ADD_ARRAY(const int *, descriptor)
+#endif
+
+#undef NVLIST_ADD_ARRAY
+
bool
nvlist_move_nvpair(nvlist_t *nvl, nvpair_t *nvp)
{
@@ -1306,6 +1711,131 @@ nvlist_move_binary(nvlist_t *nvl, const char *name, void *value, size_t size)
}
}
+void
+nvlist_move_bool_array(nvlist_t *nvl, const char *name, bool *value,
+ size_t nitems)
+{
+ nvpair_t *nvp;
+
+ if (nvlist_error(nvl) != 0) {
+ nv_free(value);
+ ERRNO_SET(nvlist_error(nvl));
+ return;
+ }
+
+ nvp = nvpair_move_bool_array(name, value, nitems);
+ if (nvp == NULL) {
+ nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM);
+ ERRNO_SET(nvl->nvl_error);
+ } else {
+ (void)nvlist_move_nvpair(nvl, nvp);
+ }
+}
+
+void
+nvlist_move_string_array(nvlist_t *nvl, const char *name, char **value,
+ size_t nitems)
+{
+ nvpair_t *nvp;
+ size_t i;
+
+ if (nvlist_error(nvl) != 0) {
+ if (value != NULL) {
+ for (i = 0; i < nitems; i++)
+ nv_free(value[i]);
+ nv_free(value);
+ }
+ ERRNO_SET(nvlist_error(nvl));
+ return;
+ }
+
+ nvp = nvpair_move_string_array(name, value, nitems);
+ if (nvp == NULL) {
+ nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM);
+ ERRNO_SET(nvl->nvl_error);
+ } else {
+ (void)nvlist_move_nvpair(nvl, nvp);
+ }
+}
+
+void
+nvlist_move_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t **value,
+ size_t nitems)
+{
+ nvpair_t *nvp;
+ size_t i;
+
+ if (nvlist_error(nvl) != 0) {
+ if (value != NULL) {
+ for (i = 0; i < nitems; i++) {
+ if (nvlist_get_pararr(value[i], NULL) == NULL)
+ nvlist_destroy(value[i]);
+ }
+ }
+ nv_free(value);
+ ERRNO_SET(nvlist_error(nvl));
+ return;
+ }
+
+ nvp = nvpair_move_nvlist_array(name, value, nitems);
+ if (nvp == NULL) {
+ nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM);
+ ERRNO_SET(nvl->nvl_error);
+ } else {
+ (void)nvlist_move_nvpair(nvl, nvp);
+ }
+}
+
+void
+nvlist_move_number_array(nvlist_t *nvl, const char *name, uint64_t *value,
+ size_t nitems)
+{
+ nvpair_t *nvp;
+
+ if (nvlist_error(nvl) != 0) {
+ nv_free(value);
+ ERRNO_SET(nvlist_error(nvl));
+ return;
+ }
+
+ nvp = nvpair_move_number_array(name, value, nitems);
+ if (nvp == NULL) {
+ nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM);
+ ERRNO_SET(nvl->nvl_error);
+ } else {
+ (void)nvlist_move_nvpair(nvl, nvp);
+ }
+}
+
+#ifndef _KERNEL
+void
+nvlist_move_descriptor_array(nvlist_t *nvl, const char *name, int *value,
+ size_t nitems)
+{
+ nvpair_t *nvp;
+ size_t i;
+
+ if (nvlist_error(nvl) != 0) {
+ if (value != 0) {
+ for (i = 0; i < nitems; i++)
+ close(value[i]);
+ nv_free(value);
+ }
+
+ ERRNO_SET(nvlist_error(nvl));
+ return;
+ }
+
+ nvp = nvpair_move_descriptor_array(name, value, nitems);
+ if (nvp == NULL) {
+ nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM);
+ ERRNO_SET(nvl->nvl_error);
+ } else {
+ (void)nvlist_move_nvpair(nvl, nvp);
+ }
+}
+#endif
+
const nvpair_t *
nvlist_get_nvpair(const nvlist_t *nvl, const char *name)
{
@@ -1347,6 +1877,29 @@ nvlist_get_binary(const nvlist_t *nvl, const char *name, size_t *sizep)
return (nvpair_get_binary(nvp, sizep));
}
+#define NVLIST_GET_ARRAY(ftype, type, TYPE) \
+ftype \
+nvlist_get_##type##_array(const nvlist_t *nvl, const char *name, \
+ size_t *nitems) \
+{ \
+ const nvpair_t *nvp; \
+ \
+ nvp = nvlist_find(nvl, NV_TYPE_##TYPE##_ARRAY, name); \
+ if (nvp == NULL) \
+ nvlist_report_missing(NV_TYPE_##TYPE##_ARRAY, name); \
+ return (nvpair_get_##type##_array(nvp, nitems)); \
+}
+
+NVLIST_GET_ARRAY(const bool *, bool, BOOL)
+NVLIST_GET_ARRAY(const uint64_t *, number, NUMBER)
+NVLIST_GET_ARRAY(const char * const *, string, STRING)
+NVLIST_GET_ARRAY(const nvlist_t * const *, nvlist, NVLIST)
+#ifndef _KERNEL
+NVLIST_GET_ARRAY(const int *, descriptor, DESCRIPTOR)
+#endif
+
+#undef NVLIST_GET_ARRAY
+
#define NVLIST_TAKE(ftype, type, TYPE) \
ftype \
nvlist_take_##type(nvlist_t *nvl, const char *name) \
@@ -1389,6 +1942,31 @@ nvlist_take_binary(nvlist_t *nvl, const char *name, size_t *sizep)
return (value);
}
+#define NVLIST_TAKE_ARRAY(ftype, type, TYPE) \
+ftype \
+nvlist_take_##type##_array(nvlist_t *nvl, const char *name, \
+ size_t *nitems) \
+{ \
+ nvpair_t *nvp; \
+ ftype value; \
+ \
+ nvp = nvlist_find(nvl, NV_TYPE_##TYPE##_ARRAY, name); \
+ if (nvp == NULL) \
+ nvlist_report_missing(NV_TYPE_##TYPE##_ARRAY, name); \
+ value = (ftype)(intptr_t)nvpair_get_##type##_array(nvp, nitems);\
+ nvlist_remove_nvpair(nvl, nvp); \
+ nvpair_free_structure(nvp); \
+ return (value); \
+}
+
+NVLIST_TAKE_ARRAY(bool *, bool, BOOL)
+NVLIST_TAKE_ARRAY(uint64_t *, number, NUMBER)
+NVLIST_TAKE_ARRAY(char **, string, STRING)
+NVLIST_TAKE_ARRAY(nvlist_t **, nvlist, NVLIST)
+#ifndef _KERNEL
+NVLIST_TAKE_ARRAY(int *, descriptor, DESCRIPTOR)
+#endif
+
void
nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
{
@@ -1420,10 +1998,15 @@ NVLIST_FREE(bool, BOOL)
NVLIST_FREE(number, NUMBER)
NVLIST_FREE(string, STRING)
NVLIST_FREE(nvlist, NVLIST)
+NVLIST_FREE(binary, BINARY)
+NVLIST_FREE(bool_array, BOOL_ARRAY)
+NVLIST_FREE(number_array, NUMBER_ARRAY)
+NVLIST_FREE(string_array, STRING_ARRAY)
+NVLIST_FREE(nvlist_array, NVLIST_ARRAY)
#ifndef _KERNEL
NVLIST_FREE(descriptor, DESCRIPTOR)
+NVLIST_FREE(descriptor_array, DESCRIPTOR_ARRAY)
#endif
-NVLIST_FREE(binary, BINARY)
#undef NVLIST_FREE
diff --git a/sys/contrib/libnv/nvlist_impl.h b/sys/contrib/libnv/nvlist_impl.h
index 18ccebf..9952db8 100644
--- a/sys/contrib/libnv/nvlist_impl.h
+++ b/sys/contrib/libnv/nvlist_impl.h
@@ -1,5 +1,6 @@
/*-
* Copyright (c) 2013 The FreeBSD Foundation
+ * Copyright (c) 2013-2015 Mariusz Zaborski <oshogbo@FreeBSD.org>
* All rights reserved.
*
* This software was developed by Pawel Jakub Dawidek under sponsorship from
diff --git a/sys/contrib/libnv/nvpair.c b/sys/contrib/libnv/nvpair.c
index 7146767..1e3bd0e 100644
--- a/sys/contrib/libnv/nvpair.c
+++ b/sys/contrib/libnv/nvpair.c
@@ -1,5 +1,6 @@
/*-
* Copyright (c) 2009-2013 The FreeBSD Foundation
+ * Copyright (c) 2013-2015 Mariusz Zaborski <oshogbo@FreeBSD.org>
* All rights reserved.
*
* This software was developed by Pawel Jakub Dawidek under sponsorship from
@@ -86,6 +87,7 @@ struct nvpair {
int nvp_type;
uint64_t nvp_data;
size_t nvp_datasize;
+ size_t nvp_nitems; /* Used only for array types. */
nvlist_t *nvp_list;
TAILQ_ENTRY(nvpair) nvp_next;
};
@@ -99,6 +101,7 @@ struct nvpair_header {
uint8_t nvph_type;
uint16_t nvph_namesize;
uint64_t nvph_datasize;
+ uint64_t nvph_nitems;
} __packed;
@@ -109,6 +112,36 @@ nvpair_assert(const nvpair_t *nvp)
NVPAIR_ASSERT(nvp);
}
+static nvpair_t *
+nvpair_allocv(const char *name, int type, uint64_t data, size_t datasize,
+ size_t nitems)
+{
+ nvpair_t *nvp;
+ size_t namelen;
+
+ PJDLOG_ASSERT(type >= NV_TYPE_FIRST && type <= NV_TYPE_LAST);
+
+ namelen = strlen(name);
+ if (namelen >= NV_NAME_MAX) {
+ ERRNO_SET(ENAMETOOLONG);
+ return (NULL);
+ }
+
+ nvp = nv_calloc(1, sizeof(*nvp) + namelen + 1);
+ if (nvp != NULL) {
+ nvp->nvp_name = (char *)(nvp + 1);
+ memcpy(nvp->nvp_name, name, namelen);
+ nvp->nvp_name[namelen] = '\0';
+ nvp->nvp_type = type;
+ nvp->nvp_data = data;
+ nvp->nvp_datasize = datasize;
+ nvp->nvp_nitems = nitems;
+ nvp->nvp_magic = NVPAIR_MAGIC;
+ }
+
+ return (nvp);
+}
+
nvlist_t *
nvpair_nvlist(const nvpair_t *nvp)
{
@@ -162,6 +195,19 @@ nvpair_remove_nvlist(nvpair_t *nvp)
nvlist_set_parent(nvl, NULL);
}
+static void
+nvpair_remove_nvlist_array(nvpair_t *nvp)
+{
+ nvlist_t **nvlarray;
+ size_t count, i;
+
+ /* XXX: DECONST is bad, mkay? */
+ nvlarray = __DECONST(nvlist_t **,
+ nvpair_get_nvlist_array(nvp, &count));
+ for (i = 0; i < count; i++)
+ nvlist_set_array_next(nvlarray[i], NULL);
+}
+
void
nvpair_remove(struct nvl_head *head, nvpair_t *nvp, const nvlist_t *nvl)
{
@@ -171,6 +217,8 @@ nvpair_remove(struct nvl_head *head, nvpair_t *nvp, const nvlist_t *nvl)
if (nvpair_type(nvp) == NV_TYPE_NVLIST)
nvpair_remove_nvlist(nvp);
+ else if (nvpair_type(nvp) == NV_TYPE_NVLIST_ARRAY)
+ nvpair_remove_nvlist_array(nvp);
TAILQ_REMOVE(head, nvp, nvp_next);
nvp->nvp_list = NULL;
@@ -204,16 +252,36 @@ nvpair_clone(const nvpair_t *nvp)
case NV_TYPE_NVLIST:
newnvp = nvpair_create_nvlist(name, nvpair_get_nvlist(nvp));
break;
+ case NV_TYPE_BINARY:
+ data = nvpair_get_binary(nvp, &datasize);
+ newnvp = nvpair_create_binary(name, data, datasize);
+ break;
+ case NV_TYPE_BOOL_ARRAY:
+ data = nvpair_get_bool_array(nvp, &datasize);
+ newnvp = nvpair_create_bool_array(name, data, datasize);
+ break;
+ case NV_TYPE_NUMBER_ARRAY:
+ data = nvpair_get_number_array(nvp, &datasize);
+ newnvp = nvpair_create_number_array(name, data, datasize);
+ break;
+ case NV_TYPE_STRING_ARRAY:
+ data = nvpair_get_string_array(nvp, &datasize);
+ newnvp = nvpair_create_string_array(name, data, datasize);
+ break;
+ case NV_TYPE_NVLIST_ARRAY:
+ data = nvpair_get_nvlist_array(nvp, &datasize);
+ newnvp = nvpair_create_nvlist_array(name, data, datasize);
+ break;
#ifndef _KERNEL
case NV_TYPE_DESCRIPTOR:
newnvp = nvpair_create_descriptor(name,
nvpair_get_descriptor(nvp));
break;
-#endif
- case NV_TYPE_BINARY:
- data = nvpair_get_binary(nvp, &datasize);
- newnvp = nvpair_create_binary(name, data, datasize);
+ case NV_TYPE_DESCRIPTOR_ARRAY:
+ data = nvpair_get_descriptor_array(nvp, &datasize);
+ newnvp = nvpair_create_descriptor_array(name, data, datasize);
break;
+#endif
default:
PJDLOG_ABORT("Unknown type: %d.", nvpair_type(nvp));
}
@@ -250,6 +318,7 @@ nvpair_pack_header(const nvpair_t *nvp, unsigned char *ptr, size_t *leftp)
PJDLOG_ASSERT(namesize > 0 && namesize <= UINT16_MAX);
nvphdr.nvph_namesize = namesize;
nvphdr.nvph_datasize = nvp->nvp_datasize;
+ nvphdr.nvph_nitems = nvp->nvp_nitems;
PJDLOG_ASSERT(*leftp >= sizeof(nvphdr));
memcpy(ptr, &nvphdr, sizeof(nvphdr));
ptr += sizeof(nvphdr);
@@ -336,6 +405,32 @@ nvpair_pack_nvlist_up(unsigned char *ptr, size_t *leftp)
nvphdr.nvph_type = NV_TYPE_NVLIST_UP;
nvphdr.nvph_namesize = namesize;
nvphdr.nvph_datasize = 0;
+ nvphdr.nvph_nitems = 0;
+ PJDLOG_ASSERT(*leftp >= sizeof(nvphdr));
+ memcpy(ptr, &nvphdr, sizeof(nvphdr));
+ ptr += sizeof(nvphdr);
+ *leftp -= sizeof(nvphdr);
+
+ PJDLOG_ASSERT(*leftp >= namesize);
+ memcpy(ptr, name, namesize);
+ ptr += namesize;
+ *leftp -= namesize;
+
+ return (ptr);
+}
+
+unsigned char *
+nvpair_pack_nvlist_array_next(unsigned char *ptr, size_t *leftp)
+{
+ struct nvpair_header nvphdr;
+ size_t namesize;
+ const char *name = "";
+
+ namesize = 1;
+ nvphdr.nvph_type = NV_TYPE_NVLIST_ARRAY_NEXT;
+ nvphdr.nvph_namesize = namesize;
+ nvphdr.nvph_datasize = 0;
+ nvphdr.nvph_nitems = 0;
PJDLOG_ASSERT(*leftp >= sizeof(nvphdr));
memcpy(ptr, &nvphdr, sizeof(nvphdr));
ptr += sizeof(nvphdr);
@@ -396,6 +491,106 @@ nvpair_pack_binary(const nvpair_t *nvp, unsigned char *ptr, size_t *leftp)
return (ptr);
}
+unsigned char *
+nvpair_pack_bool_array(const nvpair_t *nvp, unsigned char *ptr, size_t *leftp)
+{
+
+ NVPAIR_ASSERT(nvp);
+ PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_BOOL_ARRAY);
+ PJDLOG_ASSERT(*leftp >= nvp->nvp_datasize);
+
+ memcpy(ptr, (const void *)(intptr_t)nvp->nvp_data, nvp->nvp_datasize);
+ ptr += nvp->nvp_datasize;
+ *leftp -= nvp->nvp_datasize;
+
+ return (ptr);
+}
+
+unsigned char *
+nvpair_pack_number_array(const nvpair_t *nvp, unsigned char *ptr, size_t *leftp)
+{
+
+ NVPAIR_ASSERT(nvp);
+ PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NUMBER_ARRAY);
+ PJDLOG_ASSERT(*leftp >= nvp->nvp_datasize);
+
+ memcpy(ptr, (const void *)(intptr_t)nvp->nvp_data, nvp->nvp_datasize);
+ ptr += nvp->nvp_datasize;
+ *leftp -= nvp->nvp_datasize;
+
+ return (ptr);
+}
+
+unsigned char *
+nvpair_pack_string_array(const nvpair_t *nvp, unsigned char *ptr, size_t *leftp)
+{
+ unsigned int ii;
+ size_t size, len;
+ const char * const *array;
+
+ NVPAIR_ASSERT(nvp);
+ PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_STRING_ARRAY);
+ PJDLOG_ASSERT(*leftp >= nvp->nvp_datasize);
+
+ size = 0;
+ array = nvpair_get_string_array(nvp, NULL);
+ PJDLOG_ASSERT(array != NULL);
+
+ for (ii = 0; ii < nvp->nvp_nitems; ii++) {
+ len = strlen(array[ii]) + 1;
+ PJDLOG_ASSERT(*leftp >= len);
+
+ memcpy(ptr, (const void *)array[ii], len);
+ size += len;
+ ptr += len;
+ *leftp -= len;
+ }
+
+ PJDLOG_ASSERT(size == nvp->nvp_datasize);
+
+ return (ptr);
+}
+
+#ifndef _KERNEL
+unsigned char *
+nvpair_pack_descriptor_array(const nvpair_t *nvp, unsigned char *ptr,
+ int64_t *fdidxp, size_t *leftp)
+{
+ int64_t value;
+ const int *array;
+ unsigned int ii;
+
+ NVPAIR_ASSERT(nvp);
+ PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_DESCRIPTOR_ARRAY);
+ PJDLOG_ASSERT(*leftp >= nvp->nvp_datasize);
+
+ array = nvpair_get_descriptor_array(nvp, NULL);
+ PJDLOG_ASSERT(array != NULL);
+
+ for (ii = 0; ii < nvp->nvp_nitems; ii++) {
+ PJDLOG_ASSERT(*leftp >= sizeof(value));
+
+ value = array[ii];
+ if (value != -1) {
+ /*
+ * If there is a real descriptor here, we change its
+ * number to position in the array of descriptors send
+ * via control message.
+ */
+ PJDLOG_ASSERT(fdidxp != NULL);
+
+ value = *fdidxp;
+ (*fdidxp)++;
+ }
+ memcpy(ptr, &value, sizeof(value));
+ ptr += sizeof(value);
+ *leftp -= sizeof(value);
+ }
+
+ return (ptr);
+}
+#endif
+
void
nvpair_init_datasize(nvpair_t *nvp)
{
@@ -430,7 +625,8 @@ nvpair_unpack_header(bool isbe, nvpair_t *nvp, const unsigned char *ptr,
goto failed;
#endif
if (nvphdr.nvph_type > NV_TYPE_LAST &&
- nvphdr.nvph_type != NV_TYPE_NVLIST_UP) {
+ nvphdr.nvph_type != NV_TYPE_NVLIST_UP &&
+ nvphdr.nvph_type != NV_TYPE_NVLIST_ARRAY_NEXT) {
goto failed;
}
@@ -467,6 +663,7 @@ nvpair_unpack_header(bool isbe, nvpair_t *nvp, const unsigned char *ptr,
nvp->nvp_type = nvphdr.nvph_type;
nvp->nvp_data = 0;
nvp->nvp_datasize = nvphdr.nvph_datasize;
+ nvp->nvp_nitems = nvphdr.nvph_nitems;
return (ptr);
failed:
@@ -540,6 +737,7 @@ nvpair_unpack_number(bool isbe, nvpair_t *nvp, const unsigned char *ptr,
nvp->nvp_data = be64dec(ptr);
else
nvp->nvp_data = le64dec(ptr);
+
ptr += sizeof(uint64_t);
*leftp -= sizeof(uint64_t);
@@ -670,6 +868,234 @@ nvpair_unpack_binary(bool isbe __unused, nvpair_t *nvp,
}
const unsigned char *
+nvpair_unpack_bool_array(bool isbe __unused, nvpair_t *nvp,
+ const unsigned char *ptr, size_t *leftp)
+{
+ uint8_t *value;
+ size_t size;
+ unsigned int i;
+
+ PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_BOOL_ARRAY);
+
+ size = sizeof(*value) * nvp->nvp_nitems;
+ if (nvp->nvp_datasize != size || *leftp < size ||
+ nvp->nvp_nitems == 0 || size < nvp->nvp_nitems) {
+ ERRNO_SET(EINVAL);
+ return (NULL);
+ }
+
+ value = nv_malloc(size);
+ if (value == NULL)
+ return (NULL);
+
+ for (i = 0; i < nvp->nvp_nitems; i++) {
+ value[i] = *(const uint8_t *)ptr;
+
+ ptr += sizeof(*value);
+ *leftp -= sizeof(*value);
+ }
+
+ nvp->nvp_data = (uint64_t)(uintptr_t)value;
+
+ return (ptr);
+}
+
+const unsigned char *
+nvpair_unpack_number_array(bool isbe, nvpair_t *nvp, const unsigned char *ptr,
+ size_t *leftp)
+{
+ uint64_t *value;
+ size_t size;
+ unsigned int i;
+
+ PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NUMBER_ARRAY);
+
+ size = sizeof(*value) * nvp->nvp_nitems;
+ if (nvp->nvp_datasize != size || *leftp < size ||
+ nvp->nvp_nitems == 0 || size < nvp->nvp_nitems) {
+ ERRNO_SET(EINVAL);
+ return (NULL);
+ }
+
+ value = nv_malloc(size);
+ if (value == NULL)
+ return (NULL);
+
+ for (i = 0; i < nvp->nvp_nitems; i++) {
+ if (isbe)
+ value[i] = be64dec(ptr);
+ else
+ value[i] = le64dec(ptr);
+
+ ptr += sizeof(*value);
+ *leftp -= sizeof(*value);
+ }
+
+ nvp->nvp_data = (uint64_t)(uintptr_t)value;
+
+ return (ptr);
+}
+
+const unsigned char *
+nvpair_unpack_string_array(bool isbe __unused, nvpair_t *nvp,
+ const unsigned char *ptr, size_t *leftp)
+{
+ ssize_t size;
+ size_t len;
+ const char *tmp;
+ char **value;
+ unsigned int ii, j;
+
+ PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_STRING_ARRAY);
+
+ if (*leftp < nvp->nvp_datasize || nvp->nvp_datasize == 0 ||
+ nvp->nvp_nitems == 0) {
+ ERRNO_SET(EINVAL);
+ return (NULL);
+ }
+
+ size = nvp->nvp_datasize;
+ tmp = (const char *)ptr;
+ for (ii = 0; ii < nvp->nvp_nitems; ii++) {
+ len = strnlen(tmp, size - 1) + 1;
+ size -= len;
+ if (size < 0) {
+ ERRNO_SET(EINVAL);
+ return (NULL);
+ }
+ tmp += len;
+ }
+ if (size != 0) {
+ ERRNO_SET(EINVAL);
+ return (NULL);
+ }
+
+ value = nv_malloc(sizeof(*value) * nvp->nvp_nitems);
+ if (value == NULL)
+ return (NULL);
+
+ for (ii = 0; ii < nvp->nvp_nitems; ii++) {
+ value[ii] = nv_strdup((const char *)ptr);
+ if (value[ii] == NULL)
+ goto out;
+ len = strlen(value[ii]) + 1;
+ ptr += len;
+ *leftp -= len;
+ }
+ nvp->nvp_data = (uint64_t)(uintptr_t)value;
+
+ return (ptr);
+out:
+ for (j = 0; j < ii; j++)
+ nv_free(value[j]);
+ nv_free(value);
+ return (NULL);
+}
+
+#ifndef _KERNEL
+const unsigned char *
+nvpair_unpack_descriptor_array(bool isbe, nvpair_t *nvp,
+ const unsigned char *ptr, size_t *leftp, const int *fds, size_t nfds)
+{
+ int64_t idx;
+ size_t size;
+ unsigned int ii;
+ int *array;
+
+ PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_DESCRIPTOR_ARRAY);
+
+ size = sizeof(idx) * nvp->nvp_nitems;
+ if (nvp->nvp_datasize != size || *leftp < size ||
+ nvp->nvp_nitems == 0 || size < nvp->nvp_nitems) {
+ ERRNO_SET(EINVAL);
+ return (NULL);
+ }
+
+ array = (int *)nv_malloc(size);
+ if (array == NULL)
+ return (NULL);
+
+ for (ii = 0; ii < nvp->nvp_nitems; ii++) {
+ if (isbe)
+ idx = be64dec(ptr);
+ else
+ idx = le64dec(ptr);
+
+ if (idx < 0) {
+ ERRNO_SET(EINVAL);
+ nv_free(array);
+ return (NULL);
+ }
+
+ if ((size_t)idx >= nfds) {
+ ERRNO_SET(EINVAL);
+ nv_free(array);
+ return (NULL);
+ }
+
+ array[ii] = (uint64_t)fds[idx];
+
+ ptr += sizeof(idx);
+ *leftp -= sizeof(idx);
+ }
+
+ nvp->nvp_data = (uint64_t)(uintptr_t)array;
+
+ return (ptr);
+}
+#endif
+
+const unsigned char *
+nvpair_unpack_nvlist_array(bool isbe __unused, nvpair_t *nvp,
+ const unsigned char *ptr, size_t *leftp, nvlist_t **firstel)
+{
+ nvlist_t **value;
+ nvpair_t *tmpnvp;
+ unsigned int ii, j;
+ size_t sizeup;
+
+ PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NVLIST_ARRAY);
+
+ sizeup = sizeof(struct nvpair_header) * nvp->nvp_nitems;
+ if (nvp->nvp_nitems == 0 || sizeup < nvp->nvp_nitems ||
+ sizeup > *leftp) {
+ ERRNO_SET(EINVAL);
+ return (NULL);
+ }
+
+ value = nv_malloc(nvp->nvp_nitems * sizeof(*value));
+ if (value == NULL)
+ return (NULL);
+
+ for (ii = 0; ii < nvp->nvp_nitems; ii++) {
+ value[ii] = nvlist_create(0);
+ if (value[ii] == NULL)
+ goto fail;
+ if (ii > 0) {
+ tmpnvp = nvpair_allocv(" ", NV_TYPE_NVLIST,
+ (uint64_t)(uintptr_t)value[ii], 0, 0);
+ if (tmpnvp == NULL)
+ goto fail;
+ nvlist_set_array_next(value[ii - 1], tmpnvp);
+ }
+ }
+ nvlist_set_flags(value[nvp->nvp_nitems - 1], NV_FLAG_IN_ARRAY);
+
+ nvp->nvp_data = (uint64_t)(uintptr_t)value;
+ *firstel = value[0];
+
+ return (ptr);
+fail:
+ ERRNO_SAVE();
+ for (j = 0; j < ii; j++)
+ nvlist_destroy(value[j]);
+ nv_free(value);
+ ERRNO_RESTORE();
+
+ return (NULL);
+}
+
+const unsigned char *
nvpair_unpack(bool isbe, const unsigned char *ptr, size_t *leftp,
nvpair_t **nvpp)
{
@@ -717,34 +1143,6 @@ nvpair_name(const nvpair_t *nvp)
return (nvp->nvp_name);
}
-static nvpair_t *
-nvpair_allocv(const char *name, int type, uint64_t data, size_t datasize)
-{
- nvpair_t *nvp;
- size_t namelen;
-
- PJDLOG_ASSERT(type >= NV_TYPE_FIRST && type <= NV_TYPE_LAST);
-
- namelen = strlen(name);
- if (namelen >= NV_NAME_MAX) {
- ERRNO_SET(ENAMETOOLONG);
- return (NULL);
- }
-
- nvp = nv_calloc(1, sizeof(*nvp) + namelen + 1);
- if (nvp != NULL) {
- nvp->nvp_name = (char *)(nvp + 1);
- memcpy(nvp->nvp_name, name, namelen);
- nvp->nvp_name[namelen] = '\0';
- nvp->nvp_type = type;
- nvp->nvp_data = data;
- nvp->nvp_datasize = datasize;
- nvp->nvp_magic = NVPAIR_MAGIC;
- }
-
- return (nvp);
-}
-
nvpair_t *
nvpair_create_stringf(const char *name, const char *valuefmt, ...)
{
@@ -778,7 +1176,7 @@ nvpair_t *
nvpair_create_null(const char *name)
{
- return (nvpair_allocv(name, NV_TYPE_NULL, 0, 0));
+ return (nvpair_allocv(name, NV_TYPE_NULL, 0, 0, 0));
}
nvpair_t *
@@ -786,14 +1184,14 @@ nvpair_create_bool(const char *name, bool value)
{
return (nvpair_allocv(name, NV_TYPE_BOOL, value ? 1 : 0,
- sizeof(uint8_t)));
+ sizeof(uint8_t), 0));
}
nvpair_t *
nvpair_create_number(const char *name, uint64_t value)
{
- return (nvpair_allocv(name, NV_TYPE_NUMBER, value, sizeof(value)));
+ return (nvpair_allocv(name, NV_TYPE_NUMBER, value, sizeof(value), 0));
}
nvpair_t *
@@ -814,7 +1212,7 @@ nvpair_create_string(const char *name, const char *value)
size = strlen(value) + 1;
nvp = nvpair_allocv(name, NV_TYPE_STRING, (uint64_t)(uintptr_t)data,
- size);
+ size, 0);
if (nvp == NULL)
nv_free(data);
@@ -836,7 +1234,8 @@ nvpair_create_nvlist(const char *name, const nvlist_t *value)
if (nvl == NULL)
return (NULL);
- nvp = nvpair_allocv(name, NV_TYPE_NVLIST, (uint64_t)(uintptr_t)nvl, 0);
+ nvp = nvpair_allocv(name, NV_TYPE_NVLIST, (uint64_t)(uintptr_t)nvl, 0,
+ 0);
if (nvp == NULL)
nvlist_destroy(nvl);
else
@@ -861,7 +1260,7 @@ nvpair_create_descriptor(const char *name, int value)
return (NULL);
nvp = nvpair_allocv(name, NV_TYPE_DESCRIPTOR, (uint64_t)value,
- sizeof(int64_t));
+ sizeof(int64_t), 0);
if (nvp == NULL) {
ERRNO_SAVE();
close(value);
@@ -889,7 +1288,7 @@ nvpair_create_binary(const char *name, const void *value, size_t size)
memcpy(data, value, size);
nvp = nvpair_allocv(name, NV_TYPE_BINARY, (uint64_t)(uintptr_t)data,
- size);
+ size, 0);
if (nvp == NULL)
nv_free(data);
@@ -897,6 +1296,226 @@ nvpair_create_binary(const char *name, const void *value, size_t size)
}
nvpair_t *
+nvpair_create_bool_array(const char *name, const bool *value, size_t nitems)
+{
+ nvpair_t *nvp;
+ size_t size;
+ void *data;
+
+ if (value == NULL || nitems == 0) {
+ ERRNO_SET(EINVAL);
+ return (NULL);
+ }
+
+ size = sizeof(value[0]) * nitems;
+ data = nv_malloc(size);
+ if (data == NULL)
+ return (NULL);
+
+ memcpy(data, value, size);
+ nvp = nvpair_allocv(name, NV_TYPE_BOOL_ARRAY, (uint64_t)(uintptr_t)data,
+ size, nitems);
+ if (nvp == NULL) {
+ ERRNO_SAVE();
+ nv_free(data);
+ ERRNO_RESTORE();
+ }
+
+ return (nvp);
+}
+
+nvpair_t *
+nvpair_create_number_array(const char *name, const uint64_t *value,
+ size_t nitems)
+{
+ nvpair_t *nvp;
+ size_t size;
+ void *data;
+
+ if (value == NULL || nitems == 0) {
+ ERRNO_SET(EINVAL);
+ return (NULL);
+ }
+
+ size = sizeof(value[0]) * nitems;
+ data = nv_malloc(size);
+ if (data == NULL)
+ return (NULL);
+
+ memcpy(data, value, size);
+ nvp = nvpair_allocv(name, NV_TYPE_NUMBER_ARRAY,
+ (uint64_t)(uintptr_t)data, size, nitems);
+ if (nvp == NULL) {
+ ERRNO_SAVE();
+ nv_free(data);
+ ERRNO_RESTORE();
+ }
+
+ return (nvp);
+}
+
+nvpair_t *
+nvpair_create_string_array(const char *name, const char * const *value,
+ size_t nitems)
+{
+ nvpair_t *nvp;
+ unsigned int ii;
+ size_t datasize, size;
+ char **data;
+
+ if (value == NULL || nitems == 0) {
+ ERRNO_SET(EINVAL);
+ return (NULL);
+ }
+
+ nvp = NULL;
+ datasize = 0;
+ data = nv_malloc(sizeof(value[0]) * nitems);
+ if (data == NULL)
+ return (NULL);
+
+ for (ii = 0; ii < nitems; ii++) {
+ if (value[ii] == NULL) {
+ ERRNO_SET(EINVAL);
+ goto fail;
+ }
+
+ size = strlen(value[ii]) + 1;
+ datasize += size;
+ data[ii] = nv_strdup(value[ii]);
+ if (data[ii] == NULL)
+ goto fail;
+ }
+ nvp = nvpair_allocv(name, NV_TYPE_STRING_ARRAY,
+ (uint64_t)(uintptr_t)data, datasize, nitems);
+
+fail:
+ if (nvp == NULL) {
+ ERRNO_SAVE();
+ for (; ii > 0; ii--)
+ nv_free(data[ii - 1]);
+ nv_free(data);
+ ERRNO_RESTORE();
+ }
+
+ return (nvp);
+}
+
+nvpair_t *
+nvpair_create_nvlist_array(const char *name, const nvlist_t * const *value,
+ size_t nitems)
+{
+ unsigned int ii;
+ nvlist_t **nvls;
+ nvpair_t *nvp;
+ int flags;
+
+ nvp = NULL;
+ nvls = NULL;
+ ii = 0;
+
+ if (value == NULL || nitems == 0) {
+ ERRNO_SET(EINVAL);
+ return (NULL);
+ }
+
+ nvls = nv_malloc(sizeof(value[0]) * nitems);
+ if (nvls == NULL)
+ return (NULL);
+
+ for (ii = 0; ii < nitems; ii++) {
+ if (value[ii] == NULL) {
+ ERRNO_SET(EINVAL);
+ goto fail;
+ }
+
+ nvls[ii] = nvlist_clone(value[ii]);
+ if (nvls[ii] == NULL)
+ goto fail;
+
+ if (ii > 0) {
+ nvp = nvpair_allocv(" ", NV_TYPE_NVLIST,
+ (uint64_t)(uintptr_t)nvls[ii], 0, 0);
+ if (nvp == NULL)
+ goto fail;
+ nvlist_set_array_next(nvls[ii - 1], nvp);
+ }
+ }
+ flags = nvlist_flags(nvls[nitems - 1]) | NV_FLAG_IN_ARRAY;
+ nvlist_set_flags(nvls[nitems - 1], flags);
+
+ nvp = nvpair_allocv(name, NV_TYPE_NVLIST_ARRAY,
+ (uint64_t)(uintptr_t)nvls, 0, nitems);
+
+fail:
+ if (nvp == NULL) {
+ ERRNO_SAVE();
+ for (; ii > 0; ii--)
+ nvlist_destroy(nvls[ii - 1]);
+
+ nv_free(nvls);
+ ERRNO_RESTORE();
+ } else {
+ for (ii = 0; ii < nitems; ii++)
+ nvlist_set_parent(nvls[ii], nvp);
+ }
+
+ return (nvp);
+}
+
+#ifndef _KERNEL
+nvpair_t *
+nvpair_create_descriptor_array(const char *name, const int *value,
+ size_t nitems)
+{
+ unsigned int ii;
+ nvpair_t *nvp;
+ int *fds;
+
+ if (value == NULL) {
+ ERRNO_SET(EINVAL);
+ return (NULL);
+ }
+
+ nvp = NULL;
+
+ fds = nv_malloc(sizeof(value[0]) * nitems);
+ if (fds == NULL)
+ return (NULL);
+ for (ii = 0; ii < nitems; ii++) {
+ if (value[ii] == -1) {
+ fds[ii] = -1;
+ } else {
+ if (!fd_is_valid(value[ii])) {
+ ERRNO_SET(EBADF);
+ goto fail;
+ }
+
+ fds[ii] = fcntl(value[ii], F_DUPFD_CLOEXEC, 0);
+ if (fds[ii] == -1)
+ goto fail;
+ }
+ }
+
+ nvp = nvpair_allocv(name, NV_TYPE_DESCRIPTOR_ARRAY,
+ (uint64_t)(uintptr_t)fds, sizeof(int64_t) * nitems, nitems);
+
+fail:
+ if (nvp == NULL) {
+ ERRNO_SAVE();
+ for (; ii > 0; ii--) {
+ if (fds[ii - 1] != -1)
+ close(fds[ii - 1]);
+ }
+ nv_free(fds);
+ ERRNO_RESTORE();
+ }
+
+ return (nvp);
+}
+#endif
+
+nvpair_t *
nvpair_move_string(const char *name, char *value)
{
nvpair_t *nvp;
@@ -907,7 +1526,7 @@ nvpair_move_string(const char *name, char *value)
}
nvp = nvpair_allocv(name, NV_TYPE_STRING, (uint64_t)(uintptr_t)value,
- strlen(value) + 1);
+ strlen(value) + 1, 0);
if (nvp == NULL) {
ERRNO_SAVE();
nv_free(value);
@@ -934,7 +1553,7 @@ nvpair_move_nvlist(const char *name, nvlist_t *value)
}
nvp = nvpair_allocv(name, NV_TYPE_NVLIST, (uint64_t)(uintptr_t)value,
- 0);
+ 0, 0);
if (nvp == NULL)
nvlist_destroy(value);
else
@@ -955,7 +1574,7 @@ nvpair_move_descriptor(const char *name, int value)
}
nvp = nvpair_allocv(name, NV_TYPE_DESCRIPTOR, (uint64_t)value,
- sizeof(int64_t));
+ sizeof(int64_t), 0);
if (nvp == NULL) {
ERRNO_SAVE();
close(value);
@@ -977,7 +1596,83 @@ nvpair_move_binary(const char *name, void *value, size_t size)
}
nvp = nvpair_allocv(name, NV_TYPE_BINARY, (uint64_t)(uintptr_t)value,
- size);
+ size, 0);
+ if (nvp == NULL) {
+ ERRNO_SAVE();
+ nv_free(value);
+ ERRNO_RESTORE();
+ }
+
+ return (nvp);
+}
+
+nvpair_t *
+nvpair_move_bool_array(const char *name, bool *value, size_t nitems)
+{
+ nvpair_t *nvp;
+
+ if (value == NULL || nitems == 0) {
+ ERRNO_SET(EINVAL);
+ return (NULL);
+ }
+
+ nvp = nvpair_allocv(name, NV_TYPE_BOOL_ARRAY,
+ (uint64_t)(uintptr_t)value, sizeof(value[0]) * nitems, nitems);
+ if (nvp == NULL) {
+ ERRNO_SAVE();
+ nv_free(value);
+ ERRNO_RESTORE();
+ }
+
+ return (nvp);
+}
+
+nvpair_t *
+nvpair_move_string_array(const char *name, char **value, size_t nitems)
+{
+ nvpair_t *nvp;
+ size_t i, size;
+
+ if (value == NULL || nitems == 0) {
+ ERRNO_SET(EINVAL);
+ return (NULL);
+ }
+
+ size = 0;
+ for (i = 0; i < nitems; i++) {
+ if (value[i] == NULL) {
+ ERRNO_SET(EINVAL);
+ return (NULL);
+ }
+
+ size += strlen(value[i]) + 1;
+ }
+
+ nvp = nvpair_allocv(name, NV_TYPE_STRING_ARRAY,
+ (uint64_t)(uintptr_t)value, size, nitems);
+ if (nvp == NULL) {
+ ERRNO_SAVE();
+ for (i = 0; i < nitems; i++)
+ nv_free(value[i]);
+ nv_free(value);
+ ERRNO_RESTORE();
+ }
+
+ return (nvp);
+}
+
+nvpair_t *
+nvpair_move_number_array(const char *name, uint64_t *value, size_t nitems)
+{
+ nvpair_t *nvp;
+
+ if (value == NULL || nitems == 0) {
+ ERRNO_SET(EINVAL);
+ return (NULL);
+ }
+
+ nvp = nvpair_allocv(name, NV_TYPE_NUMBER_ARRAY,
+ (uint64_t)(uintptr_t)value, sizeof(value[0]) * nitems, nitems);
if (nvp == NULL) {
ERRNO_SAVE();
nv_free(value);
@@ -987,6 +1682,95 @@ nvpair_move_binary(const char *name, void *value, size_t size)
return (nvp);
}
+nvpair_t *
+nvpair_move_nvlist_array(const char *name, nvlist_t **value, size_t nitems)
+{
+ unsigned int ii;
+ nvpair_t *nvp;
+ int flags;
+
+ nvp = NULL;
+ if (value == NULL || nitems == 0) {
+ ERRNO_SET(EINVAL);
+ return (NULL);
+ }
+
+ for (ii = 0; ii < nitems; ii++) {
+ if (value == NULL || nvlist_error(value[ii]) != 0 ||
+ nvlist_get_pararr(value[ii], NULL) != NULL) {
+ ERRNO_SET(EINVAL);
+ goto fail;
+ }
+ if (ii > 0) {
+ nvp = nvpair_allocv(" ", NV_TYPE_NVLIST,
+ (uint64_t)(uintptr_t)value[ii], 0, 0);
+ if (nvp == NULL)
+ goto fail;
+ nvlist_set_array_next(value[ii - 1], nvp);
+ }
+ }
+ flags = nvlist_flags(value[nitems - 1]) | NV_FLAG_IN_ARRAY;
+ nvlist_set_flags(value[nitems - 1], flags);
+
+ nvp = nvpair_allocv(name, NV_TYPE_NVLIST_ARRAY,
+ (uint64_t)(uintptr_t)value, 0, nitems);
+fail:
+ if (nvp == NULL) {
+ ERRNO_SAVE();
+ for (ii = 0; ii < nitems; ii++) {
+ if (value[ii] != NULL &&
+ nvlist_get_pararr(value[ii], NULL) != NULL) {
+ nvlist_destroy(value[ii]);
+ }
+ nv_free(value);
+ }
+ ERRNO_RESTORE();
+ } else {
+ for (ii = 0; ii < nitems; ii++)
+ nvlist_set_parent(value[ii], nvp);
+ }
+
+ return (nvp);
+}
+
+#ifndef _KERNEL
+nvpair_t *
+nvpair_move_descriptor_array(const char *name, int *value, size_t nitems)
+{
+ nvpair_t *nvp;
+ size_t i;
+
+ nvp = NULL;
+ if (value == NULL || nitems == 0) {
+ ERRNO_SET(EINVAL);
+ return (NULL);
+ }
+
+ for (i = 0; i < nitems; i++) {
+ if (value[i] != -1 && !fd_is_valid(value[i])) {
+ ERRNO_SET(EBADF);
+ goto fail;
+ }
+ }
+
+ nvp = nvpair_allocv(name, NV_TYPE_DESCRIPTOR_ARRAY,
+ (uint64_t)(uintptr_t)value, sizeof(value[0]) * nitems, nitems);
+
+fail:
+ if (nvp == NULL) {
+ ERRNO_SAVE();
+ for (i = 0; i < nitems; i++) {
+ if (fd_is_valid(value[i]))
+ close(value[i]);
+ }
+ nv_free(value);
+ ERRNO_RESTORE();
+ }
+
+ return (nvp);
+}
+#endif
+
bool
nvpair_get_bool(const nvpair_t *nvp)
{
@@ -1046,12 +1830,81 @@ nvpair_get_binary(const nvpair_t *nvp, size_t *sizep)
if (sizep != NULL)
*sizep = nvp->nvp_datasize;
+
return ((const void *)(intptr_t)nvp->nvp_data);
}
+const bool *
+nvpair_get_bool_array(const nvpair_t *nvp, size_t *nitems)
+{
+
+ NVPAIR_ASSERT(nvp);
+ PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_BOOL_ARRAY);
+
+ if (nitems != NULL)
+ *nitems = nvp->nvp_nitems;
+
+ return ((const bool *)(intptr_t)nvp->nvp_data);
+}
+
+const uint64_t *
+nvpair_get_number_array(const nvpair_t *nvp, size_t *nitems)
+{
+
+ NVPAIR_ASSERT(nvp);
+ PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NUMBER_ARRAY);
+
+ if (nitems != NULL)
+ *nitems = nvp->nvp_nitems;
+
+ return ((const uint64_t *)(intptr_t)nvp->nvp_data);
+}
+
+const char * const *
+nvpair_get_string_array(const nvpair_t *nvp, size_t *nitems)
+{
+
+ NVPAIR_ASSERT(nvp);
+ PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_STRING_ARRAY);
+
+ if (nitems != NULL)
+ *nitems = nvp->nvp_nitems;
+
+ return ((const char * const *)(intptr_t)nvp->nvp_data);
+}
+
+const nvlist_t * const *
+nvpair_get_nvlist_array(const nvpair_t *nvp, size_t *nitems)
+{
+
+ NVPAIR_ASSERT(nvp);
+ PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NVLIST_ARRAY);
+
+ if (nitems != NULL)
+ *nitems = nvp->nvp_nitems;
+
+ return ((const nvlist_t * const *)((intptr_t)nvp->nvp_data));
+}
+
+#ifndef _KERNEL
+const int *
+nvpair_get_descriptor_array(const nvpair_t *nvp, size_t *nitems)
+{
+
+ NVPAIR_ASSERT(nvp);
+ PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_DESCRIPTOR_ARRAY);
+
+ if (nitems != NULL)
+ *nitems = nvp->nvp_nitems;
+
+ return ((const int *)(intptr_t)nvp->nvp_data);
+}
+#endif
+
void
nvpair_free(nvpair_t *nvp)
{
+ size_t i;
NVPAIR_ASSERT(nvp);
PJDLOG_ASSERT(nvp->nvp_list == NULL);
@@ -1062,6 +1915,10 @@ nvpair_free(nvpair_t *nvp)
case NV_TYPE_DESCRIPTOR:
close((int)nvp->nvp_data);
break;
+ case NV_TYPE_DESCRIPTOR_ARRAY:
+ for (i = 0; i < nvp->nvp_nitems; i++)
+ close(((int *)(intptr_t)nvp->nvp_data)[i]);
+ break;
#endif
case NV_TYPE_NVLIST:
nvlist_destroy((nvlist_t *)(intptr_t)nvp->nvp_data);
@@ -1072,6 +1929,23 @@ nvpair_free(nvpair_t *nvp)
case NV_TYPE_BINARY:
nv_free((void *)(intptr_t)nvp->nvp_data);
break;
+ case NV_TYPE_NVLIST_ARRAY:
+ for (i = 0; i < nvp->nvp_nitems; i++) {
+ nvlist_destroy(
+ ((nvlist_t **)(intptr_t)nvp->nvp_data)[i]);
+ }
+ nv_free(((nvlist_t **)(intptr_t)nvp->nvp_data));
+ break;
+ case NV_TYPE_NUMBER_ARRAY:
+ nv_free((uint64_t *)(intptr_t)nvp->nvp_data);
+ break;
+ case NV_TYPE_BOOL_ARRAY:
+ nv_free((bool *)(intptr_t)nvp->nvp_data);
+ break;
+ case NV_TYPE_STRING_ARRAY:
+ for (i = 0; i < nvp->nvp_nitems; i++)
+ nv_free(((char **)(intptr_t)nvp->nvp_data)[i]);
+ break;
}
nv_free(nvp);
}
@@ -1106,6 +1980,16 @@ nvpair_type_string(int type)
return ("DESCRIPTOR");
case NV_TYPE_BINARY:
return ("BINARY");
+ case NV_TYPE_BOOL_ARRAY:
+ return ("BOOL ARRAY");
+ case NV_TYPE_NUMBER_ARRAY:
+ return ("NUMBER ARRAY");
+ case NV_TYPE_STRING_ARRAY:
+ return ("STRING ARRAY");
+ case NV_TYPE_NVLIST_ARRAY:
+ return ("NVLIST ARRAY");
+ case NV_TYPE_DESCRIPTOR_ARRAY:
+ return ("DESCRIPTOR ARRAY");
default:
return ("<UNKNOWN>");
}
diff --git a/sys/contrib/libnv/nvpair_impl.h b/sys/contrib/libnv/nvpair_impl.h
index fed7725..0350b1c 100644
--- a/sys/contrib/libnv/nvpair_impl.h
+++ b/sys/contrib/libnv/nvpair_impl.h
@@ -1,5 +1,6 @@
/*-
* Copyright (c) 2009-2013 The FreeBSD Foundation
+ * Copyright (c) 2013-2015 Mariusz Zaborski <oshogbo@FreeBSD.org>
* All rights reserved.
*
* This software was developed by Pawel Jakub Dawidek under sponsorship from
@@ -71,6 +72,15 @@ unsigned char *nvpair_pack_descriptor(const nvpair_t *nvp, unsigned char *ptr,
unsigned char *nvpair_pack_binary(const nvpair_t *nvp, unsigned char *ptr,
size_t *leftp);
unsigned char *nvpair_pack_nvlist_up(unsigned char *ptr, size_t *leftp);
+unsigned char *nvpair_pack_bool_array(const nvpair_t *nvp, unsigned char *ptr,
+ size_t *leftp);
+unsigned char *nvpair_pack_number_array(const nvpair_t *nvp, unsigned char *ptr,
+ size_t *leftp);
+unsigned char *nvpair_pack_string_array(const nvpair_t *nvp, unsigned char *ptr,
+ size_t *leftp);
+unsigned char *nvpair_pack_descriptor_array(const nvpair_t *nvp,
+ unsigned char *ptr, int64_t *fdidxp, size_t *leftp);
+unsigned char *nvpair_pack_nvlist_array_next(unsigned char *ptr, size_t *leftp);
/* Unpack data functions. */
const unsigned char *nvpair_unpack_header(bool isbe, nvpair_t *nvp,
@@ -89,5 +99,15 @@ const unsigned char *nvpair_unpack_descriptor(bool isbe, nvpair_t *nvp,
const unsigned char *ptr, size_t *leftp, const int *fds, size_t nfds);
const unsigned char *nvpair_unpack_binary(bool isbe, nvpair_t *nvp,
const unsigned char *ptr, size_t *leftp);
+const unsigned char *nvpair_unpack_bool_array(bool isbe, nvpair_t *nvp,
+ const unsigned char *ptr, size_t *leftp);
+const unsigned char *nvpair_unpack_number_array(bool isbe, nvpair_t *nvp,
+ const unsigned char *ptr, size_t *leftp);
+const unsigned char *nvpair_unpack_string_array(bool isbe, nvpair_t *nvp,
+ const unsigned char *ptr, size_t *leftp);
+const unsigned char *nvpair_unpack_descriptor_array(bool isbe, nvpair_t *nvp,
+ const unsigned char *ptr, size_t *leftp, const int *fds, size_t nfds);
+const unsigned char *nvpair_unpack_nvlist_array(bool isbe, nvpair_t *nvp,
+ const unsigned char *ptr, size_t *leftp, nvlist_t **firstel);
#endif /* !_NVPAIR_IMPL_H_ */
diff --git a/sys/dev/ata/ata-all.c b/sys/dev/ata/ata-all.c
index 52db44d..118e38e 100644
--- a/sys/dev/ata/ata-all.c
+++ b/sys/dev/ata/ata-all.c
@@ -64,18 +64,15 @@ static void ata_cam_end_transaction(device_t dev, struct ata_request *request);
static void ata_cam_request_sense(device_t dev, struct ata_request *request);
static int ata_check_ids(device_t dev, union ccb *ccb);
static void ata_conn_event(void *context, int dummy);
-static void ata_init(void);
static void ata_interrupt_locked(void *data);
static int ata_module_event_handler(module_t mod, int what, void *arg);
static void ata_periodic_poll(void *data);
static int ata_str2mode(const char *str);
-static void ata_uninit(void);
/* global vars */
MALLOC_DEFINE(M_ATA, "ata_generic", "ATA driver generic layer");
int (*ata_raid_ioctl_func)(u_long cmd, caddr_t data) = NULL;
devclass_t ata_devclass;
-uma_zone_t ata_request_zone;
int ata_dma_check_80pin = 1;
/* sysctl vars */
@@ -650,12 +647,7 @@ ata_cam_begin_transaction(device_t dev, union ccb *ccb)
struct ata_channel *ch = device_get_softc(dev);
struct ata_request *request;
- if (!(request = ata_alloc_request())) {
- device_printf(dev, "FAILURE - out of memory in start\n");
- ccb->ccb_h.status = CAM_REQ_INVALID;
- xpt_done(ccb);
- return;
- }
+ request = &ch->request;
bzero(request, sizeof(*request));
/* setup request */
@@ -794,7 +786,6 @@ ata_cam_process_sense(device_t dev, struct ata_request *request)
ccb->ccb_h.status |= CAM_AUTOSENSE_FAIL;
}
- ata_free_request(request);
xpt_done(ccb);
/* Do error recovery if needed. */
if (fatalerr)
@@ -865,10 +856,8 @@ ata_cam_end_transaction(device_t dev, struct ata_request *request)
if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_SCSI_STATUS_ERROR &&
(ccb->ccb_h.flags & CAM_DIS_AUTOSENSE) == 0)
ata_cam_request_sense(dev, request);
- else {
- ata_free_request(request);
+ else
xpt_done(ccb);
- }
/* Do error recovery if needed. */
if (fatalerr)
ata_reinit(dev);
@@ -1148,18 +1137,3 @@ static moduledata_t ata_moduledata = { "ata", ata_module_event_handler, NULL };
DECLARE_MODULE(ata, ata_moduledata, SI_SUB_CONFIGURE, SI_ORDER_SECOND);
MODULE_VERSION(ata, 1);
MODULE_DEPEND(ata, cam, 1, 1, 1);
-
-static void
-ata_init(void)
-{
- ata_request_zone = uma_zcreate("ata_request", sizeof(struct ata_request),
- NULL, NULL, NULL, NULL, 0, 0);
-}
-SYSINIT(ata_register, SI_SUB_DRIVERS, SI_ORDER_SECOND, ata_init, NULL);
-
-static void
-ata_uninit(void)
-{
- uma_zdestroy(ata_request_zone);
-}
-SYSUNINIT(ata_unregister, SI_SUB_DRIVERS, SI_ORDER_SECOND, ata_uninit, NULL);
diff --git a/sys/dev/ata/ata-all.h b/sys/dev/ata/ata-all.h
index 19cb7ef..cf8ed78 100644
--- a/sys/dev/ata/ata-all.h
+++ b/sys/dev/ata/ata-all.h
@@ -450,6 +450,7 @@ struct ata_channel {
struct ata_cam_device curr[16]; /* Current settings */
int requestsense; /* CCB waiting for SENSE. */
struct callout poll_callout; /* Periodic status poll. */
+ struct ata_request request;
};
/* disk bay/enclosure related */
@@ -507,14 +508,6 @@ int ata_sata_getrev(device_t dev, int target);
int ata_request2fis_h2d(struct ata_request *request, u_int8_t *fis);
void ata_pm_identify(device_t dev);
-/* macros for alloc/free of struct ata_request */
-extern uma_zone_t ata_request_zone;
-#define ata_alloc_request() uma_zalloc(ata_request_zone, M_NOWAIT | M_ZERO)
-#define ata_free_request(request) { \
- if (!(request->flags & ATA_R_DANGER2)) \
- uma_zfree(ata_request_zone, request); \
- }
-
MALLOC_DECLARE(M_ATA);
/* misc newbus defines */
diff --git a/sys/dev/ath/if_ath.c b/sys/dev/ath/if_ath.c
index 2c935a2b..26bf591 100644
--- a/sys/dev/ath/if_ath.c
+++ b/sys/dev/ath/if_ath.c
@@ -1473,7 +1473,7 @@ ath_vap_create(struct ieee80211com *ic, const char name[IFNAMSIZ], int unit,
const uint8_t bssid[IEEE80211_ADDR_LEN],
const uint8_t mac0[IEEE80211_ADDR_LEN])
{
- struct ath_softc *sc = ic->ic_ifp->if_softc;
+ struct ath_softc *sc = ic->ic_softc;
struct ath_vap *avp;
struct ieee80211vap *vap;
uint8_t mac[IEEE80211_ADDR_LEN];
@@ -1732,7 +1732,7 @@ ath_vap_delete(struct ieee80211vap *vap)
{
struct ieee80211com *ic = vap->iv_ic;
struct ifnet *ifp = ic->ic_ifp;
- struct ath_softc *sc = ifp->if_softc;
+ struct ath_softc *sc = ic->ic_softc;
struct ath_hal *ah = sc->sc_ah;
struct ath_vap *avp = ATH_VAP(vap);
@@ -2340,7 +2340,7 @@ ath_fatal_proc(void *arg, int pending)
static void
ath_bmiss_vap(struct ieee80211vap *vap)
{
- struct ath_softc *sc = vap->iv_ic->ic_ifp->if_softc;
+ struct ath_softc *sc = vap->iv_ic->ic_softc;
/*
* Workaround phantom bmiss interrupts by sanity-checking
@@ -2361,8 +2361,6 @@ ath_bmiss_vap(struct ieee80211vap *vap)
ATH_UNLOCK(sc);
if ((vap->iv_flags_ext & IEEE80211_FEXT_SWBMISS) == 0) {
- struct ifnet *ifp = vap->iv_ic->ic_ifp;
- struct ath_softc *sc = ifp->if_softc;
u_int64_t lastrx = sc->sc_lastrx;
u_int64_t tsf = ath_hal_gettsf64(sc->sc_ah);
/* XXX should take a locked ref to iv_bss */
@@ -2851,8 +2849,8 @@ ath_stop(struct ifnet *ifp)
int
ath_reset(struct ifnet *ifp, ATH_RESET_TYPE reset_type)
{
- struct ath_softc *sc = ifp->if_softc;
struct ieee80211com *ic = ifp->if_l2com;
+ struct ath_softc *sc = ic->ic_softc;
struct ath_hal *ah = sc->sc_ah;
HAL_STATUS status;
int i;
@@ -3045,7 +3043,7 @@ ath_reset_vap(struct ieee80211vap *vap, u_long cmd)
{
struct ieee80211com *ic = vap->iv_ic;
struct ifnet *ifp = ic->ic_ifp;
- struct ath_softc *sc = ifp->if_softc;
+ struct ath_softc *sc = ic->ic_softc;
struct ath_hal *ah = sc->sc_ah;
switch (cmd) {
@@ -3248,7 +3246,7 @@ static int
ath_transmit(struct ifnet *ifp, struct mbuf *m)
{
struct ieee80211com *ic = ifp->if_l2com;
- struct ath_softc *sc = ic->ic_ifp->if_softc;
+ struct ath_softc *sc = ic->ic_softc;
struct ieee80211_node *ni;
struct mbuf *next;
struct ath_buf *bf;
@@ -3538,8 +3536,7 @@ ath_media_change(struct ifnet *ifp)
static void
ath_key_update_begin(struct ieee80211vap *vap)
{
- struct ifnet *ifp = vap->iv_ic->ic_ifp;
- struct ath_softc *sc = ifp->if_softc;
+ struct ath_softc *sc = vap->iv_ic->ic_softc;
DPRINTF(sc, ATH_DEBUG_KEYCACHE, "%s:\n", __func__);
taskqueue_block(sc->sc_tq);
@@ -3548,8 +3545,7 @@ ath_key_update_begin(struct ieee80211vap *vap)
static void
ath_key_update_end(struct ieee80211vap *vap)
{
- struct ifnet *ifp = vap->iv_ic->ic_ifp;
- struct ath_softc *sc = ifp->if_softc;
+ struct ath_softc *sc = vap->iv_ic->ic_softc;
DPRINTF(sc, ATH_DEBUG_KEYCACHE, "%s:\n", __func__);
taskqueue_unblock(sc->sc_tq);
@@ -4156,7 +4152,7 @@ static struct ieee80211_node *
ath_node_alloc(struct ieee80211vap *vap, const uint8_t mac[IEEE80211_ADDR_LEN])
{
struct ieee80211com *ic = vap->iv_ic;
- struct ath_softc *sc = ic->ic_ifp->if_softc;
+ struct ath_softc *sc = ic->ic_softc;
const size_t space = sizeof(struct ath_node) + sc->sc_rc->arc_space;
struct ath_node *an;
@@ -4183,7 +4179,7 @@ static void
ath_node_cleanup(struct ieee80211_node *ni)
{
struct ieee80211com *ic = ni->ni_ic;
- struct ath_softc *sc = ic->ic_ifp->if_softc;
+ struct ath_softc *sc = ic->ic_softc;
DPRINTF(sc, ATH_DEBUG_NODE, "%s: %6D: an %p\n", __func__,
ni->ni_macaddr, ":", ATH_NODE(ni));
@@ -4198,7 +4194,7 @@ static void
ath_node_free(struct ieee80211_node *ni)
{
struct ieee80211com *ic = ni->ni_ic;
- struct ath_softc *sc = ic->ic_ifp->if_softc;
+ struct ath_softc *sc = ic->ic_softc;
DPRINTF(sc, ATH_DEBUG_NODE, "%s: %6D: an %p\n", __func__,
ni->ni_macaddr, ":", ATH_NODE(ni));
@@ -4210,7 +4206,7 @@ static void
ath_node_getsignal(const struct ieee80211_node *ni, int8_t *rssi, int8_t *noise)
{
struct ieee80211com *ic = ni->ni_ic;
- struct ath_softc *sc = ic->ic_ifp->if_softc;
+ struct ath_softc *sc = ic->ic_softc;
struct ath_hal *ah = sc->sc_ah;
*rssi = ic->ic_node_getrssi(ni);
@@ -4422,7 +4418,7 @@ ath_txq_update(struct ath_softc *sc, int ac)
int
ath_wme_update(struct ieee80211com *ic)
{
- struct ath_softc *sc = ic->ic_ifp->if_softc;
+ struct ath_softc *sc = ic->ic_softc;
return !ath_txq_update(sc, WME_AC_BE) ||
!ath_txq_update(sc, WME_AC_BK) ||
@@ -5797,7 +5793,7 @@ static void
ath_scan_start(struct ieee80211com *ic)
{
struct ifnet *ifp = ic->ic_ifp;
- struct ath_softc *sc = ifp->if_softc;
+ struct ath_softc *sc = ic->ic_softc;
struct ath_hal *ah = sc->sc_ah;
u_int32_t rfilt;
@@ -5821,8 +5817,7 @@ ath_scan_start(struct ieee80211com *ic)
static void
ath_scan_end(struct ieee80211com *ic)
{
- struct ifnet *ifp = ic->ic_ifp;
- struct ath_softc *sc = ifp->if_softc;
+ struct ath_softc *sc = ic->ic_softc;
struct ath_hal *ah = sc->sc_ah;
u_int32_t rfilt;
@@ -5862,8 +5857,7 @@ ath_scan_end(struct ieee80211com *ic)
static void
ath_update_chw(struct ieee80211com *ic)
{
- struct ifnet *ifp = ic->ic_ifp;
- struct ath_softc *sc = ifp->if_softc;
+ struct ath_softc *sc = ic->ic_softc;
DPRINTF(sc, ATH_DEBUG_STATE, "%s: called\n", __func__);
ath_set_channel(ic);
@@ -5873,8 +5867,7 @@ ath_update_chw(struct ieee80211com *ic)
static void
ath_set_channel(struct ieee80211com *ic)
{
- struct ifnet *ifp = ic->ic_ifp;
- struct ath_softc *sc = ifp->if_softc;
+ struct ath_softc *sc = ic->ic_softc;
ATH_LOCK(sc);
ath_power_set_power_state(sc, HAL_PM_AWAKE);
@@ -5916,7 +5909,7 @@ static int
ath_newstate(struct ieee80211vap *vap, enum ieee80211_state nstate, int arg)
{
struct ieee80211com *ic = vap->iv_ic;
- struct ath_softc *sc = ic->ic_ifp->if_softc;
+ struct ath_softc *sc = ic->ic_softc;
struct ath_vap *avp = ATH_VAP(vap);
struct ath_hal *ah = sc->sc_ah;
struct ieee80211_node *ni = NULL;
@@ -6252,7 +6245,7 @@ static void
ath_setup_stationkey(struct ieee80211_node *ni)
{
struct ieee80211vap *vap = ni->ni_vap;
- struct ath_softc *sc = vap->iv_ic->ic_ifp->if_softc;
+ struct ath_softc *sc = vap->iv_ic->ic_softc;
ieee80211_keyix keyix, rxkeyix;
/* XXX should take a locked ref to vap->iv_bss */
@@ -6285,7 +6278,7 @@ ath_newassoc(struct ieee80211_node *ni, int isnew)
{
struct ath_node *an = ATH_NODE(ni);
struct ieee80211vap *vap = ni->ni_vap;
- struct ath_softc *sc = vap->iv_ic->ic_ifp->if_softc;
+ struct ath_softc *sc = vap->iv_ic->ic_softc;
const struct ieee80211_txparam *tp = ni->ni_txparms;
an->an_mcastrix = ath_tx_findrix(sc, tp->mcastrate);
@@ -6337,7 +6330,7 @@ static int
ath_setregdomain(struct ieee80211com *ic, struct ieee80211_regdomain *reg,
int nchans, struct ieee80211_channel chans[])
{
- struct ath_softc *sc = ic->ic_ifp->if_softc;
+ struct ath_softc *sc = ic->ic_softc;
struct ath_hal *ah = sc->sc_ah;
HAL_STATUS status;
@@ -6361,7 +6354,7 @@ static void
ath_getradiocaps(struct ieee80211com *ic,
int maxchans, int *nchans, struct ieee80211_channel chans[])
{
- struct ath_softc *sc = ic->ic_ifp->if_softc;
+ struct ath_softc *sc = ic->ic_softc;
struct ath_hal *ah = sc->sc_ah;
DPRINTF(sc, ATH_DEBUG_REGDOMAIN, "%s: use rd %u cc %d\n",
@@ -6693,8 +6686,8 @@ ath_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
{
#define IS_RUNNING(ifp) \
((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))
- struct ath_softc *sc = ifp->if_softc;
struct ieee80211com *ic = ifp->if_l2com;
+ struct ath_softc *sc = ic->ic_softc;
struct ifreq *ifr = (struct ifreq *)data;
const HAL_RATE_TABLE *rt;
int error = 0;
@@ -6864,7 +6857,7 @@ ath_node_powersave(struct ieee80211_node *ni, int enable)
#ifdef ATH_SW_PSQ
struct ath_node *an = ATH_NODE(ni);
struct ieee80211com *ic = ni->ni_ic;
- struct ath_softc *sc = ic->ic_ifp->if_softc;
+ struct ath_softc *sc = ic->ic_softc;
struct ath_vap *avp = ATH_VAP(ni->ni_vap);
/* XXX and no TXQ locks should be held here */
@@ -6931,7 +6924,7 @@ ath_node_set_tim(struct ieee80211_node *ni, int enable)
{
#ifdef ATH_SW_PSQ
struct ieee80211com *ic = ni->ni_ic;
- struct ath_softc *sc = ic->ic_ifp->if_softc;
+ struct ath_softc *sc = ic->ic_softc;
struct ath_node *an = ATH_NODE(ni);
struct ath_vap *avp = ATH_VAP(ni->ni_vap);
int changed = 0;
@@ -7136,7 +7129,7 @@ ath_node_recv_pspoll(struct ieee80211_node *ni, struct mbuf *m)
struct ath_node *an;
struct ath_vap *avp;
struct ieee80211com *ic = ni->ni_ic;
- struct ath_softc *sc = ic->ic_ifp->if_softc;
+ struct ath_softc *sc = ic->ic_softc;
int tid;
/* Just paranoia */
diff --git a/sys/dev/ath/if_ath_keycache.c b/sys/dev/ath/if_ath_keycache.c
index fe99f10..b8a77e8 100644
--- a/sys/dev/ath/if_ath_keycache.c
+++ b/sys/dev/ath/if_ath_keycache.c
@@ -425,7 +425,7 @@ int
ath_key_alloc(struct ieee80211vap *vap, struct ieee80211_key *k,
ieee80211_keyix *keyix, ieee80211_keyix *rxkeyix)
{
- struct ath_softc *sc = vap->iv_ic->ic_ifp->if_softc;
+ struct ath_softc *sc = vap->iv_ic->ic_softc;
/*
* Group key allocation must be handled specially for
@@ -493,7 +493,7 @@ ath_key_alloc(struct ieee80211vap *vap, struct ieee80211_key *k,
int
ath_key_delete(struct ieee80211vap *vap, const struct ieee80211_key *k)
{
- struct ath_softc *sc = vap->iv_ic->ic_ifp->if_softc;
+ struct ath_softc *sc = vap->iv_ic->ic_softc;
struct ath_hal *ah = sc->sc_ah;
const struct ieee80211_cipher *cip = k->wk_cipher;
u_int keyix = k->wk_keyix;
@@ -538,7 +538,7 @@ int
ath_key_set(struct ieee80211vap *vap, const struct ieee80211_key *k,
const u_int8_t mac[IEEE80211_ADDR_LEN])
{
- struct ath_softc *sc = vap->iv_ic->ic_ifp->if_softc;
+ struct ath_softc *sc = vap->iv_ic->ic_softc;
return ath_keyset(sc, vap, k, vap->iv_bss);
}
diff --git a/sys/dev/ath/if_ath_rx.c b/sys/dev/ath/if_ath_rx.c
index 2779b7a..e391dd7 100644
--- a/sys/dev/ath/if_ath_rx.c
+++ b/sys/dev/ath/if_ath_rx.c
@@ -330,7 +330,7 @@ ath_recv_mgmt(struct ieee80211_node *ni, struct mbuf *m,
int subtype, const struct ieee80211_rx_stats *rxs, int rssi, int nf)
{
struct ieee80211vap *vap = ni->ni_vap;
- struct ath_softc *sc = vap->iv_ic->ic_ifp->if_softc;
+ struct ath_softc *sc = vap->iv_ic->ic_softc;
uint64_t tsf_beacon_old, tsf_beacon;
uint64_t nexttbtt;
int64_t tsf_delta;
diff --git a/sys/dev/ath/if_ath_tdma.c b/sys/dev/ath/if_ath_tdma.c
index fd23db1..d4c9ccd 100644
--- a/sys/dev/ath/if_ath_tdma.c
+++ b/sys/dev/ath/if_ath_tdma.c
@@ -359,7 +359,7 @@ ath_tdma_update(struct ieee80211_node *ni,
#define TU_TO_TSF(_tu) (((u_int64_t)(_tu)) << 10)
struct ieee80211vap *vap = ni->ni_vap;
struct ieee80211com *ic = ni->ni_ic;
- struct ath_softc *sc = ic->ic_ifp->if_softc;
+ struct ath_softc *sc = ic->ic_softc;
struct ath_hal *ah = sc->sc_ah;
const HAL_RATE_TABLE *rt = sc->sc_currates;
u_int64_t tsf, rstamp, nextslot, nexttbtt, nexttbtt_full;
diff --git a/sys/dev/ath/if_ath_tx.c b/sys/dev/ath/if_ath_tx.c
index c15b158..916d4cb 100644
--- a/sys/dev/ath/if_ath_tx.c
+++ b/sys/dev/ath/if_ath_tx.c
@@ -2341,7 +2341,7 @@ ath_raw_xmit(struct ieee80211_node *ni, struct mbuf *m,
{
struct ieee80211com *ic = ni->ni_ic;
struct ifnet *ifp = ic->ic_ifp;
- struct ath_softc *sc = ifp->if_softc;
+ struct ath_softc *sc = ic->ic_softc;
struct ath_buf *bf;
struct ieee80211_frame *wh = mtod(m, struct ieee80211_frame *);
int error = 0;
@@ -5731,7 +5731,7 @@ int
ath_addba_request(struct ieee80211_node *ni, struct ieee80211_tx_ampdu *tap,
int dialogtoken, int baparamset, int batimeout)
{
- struct ath_softc *sc = ni->ni_ic->ic_ifp->if_softc;
+ struct ath_softc *sc = ni->ni_ic->ic_softc;
int tid = tap->txa_tid;
struct ath_node *an = ATH_NODE(ni);
struct ath_tid *atid = &an->an_tid[tid];
@@ -5809,7 +5809,7 @@ int
ath_addba_response(struct ieee80211_node *ni, struct ieee80211_tx_ampdu *tap,
int status, int code, int batimeout)
{
- struct ath_softc *sc = ni->ni_ic->ic_ifp->if_softc;
+ struct ath_softc *sc = ni->ni_ic->ic_softc;
int tid = tap->txa_tid;
struct ath_node *an = ATH_NODE(ni);
struct ath_tid *atid = &an->an_tid[tid];
@@ -5856,7 +5856,7 @@ ath_addba_response(struct ieee80211_node *ni, struct ieee80211_tx_ampdu *tap,
void
ath_addba_stop(struct ieee80211_node *ni, struct ieee80211_tx_ampdu *tap)
{
- struct ath_softc *sc = ni->ni_ic->ic_ifp->if_softc;
+ struct ath_softc *sc = ni->ni_ic->ic_softc;
int tid = tap->txa_tid;
struct ath_node *an = ATH_NODE(ni);
struct ath_tid *atid = &an->an_tid[tid];
@@ -5991,7 +5991,7 @@ void
ath_bar_response(struct ieee80211_node *ni, struct ieee80211_tx_ampdu *tap,
int status)
{
- struct ath_softc *sc = ni->ni_ic->ic_ifp->if_softc;
+ struct ath_softc *sc = ni->ni_ic->ic_softc;
int tid = tap->txa_tid;
struct ath_node *an = ATH_NODE(ni);
struct ath_tid *atid = &an->an_tid[tid];
@@ -6064,7 +6064,7 @@ void
ath_addba_response_timeout(struct ieee80211_node *ni,
struct ieee80211_tx_ampdu *tap)
{
- struct ath_softc *sc = ni->ni_ic->ic_ifp->if_softc;
+ struct ath_softc *sc = ni->ni_ic->ic_softc;
int tid = tap->txa_tid;
struct ath_node *an = ATH_NODE(ni);
struct ath_tid *atid = &an->an_tid[tid];
diff --git a/sys/dev/bxe/ecore_hsi.h b/sys/dev/bxe/ecore_hsi.h
index 005bb2e..f78f4ea 100644
--- a/sys/dev/bxe/ecore_hsi.h
+++ b/sys/dev/bxe/ecore_hsi.h
@@ -2536,9 +2536,9 @@ struct shmem2_region {
#define SHMEM_EEE_SUPPORTED_MASK 0x000f0000
#define SHMEM_EEE_SUPPORTED_SHIFT 16
#define SHMEM_EEE_ADV_STATUS_MASK 0x00f00000
- #define SHMEM_EEE_100M_ADV (1<<0)
- #define SHMEM_EEE_1G_ADV (1<<1)
- #define SHMEM_EEE_10G_ADV (1<<2)
+ #define SHMEM_EEE_100M_ADV (1U<<0)
+ #define SHMEM_EEE_1G_ADV (1U<<1)
+ #define SHMEM_EEE_10G_ADV (1U<<2)
#define SHMEM_EEE_ADV_STATUS_SHIFT 20
#define SHMEM_EEE_LP_ADV_STATUS_MASK 0x0f000000
#define SHMEM_EEE_LP_ADV_STATUS_SHIFT 24
diff --git a/sys/dev/e1000/e1000_80003es2lan.c b/sys/dev/e1000/e1000_80003es2lan.c
index 076e02b..b948bb4 100644
--- a/sys/dev/e1000/e1000_80003es2lan.c
+++ b/sys/dev/e1000/e1000_80003es2lan.c
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2013, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_80003es2lan.h b/sys/dev/e1000/e1000_80003es2lan.h
index 3807e46..89b1551 100644
--- a/sys/dev/e1000/e1000_80003es2lan.h
+++ b/sys/dev/e1000/e1000_80003es2lan.h
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2013, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_82540.c b/sys/dev/e1000/e1000_82540.c
index 141b92e..68f92c6 100644
--- a/sys/dev/e1000/e1000_82540.c
+++ b/sys/dev/e1000/e1000_82540.c
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2011, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_82541.c b/sys/dev/e1000/e1000_82541.c
index 781aa93..69fcee4 100644
--- a/sys/dev/e1000/e1000_82541.c
+++ b/sys/dev/e1000/e1000_82541.c
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2011, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_82541.h b/sys/dev/e1000/e1000_82541.h
index 3b6b961..1eebfad 100644
--- a/sys/dev/e1000/e1000_82541.h
+++ b/sys/dev/e1000/e1000_82541.h
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2008, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_82542.c b/sys/dev/e1000/e1000_82542.c
index 19d5402..a6b3616 100644
--- a/sys/dev/e1000/e1000_82542.c
+++ b/sys/dev/e1000/e1000_82542.c
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_82543.c b/sys/dev/e1000/e1000_82543.c
index 1c01658..3350f17 100644
--- a/sys/dev/e1000/e1000_82543.c
+++ b/sys/dev/e1000/e1000_82543.c
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2011, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_82543.h b/sys/dev/e1000/e1000_82543.h
index 60e5c15..0fa813b 100644
--- a/sys/dev/e1000/e1000_82543.h
+++ b/sys/dev/e1000/e1000_82543.h
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2008, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_82571.c b/sys/dev/e1000/e1000_82571.c
index e209d43..a64ef56 100644
--- a/sys/dev/e1000/e1000_82571.c
+++ b/sys/dev/e1000/e1000_82571.c
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_82571.h b/sys/dev/e1000/e1000_82571.h
index c76f16f..cda87a2 100644
--- a/sys/dev/e1000/e1000_82571.h
+++ b/sys/dev/e1000/e1000_82571.h
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2010, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_82575.c b/sys/dev/e1000/e1000_82575.c
index d79db67..8981ae3 100644
--- a/sys/dev/e1000/e1000_82575.c
+++ b/sys/dev/e1000/e1000_82575.c
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_82575.h b/sys/dev/e1000/e1000_82575.h
index 6569b98..503fdce 100644
--- a/sys/dev/e1000/e1000_82575.h
+++ b/sys/dev/e1000/e1000_82575.h
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_api.c b/sys/dev/e1000/e1000_api.c
index 374ffa6..5db22db 100644
--- a/sys/dev/e1000/e1000_api.c
+++ b/sys/dev/e1000/e1000_api.c
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_api.h b/sys/dev/e1000/e1000_api.h
index a2ffa16..e87acc8 100644
--- a/sys/dev/e1000/e1000_api.h
+++ b/sys/dev/e1000/e1000_api.h
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_defines.h b/sys/dev/e1000/e1000_defines.h
index 5deada2..9472ca4 100644
--- a/sys/dev/e1000/e1000_defines.h
+++ b/sys/dev/e1000/e1000_defines.h
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_hw.h b/sys/dev/e1000/e1000_hw.h
index faf64a3..3ec921e 100644
--- a/sys/dev/e1000/e1000_hw.h
+++ b/sys/dev/e1000/e1000_hw.h
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_i210.c b/sys/dev/e1000/e1000_i210.c
index f12c13f..563f11a 100644
--- a/sys/dev/e1000/e1000_i210.c
+++ b/sys/dev/e1000/e1000_i210.c
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_i210.h b/sys/dev/e1000/e1000_i210.h
index 2a20ca1..f940915 100644
--- a/sys/dev/e1000/e1000_i210.h
+++ b/sys/dev/e1000/e1000_i210.h
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_ich8lan.c b/sys/dev/e1000/e1000_ich8lan.c
index 204c39c..23e7b95 100644
--- a/sys/dev/e1000/e1000_ich8lan.c
+++ b/sys/dev/e1000/e1000_ich8lan.c
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_ich8lan.h b/sys/dev/e1000/e1000_ich8lan.h
index f045ebd..9cb79c0 100644
--- a/sys/dev/e1000/e1000_ich8lan.h
+++ b/sys/dev/e1000/e1000_ich8lan.h
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_mac.c b/sys/dev/e1000/e1000_mac.c
index b888b34..1c86307 100644
--- a/sys/dev/e1000/e1000_mac.c
+++ b/sys/dev/e1000/e1000_mac.c
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_mac.h b/sys/dev/e1000/e1000_mac.h
index 2c1bfe3..1daed9b 100644
--- a/sys/dev/e1000/e1000_mac.h
+++ b/sys/dev/e1000/e1000_mac.h
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_manage.c b/sys/dev/e1000/e1000_manage.c
index 8087e65..f319c8b 100644
--- a/sys/dev/e1000/e1000_manage.c
+++ b/sys/dev/e1000/e1000_manage.c
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_manage.h b/sys/dev/e1000/e1000_manage.h
index 51f17671..303e99e 100644
--- a/sys/dev/e1000/e1000_manage.h
+++ b/sys/dev/e1000/e1000_manage.h
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2012, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_mbx.c b/sys/dev/e1000/e1000_mbx.c
index 55477b2..d9fb9ac 100644
--- a/sys/dev/e1000/e1000_mbx.c
+++ b/sys/dev/e1000/e1000_mbx.c
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_mbx.h b/sys/dev/e1000/e1000_mbx.h
index d2aea5c4..fadd849 100644
--- a/sys/dev/e1000/e1000_mbx.h
+++ b/sys/dev/e1000/e1000_mbx.h
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_nvm.c b/sys/dev/e1000/e1000_nvm.c
index f702f71..0a1a18d 100644
--- a/sys/dev/e1000/e1000_nvm.c
+++ b/sys/dev/e1000/e1000_nvm.c
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_nvm.h b/sys/dev/e1000/e1000_nvm.h
index 34077b2..31f2180 100644
--- a/sys/dev/e1000/e1000_nvm.h
+++ b/sys/dev/e1000/e1000_nvm.h
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2013, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_osdep.c b/sys/dev/e1000/e1000_osdep.c
index 75a7b79..2987cda 100644
--- a/sys/dev/e1000/e1000_osdep.c
+++ b/sys/dev/e1000/e1000_osdep.c
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2010, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_osdep.h b/sys/dev/e1000/e1000_osdep.h
index 1324110..fc46f48 100644
--- a/sys/dev/e1000/e1000_osdep.h
+++ b/sys/dev/e1000/e1000_osdep.h
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_phy.c b/sys/dev/e1000/e1000_phy.c
index f27889c..adb6732 100644
--- a/sys/dev/e1000/e1000_phy.c
+++ b/sys/dev/e1000/e1000_phy.c
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_phy.h b/sys/dev/e1000/e1000_phy.h
index 0e5b2e6..d3d563f 100644
--- a/sys/dev/e1000/e1000_phy.h
+++ b/sys/dev/e1000/e1000_phy.h
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_regs.h b/sys/dev/e1000/e1000_regs.h
index 952a7dc..da93d75 100644
--- a/sys/dev/e1000/e1000_regs.h
+++ b/sys/dev/e1000/e1000_regs.h
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_vf.c b/sys/dev/e1000/e1000_vf.c
index 2cabac9..4af985b 100644
--- a/sys/dev/e1000/e1000_vf.c
+++ b/sys/dev/e1000/e1000_vf.c
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/e1000_vf.h b/sys/dev/e1000/e1000_vf.h
index 2a780741..e6f834e 100644
--- a/sys/dev/e1000/e1000_vf.h
+++ b/sys/dev/e1000/e1000_vf.h
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2014, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c
index 830325b..e36a3d8 100644
--- a/sys/dev/e1000/if_em.c
+++ b/sys/dev/e1000/if_em.c
@@ -364,8 +364,14 @@ MODULE_DEPEND(em, netmap, 1, 1, 1);
#define CSUM_TSO 0
#endif
+#define TSO_WORKAROUND 4
+
static SYSCTL_NODE(_hw, OID_AUTO, em, CTLFLAG_RD, 0, "EM driver parameters");
+static int em_disable_crc_stripping = 0;
+SYSCTL_INT(_hw_em, OID_AUTO, disable_crc_stripping, CTLFLAG_RDTUN,
+ &em_disable_crc_stripping, 0, "Disable CRC Stripping");
+
static int em_tx_int_delay_dflt = EM_TICKS_TO_USECS(EM_TIDV);
static int em_rx_int_delay_dflt = EM_TICKS_TO_USECS(EM_RDTR);
SYSCTL_INT(_hw_em, OID_AUTO, tx_int_delay, CTLFLAG_RDTUN, &em_tx_int_delay_dflt,
@@ -1872,13 +1878,15 @@ em_xmit(struct tx_ring *txr, struct mbuf **m_headp)
struct ether_header *eh;
struct ip *ip = NULL;
struct tcphdr *tp = NULL;
- u32 txd_upper = 0, txd_lower = 0, txd_used = 0;
+ u32 txd_upper = 0, txd_lower = 0;
int ip_off, poff;
int nsegs, i, j, first, last = 0;
- int error, do_tso, tso_desc = 0, remap = 1;
+ int error;
+ bool do_tso, tso_desc, remap = TRUE;
m_head = *m_headp;
- do_tso = ((m_head->m_pkthdr.csum_flags & CSUM_TSO) != 0);
+ do_tso = (m_head->m_pkthdr.csum_flags & CSUM_TSO);
+ tso_desc = FALSE;
ip_off = poff = 0;
/*
@@ -1914,74 +1922,82 @@ em_xmit(struct tx_ring *txr, struct mbuf **m_headp)
* for IPv6 yet.
*/
ip_off = sizeof(struct ether_header);
- m_head = m_pullup(m_head, ip_off);
- if (m_head == NULL) {
- *m_headp = NULL;
- return (ENOBUFS);
+ if (m_head->m_len < ip_off) {
+ m_head = m_pullup(m_head, ip_off);
+ if (m_head == NULL) {
+ *m_headp = NULL;
+ return (ENOBUFS);
+ }
}
eh = mtod(m_head, struct ether_header *);
if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
ip_off = sizeof(struct ether_vlan_header);
- m_head = m_pullup(m_head, ip_off);
+ if (m_head->m_len < ip_off) {
+ m_head = m_pullup(m_head, ip_off);
+ if (m_head == NULL) {
+ *m_headp = NULL;
+ return (ENOBUFS);
+ }
+ }
+ }
+ if (m_head->m_len < ip_off + sizeof(struct ip)) {
+ m_head = m_pullup(m_head, ip_off + sizeof(struct ip));
if (m_head == NULL) {
*m_headp = NULL;
return (ENOBUFS);
}
}
- m_head = m_pullup(m_head, ip_off + sizeof(struct ip));
- if (m_head == NULL) {
- *m_headp = NULL;
- return (ENOBUFS);
- }
ip = (struct ip *)(mtod(m_head, char *) + ip_off);
poff = ip_off + (ip->ip_hl << 2);
- if (do_tso) {
- m_head = m_pullup(m_head, poff + sizeof(struct tcphdr));
- if (m_head == NULL) {
- *m_headp = NULL;
- return (ENOBUFS);
+
+ if (do_tso || (m_head->m_pkthdr.csum_flags & CSUM_TCP)) {
+ if (m_head->m_len < poff + sizeof(struct tcphdr)) {
+ m_head = m_pullup(m_head, poff +
+ sizeof(struct tcphdr));
+ if (m_head == NULL) {
+ *m_headp = NULL;
+ return (ENOBUFS);
+ }
}
tp = (struct tcphdr *)(mtod(m_head, char *) + poff);
/*
* TSO workaround:
* pull 4 more bytes of data into it.
*/
- m_head = m_pullup(m_head, poff + (tp->th_off << 2) + 4);
- if (m_head == NULL) {
- *m_headp = NULL;
- return (ENOBUFS);
+ if (m_head->m_len < poff + (tp->th_off << 2)) {
+ m_head = m_pullup(m_head, poff +
+ (tp->th_off << 2) +
+ TSO_WORKAROUND);
+ if (m_head == NULL) {
+ *m_headp = NULL;
+ return (ENOBUFS);
+ }
}
ip = (struct ip *)(mtod(m_head, char *) + ip_off);
- ip->ip_len = 0;
- ip->ip_sum = 0;
- /*
- * The pseudo TCP checksum does not include TCP payload
- * length so driver should recompute the checksum here
- * what hardware expect to see. This is adherence of
- * Microsoft's Large Send specification.
- */
tp = (struct tcphdr *)(mtod(m_head, char *) + poff);
- tp->th_sum = in_pseudo(ip->ip_src.s_addr,
- ip->ip_dst.s_addr, htons(IPPROTO_TCP));
- } else if (m_head->m_pkthdr.csum_flags & CSUM_TCP) {
- m_head = m_pullup(m_head, poff + sizeof(struct tcphdr));
- if (m_head == NULL) {
- *m_headp = NULL;
- return (ENOBUFS);
+ if (do_tso) {
+ ip->ip_len = htons(m_head->m_pkthdr.tso_segsz +
+ (ip->ip_hl << 2) +
+ (tp->th_off << 2));
+ ip->ip_sum = 0;
+ /*
+ * The pseudo TCP checksum does not include TCP
+ * payload length so driver should recompute
+ * the checksum here what hardware expect to
+ * see. This is adherence of Microsoft's Large
+ * Send specification.
+ */
+ tp->th_sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr, htons(IPPROTO_TCP));
}
- tp = (struct tcphdr *)(mtod(m_head, char *) + poff);
- m_head = m_pullup(m_head, poff + (tp->th_off << 2));
- if (m_head == NULL) {
- *m_headp = NULL;
- return (ENOBUFS);
- }
- ip = (struct ip *)(mtod(m_head, char *) + ip_off);
- tp = (struct tcphdr *)(mtod(m_head, char *) + poff);
} else if (m_head->m_pkthdr.csum_flags & CSUM_UDP) {
- m_head = m_pullup(m_head, poff + sizeof(struct udphdr));
- if (m_head == NULL) {
- *m_headp = NULL;
- return (ENOBUFS);
+ if (m_head->m_len < poff + sizeof(struct udphdr)) {
+ m_head = m_pullup(m_head, poff +
+ sizeof(struct udphdr));
+ if (m_head == NULL) {
+ *m_headp = NULL;
+ return (ENOBUFS);
+ }
}
ip = (struct ip *)(mtod(m_head, char *) + ip_off);
}
@@ -2027,7 +2043,7 @@ retry:
*m_headp = m;
/* Try it again, but only once */
- remap = 0;
+ remap = FALSE;
goto retry;
} else if (error != 0) {
adapter->no_tx_dma_setup++;
@@ -2042,13 +2058,13 @@ retry:
* it follows a TSO burst, then we need to add a
* sentinel descriptor to prevent premature writeback.
*/
- if ((do_tso == 0) && (txr->tx_tso == TRUE)) {
+ if ((!do_tso) && (txr->tx_tso == TRUE)) {
if (nsegs == 1)
tso_desc = TRUE;
txr->tx_tso = FALSE;
}
- if (nsegs > (txr->tx_avail - 2)) {
+ if (nsegs > (txr->tx_avail - EM_MAX_SCATTER)) {
txr->no_desc_avail++;
bus_dmamap_unload(txr->txtag, map);
return (ENOBUFS);
@@ -2088,23 +2104,23 @@ retry:
** If this is the last descriptor, we want to
** split it so we have a small final sentinel
*/
- if (tso_desc && (j == (nsegs -1)) && (seg_len > 8)) {
- seg_len -= 4;
+ if (tso_desc && (j == (nsegs - 1)) && (seg_len > 8)) {
+ seg_len -= TSO_WORKAROUND;
ctxd->buffer_addr = htole64(seg_addr);
ctxd->lower.data = htole32(
- adapter->txd_cmd | txd_lower | seg_len);
- ctxd->upper.data =
- htole32(txd_upper);
+ adapter->txd_cmd | txd_lower | seg_len);
+ ctxd->upper.data = htole32(txd_upper);
if (++i == adapter->num_tx_desc)
i = 0;
+
/* Now make the sentinel */
- ++txd_used; /* using an extra txd */
+ txr->tx_avail--;
ctxd = &txr->tx_base[i];
tx_buffer = &txr->tx_buffers[i];
ctxd->buffer_addr =
htole64(seg_addr + seg_len);
ctxd->lower.data = htole32(
- adapter->txd_cmd | txd_lower | 4);
+ adapter->txd_cmd | txd_lower | TSO_WORKAROUND);
ctxd->upper.data =
htole32(txd_upper);
last = i;
@@ -2114,8 +2130,7 @@ retry:
ctxd->buffer_addr = htole64(seg_addr);
ctxd->lower.data = htole32(
adapter->txd_cmd | txd_lower | seg_len);
- ctxd->upper.data =
- htole32(txd_upper);
+ ctxd->upper.data = htole32(txd_upper);
last = i;
if (++i == adapter->num_tx_desc)
i = 0;
@@ -2126,8 +2141,6 @@ retry:
txr->next_avail_desc = i;
txr->tx_avail -= nsegs;
- if (tso_desc) /* TSO used an extra for sentinel */
- txr->tx_avail -= txd_used;
tx_buffer->m_head = m_head;
/*
@@ -3030,6 +3043,11 @@ em_setup_interface(device_t dev, struct adapter *adapter)
if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
if_setioctlfn(ifp, em_ioctl);
if_setgetcounterfn(ifp, em_get_counter);
+ /* TSO parameters */
+ ifp->if_hw_tsomax = EM_TSO_SIZE;
+ ifp->if_hw_tsomaxsegcount = EM_MAX_SCATTER;
+ ifp->if_hw_tsomaxsegsize = EM_TSO_SEG_SIZE;
+
#ifdef EM_MULTIQUEUE
/* Multiqueue stack interface */
if_settransmitfn(ifp, em_mq_start);
@@ -4514,7 +4532,8 @@ em_initialize_receive_unit(struct adapter *adapter)
(hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT);
/* Strip the CRC */
- rctl |= E1000_RCTL_SECRC;
+ if (!em_disable_crc_stripping)
+ rctl |= E1000_RCTL_SECRC;
/* Make sure VLAN Filters are off */
rctl &= ~E1000_RCTL_VFE;
@@ -4888,8 +4907,8 @@ em_enable_intr(struct adapter *adapter)
u32 ims_mask = IMS_ENABLE_MASK;
if (hw->mac.type == e1000_82574) {
- E1000_WRITE_REG(hw, EM_EIAC, EM_MSIX_MASK);
- ims_mask |= EM_MSIX_MASK;
+ E1000_WRITE_REG(hw, EM_EIAC, adapter->ims);
+ ims_mask |= adapter->ims;
}
E1000_WRITE_REG(hw, E1000_IMS, ims_mask);
}
diff --git a/sys/dev/e1000/if_em.h b/sys/dev/e1000/if_em.h
index be18a6c..8725de3 100644
--- a/sys/dev/e1000/if_em.h
+++ b/sys/dev/e1000/if_em.h
@@ -266,7 +266,7 @@
#define HW_DEBUGOUT1(S, A) if (DEBUG_HW) printf(S "\n", A)
#define HW_DEBUGOUT2(S, A, B) if (DEBUG_HW) printf(S "\n", A, B)
-#define EM_MAX_SCATTER 32
+#define EM_MAX_SCATTER 64
#define EM_VFTA_SIZE 128
#define EM_TSO_SIZE (65535 + sizeof(struct ether_vlan_header))
#define EM_TSO_SEG_SIZE 4096 /* Max dma segment size */
diff --git a/sys/dev/e1000/if_igb.c b/sys/dev/e1000/if_igb.c
index 9eacc78..a3ea8d0 100644
--- a/sys/dev/e1000/if_igb.c
+++ b/sys/dev/e1000/if_igb.c
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2013, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/if_igb.h b/sys/dev/e1000/if_igb.h
index f2d0926..a4222e3 100644
--- a/sys/dev/e1000/if_igb.h
+++ b/sys/dev/e1000/if_igb.h
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2013, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/sys/dev/e1000/if_lem.c b/sys/dev/e1000/if_lem.c
index f34010e..7476be5 100644
--- a/sys/dev/e1000/if_lem.c
+++ b/sys/dev/e1000/if_lem.c
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2012, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -97,7 +97,7 @@
/*********************************************************************
* Legacy Em Driver version:
*********************************************************************/
-char lem_driver_version[] = "1.0.6";
+char lem_driver_version[] = "1.1.0";
/*********************************************************************
* PCI Device ID Table
@@ -2913,10 +2913,6 @@ lem_free_transmit_structures(struct adapter *adapter)
bus_dma_tag_destroy(adapter->txtag);
adapter->txtag = NULL;
}
-#if __FreeBSD_version >= 800000
- if (adapter->br != NULL)
- buf_ring_free(adapter->br, M_DEVBUF);
-#endif
}
/*********************************************************************
diff --git a/sys/dev/e1000/if_lem.h b/sys/dev/e1000/if_lem.h
index 41447d1..4c43bdd 100644
--- a/sys/dev/e1000/if_lem.h
+++ b/sys/dev/e1000/if_lem.h
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2011, Intel Corporation
+ Copyright (c) 2001-2015, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -296,9 +296,6 @@ struct em_int_delay_info {
/* Our adapter structure */
struct adapter {
if_t ifp;
-#if __FreeBSD_version >= 800000
- struct buf_ring *br;
-#endif
struct e1000_hw hw;
/* FreeBSD operating-system-specific structures. */
diff --git a/sys/dev/gpio/gpiobus.c b/sys/dev/gpio/gpiobus.c
index e741d28..6cafdaf 100644
--- a/sys/dev/gpio/gpiobus.c
+++ b/sys/dev/gpio/gpiobus.c
@@ -155,12 +155,16 @@ gpiobus_attach_bus(device_t dev)
int
gpiobus_detach_bus(device_t dev)
{
+ int err;
#ifdef FDT
ofw_gpiobus_unregister_provider(dev);
#endif
+ err = bus_generic_detach(dev);
+ if (err != 0)
+ return (err);
- return (bus_generic_detach(dev));
+ return (device_delete_children(dev));
}
int
@@ -338,11 +342,14 @@ gpiobus_detach(device_t dev)
if ((err = device_get_children(dev, &devlist, &ndevs)) != 0)
return (err);
for (i = 0; i < ndevs; i++) {
- device_delete_child(dev, devlist[i]);
devi = GPIOBUS_IVAR(devlist[i]);
gpiobus_free_ivars(devi);
+ resource_list_free(&devi->rl);
+ free(devi, M_DEVBUF);
+ device_delete_child(dev, devlist[i]);
}
free(devlist, M_TEMP);
+ rman_fini(&sc->sc_intr_rman);
if (sc->sc_pins) {
for (i = 0; i < sc->sc_npins; i++) {
if (sc->sc_pins[i].name != NULL)
@@ -442,7 +449,7 @@ gpiobus_add_child(device_t dev, u_int order, const char *name, int unit)
devi = malloc(sizeof(struct gpiobus_ivar), M_DEVBUF, M_NOWAIT | M_ZERO);
if (devi == NULL) {
device_delete_child(dev, child);
- return (0);
+ return (NULL);
}
resource_list_init(&devi->rl);
device_set_ivars(child, devi);
@@ -461,8 +468,11 @@ gpiobus_hinted_child(device_t bus, const char *dname, int dunit)
child = BUS_ADD_CHILD(bus, 0, dname, dunit);
devi = GPIOBUS_IVAR(child);
resource_int_value(dname, dunit, "pins", &pins);
- if (gpiobus_parse_pins(sc, child, pins))
+ if (gpiobus_parse_pins(sc, child, pins)) {
+ resource_list_free(&devi->rl);
+ free(devi, M_DEVBUF);
device_delete_child(bus, child);
+ }
if (resource_int_value(dname, dunit, "irq", &irq) == 0) {
if (bus_set_resource(child, SYS_RES_IRQ, 0, irq, 1) != 0)
device_printf(bus,
diff --git a/sys/dev/gpio/gpioled.c b/sys/dev/gpio/gpioled.c
index 01710c2..e699128 100644
--- a/sys/dev/gpio/gpioled.c
+++ b/sys/dev/gpio/gpioled.c
@@ -255,3 +255,4 @@ static driver_t gpioled_driver = {
};
DRIVER_MODULE(gpioled, gpiobus, gpioled_driver, gpioled_devclass, 0, 0);
+MODULE_DEPEND(gpioled, gpiobus, 1, 1, 1);
diff --git a/sys/dev/md/md.c b/sys/dev/md/md.c
index a82d81d..c19f7fe 100644
--- a/sys/dev/md/md.c
+++ b/sys/dev/md/md.c
@@ -89,6 +89,7 @@
#include <sys/vnode.h>
#include <geom/geom.h>
+#include <geom/geom_int.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
@@ -121,9 +122,12 @@ SYSCTL_INT(_vm, OID_AUTO, md_malloc_wait, CTLFLAG_RW, &md_malloc_wait, 0,
#define MD_ROOT_FSTYPE "ufs"
#endif
-#if defined(MD_ROOT) && defined(MD_ROOT_SIZE)
+#if defined(MD_ROOT)
/*
* Preloaded image gets put here.
+ */
+#if defined(MD_ROOT_SIZE)
+/*
* Applications that patch the object with the image can determine
* the size looking at the start and end markers (strings),
* so we want them contiguous.
@@ -135,6 +139,14 @@ static struct {
.start = "MFS Filesystem goes here",
.end = "MFS Filesystem had better STOP here",
};
+const int mfs_root_size = sizeof(mfs_root.start);
+#else
+extern volatile u_char __weak_symbol mfs_root;
+extern volatile u_char __weak_symbol mfs_root_end;
+__GLOBL(mfs_root);
+__GLOBL(mfs_root_end);
+#define mfs_root_size ((uintptr_t)(&mfs_root_end - &mfs_root))
+#endif
#endif
static g_init_t g_md_init;
@@ -1552,6 +1564,9 @@ md_preloaded(u_char *image, size_t length, const char *name)
if (name != NULL) {
printf("%s%d: Preloaded image <%s> %zd bytes at %p\n",
MD_NAME, sc->unit, name, length, image);
+ } else {
+ printf("%s%d: Embedded image %zd bytes at %p\n",
+ MD_NAME, sc->unit, length, image);
}
}
@@ -1571,10 +1586,13 @@ g_md_init(struct g_class *mp __unused)
sx_init(&md_sx, "MD config lock");
g_topology_unlock();
md_uh = new_unrhdr(0, INT_MAX, NULL);
-#ifdef MD_ROOT_SIZE
- sx_xlock(&md_sx);
- md_preloaded(mfs_root.start, sizeof(mfs_root.start), NULL);
- sx_xunlock(&md_sx);
+#ifdef MD_ROOT
+ if (mfs_root_size != 0) {
+ sx_xlock(&md_sx);
+ md_preloaded(__DEVOLATILE(u_char *, &mfs_root), mfs_root_size,
+ NULL);
+ sx_xunlock(&md_sx);
+ }
#endif
/* XXX: are preload_* static or do they need Giant ? */
while ((mod = preload_search_next_name(mod)) != NULL) {
@@ -1660,9 +1678,11 @@ g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
"read-only");
sbuf_printf(sb, "%s<type>%s</type>\n", indent,
type);
- if (mp->type == MD_VNODE && mp->vnode != NULL)
- sbuf_printf(sb, "%s<file>%s</file>\n",
- indent, mp->file);
+ if (mp->type == MD_VNODE && mp->vnode != NULL) {
+ sbuf_printf(sb, "%s<file>", indent);
+ g_conf_printf_escaped(sb, "%s", mp->file);
+ sbuf_printf(sb, "</file>\n");
+ }
}
}
}
diff --git a/sys/dev/random/fortuna.c b/sys/dev/random/fortuna.c
index 2aafba4..0b03931 100644
--- a/sys/dev/random/fortuna.c
+++ b/sys/dev/random/fortuna.c
@@ -58,6 +58,7 @@ __FBSDID("$FreeBSD$");
#include <dev/random/fortuna.h>
#else /* !_KERNEL */
#include <inttypes.h>
+#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -124,9 +125,7 @@ static uint8_t zero_region[RANDOM_ZERO_BLOCKSIZE];
static void random_fortuna_pre_read(void);
static void random_fortuna_read(uint8_t *, u_int);
-static void random_fortuna_write(uint8_t *, u_int);
-static void random_fortuna_reseed(void);
-static int random_fortuna_seeded(void);
+static bool random_fortuna_seeded(void);
static void random_fortuna_process_event(struct harvest_event *);
static void random_fortuna_init_alg(void *);
static void random_fortuna_deinit_alg(void *);
@@ -139,8 +138,6 @@ struct random_algorithm random_alg_context = {
.ra_deinit_alg = random_fortuna_deinit_alg,
.ra_pre_read = random_fortuna_pre_read,
.ra_read = random_fortuna_read,
- .ra_write = random_fortuna_write,
- .ra_reseed = random_fortuna_reseed,
.ra_seeded = random_fortuna_seeded,
.ra_event_processor = random_fortuna_process_event,
.ra_poolcount = RANDOM_FORTUNA_NPOOLS,
@@ -420,43 +417,7 @@ random_fortuna_read(uint8_t *buf, u_int bytecount)
RANDOM_RESEED_UNLOCK();
}
-/* Internal function to hand external entropy to the PRNG. */
-void
-random_fortuna_write(uint8_t *buf, u_int count)
-{
- static u_int destination = 0;
- struct harvest_event event;
- struct randomdev_hash hash;
- uint32_t entropy_data[RANDOM_KEYSIZE_WORDS], timestamp;
- int i;
-
- /* Extra timing here is helpful to scrape scheduler timing entropy */
- randomdev_hash_init(&hash);
- timestamp = (uint32_t)get_cyclecount();
- randomdev_hash_iterate(&hash, &timestamp, sizeof(timestamp));
- randomdev_hash_iterate(&hash, buf, count);
- timestamp = (uint32_t)get_cyclecount();
- randomdev_hash_iterate(&hash, &timestamp, sizeof(timestamp));
- randomdev_hash_finish(&hash, entropy_data);
- explicit_bzero(&hash, sizeof(hash));
- for (i = 0; i < RANDOM_KEYSIZE_WORDS; i += sizeof(event.he_entropy)/sizeof(event.he_entropy[0])) {
- event.he_somecounter = (uint32_t)get_cyclecount();
- event.he_size = sizeof(event.he_entropy);
- event.he_bits = event.he_size/8;
- event.he_source = RANDOM_CACHED;
- event.he_destination = destination++; /* Harmless cheating */
- memcpy(event.he_entropy, entropy_data + i, sizeof(event.he_entropy));
- random_fortuna_process_event(&event);
- }
- explicit_bzero(entropy_data, sizeof(entropy_data));
-}
-
-void
-random_fortuna_reseed(void)
-{
-}
-
-int
+bool
random_fortuna_seeded(void)
{
diff --git a/sys/dev/random/other_algorithm.c b/sys/dev/random/other_algorithm.c
new file mode 100644
index 0000000..740e879
--- /dev/null
+++ b/sys/dev/random/other_algorithm.c
@@ -0,0 +1,209 @@
+/*-
+ * Copyright (c) 2015 Mark R V Murray
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*-
+ * This is a skeleton for folks who wish to build a loadable module
+ * containing an alternative entropy-processing algorithm for random(4).
+ *
+ * The functions below should be completed with the appropriate code,
+ * and the nearby yarrow.c and fortuna.c may be consulted for examples
+ * of working code.
+ *
+ * The author is willing to provide reasonable help to those wishing to
+ * write such a module for themselves. Please use the markm@ FreeBSD
+ * email address, and ensure that you are developing this on a suitably
+ * supported branch (This is currently 11-CURRENT, and will be no
+ * older than 11-STABLE in the future).
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/random.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <machine/cpu.h>
+
+#include <crypto/rijndael/rijndael-api-fst.h>
+#include <crypto/sha2/sha2.h>
+
+#include <dev/random/hash.h>
+#include <dev/random/randomdev.h>
+#include <dev/random/random_harvestq.h>
+#include <dev/random/uint128.h>
+#include <dev/random/other_algorithm.h>
+
+static void random_other_pre_read(void);
+static void random_other_read(uint8_t *, u_int);
+static bool random_other_seeded(void);
+static void random_other_process_event(struct harvest_event *);
+static void random_other_init_alg(void *);
+static void random_other_deinit_alg(void *);
+
+/*
+ * RANDOM_OTHER_NPOOLS is used when reading hardware random
+ * number sources to ensure that each pool gets one read sample
+ * per loop iteration. Yarrow has 2 such pools (FAST and SLOW),
+ * and fortuna has 32 (0-31). The RNG used prior to Yarrow and
+ * ported from Linux had just 1 pool.
+ */
+#define RANDOM_OTHER_NPOOLS 1
+
+struct random_algorithm random_alg_context = {
+ .ra_ident = "other",
+ .ra_init_alg = random_other_init_alg,
+ .ra_deinit_alg = random_other_deinit_alg,
+ .ra_pre_read = random_other_pre_read,
+ .ra_read = random_other_read,
+ .ra_seeded = random_other_seeded,
+ .ra_event_processor = random_other_process_event,
+ .ra_poolcount = RANDOM_OTHER_NPOOLS,
+};
+
+/* Use a mutex to protect your reseed variables? */
+static mtx_t other_mtx;
+
+/*
+ * void random_other_init_alg(void *unused __unused)
+ *
+ * Do algorithm-specific initialisation here.
+ */
+void
+random_other_init_alg(void *unused __unused)
+{
+
+ RANDOM_RESEED_INIT_LOCK();
+ /*
+ * Do set-up work here!
+ */
+}
+
+/*
+ * void random_other_deinit_alg(void *unused __unused)
+ *
+ * Do algorithm-specific deinitialisation here.
+ */
+static void
+random_other_deinit_alg(void *unused __unused)
+{
+
+ /*
+ * Do tear-down work here!
+ */
+ RANDOM_RESEED_DEINIT_LOCK();
+}
+
+/*
+ * void random_other_pre_read(void)
+ *
+ * Do any pre-read preparation you need to. This will be called
+ * before >=1 calls to random_other_read() corresponding to one
+ * read(2).
+ *
+ * This routine will be called periodically while the generator is
+ * still blocked and a read is being attempted, giving you an
+ * opportunity to unblock.
+ */
+static void
+random_other_pre_read(void)
+{
+
+ RANDOM_RESEED_LOCK();
+ /*
+ * Do pre-read housekeeping work here!
+ * You may use this as a chance to unblock the generator.
+ */
+ RANDOM_RESEED_UNLOCK();
+}
+
+/*
+ * void random_other_read(uint8_t *buf, u_int count)
+ *
+ * Generate <count> bytes of output into <*buf>.
+ * You may use the fact that <count> will be a multiple of
+ * RANDOM_BLOCKSIZE for optimization purposes.
+ *
+ * This function will always be called with your generator
+ * unblocked and ready. If you are not ready to generate
+ * output here, then feel free to KASSERT() or panic().
+ */
+static void
+random_other_read(uint8_t *buf, u_int count)
+{
+
+ RANDOM_RESEED_LOCK();
+ /*
+ * Do random-number generation work here!
+ */
+ RANDOM_RESEED_UNLOCK();
+}
+
+/*
+ * bool random_other_seeded(void)
+ *
+ * Return true if your generator is ready to generate
+ * output, and false otherwise.
+ */
+static bool
+random_other_seeded(void)
+{
+ bool seeded = false;
+
+ /*
+ * Find out if your generator is seeded here!
+ */
+ return (seeded);
+}
+
+/*
+ * void random_other_process_event(struct harvest_event *event)
+ *
+ * Process one stochastic event <*event> into your entropy
+ * processor.
+ *
+ * The structure of the event may change, so it is easier to
+ * just grab the whole thing into your accumulation system.
+ * You may pick-and-choose bits, but please don't complain
+ * when/if these change.
+ */
+static void
+random_other_process_event(struct harvest_event *event)
+{
+
+ RANDOM_RESEED_LOCK();
+ /*
+ * Do entropy accumulation work here!
+ * You may use this as a chance to unblock the generator.
+ */
+ RANDOM_RESEED_UNLOCK();
+}
diff --git a/sys/dev/random/other_algorithm.h b/sys/dev/random/other_algorithm.h
new file mode 100644
index 0000000..8ca2bb8
--- /dev/null
+++ b/sys/dev/random/other_algorithm.h
@@ -0,0 +1,62 @@
+/*-
+ * Copyright (c) 2015 Mark R V Murray
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*-
+ * This is a skeleton for folks who wish to build a loadable module
+ * containing an alternative entropy-processing algorithm for random(4).
+ *
+ * The functions below should be completed with the appropriate code,
+ * and the nearby yarrow.c and fortuna.c may be consulted for examples
+ * of working code.
+ *
+ * The author is willing to provide reasonable help to those wishing to
+ * write such a module for themselves. Please use the markm@ FreeBSD
+ * email address, and ensure that you are developing this on a suitably
+ * supported branch (This is currently 11-CURRENT, and will be no
+ * older than 11-STABLE in the future).
+ */
+
+#ifndef SYS_DEV_RANDOM_OTHER_H_INCLUDED
+#define SYS_DEV_RANDOM_OTHER_H_INCLUDED
+
+#ifdef _KERNEL
+typedef struct mtx mtx_t;
+#define RANDOM_RESEED_INIT_LOCK(x) mtx_init(&other_mtx, "reseed mutex", NULL, MTX_DEF)
+#define RANDOM_RESEED_DEINIT_LOCK(x) mtx_destroy(&other_mtx)
+#define RANDOM_RESEED_LOCK(x) mtx_lock(&other_mtx)
+#define RANDOM_RESEED_UNLOCK(x) mtx_unlock(&other_mtx)
+#define RANDOM_RESEED_ASSERT_LOCK_OWNED(x) mtx_assert(&other_mtx, MA_OWNED)
+#else
+#define RANDOM_RESEED_INIT_LOCK(x) mtx_init(&other_mtx, mtx_plain)
+#define RANDOM_RESEED_DEINIT_LOCK(x) mtx_destroy(&other_mtx)
+#define RANDOM_RESEED_LOCK(x) mtx_lock(&other_mtx)
+#define RANDOM_RESEED_UNLOCK(x) mtx_unlock(&other_mtx)
+#define RANDOM_RESEED_ASSERT_LOCK_OWNED(x)
+#endif
+
+#endif /* SYS_DEV_RANDOM_OTHER_H_INCLUDED */
diff --git a/sys/dev/random/random_harvestq.c b/sys/dev/random/random_harvestq.c
index 34a809b..255136c 100644
--- a/sys/dev/random/random_harvestq.c
+++ b/sys/dev/random/random_harvestq.c
@@ -47,12 +47,21 @@ __FBSDID("$FreeBSD$");
#include <sys/sysctl.h>
#include <sys/unistd.h>
+#if defined(RANDOM_LOADABLE)
+#include <sys/lock.h>
+#include <sys/sx.h>
+#endif
+
+#include <machine/atomic.h>
#include <machine/cpu.h>
#include <dev/random/randomdev.h>
#include <dev/random/random_harvestq.h>
static void random_kthread(void);
+static void random_sources_feed(void);
+
+static u_int read_rate;
/* List for the dynamic sysctls */
static struct sysctl_ctx_list random_clist;
@@ -66,7 +75,7 @@ static struct sysctl_ctx_list random_clist;
#define RANDOM_RING_MAX 1024
#define RANDOM_ACCUM_MAX 8
-/* 1 to let the kernel thread run, 0 to terminate */
+/* 1 to let the kernel thread run, 0 to terminate, -1 to mark completion */
volatile int random_kthread_control;
/*
@@ -123,13 +132,18 @@ static struct kproc_desc random_proc_kp = {
&harvest_context.hc_kthread_proc,
};
-
/* Pass the given event straight through to Fortuna/Yarrow/Whatever. */
static __inline void
random_harvestq_fast_process_event(struct harvest_event *event)
{
- if (random_alg_context.ra_event_processor)
- random_alg_context.ra_event_processor(event);
+#if defined(RANDOM_LOADABLE)
+ RANDOM_CONFIG_S_LOCK();
+ if (p_random_alg_context)
+#endif
+ p_random_alg_context->ra_event_processor(event);
+#if defined(RANDOM_LOADABLE)
+ RANDOM_CONFIG_S_UNLOCK();
+#endif
}
static void
@@ -163,12 +177,58 @@ random_kthread(void)
/* XXX: FIX!! This is a *great* place to pass hardware/live entropy to random(9) */
tsleep_sbt(&harvest_context.hc_kthread_proc, 0, "-", SBT_1S/10, 0, C_PREL(1));
}
+ random_kthread_control = -1;
wakeup(&harvest_context.hc_kthread_proc);
kproc_exit(0);
/* NOTREACHED */
}
+/* This happens well after SI_SUB_RANDOM */
SYSINIT(random_device_h_proc, SI_SUB_CREATE_INIT, SI_ORDER_ANY, kproc_start, &random_proc_kp);
+/*
+ * Run through all fast sources reading entropy for the given
+ * number of rounds, which should be a multiple of the number
+ * of entropy accumulation pools in use; 2 for Yarrow and 32
+ * for Fortuna.
+ */
+static void
+random_sources_feed(void)
+{
+ uint32_t entropy[HARVESTSIZE];
+ struct random_sources *rrs;
+ u_int i, n, local_read_rate;
+
+ /*
+ * Step over all of live entropy sources, and feed their output
+ * to the system-wide RNG.
+ */
+#if defined(RANDOM_LOADABLE)
+ RANDOM_CONFIG_S_LOCK();
+ if (p_random_alg_context) {
+ /* It's an indenting error. Yeah, Yeah. */
+#endif
+ local_read_rate = atomic_readandclear_32(&read_rate);
+ LIST_FOREACH(rrs, &source_list, rrs_entries) {
+ for (i = 0; i < p_random_alg_context->ra_poolcount*(local_read_rate + 1); i++) {
+ n = rrs->rrs_source->rs_read(entropy, sizeof(entropy));
+ KASSERT((n > 0 && n <= sizeof(entropy)), ("very bad return from rs_read (= %d) in %s", n, __func__));
+ random_harvest_direct(entropy, n, (n*8)/2, rrs->rrs_source->rs_source);
+ }
+ }
+ explicit_bzero(entropy, sizeof(entropy));
+#if defined(RANDOM_LOADABLE)
+ }
+ RANDOM_CONFIG_S_UNLOCK();
+#endif
+}
+
+void
+read_rate_increment(u_int chunk)
+{
+
+ atomic_add_32(&read_rate, chunk);
+}
+
/* ARGSUSED */
RANDOM_CHECK_UINT(harvestmask, 0, RANDOM_HARVEST_EVERYTHING_MASK);
@@ -317,7 +377,8 @@ random_harvestq_deinit(void *unused __unused)
/* Command the hash/reseed thread to end and wait for it to finish */
random_kthread_control = 0;
- tsleep(&harvest_context.hc_kthread_proc, 0, "harvqterm", 0);
+ while (random_kthread_control >= 0)
+ tsleep(&harvest_context.hc_kthread_proc, 0, "harvqterm", hz/5);
sysctl_ctx_free(&random_clist);
}
SYSUNINIT(random_device_h_init, SI_SUB_RANDOM, SI_ORDER_SECOND, random_harvestq_deinit, NULL);
@@ -412,3 +473,5 @@ random_harvest_direct(const void *entropy, u_int size, u_int bits, enum random_e
random_harvestq_fast_process_event(&event);
explicit_bzero(&event, sizeof(event));
}
+
+MODULE_VERSION(random_harvestq, 1);
diff --git a/sys/dev/random/random_harvestq.h b/sys/dev/random/random_harvestq.h
index f1de86f..421b592 100644
--- a/sys/dev/random/random_harvestq.h
+++ b/sys/dev/random/random_harvestq.h
@@ -43,6 +43,8 @@ struct harvest_event {
uint8_t he_source; /* origin of the entropy */
} __packed;
+void read_rate_increment(u_int);
+
#define RANDOM_HARVESTQ_BOOT_ENTROPY_FILE "/boot/entropy"
#define RANDOM_HARVEST_INIT_LOCK(x) mtx_init(&harvest_context.hc_mtx, "entropy harvest mutex", NULL, MTX_SPIN)
diff --git a/sys/dev/random/random_infra.c b/sys/dev/random/random_infra.c
new file mode 100644
index 0000000..d31b84b
--- /dev/null
+++ b/sys/dev/random/random_infra.c
@@ -0,0 +1,128 @@
+/*-
+ * Copyright (c) 2015 Mark R V Murray
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/random.h>
+#include <sys/sysctl.h>
+
+#if defined(RANDOM_LOADABLE)
+#include <sys/lock.h>
+#include <sys/sx.h>
+#endif
+
+#include <dev/random/randomdev.h>
+
+/* Set up the sysctl root node for the entropy device */
+SYSCTL_NODE(_kern, OID_AUTO, random, CTLFLAG_RW, 0, "Cryptographically Secure Random Number Generator");
+
+MALLOC_DEFINE(M_ENTROPY, "entropy", "Entropy harvesting buffers and data structures");
+
+struct sources_head source_list = LIST_HEAD_INITIALIZER(source_list);
+
+#if defined(RANDOM_LOADABLE)
+struct random_algorithm *p_random_alg_context = NULL;
+#else /* !defined(RANDOM_LOADABLE) */
+struct random_algorithm *p_random_alg_context = &random_alg_context;
+#endif /* defined(RANDOM_LOADABLE) */
+
+#if defined(RANDOM_LOADABLE)
+
+struct random_readers {
+ int (*read_random_uio)(struct uio *, bool);
+ u_int (*read_random)(void *, u_int);
+} random_reader_context = {
+ (int (*)(struct uio *, bool))nullop,
+ (u_int (*)(void *, u_int))nullop,
+};
+
+struct sx randomdev_config_lock;
+
+static void
+random_infra_sysinit(void *dummy __unused)
+{
+
+ RANDOM_CONFIG_INIT_LOCK();
+}
+SYSINIT(random_device_h_init, SI_SUB_RANDOM, SI_ORDER_FIRST, random_infra_sysinit, NULL);
+
+void
+random_infra_init(int (*p_random_read_uio)(struct uio *, bool), u_int (*p_random_read)(void *, u_int))
+{
+
+ RANDOM_CONFIG_X_LOCK();
+ random_reader_context.read_random_uio = p_random_read_uio;
+ random_reader_context.read_random = p_random_read;
+ RANDOM_CONFIG_X_UNLOCK();
+}
+
+void
+random_infra_uninit(void)
+{
+
+ RANDOM_CONFIG_X_LOCK();
+ random_reader_context.read_random_uio = (int (*)(struct uio *, bool))nullop;
+ random_reader_context.read_random = (u_int (*)(void *, u_int))nullop;
+ RANDOM_CONFIG_X_UNLOCK();
+}
+
+static void
+random_infra_sysuninit(void *dummy __unused)
+{
+
+ RANDOM_CONFIG_DEINIT_LOCK();
+}
+SYSUNINIT(random_device_h_init, SI_SUB_RANDOM, SI_ORDER_FIRST, random_infra_sysuninit, NULL);
+
+int
+read_random_uio(struct uio *uio, bool nonblock)
+{
+ int retval;
+
+ RANDOM_CONFIG_S_LOCK();
+ retval = random_reader_context.read_random_uio(uio, nonblock);
+ RANDOM_CONFIG_S_UNLOCK();
+ return (retval);
+}
+
+u_int
+read_random(void *buf, u_int len)
+{
+ u_int retval;
+
+ RANDOM_CONFIG_S_LOCK();
+ retval = random_reader_context.read_random(buf, len);
+ RANDOM_CONFIG_S_UNLOCK();
+ return (retval);
+}
+
+#endif /* defined(RANDOM_LOADABLE) */
diff --git a/sys/dev/random/randomdev.c b/sys/dev/random/randomdev.c
index 5c20c5d..f20a462 100644
--- a/sys/dev/random/randomdev.c
+++ b/sys/dev/random/randomdev.c
@@ -56,14 +56,18 @@ __FBSDID("$FreeBSD$");
#include <dev/random/randomdev.h>
#include <dev/random/random_harvestq.h>
-#include "opt_random.h"
+#define RANDOM_UNIT 0
-#if defined(RANDOM_DUMMY) && defined(RANDOM_YARROW)
-#error "Cannot define both RANDOM_DUMMY and RANDOM_YARROW"
+#if defined(RANDOM_LOADABLE)
+#define READ_RANDOM_UIO _read_random_uio
+#define READ_RANDOM _read_random
+static int READ_RANDOM_UIO(struct uio *, bool);
+static u_int READ_RANDOM(void *, u_int);
+#else
+#define READ_RANDOM_UIO read_random_uio
+#define READ_RANDOM read_random
#endif
-#define RANDOM_UNIT 0
-
/* Return the largest number >= x that is a multiple of m */
#define CEIL_TO_MULTIPLE(x, m) ((((x) + (m) - 1)/(m))*(m))
@@ -84,68 +88,31 @@ static struct cdevsw random_cdevsw = {
/* For use with make_dev(9)/destroy_dev(9). */
static struct cdev *random_dev;
-/* Set up the sysctl root node for the entropy device */
-SYSCTL_NODE(_kern, OID_AUTO, random, CTLFLAG_RW, 0, "Cryptographically Secure Random Number Generator");
-
-MALLOC_DEFINE(M_ENTROPY, "entropy", "Entropy harvesting buffers and data structures");
-
-#if defined(RANDOM_DUMMY)
-
-/*-
- * Dummy "always block" pseudo algorithm, used when there is no real
- * random(4) driver to provide a CSPRNG.
- */
-
-static u_int
-dummy_random_zero(void)
-{
-
- return (0);
-}
-
-static void
-dummy_random(void)
-{
-}
-
-struct random_algorithm random_alg_context = {
- .ra_ident = "Dummy",
- .ra_init_alg = NULL,
- .ra_deinit_alg = NULL,
- .ra_pre_read = dummy_random,
- .ra_read = (random_alg_read_t *)dummy_random_zero,
- .ra_write = (random_alg_write_t *)dummy_random_zero,
- .ra_reseed = dummy_random,
- .ra_seeded = (random_alg_seeded_t *)dummy_random_zero,
- .ra_event_processor = NULL,
- .ra_poolcount = 0,
-};
-
-#else /* !defined(RANDOM_DUMMY) */
-
-LIST_HEAD(sources_head, random_sources);
-static struct sources_head source_list = LIST_HEAD_INITIALIZER(source_list);
-static u_int read_rate;
-
static void
random_alg_context_ra_init_alg(void *data)
{
- random_alg_context.ra_init_alg(data);
+ p_random_alg_context = &random_alg_context;
+ p_random_alg_context->ra_init_alg(data);
+#if defined(RANDOM_LOADABLE)
+ random_infra_init(READ_RANDOM_UIO, READ_RANDOM);
+#endif
}
static void
random_alg_context_ra_deinit_alg(void *data)
{
- random_alg_context.ra_deinit_alg(data);
+#if defined(RANDOM_LOADABLE)
+ random_infra_uninit();
+#endif
+ p_random_alg_context->ra_deinit_alg(data);
+ p_random_alg_context = NULL;
}
SYSINIT(random_device, SI_SUB_RANDOM, SI_ORDER_THIRD, random_alg_context_ra_init_alg, NULL);
SYSUNINIT(random_device, SI_SUB_RANDOM, SI_ORDER_THIRD, random_alg_context_ra_deinit_alg, NULL);
-#endif /* defined(RANDOM_DUMMY) */
-
static struct selinfo rsel;
/*
@@ -156,28 +123,28 @@ static int
randomdev_read(struct cdev *dev __unused, struct uio *uio, int flags)
{
- return (read_random_uio(uio, (flags & O_NONBLOCK) != 0));
+ return (READ_RANDOM_UIO(uio, (flags & O_NONBLOCK) != 0));
}
int
-read_random_uio(struct uio *uio, bool nonblock)
+READ_RANDOM_UIO(struct uio *uio, bool nonblock)
{
uint8_t *random_buf;
int error, spamcount;
ssize_t read_len, total_read, c;
random_buf = malloc(PAGE_SIZE, M_ENTROPY, M_WAITOK);
- random_alg_context.ra_pre_read();
+ p_random_alg_context->ra_pre_read();
error = 0;
spamcount = 0;
/* (Un)Blocking logic */
- while (!random_alg_context.ra_seeded()) {
+ while (!p_random_alg_context->ra_seeded()) {
if (nonblock) {
error = EWOULDBLOCK;
break;
}
/* keep tapping away at the pre-read until we seed/unblock. */
- random_alg_context.ra_pre_read();
+ p_random_alg_context->ra_pre_read();
/* Only bother the console every 10 seconds or so */
if (spamcount == 0)
printf("random: %s unblock wait\n", __func__);
@@ -187,10 +154,7 @@ read_random_uio(struct uio *uio, bool nonblock)
break;
}
if (error == 0) {
-#if !defined(RANDOM_DUMMY)
- /* XXX: FIX!! Next line as an atomic operation? */
- read_rate += (uio->uio_resid + sizeof(uint32_t))/sizeof(uint32_t);
-#endif
+ read_rate_increment((uio->uio_resid + sizeof(uint32_t))/sizeof(uint32_t));
total_read = 0;
while (uio->uio_resid && !error) {
read_len = uio->uio_resid;
@@ -203,7 +167,7 @@ read_random_uio(struct uio *uio, bool nonblock)
read_len = CEIL_TO_MULTIPLE(read_len, RANDOM_BLOCKSIZE);
/* Work in chunks page-sized or less */
read_len = MIN(read_len, PAGE_SIZE);
- random_alg_context.ra_read(random_buf, read_len);
+ p_random_alg_context->ra_read(random_buf, read_len);
c = MIN(uio->uio_resid, read_len);
error = uiomove(random_buf, c, uio);
total_read += c;
@@ -224,19 +188,16 @@ read_random_uio(struct uio *uio, bool nonblock)
* RANDOM_BLOCKSIZE bytes.
*/
u_int
-read_random(void *random_buf, u_int len)
+READ_RANDOM(void *random_buf, u_int len)
{
u_int read_len;
uint8_t local_buf[len + RANDOM_BLOCKSIZE];
KASSERT(random_buf != NULL, ("No suitable random buffer in %s", __func__));
- random_alg_context.ra_pre_read();
+ p_random_alg_context->ra_pre_read();
/* (Un)Blocking logic; if not seeded, return nothing. */
- if (random_alg_context.ra_seeded()) {
-#if !defined(RANDOM_DUMMY)
- /* XXX: FIX!! Next line as an atomic operation? */
- read_rate += (len + sizeof(uint32_t))/sizeof(uint32_t);
-#endif
+ if (p_random_alg_context->ra_seeded()) {
+ read_rate_increment((len + sizeof(uint32_t))/sizeof(uint32_t));
if (len > 0) {
/*
* Belt-and-braces.
@@ -244,7 +205,7 @@ read_random(void *random_buf, u_int len)
* which is what the underlying generator is expecting.
*/
read_len = CEIL_TO_MULTIPLE(len, RANDOM_BLOCKSIZE);
- random_alg_context.ra_read(local_buf, read_len);
+ p_random_alg_context->ra_read(local_buf, read_len);
memcpy(random_buf, local_buf, len);
}
} else
@@ -252,6 +213,37 @@ read_random(void *random_buf, u_int len)
return (len);
}
+static __inline void
+randomdev_accumulate(uint8_t *buf, u_int count)
+{
+ static u_int destination = 0;
+ static struct harvest_event event;
+ static struct randomdev_hash hash;
+ static uint32_t entropy_data[RANDOM_KEYSIZE_WORDS];
+ uint32_t timestamp;
+ int i;
+
+ /* Extra timing here is helpful to scrape scheduler jitter entropy */
+ randomdev_hash_init(&hash);
+ timestamp = (uint32_t)get_cyclecount();
+ randomdev_hash_iterate(&hash, &timestamp, sizeof(timestamp));
+ randomdev_hash_iterate(&hash, buf, count);
+ timestamp = (uint32_t)get_cyclecount();
+ randomdev_hash_iterate(&hash, &timestamp, sizeof(timestamp));
+ randomdev_hash_finish(&hash, entropy_data);
+ explicit_bzero(&hash, sizeof(hash));
+ for (i = 0; i < RANDOM_KEYSIZE_WORDS; i += sizeof(event.he_entropy)/sizeof(event.he_entropy[0])) {
+ event.he_somecounter = (uint32_t)get_cyclecount();
+ event.he_size = sizeof(event.he_entropy);
+ event.he_bits = event.he_size/8;
+ event.he_source = RANDOM_CACHED;
+ event.he_destination = destination++; /* Harmless cheating */
+ memcpy(event.he_entropy, entropy_data + i, sizeof(event.he_entropy));
+ p_random_alg_context->ra_event_processor(&event);
+ }
+ explicit_bzero(entropy_data, sizeof(entropy_data));
+}
+
/* ARGSUSED */
static int
randomdev_write(struct cdev *dev __unused, struct uio *uio, int flags __unused)
@@ -267,7 +259,7 @@ randomdev_write(struct cdev *dev __unused, struct uio *uio, int flags __unused)
error = uiomove(random_buf, c, uio);
if (error)
break;
- random_alg_context.ra_write(random_buf, c);
+ randomdev_accumulate(random_buf, c);
tsleep(&random_alg_context, 0, "randwr", hz/10);
}
if (nbytes != uio->uio_resid && (error == ERESTART || error == EINTR))
@@ -283,7 +275,7 @@ randomdev_poll(struct cdev *dev __unused, int events, struct thread *td __unused
{
if (events & (POLLIN | POLLRDNORM)) {
- if (random_alg_context.ra_seeded())
+ if (p_random_alg_context->ra_seeded())
events &= (POLLIN | POLLRDNORM);
else
selrecord(td, &rsel);
@@ -325,9 +317,6 @@ randomdev_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t addr __unused,
void
random_source_register(struct random_source *rsource)
{
-#if defined(RANDOM_DUMMY)
- (void)rsource;
-#else /* !defined(RANDOM_DUMMY) */
struct random_sources *rrs;
KASSERT(rsource != NULL, ("invalid input to %s", __func__));
@@ -337,15 +326,11 @@ random_source_register(struct random_source *rsource)
printf("random: registering fast source %s\n", rsource->rs_ident);
LIST_INSERT_HEAD(&source_list, rrs, rrs_entries);
-#endif /* defined(RANDOM_DUMMY) */
}
void
random_source_deregister(struct random_source *rsource)
{
-#if defined(RANDOM_DUMMY)
- (void)rsource;
-#else /* !defined(RANDOM_DUMMY) */
struct random_sources *rrs = NULL;
KASSERT(rsource != NULL, ("invalid input to %s", __func__));
@@ -356,41 +341,6 @@ random_source_deregister(struct random_source *rsource)
}
if (rrs != NULL)
free(rrs, M_ENTROPY);
-#endif /* defined(RANDOM_DUMMY) */
-}
-
-#if !defined(RANDOM_DUMMY)
-/*
- * Run through all fast sources reading entropy for the given
- * number of rounds, which should be a multiple of the number
- * of entropy accumulation pools in use; 2 for Yarrow and 32
- * for Fortuna.
- *
- * BEWARE!!!
- * This function runs inside the RNG thread! Don't do anything silly!
- */
-void
-random_sources_feed(void)
-{
- uint32_t entropy[HARVESTSIZE];
- struct random_sources *rrs;
- u_int i, n, local_read_rate;
-
- /*
- * Step over all of live entropy sources, and feed their output
- * to the system-wide RNG.
- */
- /* XXX: FIX!! Next lines as an atomic operation? */
- local_read_rate = read_rate;
- read_rate = RANDOM_ALG_READ_RATE_MINIMUM;
- LIST_FOREACH(rrs, &source_list, rrs_entries) {
- for (i = 0; i < random_alg_context.ra_poolcount*local_read_rate; i++) {
- n = rrs->rrs_source->rs_read(entropy, sizeof(entropy));
- KASSERT((n > 0 && n <= sizeof(entropy)), ("very bad return from rs_read (= %d) in %s", n, __func__));
- random_harvest_direct(entropy, n, (n*8)/2, rrs->rrs_source->rs_source);
- }
- }
- explicit_bzero(entropy, sizeof(entropy));
}
static int
@@ -414,7 +364,6 @@ random_source_handler(SYSCTL_HANDLER_ARGS)
SYSCTL_PROC(_kern_random, OID_AUTO, random_sources, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
NULL, 0, random_source_handler, "A",
"List of active fast entropy sources.");
-#endif /* !defined(RANDOM_DUMMY) */
/* ARGSUSED */
static int
@@ -449,3 +398,5 @@ static moduledata_t randomdev_mod = {
DECLARE_MODULE(random_device, randomdev_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
MODULE_VERSION(random_device, 1);
+MODULE_DEPEND(random_device, crypto, 1, 1, 1);
+MODULE_DEPEND(random_device, random_harvestq, 1, 1, 1);
diff --git a/sys/dev/random/randomdev.h b/sys/dev/random/randomdev.h
index 799efb1..0f3b359 100644
--- a/sys/dev/random/randomdev.h
+++ b/sys/dev/random/randomdev.h
@@ -55,16 +55,15 @@ random_check_uint_##name(SYSCTL_HANDLER_ARGS) \
MALLOC_DECLARE(M_ENTROPY);
-#define RANDOM_ALG_READ_RATE_MINIMUM 32
-
#endif /* _KERNEL */
struct harvest_event;
+typedef void random_alg_init_t(void *);
+typedef void random_alg_deinit_t(void *);
typedef void random_alg_pre_read_t(void);
typedef void random_alg_read_t(uint8_t *, u_int);
-typedef void random_alg_write_t(uint8_t *, u_int);
-typedef int random_alg_seeded_t(void);
+typedef bool random_alg_seeded_t(void);
typedef void random_alg_reseed_t(void);
typedef void random_alg_eventprocessor_t(struct harvest_event *);
@@ -81,13 +80,11 @@ struct random_algorithm {
void (*ra_deinit_alg)(void *);
random_alg_pre_read_t *ra_pre_read;
random_alg_read_t *ra_read;
- random_alg_write_t *ra_write;
- random_alg_reseed_t *ra_reseed;
random_alg_seeded_t *ra_seeded;
random_alg_eventprocessor_t *ra_event_processor;
};
-extern struct random_algorithm random_alg_context;
+extern struct random_algorithm random_alg_context, *p_random_alg_context;
#ifdef _KERNEL
@@ -97,22 +94,33 @@ extern struct random_algorithm random_alg_context;
* upon request.
*/
struct random_source {
- const char *rs_ident;
- enum random_entropy_source rs_source;
- random_source_read_t *rs_read;
+ const char *rs_ident;
+ enum random_entropy_source rs_source;
+ random_source_read_t *rs_read;
};
-#if !defined(RANDOM_DUMMY)
struct random_sources {
- LIST_ENTRY(random_sources) rrs_entries;
- struct random_source *rrs_source;
+ LIST_ENTRY(random_sources) rrs_entries;
+ struct random_source *rrs_source;
};
-#endif /* !defined(RANDOM_DUMMY) */
+
+LIST_HEAD(sources_head, random_sources);
+extern struct sources_head source_list;
void random_source_register(struct random_source *);
void random_source_deregister(struct random_source *);
-void random_sources_feed(void);
+#if defined(RANDOM_LOADABLE)
+extern struct sx randomdev_config_lock;
+#define RANDOM_CONFIG_INIT_LOCK(x) sx_init(&randomdev_config_lock, "configuration change lock")
+#define RANDOM_CONFIG_X_LOCK(x) sx_xlock(&randomdev_config_lock)
+#define RANDOM_CONFIG_X_UNLOCK(x) sx_xunlock(&randomdev_config_lock)
+#define RANDOM_CONFIG_S_LOCK(x) sx_slock(&randomdev_config_lock)
+#define RANDOM_CONFIG_S_UNLOCK(x) sx_sunlock(&randomdev_config_lock)
+#define RANDOM_CONFIG_DEINIT_LOCK(x) sx_destroy(&randomdev_config_lock)
+void random_infra_init(int (*)(struct uio *, bool), u_int (*)(void *, u_int));
+void random_infra_uninit(void);
+#endif
#endif /* _KERNEL */
diff --git a/sys/dev/random/randomdev_none.c b/sys/dev/random/randomdev_none.c
deleted file mode 100644
index ee5cbf2..0000000
--- a/sys/dev/random/randomdev_none.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*-
- * Copyright (c) 2015 Mark R V Murray
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer
- * in this position and unchanged.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <sys/param.h>
-#include <sys/malloc.h>
-#include <sys/random.h>
-#include <sys/systm.h>
-
-#include <dev/random/randomdev.h>
-
-#include "opt_random.h"
-
-#if defined(RANDOM_DUMMY) || defined(RANDOM_YARROW)
-#error "Cannot define any of RANDOM_DUMMY and RANDOM_YARROW without 'device random'"
-#endif
-
-/*-
- * Dummy "not even here" device. Stub out all routines that the kernel would need.
- */
-
-/* ARGSUSED */
-u_int
-read_random(void *random_buf __unused, u_int len __unused)
-{
-
- return (0);
-}
-
-/* ARGSUSED */
-void
-random_harvest_direct(const void *entropy __unused, u_int count __unused, u_int bits __unused, enum random_entropy_source origin __unused)
-{
-}
-
-/* ARGSUSED */
-void
-random_harvest_queue(const void *entropy __unused, u_int count __unused, u_int bits __unused, enum random_entropy_source origin __unused)
-{
-}
-
-/* ARGSUSED */
-void
-random_harvest_fast(const void *entropy __unused, u_int count __unused, u_int bits __unused, enum random_entropy_source origin __unused)
-{
-}
diff --git a/sys/dev/random/unit_test.c b/sys/dev/random/unit_test.c
index 7ae5716..fac4c8d 100644
--- a/sys/dev/random/unit_test.c
+++ b/sys/dev/random/unit_test.c
@@ -46,6 +46,7 @@ Where <alg> is YARROW or FORTUNA.
#include <sys/types.h>
#include <inttypes.h>
+#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <threads.h>
@@ -172,35 +173,6 @@ RunHarvester(void *arg __unused)
}
static int
-WriteCSPRNG(void *threadid)
-{
- uint8_t *buf;
- int i;
-
- printf("Thread #1 starts\n");
-
- for (i = 0; ; i++) {
- if (stopseeding)
- break;
- buf = malloc(4096);
- if (i % 1000 == 0)
- printf("Thread write 1 - %d\n", i);
- if (buf != NULL) {
- printf("Thread 1 writing.\n");
- random_alg_context.ra_write(buf, i);
- free(buf);
- }
- usleep(1000000);
- }
-
- printf("Thread #1 ends\n");
-
- thrd_exit(0);
-
- return (0);
-}
-
-static int
ReadCSPRNG(void *threadid)
{
size_t tid, zsize;
@@ -271,7 +243,7 @@ main(int argc, char *argv[])
for (t = 0; t < NUM_THREADS; t++) {
printf("In main: creating thread %ld\n", t);
- rc = thrd_create(&threads[t], (t == 0 ? RunHarvester : (t == 1 ? WriteCSPRNG : ReadCSPRNG)), NULL);
+ rc = thrd_create(&threads[t], (t == 0 ? RunHarvester : ReadCSPRNG), NULL);
if (rc != thrd_success) {
printf("ERROR; return code from thrd_create() is %d\n", rc);
exit(-1);
diff --git a/sys/dev/random/yarrow.c b/sys/dev/random/yarrow.c
index d6ebd46..2ef15a4 100644
--- a/sys/dev/random/yarrow.c
+++ b/sys/dev/random/yarrow.c
@@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$");
#include <dev/random/yarrow.h>
#else /* !_KERNEL */
#include <inttypes.h>
+#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
@@ -92,7 +93,7 @@ static struct yarrow_state {
u_int ysp_thresh; /* pool reseed threshhold */
struct randomdev_hash ysp_hash; /* accumulated entropy */
} ys_pool[RANDOM_YARROW_NPOOLS];/* pool[0] is fast, pool[1] is slow */
- int ys_seeded;
+ bool ys_seeded;
/* Reseed lock */
mtx_t ys_mtx;
} yarrow_state;
@@ -108,9 +109,7 @@ RANDOM_CHECK_UINT(slowoverthresh, 1, 5);
static void random_yarrow_pre_read(void);
static void random_yarrow_read(uint8_t *, u_int);
-static void random_yarrow_write(uint8_t *, u_int);
-static void random_yarrow_reseed(void);
-static int random_yarrow_seeded(void);
+static bool random_yarrow_seeded(void);
static void random_yarrow_process_event(struct harvest_event *);
static void random_yarrow_init_alg(void *);
static void random_yarrow_deinit_alg(void *);
@@ -123,8 +122,6 @@ struct random_algorithm random_alg_context = {
.ra_deinit_alg = random_yarrow_deinit_alg,
.ra_pre_read = random_yarrow_pre_read,
.ra_read = random_yarrow_read,
- .ra_write = random_yarrow_write,
- .ra_reseed = random_yarrow_reseed,
.ra_seeded = random_yarrow_seeded,
.ra_event_processor = random_yarrow_process_event,
.ra_poolcount = RANDOM_YARROW_NPOOLS,
@@ -141,7 +138,7 @@ random_yarrow_init_alg(void *unused __unused)
RANDOM_RESEED_INIT_LOCK();
/* Start unseeded, therefore blocked. */
- yarrow_state.ys_seeded = 0;
+ yarrow_state.ys_seeded = false;
#ifdef _KERNEL
/*
* Yarrow parameters. Do not adjust these unless you have
@@ -266,12 +263,14 @@ random_yarrow_reseed_internal(u_int fastslow)
RANDOM_RESEED_ASSERT_LOCK_OWNED();
#ifdef RANDOM_DEBUG
/* WARNING! This is dangerously tedious to do with mutexes held! */
- printf("random: %s %s seeded = %d\n", __func__, (fastslow == RANDOM_YARROW_FAST ? "RANDOM_YARROW_FAST" : "RANDOM_YARROW_SLOW"), yarrow_state.ys_seeded);
- printf("random: %s - fast - thresh %d,1 - ", __func__, yarrow_state.ys_pool[RANDOM_YARROW_FAST].ysp_thresh);
+ printf("random: %s ", __func__);
+ printf("type/pool = %s ", fastslow == RANDOM_YARROW_FAST ? "RANDOM_YARROW_FAST" : "RANDOM_YARROW_SLOW");
+ printf("seeded = %s\n", yarrow_state.ys_seeded ? "true" : "false");
+ printf("random: fast - thresh %d,1 - ", yarrow_state.ys_pool[RANDOM_YARROW_FAST].ysp_thresh);
for (i = RANDOM_START; i < ENTROPYSOURCE; i++)
printf(" %d", yarrow_state.ys_pool[RANDOM_YARROW_FAST].ysp_source_bits[i]);
printf("\n");
- printf("random: %s - slow - thresh %d,%d - ", __func__, yarrow_state.ys_pool[RANDOM_YARROW_SLOW].ysp_thresh, yarrow_state.ys_slowoverthresh);
+ printf("random: slow - thresh %d,%d - ", yarrow_state.ys_pool[RANDOM_YARROW_SLOW].ysp_thresh, yarrow_state.ys_slowoverthresh);
for (i = RANDOM_START; i < ENTROPYSOURCE; i++)
printf(" %d", yarrow_state.ys_pool[RANDOM_YARROW_SLOW].ysp_source_bits[i]);
printf("\n");
@@ -338,7 +337,7 @@ random_yarrow_reseed_internal(u_int fastslow)
#endif
/* Unblock the device if it was blocked due to being unseeded */
if (!yarrow_state.ys_seeded) {
- yarrow_state.ys_seeded = 1;
+ yarrow_state.ys_seeded = true;
randomdev_unblock();
}
}
@@ -395,47 +394,7 @@ random_yarrow_read(uint8_t *buf, u_int bytecount)
RANDOM_RESEED_UNLOCK();
}
-/* Internal function to hand external entropy to the PRNG. */
-void
-random_yarrow_write(uint8_t *buf, u_int count)
-{
- static u_int destination = 0;
- static struct harvest_event event;
- struct randomdev_hash hash;
- uint32_t entropy_data[RANDOM_KEYSIZE_WORDS], timestamp;
- int i;
-
- /* Extra timing here is helpful to scrape scheduler timing entropy */
- randomdev_hash_init(&hash);
- timestamp = (uint32_t)get_cyclecount();
- randomdev_hash_iterate(&hash, &timestamp, sizeof(timestamp));
- randomdev_hash_iterate(&hash, buf, count);
- timestamp = (uint32_t)get_cyclecount();
- randomdev_hash_iterate(&hash, &timestamp, sizeof(timestamp));
- randomdev_hash_finish(&hash, entropy_data);
- explicit_bzero(&hash, sizeof(hash));
- for (i = 0; i < RANDOM_KEYSIZE_WORDS; i += sizeof(event.he_entropy)/sizeof(event.he_entropy[0])) {
- event.he_somecounter = (uint32_t)get_cyclecount();
- event.he_size = sizeof(event.he_entropy);
- event.he_bits = event.he_size/8;
- event.he_source = RANDOM_CACHED;
- event.he_destination = destination++; /* Harmless cheating */
- memcpy(event.he_entropy, entropy_data + i, sizeof(event.he_entropy));
- random_yarrow_process_event(&event);
- }
- explicit_bzero(entropy_data, sizeof(entropy_data));
-}
-
-void
-random_yarrow_reseed(void)
-{
-
- RANDOM_RESEED_LOCK();
- random_yarrow_reseed_internal(RANDOM_YARROW_SLOW);
- RANDOM_RESEED_UNLOCK();
-}
-
-int
+bool
random_yarrow_seeded(void)
{
diff --git a/sys/dev/usb/controller/dwc_otg.c b/sys/dev/usb/controller/dwc_otg.c
index bd3e51b..e018ab5 100644
--- a/sys/dev/usb/controller/dwc_otg.c
+++ b/sys/dev/usb/controller/dwc_otg.c
@@ -1,7 +1,7 @@
/* $FreeBSD$ */
/*-
* Copyright (c) 2015 Daisuke Aoyama. All rights reserved.
- * Copyright (c) 2012 Hans Petter Selasky. All rights reserved.
+ * Copyright (c) 2012-2015 Hans Petter Selasky. All rights reserved.
* Copyright (c) 2010-2011 Aleksandr Rybalko. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -597,14 +597,18 @@ dwc_otg_clear_hcint(struct dwc_otg_softc *sc, uint8_t x)
}
static uint8_t
-dwc_otg_host_check_fifo_empty(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
+dwc_otg_host_check_tx_fifo_empty(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
{
uint32_t temp;
temp = DWC_OTG_READ_4(sc, DOTG_GINTSTS);
- if (td->ep_type == UE_INTERRUPT ||
- td->ep_type == UE_ISOCHRONOUS) {
+ if (td->ep_type == UE_ISOCHRONOUS) {
+ /*
+ * NOTE: USB INTERRUPT transactions are executed like
+ * USB CONTROL transactions! See the setup standard
+ * chain function for more information.
+ */
if (!(temp & GINTSTS_PTXFEMP)) {
DPRINTF("Periodic TX FIFO is not empty\n");
if (!(sc->sc_irq_mask & GINTMSK_PTXFEMPMSK)) {
@@ -631,8 +635,10 @@ dwc_otg_host_channel_alloc(struct dwc_otg_softc *sc,
struct dwc_otg_td *td, uint8_t is_out)
{
uint8_t x;
+ uint8_t y;
+ uint8_t z;
- if (td->channel < DWC_OTG_MAX_CHANNELS)
+ if (td->channel[0] < DWC_OTG_MAX_CHANNELS)
return (0); /* already allocated */
/* check if device is suspended */
@@ -641,20 +647,42 @@ dwc_otg_host_channel_alloc(struct dwc_otg_softc *sc,
/* compute needed TX FIFO size */
if (is_out != 0) {
- if (dwc_otg_host_check_fifo_empty(sc, td) != 0)
+ if (dwc_otg_host_check_tx_fifo_empty(sc, td) != 0)
return (1); /* busy - cannot transfer data */
}
-
- for (x = 0; x != sc->sc_host_ch_max; x++) {
+ z = td->max_packet_count;
+ for (x = y = 0; x != sc->sc_host_ch_max; x++) {
/* check if channel is allocated */
if (sc->sc_chan_state[x].allocated != 0)
continue;
/* check if channel is still enabled */
if (sc->sc_chan_state[x].wait_halted != 0)
continue;
+ /* store channel number */
+ td->channel[y++] = x;
+ /* check if we got all channels */
+ if (y == z)
+ break;
+ }
+ if (y != z) {
+ /* reset channel variable */
+ td->channel[0] = DWC_OTG_MAX_CHANNELS;
+ td->channel[1] = DWC_OTG_MAX_CHANNELS;
+ td->channel[2] = DWC_OTG_MAX_CHANNELS;
+ /* wait a bit */
+ dwc_otg_enable_sof_irq(sc);
+ return (1); /* busy - not enough channels */
+ }
+
+ for (y = 0; y != z; y++) {
+ x = td->channel[y];
+ /* set allocated */
sc->sc_chan_state[x].allocated = 1;
+ /* set wait halted */
+ sc->sc_chan_state[x].wait_halted = 1;
+
/* clear interrupts */
dwc_otg_clear_hcint(sc, x);
@@ -663,29 +691,22 @@ dwc_otg_host_channel_alloc(struct dwc_otg_softc *sc,
/* set active channel */
sc->sc_active_rx_ep |= (1 << x);
-
- /* set channel */
- td->channel = x;
-
- return (0); /* allocated */
}
- /* wait a bit */
- dwc_otg_enable_sof_irq(sc);
- return (1); /* busy */
+ return (0); /* allocated */
}
static void
-dwc_otg_host_channel_free(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
+dwc_otg_host_channel_free_sub(struct dwc_otg_softc *sc, struct dwc_otg_td *td, uint8_t index)
{
uint32_t hcchar;
uint8_t x;
- if (td->channel >= DWC_OTG_MAX_CHANNELS)
+ if (td->channel[index] >= DWC_OTG_MAX_CHANNELS)
return; /* already freed */
/* free channel */
- x = td->channel;
- td->channel = DWC_OTG_MAX_CHANNELS;
+ x = td->channel[index];
+ td->channel[index] = DWC_OTG_MAX_CHANNELS;
DPRINTF("CH=%d\n", x);
@@ -704,26 +725,42 @@ dwc_otg_host_channel_free(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
/* clear active channel */
sc->sc_active_rx_ep &= ~(1 << x);
+ /* check if already halted */
+ if (sc->sc_chan_state[x].wait_halted == 0)
+ return;
+
/* disable host channel */
hcchar = DWC_OTG_READ_4(sc, DOTG_HCCHAR(x));
if (hcchar & HCCHAR_CHENA) {
DPRINTF("Halting channel %d\n", x);
DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(x),
hcchar | HCCHAR_CHDIS);
- sc->sc_chan_state[x].wait_halted = 1;
/* don't write HCCHAR until the channel is halted */
+ } else {
+ sc->sc_chan_state[x].wait_halted = 0;
}
}
static void
+dwc_otg_host_channel_free(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
+{
+ uint8_t x;
+ for (x = 0; x != td->max_packet_count; x++)
+ dwc_otg_host_channel_free_sub(sc, td, x);
+}
+
+static void
dwc_otg_host_dump_rx(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
{
+ uint8_t x;
/* dump any pending messages */
- if (sc->sc_last_rx_status != 0) {
- if (td->channel < DWC_OTG_MAX_CHANNELS &&
- td->channel == GRXSTSRD_CHNUM_GET(sc->sc_last_rx_status)) {
- dwc_otg_common_rx_ack(sc);
- }
+ if (sc->sc_last_rx_status == 0)
+ return;
+ for (x = 0; x != td->max_packet_count; x++) {
+ if (td->channel[x] >= DWC_OTG_MAX_CHANNELS ||
+ td->channel[x] != GRXSTSRD_CHNUM_GET(sc->sc_last_rx_status))
+ continue;
+ dwc_otg_common_rx_ack(sc);
}
}
@@ -737,13 +774,13 @@ dwc_otg_host_setup_tx(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
dwc_otg_host_dump_rx(sc, td);
- if (td->channel < DWC_OTG_MAX_CHANNELS) {
- hcint = sc->sc_chan_state[td->channel].hcint;
+ if (td->channel[0] < DWC_OTG_MAX_CHANNELS) {
+ hcint = sc->sc_chan_state[td->channel[0]].hcint;
DPRINTF("CH=%d ST=%d HCINT=0x%08x HCCHAR=0x%08x HCTSIZ=0x%08x\n",
- td->channel, td->state, hcint,
- DWC_OTG_READ_4(sc, DOTG_HCCHAR(td->channel)),
- DWC_OTG_READ_4(sc, DOTG_HCTSIZ(td->channel)));
+ td->channel[0], td->state, hcint,
+ DWC_OTG_READ_4(sc, DOTG_HCCHAR(td->channel[0])),
+ DWC_OTG_READ_4(sc, DOTG_HCTSIZ(td->channel[0])));
} else {
hcint = 0;
goto check_state;
@@ -753,12 +790,12 @@ dwc_otg_host_setup_tx(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
HCINT_ACK | HCINT_NYET)) {
/* give success bits priority over failure bits */
} else if (hcint & HCINT_STALL) {
- DPRINTF("CH=%d STALL\n", td->channel);
+ DPRINTF("CH=%d STALL\n", td->channel[0]);
td->error_stall = 1;
td->error_any = 1;
goto complete;
} else if (hcint & HCINT_ERRORS) {
- DPRINTF("CH=%d ERROR\n", td->channel);
+ DPRINTF("CH=%d ERROR\n", td->channel[0]);
td->errcnt++;
if (td->hcsplt != 0 || td->errcnt >= 3) {
td->error_any = 1;
@@ -863,23 +900,23 @@ send_pkt:
usbd_copy_out(td->pc, 0, &req, sizeof(req));
- DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(td->channel),
+ DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(td->channel[0]),
(sizeof(req) << HCTSIZ_XFERSIZE_SHIFT) |
(1 << HCTSIZ_PKTCNT_SHIFT) |
(HCTSIZ_PID_SETUP << HCTSIZ_PID_SHIFT));
- DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(td->channel), td->hcsplt);
+ DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(td->channel[0]), td->hcsplt);
hcchar = td->hcchar;
hcchar &= ~(HCCHAR_EPDIR_IN | HCCHAR_EPTYPE_MASK);
hcchar |= UE_CONTROL << HCCHAR_EPTYPE_SHIFT;
/* must enable channel before writing data to FIFO */
- DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(td->channel), hcchar);
+ DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(td->channel[0]), hcchar);
/* transfer data into FIFO */
bus_space_write_region_4(sc->sc_io_tag, sc->sc_io_hdl,
- DOTG_DFIFO(td->channel), (uint32_t *)&req, sizeof(req) / 4);
+ DOTG_DFIFO(td->channel[0]), (uint32_t *)&req, sizeof(req) / 4);
/* wait until next slot before trying complete split */
td->tt_complete_slot = sc->sc_last_frame_num + 1;
@@ -916,17 +953,17 @@ send_cpkt:
td->hcsplt |= HCSPLT_COMPSPLT;
td->state = DWC_CHAN_ST_WAIT_C_ANE;
- DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(td->channel),
+ DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(td->channel[0]),
(HCTSIZ_PID_SETUP << HCTSIZ_PID_SHIFT));
- DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(td->channel), td->hcsplt);
+ DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(td->channel[0]), td->hcsplt);
hcchar = td->hcchar;
hcchar &= ~(HCCHAR_EPDIR_IN | HCCHAR_EPTYPE_MASK);
hcchar |= UE_CONTROL << HCCHAR_EPTYPE_SHIFT;
/* must enable channel before writing data to FIFO */
- DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(td->channel), hcchar);
+ DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(td->channel[0]), hcchar);
busy:
return (1); /* busy */
@@ -1060,50 +1097,51 @@ dwc_otg_host_rate_check_interrupt(struct dwc_otg_softc *sc, struct dwc_otg_td *t
static uint8_t
dwc_otg_host_rate_check(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
{
+ uint8_t frame_num = (uint8_t)sc->sc_last_frame_num;
+
if (td->ep_type == UE_ISOCHRONOUS) {
/* non TT isochronous traffic */
- if ((td->tmr_val != 0) ||
- (sc->sc_last_frame_num & (td->tmr_res - 1))) {
+ if (frame_num & (td->tmr_res - 1))
goto busy;
- }
- td->tmr_val = 1; /* executed */
+ if ((frame_num ^ td->tmr_val) & td->tmr_res)
+ goto busy;
+ td->tmr_val = td->tmr_res + sc->sc_last_frame_num;
td->toggle = 0;
-
+ return (0);
} else if (td->ep_type == UE_INTERRUPT) {
if (!td->tt_scheduled)
goto busy;
td->tt_scheduled = 0;
+ return (0);
} else if (td->did_nak != 0) {
- uint8_t frame_num = (uint8_t)sc->sc_last_frame_num;
/* check if we should pause sending queries for 125us */
if (td->tmr_res == frame_num) {
/* wait a bit */
dwc_otg_enable_sof_irq(sc);
goto busy;
}
- /* query for data one more time */
- td->tmr_res = frame_num;
- td->did_nak = 0;
} else if (td->set_toggle) {
td->set_toggle = 0;
td->toggle = 1;
}
+ /* query for data one more time */
+ td->tmr_res = frame_num;
+ td->did_nak = 0;
return (0);
busy:
return (1);
}
static uint8_t
-dwc_otg_host_data_rx_sub(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
+dwc_otg_host_data_rx_sub(struct dwc_otg_softc *sc, struct dwc_otg_td *td,
+ uint8_t channel)
{
uint32_t count;
- uint8_t channel;
/* check endpoint status */
if (sc->sc_last_rx_status == 0)
goto busy;
- channel = td->channel;
if (channel >= DWC_OTG_MAX_CHANNELS)
goto busy;
@@ -1128,21 +1166,22 @@ dwc_otg_host_data_rx_sub(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
/* get the packet byte count */
count = GRXSTSRD_BCNT_GET(sc->sc_last_rx_status);
- /* check for isochronous transfer or high-speed bandwidth endpoint */
- if (td->ep_type == UE_ISOCHRONOUS || td->max_packet_count > 1) {
- if ((sc->sc_last_rx_status & GRXSTSRD_DPID_MASK) != GRXSTSRD_DPID_DATA0) {
+ /* check for ISOCHRONOUS endpoint */
+ if (td->ep_type == UE_ISOCHRONOUS) {
+ if ((sc->sc_last_rx_status & GRXSTSRD_DPID_MASK) !=
+ GRXSTSRD_DPID_DATA0) {
+ /* more data to be received */
td->tt_xactpos = HCSPLT_XACTPOS_MIDDLE;
} else {
+ /* all data received */
td->tt_xactpos = HCSPLT_XACTPOS_BEGIN;
-
/* verify the packet byte count */
- if (count < td->max_packet_size) {
+ if (count != td->remainder) {
/* we have a short packet */
td->short_pkt = 1;
td->got_short = 1;
}
}
- td->toggle = 0;
} else {
/* verify the packet byte count */
if (count != td->max_packet_size) {
@@ -1194,15 +1233,17 @@ complete:
static uint8_t
dwc_otg_host_data_rx(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
{
- uint32_t hcint;
+ uint32_t hcint = 0;
uint32_t hcchar;
uint8_t delta;
uint8_t channel;
+ uint8_t x;
- channel = td->channel;
-
- if (channel < DWC_OTG_MAX_CHANNELS) {
- hcint = sc->sc_chan_state[channel].hcint;
+ for (x = 0; x != td->max_packet_count; x++) {
+ channel = td->channel[x];
+ if (channel >= DWC_OTG_MAX_CHANNELS)
+ continue;
+ hcint |= sc->sc_chan_state[channel].hcint;
DPRINTF("CH=%d ST=%d HCINT=0x%08x HCCHAR=0x%08x HCTSIZ=0x%08x\n",
channel, td->state, hcint,
@@ -1230,19 +1271,17 @@ dwc_otg_host_data_rx(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
}
/* check channels for data, if any */
- if (dwc_otg_host_data_rx_sub(sc, td))
+ if (dwc_otg_host_data_rx_sub(sc, td, channel))
goto complete;
/* refresh interrupt status */
- hcint = sc->sc_chan_state[channel].hcint;
+ hcint |= sc->sc_chan_state[channel].hcint;
if (hcint & (HCINT_ERRORS | HCINT_RETRY |
HCINT_ACK | HCINT_NYET)) {
if (!(hcint & HCINT_ERRORS))
td->errcnt = 0;
}
- } else {
- hcint = 0;
}
switch (td->state) {
@@ -1269,6 +1308,8 @@ dwc_otg_host_data_rx(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
td->toggle ^= 1;
goto receive_pkt;
}
+ } else if (td->ep_type == UE_ISOCHRONOUS) {
+ goto complete;
}
td->did_nak = 1;
td->tt_scheduled = 0;
@@ -1292,12 +1333,12 @@ dwc_otg_host_data_rx(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
if (td->ep_type == UE_ISOCHRONOUS) {
/* check if we are complete */
- if ((td->remainder == 0) ||
- (td->tt_xactpos == HCSPLT_XACTPOS_BEGIN)) {
+ if (td->tt_xactpos == HCSPLT_XACTPOS_BEGIN) {
goto complete;
+ } else {
+ /* get more packets */
+ goto busy;
}
- /* get another packet */
- goto receive_pkt;
} else {
/* check if we are complete */
if ((td->remainder == 0) || (td->got_short != 0)) {
@@ -1365,8 +1406,7 @@ receive_pkt:
}
/* complete split */
td->hcsplt |= HCSPLT_COMPSPLT;
- } else if (td->tt_xactpos == HCSPLT_XACTPOS_BEGIN &&
- dwc_otg_host_rate_check(sc, td)) {
+ } else if (dwc_otg_host_rate_check(sc, td)) {
td->state = DWC_CHAN_ST_WAIT_C_PKT;
goto busy;
}
@@ -1377,8 +1417,6 @@ receive_pkt:
goto busy;
}
- channel = td->channel;
-
/* set toggle, if any */
if (td->set_toggle) {
td->set_toggle = 0;
@@ -1387,28 +1425,31 @@ receive_pkt:
td->state = DWC_CHAN_ST_WAIT_ANE;
- /* receive one packet */
- DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel),
- (td->max_packet_size << HCTSIZ_XFERSIZE_SHIFT) |
- (1 << HCTSIZ_PKTCNT_SHIFT) |
- (td->toggle ? (HCTSIZ_PID_DATA1 << HCTSIZ_PID_SHIFT) :
- (HCTSIZ_PID_DATA0 << HCTSIZ_PID_SHIFT)));
+ for (x = 0; x != td->max_packet_count; x++) {
+ channel = td->channel[x];
- DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(channel), td->hcsplt);
+ /* receive one packet */
+ DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel),
+ (td->max_packet_size << HCTSIZ_XFERSIZE_SHIFT) |
+ (1 << HCTSIZ_PKTCNT_SHIFT) |
+ (td->toggle ? (HCTSIZ_PID_DATA1 << HCTSIZ_PID_SHIFT) :
+ (HCTSIZ_PID_DATA0 << HCTSIZ_PID_SHIFT)));
- hcchar = td->hcchar;
- hcchar |= HCCHAR_EPDIR_IN;
+ DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(channel), td->hcsplt);
- /* receive complete split ASAP */
- if ((sc->sc_last_frame_num & 1) != 0 &&
- (td->ep_type == UE_INTERRUPT || td->ep_type == UE_ISOCHRONOUS))
- hcchar |= HCCHAR_ODDFRM;
- else
- hcchar &= ~HCCHAR_ODDFRM;
+ hcchar = td->hcchar;
+ hcchar |= HCCHAR_EPDIR_IN;
- /* must enable channel before data can be received */
- DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(channel), hcchar);
+ /* receive complete split ASAP */
+ if ((sc->sc_last_frame_num & 1) != 0 &&
+ td->ep_type == UE_ISOCHRONOUS)
+ hcchar |= HCCHAR_ODDFRM;
+ else
+ hcchar &= ~HCCHAR_ODDFRM;
+ /* must enable channel before data can be received */
+ DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(channel), hcchar);
+ }
/* wait until next slot before trying complete split */
td->tt_complete_slot = sc->sc_last_frame_num + 1;
@@ -1437,7 +1478,7 @@ receive_spkt:
goto busy;
}
- channel = td->channel;
+ channel = td->channel[0];
td->hcsplt &= ~HCSPLT_COMPSPLT;
td->state = DWC_CHAN_ST_WAIT_S_ANE;
@@ -1450,7 +1491,7 @@ receive_spkt:
/* send after next SOF event */
if ((sc->sc_last_frame_num & 1) == 0 &&
- (td->ep_type == UE_INTERRUPT || td->ep_type == UE_ISOCHRONOUS))
+ td->ep_type == UE_ISOCHRONOUS)
td->hcchar |= HCCHAR_ODDFRM;
else
td->hcchar &= ~HCCHAR_ODDFRM;
@@ -1605,10 +1646,12 @@ dwc_otg_host_data_tx(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
uint32_t hcchar;
uint8_t delta;
uint8_t channel;
+ uint8_t x;
dwc_otg_host_dump_rx(sc, td);
- channel = td->channel;
+ /* check that last channel is complete */
+ channel = td->channel[td->npkt];
if (channel < DWC_OTG_MAX_CHANNELS) {
hcint = sc->sc_chan_state[channel].hcint;
@@ -1658,7 +1701,11 @@ dwc_otg_host_data_tx(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
td->offset += td->tx_bytes;
td->remainder -= td->tx_bytes;
td->toggle ^= 1;
- td->did_nak = 0;
+ /* check if next response will be a NAK */
+ if (hcint & HCINT_NYET)
+ td->did_nak = 1;
+ else
+ td->did_nak = 0;
td->tt_scheduled = 0;
/* check remainder */
@@ -1715,33 +1762,13 @@ dwc_otg_host_data_tx(struct dwc_otg_softc *sc, struct dwc_otg_td *td)
goto send_cpkt;
case DWC_CHAN_ST_TX_WAIT_ISOC:
-
- /* Check if isochronous OUT traffic is complete */
+ /* Check if ISOCHRONOUS OUT traffic is complete */
if ((hcint & HCINT_HCH_DONE_MASK) == 0)
break;
td->offset += td->tx_bytes;
td->remainder -= td->tx_bytes;
-
- if (td->hcsplt != 0 || td->remainder == 0)
- goto complete;
-
- /* check for next packet */
- if (td->max_packet_count > 1)
- td->tt_xactpos++;
-
- /* free existing channel, if any */
- dwc_otg_host_channel_free(sc, td);
-
- td->state = DWC_CHAN_ST_TX_PKT_ISOC;
-
- /* FALLTHROUGH */
-
- case DWC_CHAN_ST_TX_PKT_ISOC:
- if (dwc_otg_host_channel_alloc(sc, td, 1))
- break;
- channel = td->channel;
- goto send_isoc_pkt;
+ goto complete;
default:
break;
}
@@ -1775,8 +1802,6 @@ send_pkt:
goto busy;
}
- channel = td->channel;
-
/* set toggle, if any */
if (td->set_toggle) {
td->set_toggle = 0;
@@ -1784,8 +1809,7 @@ send_pkt:
}
if (td->ep_type == UE_ISOCHRONOUS) {
-send_isoc_pkt:
- /* Isochronous OUT transfers don't have any ACKs */
+ /* ISOCHRONOUS OUT transfers don't have any ACKs */
td->state = DWC_CHAN_ST_TX_WAIT_ISOC;
td->hcsplt &= ~HCSPLT_COMPSPLT;
if (td->hcsplt != 0) {
@@ -1799,123 +1823,110 @@ send_isoc_pkt:
/* Update transaction position */
td->hcsplt &= ~HCSPLT_XACTPOS_MASK;
td->hcsplt |= (HCSPLT_XACTPOS_ALL << HCSPLT_XACTPOS_SHIFT);
- } else {
- /* send one packet at a time */
- count = td->max_packet_size;
- if (td->remainder < count) {
- /* we have a short packet */
- td->short_pkt = 1;
- count = td->remainder;
- }
}
} else if (td->hcsplt != 0) {
-
td->hcsplt &= ~HCSPLT_COMPSPLT;
-
/* Wait for ACK/NAK/ERR from TT */
td->state = DWC_CHAN_ST_WAIT_S_ANE;
-
- /* send one packet at a time */
- count = td->max_packet_size;
- if (td->remainder < count) {
- /* we have a short packet */
- td->short_pkt = 1;
- count = td->remainder;
- }
} else {
/* Wait for ACK/NAK/STALL from device */
td->state = DWC_CHAN_ST_WAIT_ANE;
+ }
+
+ td->tx_bytes = 0;
+
+ for (x = 0; x != td->max_packet_count; x++) {
+ uint32_t rem_bytes;
+
+ channel = td->channel[x];
/* send one packet at a time */
count = td->max_packet_size;
- if (td->remainder < count) {
+ rem_bytes = td->remainder - td->tx_bytes;
+ if (rem_bytes < count) {
/* we have a short packet */
td->short_pkt = 1;
- count = td->remainder;
- }
- }
-
- /* check for High-Speed multi-packets */
- if ((td->hcsplt == 0) && (td->max_packet_count > 1)) {
- if (td->npkt == 0) {
- if (td->remainder >= (3 * td->max_packet_size))
- td->npkt = 3;
- else if (td->remainder >= (2 * td->max_packet_size))
- td->npkt = 2;
- else
- td->npkt = 1;
-
- if (td->npkt > td->max_packet_count)
- td->npkt = td->max_packet_count;
-
- td->tt_xactpos = 1; /* overload */
+ count = rem_bytes;
}
- if (td->tt_xactpos == td->npkt) {
- if (td->npkt == 1) {
+ if (count == rem_bytes) {
+ /* last packet */
+ switch (x) {
+ case 0:
DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel),
(count << HCTSIZ_XFERSIZE_SHIFT) |
(1 << HCTSIZ_PKTCNT_SHIFT) |
- (HCTSIZ_PID_DATA0 << HCTSIZ_PID_SHIFT));
- } else if (td->npkt == 2) {
+ (td->toggle ? (HCTSIZ_PID_DATA1 << HCTSIZ_PID_SHIFT) :
+ (HCTSIZ_PID_DATA0 << HCTSIZ_PID_SHIFT)));
+ break;
+ case 1:
DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel),
(count << HCTSIZ_XFERSIZE_SHIFT) |
(1 << HCTSIZ_PKTCNT_SHIFT) |
(HCTSIZ_PID_DATA1 << HCTSIZ_PID_SHIFT));
- } else {
+ break;
+ default:
DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel),
(count << HCTSIZ_XFERSIZE_SHIFT) |
(1 << HCTSIZ_PKTCNT_SHIFT) |
(HCTSIZ_PID_DATA2 << HCTSIZ_PID_SHIFT));
+ break;
}
- td->npkt = 0;
- } else {
+ } else if (td->ep_type == UE_ISOCHRONOUS &&
+ td->max_packet_count > 1) {
+ /* ISOCHRONOUS multi packet */
DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel),
(count << HCTSIZ_XFERSIZE_SHIFT) |
(1 << HCTSIZ_PKTCNT_SHIFT) |
(HCTSIZ_PID_MDATA << HCTSIZ_PID_SHIFT));
+ } else {
+ /* TODO: HCTSIZ_DOPNG */
+ /* standard BULK/INTERRUPT/CONTROL packet */
+ DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel),
+ (count << HCTSIZ_XFERSIZE_SHIFT) |
+ (1 << HCTSIZ_PKTCNT_SHIFT) |
+ (td->toggle ? (HCTSIZ_PID_DATA1 << HCTSIZ_PID_SHIFT) :
+ (HCTSIZ_PID_DATA0 << HCTSIZ_PID_SHIFT)));
}
- } else {
- /* TODO: HCTSIZ_DOPNG */
- DWC_OTG_WRITE_4(sc, DOTG_HCTSIZ(channel),
- (count << HCTSIZ_XFERSIZE_SHIFT) |
- (1 << HCTSIZ_PKTCNT_SHIFT) |
- (td->toggle ? (HCTSIZ_PID_DATA1 << HCTSIZ_PID_SHIFT) :
- (HCTSIZ_PID_DATA0 << HCTSIZ_PID_SHIFT)));
- }
+ DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(channel), td->hcsplt);
- DWC_OTG_WRITE_4(sc, DOTG_HCSPLT(channel), td->hcsplt);
+ hcchar = td->hcchar;
+ hcchar &= ~HCCHAR_EPDIR_IN;
- hcchar = td->hcchar;
- hcchar &= ~HCCHAR_EPDIR_IN;
+ /* send after next SOF event */
+ if ((sc->sc_last_frame_num & 1) == 0 &&
+ td->ep_type == UE_ISOCHRONOUS)
+ hcchar |= HCCHAR_ODDFRM;
+ else
+ hcchar &= ~HCCHAR_ODDFRM;
- /* send after next SOF event */
- if ((sc->sc_last_frame_num & 1) == 0 &&
- (td->ep_type == UE_INTERRUPT || td->ep_type == UE_ISOCHRONOUS))
- hcchar |= HCCHAR_ODDFRM;
- else
- hcchar &= ~HCCHAR_ODDFRM;
+ /* must enable before writing data to FIFO */
+ DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(channel), hcchar);
- /* must enable before writing data to FIFO */
- DWC_OTG_WRITE_4(sc, DOTG_HCCHAR(channel), hcchar);
+ if (count != 0) {
+ /* clear topmost word before copy */
+ sc->sc_tx_bounce_buffer[(count - 1) / 4] = 0;
- if (count != 0) {
+ /* copy out data */
+ usbd_copy_out(td->pc, td->offset + td->tx_bytes,
+ sc->sc_tx_bounce_buffer, count);
- /* clear topmost word before copy */
- sc->sc_tx_bounce_buffer[(count - 1) / 4] = 0;
+ /* transfer data into FIFO */
+ bus_space_write_region_4(sc->sc_io_tag, sc->sc_io_hdl,
+ DOTG_DFIFO(channel),
+ sc->sc_tx_bounce_buffer, (count + 3) / 4);
+ }
- /* copy out data */
- usbd_copy_out(td->pc, td->offset,
- sc->sc_tx_bounce_buffer, count);
+ /* store number of bytes transmitted */
+ td->tx_bytes += count;
- /* transfer data into FIFO */
- bus_space_write_region_4(sc->sc_io_tag, sc->sc_io_hdl,
- DOTG_DFIFO(channel),
- sc->sc_tx_bounce_buffer, (count + 3) / 4);
+ /* store last packet index */
+ td->npkt = x;
+
+ /* check for last packet */
+ if (count == rem_bytes)
+ break;
}
-
- /* store number of bytes transmitted */
- td->tx_bytes = count;
goto busy;
send_cpkt:
@@ -1941,7 +1952,7 @@ send_cpkt:
goto busy;
}
- channel = td->channel;
+ channel = td->channel[0];
td->hcsplt |= HCSPLT_COMPSPLT;
td->state = DWC_CHAN_ST_WAIT_C_ANE;
@@ -1956,7 +1967,7 @@ send_cpkt:
/* receive complete split ASAP */
if ((sc->sc_last_frame_num & 1) != 0 &&
- (td->ep_type == UE_INTERRUPT || td->ep_type == UE_ISOCHRONOUS))
+ td->ep_type == UE_ISOCHRONOUS)
hcchar |= HCCHAR_ODDFRM;
else
hcchar &= ~HCCHAR_ODDFRM;
@@ -2383,9 +2394,6 @@ dwc_otg_update_host_transfer_schedule_locked(struct dwc_otg_softc *sc)
if ((td->hcchar & HCCHAR_EPDIR_IN) != 0)
continue;
- /* execute more frames */
- td->tmr_val = 0;
-
sc->sc_needsof = 1;
if (td->hcsplt == 0 || td->tt_scheduled != 0)
@@ -2417,9 +2425,6 @@ dwc_otg_update_host_transfer_schedule_locked(struct dwc_otg_softc *sc)
if ((td->hcchar & HCCHAR_EPDIR_IN) == 0)
continue;
- /* execute more frames */
- td->tmr_val = 0;
-
sc->sc_needsof = 1;
if (td->hcsplt == 0 || td->tt_scheduled != 0)
@@ -2513,10 +2518,10 @@ dwc_otg_update_host_transfer_schedule_locked(struct dwc_otg_softc *sc)
TAILQ_CONCAT(&head, &sc->sc_bus.intr_q.head, wait_entry);
TAILQ_CONCAT(&sc->sc_bus.intr_q.head, &head, wait_entry);
- /* put non-TT BULK transfers last */
+ /* put non-TT non-ISOCHRONOUS transfers last */
TAILQ_FOREACH_SAFE(xfer, &sc->sc_bus.intr_q.head, wait_entry, xfer_next) {
td = xfer->td_transfer_cache;
- if (td == NULL || td->hcsplt != 0 || td->ep_type != UE_BULK)
+ if (td == NULL || td->hcsplt != 0 || td->ep_type == UE_ISOCHRONOUS)
continue;
TAILQ_REMOVE(&sc->sc_bus.intr_q.head, xfer, wait_entry);
TAILQ_INSERT_TAIL(&head, xfer, wait_entry);
@@ -2551,11 +2556,19 @@ static void
dwc_otg_interrupt_poll_locked(struct dwc_otg_softc *sc)
{
struct usb_xfer *xfer;
- uint32_t count = 0;
+ uint32_t count;
uint32_t temp;
uint8_t got_rx_status;
uint8_t x;
+ if (sc->sc_flags.status_device_mode == 0) {
+ /*
+ * Update host transfer schedule, so that new
+ * transfers can be issued:
+ */
+ dwc_otg_update_host_transfer_schedule_locked(sc);
+ }
+ count = 0;
repeat:
if (++count == 16) {
/* give other interrupts a chance */
@@ -2659,12 +2672,6 @@ repeat:
sc->sc_irq_mask &= ~GINTMSK_RXFLVLMSK;
DWC_OTG_WRITE_4(sc, DOTG_GINTMSK, sc->sc_irq_mask);
}
-
- if (sc->sc_flags.status_device_mode == 0 && sc->sc_xfer_complete == 0) {
- /* update host transfer schedule, so that new transfers can be issued */
- if (dwc_otg_update_host_transfer_schedule_locked(sc))
- goto repeat;
- }
}
static void
@@ -2944,12 +2951,6 @@ dwc_otg_interrupt(void *arg)
/* complete FIFOs, if any */
dwc_otg_interrupt_complete_locked(sc);
-
- if (sc->sc_flags.status_device_mode == 0) {
- /* update host transfer schedule, so that new transfers can be issued */
- if (dwc_otg_update_host_transfer_schedule_locked(sc))
- dwc_otg_interrupt_poll_locked(sc);
- }
}
USB_BUS_SPIN_UNLOCK(&sc->sc_bus);
USB_BUS_UNLOCK(&sc->sc_bus);
@@ -2982,7 +2983,9 @@ dwc_otg_setup_standard_chain_sub(struct dwc_otg_std_temp *temp)
td->set_toggle = 0;
td->got_short = 0;
td->did_nak = 0;
- td->channel = DWC_OTG_MAX_CHANNELS;
+ td->channel[0] = DWC_OTG_MAX_CHANNELS;
+ td->channel[1] = DWC_OTG_MAX_CHANNELS;
+ td->channel[2] = DWC_OTG_MAX_CHANNELS;
td->state = 0;
td->errcnt = 0;
td->tt_scheduled = 0;
@@ -3247,8 +3250,10 @@ dwc_otg_setup_standard_chain(struct usb_xfer *xfer)
td->tmr_val = sc->sc_tmr_val + ival;
td->tmr_res = ival;
} else if (td->ep_type == UE_ISOCHRONOUS) {
- td->tmr_val = 0;
td->tmr_res = 1;
+ td->tmr_val = sc->sc_last_frame_num;
+ if (td->hcchar & HCCHAR_EPDIR_IN)
+ td->tmr_val++;
} else {
td->tmr_val = 0;
td->tmr_res = (uint8_t)sc->sc_last_frame_num;
@@ -3258,10 +3263,8 @@ dwc_otg_setup_standard_chain(struct usb_xfer *xfer)
hcsplt = 0;
if (td->ep_type == UE_INTERRUPT) {
uint32_t ival;
-#if 0
hcchar |= ((xfer->max_packet_count & 3)
<< HCCHAR_MC_SHIFT);
-#endif
ival = xfer->interval / DWC_OTG_HOST_TIMER_RATE;
if (ival == 0)
ival = 1;
@@ -3272,8 +3275,11 @@ dwc_otg_setup_standard_chain(struct usb_xfer *xfer)
} else if (td->ep_type == UE_ISOCHRONOUS) {
hcchar |= ((xfer->max_packet_count & 3)
<< HCCHAR_MC_SHIFT);
- td->tmr_val = 0;
td->tmr_res = 1 << usbd_xfer_get_fps_shift(xfer);
+ td->tmr_val = sc->sc_last_frame_num;
+ if (td->hcchar & HCCHAR_EPDIR_IN)
+ td->tmr_val += td->tmr_res;
+
} else {
td->tmr_val = 0;
td->tmr_res = (uint8_t)sc->sc_last_frame_num;
@@ -3330,6 +3336,19 @@ dwc_otg_start_standard_chain(struct usb_xfer *xfer)
dwc_otg_xfer_do_fifo(sc, xfer);
if (dwc_otg_xfer_do_complete_locked(sc, xfer))
goto done;
+ } else {
+ struct dwc_otg_td *td = xfer->td_transfer_cache;
+ if (td->ep_type == UE_ISOCHRONOUS &&
+ (td->hcchar & HCCHAR_EPDIR_IN) == 0) {
+ /*
+ * Need to start ISOCHRONOUS OUT transfer ASAP
+ * because execution is delayed by one 125us
+ * microframe:
+ */
+ dwc_otg_xfer_do_fifo(sc, xfer);
+ if (dwc_otg_xfer_do_complete_locked(sc, xfer))
+ goto done;
+ }
}
/* put transfer on interrupt queue */
@@ -3950,11 +3969,6 @@ dwc_otg_do_poll(struct usb_bus *bus)
USB_BUS_SPIN_LOCK(&sc->sc_bus);
dwc_otg_interrupt_poll_locked(sc);
dwc_otg_interrupt_complete_locked(sc);
- if (sc->sc_flags.status_device_mode == 0) {
- /* update host transfer schedule, so that new transfers can be issued */
- if (dwc_otg_update_host_transfer_schedule_locked(sc))
- dwc_otg_interrupt_poll_locked(sc);
- }
USB_BUS_SPIN_UNLOCK(&sc->sc_bus);
USB_BUS_UNLOCK(&sc->sc_bus);
}
@@ -4728,6 +4742,9 @@ dwc_otg_xfer_setup(struct usb_setup_params *parm)
/* init TD */
td->max_packet_size = xfer->max_packet_size;
td->max_packet_count = xfer->max_packet_count;
+ /* range check */
+ if (td->max_packet_count == 0 || td->max_packet_count > 3)
+ td->max_packet_count = 1;
td->ep_no = ep_no;
td->ep_type = ep_type;
td->obj_next = last_obj;
@@ -4766,12 +4783,13 @@ dwc_otg_ep_init(struct usb_device *udev, struct usb_endpoint_descriptor *edesc,
return;
}
} else {
- if (udev->speed == USB_SPEED_HIGH) {
- if ((UGETW(edesc->wMaxPacketSize) >> 11) & 3) {
- /* high bandwidth endpoint - not tested */
- DPRINTF("High Bandwidth Endpoint - not tested\n");
- return;
- }
+ if (udev->speed == USB_SPEED_HIGH &&
+ (edesc->wMaxPacketSize[1] & 0x18) != 0 &&
+ (edesc->bmAttributes & UE_XFERTYPE) != UE_ISOCHRONOUS) {
+ /* not supported */
+ DPRINTFN(-1, "Non-isochronous high bandwidth "
+ "endpoint not supported\n");
+ return;
}
}
if ((edesc->bmAttributes & UE_XFERTYPE) == UE_ISOCHRONOUS)
diff --git a/sys/dev/usb/controller/dwc_otg.h b/sys/dev/usb/controller/dwc_otg.h
index 39c9529..f5e9887 100644
--- a/sys/dev/usb/controller/dwc_otg.h
+++ b/sys/dev/usb/controller/dwc_otg.h
@@ -69,7 +69,7 @@ struct dwc_otg_td {
uint8_t tmr_val;
uint8_t ep_no;
uint8_t ep_type;
- uint8_t channel;
+ uint8_t channel[3];
uint8_t tt_index; /* TT data */
uint8_t tt_start_slot; /* TT data */
uint8_t tt_complete_slot; /* TT data */
@@ -80,8 +80,7 @@ struct dwc_otg_td {
#define DWC_CHAN_ST_WAIT_S_ANE 2
#define DWC_CHAN_ST_WAIT_C_ANE 3
#define DWC_CHAN_ST_WAIT_C_PKT 4
-#define DWC_CHAN_ST_TX_PKT_ISOC 5
-#define DWC_CHAN_ST_TX_WAIT_ISOC 6
+#define DWC_CHAN_ST_TX_WAIT_ISOC 5
uint8_t error_any:1;
uint8_t error_stall:1;
uint8_t alt_next:1;
diff --git a/sys/dev/usb/controller/usb_controller.c b/sys/dev/usb/controller/usb_controller.c
index 92ea6c5..9f7ce24 100644
--- a/sys/dev/usb/controller/usb_controller.c
+++ b/sys/dev/usb/controller/usb_controller.c
@@ -231,7 +231,8 @@ usb_detach(device_t dev)
/* Get rid of USB callback processes */
usb_proc_free(USB_BUS_GIANT_PROC(bus));
- usb_proc_free(USB_BUS_NON_GIANT_PROC(bus));
+ usb_proc_free(USB_BUS_NON_GIANT_ISOC_PROC(bus));
+ usb_proc_free(USB_BUS_NON_GIANT_BULK_PROC(bus));
/* Get rid of USB explore process */
@@ -395,7 +396,8 @@ usb_bus_explore(struct usb_proc_msg *pm)
*/
usb_proc_rewakeup(USB_BUS_CONTROL_XFER_PROC(bus));
usb_proc_rewakeup(USB_BUS_GIANT_PROC(bus));
- usb_proc_rewakeup(USB_BUS_NON_GIANT_PROC(bus));
+ usb_proc_rewakeup(USB_BUS_NON_GIANT_ISOC_PROC(bus));
+ usb_proc_rewakeup(USB_BUS_NON_GIANT_BULK_PROC(bus));
#endif
USB_BUS_UNLOCK(bus);
@@ -860,9 +862,13 @@ usb_attach_sub(device_t dev, struct usb_bus *bus)
&bus->bus_mtx, device_get_nameunit(dev), USB_PRI_MED)) {
device_printf(dev, "WARNING: Creation of USB Giant "
"callback process failed.\n");
- } else if (usb_proc_create(USB_BUS_NON_GIANT_PROC(bus),
+ } else if (usb_proc_create(USB_BUS_NON_GIANT_ISOC_PROC(bus),
+ &bus->bus_mtx, device_get_nameunit(dev), USB_PRI_HIGHEST)) {
+ device_printf(dev, "WARNING: Creation of USB non-Giant ISOC "
+ "callback process failed.\n");
+ } else if (usb_proc_create(USB_BUS_NON_GIANT_BULK_PROC(bus),
&bus->bus_mtx, device_get_nameunit(dev), USB_PRI_HIGH)) {
- device_printf(dev, "WARNING: Creation of USB non-Giant "
+ device_printf(dev, "WARNING: Creation of USB non-Giant BULK "
"callback process failed.\n");
} else if (usb_proc_create(USB_BUS_EXPLORE_PROC(bus),
&bus->bus_mtx, device_get_nameunit(dev), USB_PRI_MED)) {
diff --git a/sys/dev/usb/usb_bus.h b/sys/dev/usb/usb_bus.h
index bdd1681..3ceeb1e 100644
--- a/sys/dev/usb/usb_bus.h
+++ b/sys/dev/usb/usb_bus.h
@@ -57,19 +57,26 @@ struct usb_bus {
struct root_hold_token *bus_roothold;
#endif
+/* convenience macros */
+#define USB_BUS_TT_PROC(bus) USB_BUS_NON_GIANT_ISOC_PROC(bus)
+#define USB_BUS_CS_PROC(bus) USB_BUS_NON_GIANT_ISOC_PROC(bus)
+
#if USB_HAVE_PER_BUS_PROCESS
#define USB_BUS_GIANT_PROC(bus) (&(bus)->giant_callback_proc)
-#define USB_BUS_NON_GIANT_PROC(bus) (&(bus)->non_giant_callback_proc)
+#define USB_BUS_NON_GIANT_ISOC_PROC(bus) (&(bus)->non_giant_isoc_callback_proc)
+#define USB_BUS_NON_GIANT_BULK_PROC(bus) (&(bus)->non_giant_bulk_callback_proc)
#define USB_BUS_EXPLORE_PROC(bus) (&(bus)->explore_proc)
#define USB_BUS_CONTROL_XFER_PROC(bus) (&(bus)->control_xfer_proc)
-
/*
- * There are two callback processes. One for Giant locked
- * callbacks. One for non-Giant locked callbacks. This should
- * avoid congestion and reduce response time in most cases.
+ * There are three callback processes. One for Giant locked
+ * callbacks. One for non-Giant locked non-periodic callbacks
+ * and one for non-Giant locked periodic callbacks. This
+ * should avoid congestion and reduce response time in most
+ * cases.
*/
struct usb_process giant_callback_proc;
- struct usb_process non_giant_callback_proc;
+ struct usb_process non_giant_isoc_callback_proc;
+ struct usb_process non_giant_bulk_callback_proc;
/* Explore process */
struct usb_process explore_proc;
diff --git a/sys/dev/usb/usb_device.c b/sys/dev/usb/usb_device.c
index 5ffc07f..13e2c14 100644
--- a/sys/dev/usb/usb_device.c
+++ b/sys/dev/usb/usb_device.c
@@ -2181,7 +2181,7 @@ usb_free_device(struct usb_device *udev, uint8_t flag)
* anywhere:
*/
USB_BUS_LOCK(udev->bus);
- usb_proc_mwait(USB_BUS_NON_GIANT_PROC(udev->bus),
+ usb_proc_mwait(USB_BUS_CS_PROC(udev->bus),
&udev->cs_msg[0], &udev->cs_msg[1]);
USB_BUS_UNLOCK(udev->bus);
diff --git a/sys/dev/usb/usb_hub.c b/sys/dev/usb/usb_hub.c
index 2f1459c..a54fa2e 100644
--- a/sys/dev/usb/usb_hub.c
+++ b/sys/dev/usb/usb_hub.c
@@ -346,7 +346,7 @@ uhub_tt_buffer_reset_async_locked(struct usb_device *child, struct usb_endpoint
}
up->req_reset_tt = req;
/* get reset transfer started */
- usb_proc_msignal(USB_BUS_NON_GIANT_PROC(udev->bus),
+ usb_proc_msignal(USB_BUS_TT_PROC(udev->bus),
&hub->tt_msg[0], &hub->tt_msg[1]);
}
#endif
@@ -1579,7 +1579,7 @@ uhub_detach(device_t dev)
#if USB_HAVE_TT_SUPPORT
/* Make sure our TT messages are not queued anywhere */
USB_BUS_LOCK(bus);
- usb_proc_mwait(USB_BUS_NON_GIANT_PROC(bus),
+ usb_proc_mwait(USB_BUS_TT_PROC(bus),
&hub->tt_msg[0], &hub->tt_msg[1]);
USB_BUS_UNLOCK(bus);
#endif
diff --git a/sys/dev/usb/usb_pf.c b/sys/dev/usb/usb_pf.c
index 468eafb..82ad8e4 100644
--- a/sys/dev/usb/usb_pf.c
+++ b/sys/dev/usb/usb_pf.c
@@ -221,7 +221,13 @@ usbpf_clone_destroy(struct if_clone *ifc, struct ifnet *ifp)
ubus = ifp->if_softc;
unit = ifp->if_dunit;
+ /*
+ * Lock USB before clearing the "ifp" pointer, to avoid
+ * clearing the pointer in the middle of a TAP operation:
+ */
+ USB_BUS_LOCK(ubus);
ubus->ifp = NULL;
+ USB_BUS_UNLOCK(ubus);
bpfdetach(ifp);
if_detach(ifp);
if_free(ifp);
diff --git a/sys/dev/usb/usb_process.h b/sys/dev/usb/usb_process.h
index c12cdc4..dd20afd 100644
--- a/sys/dev/usb/usb_process.h
+++ b/sys/dev/usb/usb_process.h
@@ -34,6 +34,7 @@
#endif
/* defines */
+#define USB_PRI_HIGHEST PI_SWI(SWI_TTY)
#define USB_PRI_HIGH PI_SWI(SWI_NET)
#define USB_PRI_MED PI_SWI(SWI_CAMBIO)
diff --git a/sys/dev/usb/usb_transfer.c b/sys/dev/usb/usb_transfer.c
index 5650790..783a96c 100644
--- a/sys/dev/usb/usb_transfer.c
+++ b/sys/dev/usb/usb_transfer.c
@@ -872,6 +872,19 @@ done:
}
}
+static uint8_t
+usbd_transfer_setup_has_bulk(const struct usb_config *setup_start,
+ uint16_t n_setup)
+{
+ while (n_setup--) {
+ uint8_t type = setup_start[n_setup].type;
+ if (type == UE_BULK || type == UE_BULK_INTR ||
+ type == UE_TYPE_ANY)
+ return (1);
+ }
+ return (0);
+}
+
/*------------------------------------------------------------------------*
* usbd_transfer_setup - setup an array of USB transfers
*
@@ -1013,9 +1026,12 @@ usbd_transfer_setup(struct usb_device *udev,
else if (xfer_mtx == &Giant)
info->done_p =
USB_BUS_GIANT_PROC(udev->bus);
+ else if (usbd_transfer_setup_has_bulk(setup_start, n_setup))
+ info->done_p =
+ USB_BUS_NON_GIANT_BULK_PROC(udev->bus);
else
info->done_p =
- USB_BUS_NON_GIANT_PROC(udev->bus);
+ USB_BUS_NON_GIANT_ISOC_PROC(udev->bus);
}
/* reset sizes */
@@ -2280,10 +2296,8 @@ usbd_callback_ss_done_defer(struct usb_xfer *xfer)
* will have a Lock Order Reversal, LOR, if we try to
* proceed !
*/
- if (usb_proc_msignal(info->done_p,
- &info->done_m[0], &info->done_m[1])) {
- /* ignore */
- }
+ (void) usb_proc_msignal(info->done_p,
+ &info->done_m[0], &info->done_m[1]);
} else {
/* clear second recurse flag */
pq->recurse_2 = 0;
@@ -2307,23 +2321,26 @@ usbd_callback_wrapper(struct usb_xfer_queue *pq)
struct usb_xfer_root *info = xfer->xroot;
USB_BUS_LOCK_ASSERT(info->bus, MA_OWNED);
- if (!mtx_owned(info->xfer_mtx) && !SCHEDULER_STOPPED()) {
+ if ((pq->recurse_3 != 0 || mtx_owned(info->xfer_mtx) == 0) &&
+ SCHEDULER_STOPPED() == 0) {
/*
* Cases that end up here:
*
* 5) HW interrupt done callback or other source.
+ * 6) HW completed transfer during callback
*/
- DPRINTFN(3, "case 5\n");
+ DPRINTFN(3, "case 5 and 6\n");
/*
* We have to postpone the callback due to the fact we
* will have a Lock Order Reversal, LOR, if we try to
- * proceed !
+ * proceed!
+ *
+ * Postponing the callback also ensures that other USB
+ * transfer queues get a chance.
*/
- if (usb_proc_msignal(info->done_p,
- &info->done_m[0], &info->done_m[1])) {
- /* ignore */
- }
+ (void) usb_proc_msignal(info->done_p,
+ &info->done_m[0], &info->done_m[1]);
return;
}
/*
@@ -2381,8 +2398,11 @@ usbd_callback_wrapper(struct usb_xfer_queue *pq)
}
#if USB_HAVE_PF
- if (xfer->usb_state != USB_ST_SETUP)
+ if (xfer->usb_state != USB_ST_SETUP) {
+ USB_BUS_LOCK(info->bus);
usbpf_xfertap(xfer, USBPF_XFERTAP_DONE);
+ USB_BUS_UNLOCK(info->bus);
+ }
#endif
/* call processing routine */
(xfer->callback) (xfer, xfer->error);
@@ -2694,7 +2714,7 @@ usbd_pipe_start(struct usb_xfer_queue *pq)
} else if (udev->ctrl_xfer[1]) {
info = udev->ctrl_xfer[1]->xroot;
usb_proc_msignal(
- USB_BUS_NON_GIANT_PROC(info->bus),
+ USB_BUS_CS_PROC(info->bus),
&udev->cs_msg[0], &udev->cs_msg[1]);
} else {
/* should not happen */
@@ -3019,9 +3039,11 @@ usb_command_wrapper(struct usb_xfer_queue *pq, struct usb_xfer *xfer)
if (!pq->recurse_1) {
- do {
+ /* clear third recurse flag */
+ pq->recurse_3 = 0;
- /* set both recurse flags */
+ do {
+ /* set two first recurse flags */
pq->recurse_1 = 1;
pq->recurse_2 = 1;
@@ -3040,6 +3062,12 @@ usb_command_wrapper(struct usb_xfer_queue *pq, struct usb_xfer *xfer)
(pq->command) (pq);
DPRINTFN(6, "cb %p (leave)\n", pq->curr);
+ /*
+ * Set third recurse flag to indicate
+ * recursion happened:
+ */
+ pq->recurse_3 = 1;
+
} while (!pq->recurse_2);
/* clear first recurse flag */
@@ -3315,7 +3343,8 @@ usbd_transfer_poll(struct usb_xfer **ppxfer, uint16_t max)
USB_BUS_CONTROL_XFER_PROC(udev->bus)->up_msleep = 0;
USB_BUS_EXPLORE_PROC(udev->bus)->up_msleep = 0;
USB_BUS_GIANT_PROC(udev->bus)->up_msleep = 0;
- USB_BUS_NON_GIANT_PROC(udev->bus)->up_msleep = 0;
+ USB_BUS_NON_GIANT_ISOC_PROC(udev->bus)->up_msleep = 0;
+ USB_BUS_NON_GIANT_BULK_PROC(udev->bus)->up_msleep = 0;
/* poll USB hardware */
(udev->bus->methods->xfer_poll) (udev->bus);
diff --git a/sys/dev/usb/usbdi.h b/sys/dev/usb/usbdi.h
index 09b0ca7..ecd5a81 100644
--- a/sys/dev/usb/usbdi.h
+++ b/sys/dev/usb/usbdi.h
@@ -128,6 +128,8 @@ struct usb_xfer_queue {
void (*command) (struct usb_xfer_queue *pq);
uint8_t recurse_1:1;
uint8_t recurse_2:1;
+ uint8_t recurse_3:1;
+ uint8_t reserved:5;
};
/*
diff --git a/sys/dev/vt/hw/efifb/efifb.c b/sys/dev/vt/hw/efifb/efifb.c
index ec029c8..4184f77 100644
--- a/sys/dev/vt/hw/efifb/efifb.c
+++ b/sys/dev/vt/hw/efifb/efifb.c
@@ -96,7 +96,6 @@ vt_efifb_probe(struct vt_device *vd)
static int
vt_efifb_init(struct vt_device *vd)
{
- int depth, d;
struct fb_info *info;
struct efi_fb *efifb;
caddr_t kmdp;
@@ -116,16 +115,13 @@ vt_efifb_init(struct vt_device *vd)
info->fb_height = efifb->fb_height;
info->fb_width = efifb->fb_width;
- depth = fls(efifb->fb_mask_red);
- d = fls(efifb->fb_mask_green);
- depth = d > depth ? d : depth;
- d = fls(efifb->fb_mask_blue);
- depth = d > depth ? d : depth;
- d = fls(efifb->fb_mask_reserved);
- depth = d > depth ? d : depth;
- info->fb_depth = depth;
+ info->fb_depth = fls(efifb->fb_mask_red | efifb->fb_mask_green |
+ efifb->fb_mask_blue | efifb->fb_mask_reserved);
+ /* Round to a multiple of the bits in a byte. */
+ info->fb_bpp = (info->fb_depth + NBBY - 1) & ~(NBBY - 1);
- info->fb_stride = efifb->fb_stride * (depth / 8);
+ /* Stride in bytes, not pixels */
+ info->fb_stride = efifb->fb_stride * (info->fb_bpp / NBBY);
vt_generate_cons_palette(info->fb_cmap, COLOR_FORMAT_RGB,
efifb->fb_mask_red, ffs(efifb->fb_mask_red) - 1,
@@ -137,16 +133,6 @@ vt_efifb_init(struct vt_device *vd)
info->fb_vbase = (intptr_t)pmap_mapdev_attr(info->fb_pbase,
info->fb_size, VM_MEMATTR_WRITE_COMBINING);
- /* Get pixel storage size. */
- info->fb_bpp = info->fb_stride / info->fb_width * 8;
-
- /*
- * Early FB driver work with static window buffer, so reduce to minimal
- * size, buffer or screen.
- */
- info->fb_width = MIN(info->fb_width, VT_FB_DEFAULT_WIDTH);
- info->fb_height = MIN(info->fb_height, VT_FB_DEFAULT_HEIGHT);
-
vt_fb_init(vd);
return (CN_INTERNAL);
diff --git a/sys/dev/vt/hw/vga/vt_vga.c b/sys/dev/vt/hw/vga/vt_vga.c
index 0b7ebe4..4661f35 100644
--- a/sys/dev/vt/hw/vga/vt_vga.c
+++ b/sys/dev/vt/hw/vga/vt_vga.c
@@ -883,9 +883,9 @@ vga_bitblt_text_txtmode(struct vt_device *vd, const struct vt_window *vw,
/* Convert colors to VGA attributes. */
attr = bg << 4 | fg;
- MEM_WRITE1(sc, 0x18000 + (row * 80 + col) * 2 + 0,
+ MEM_WRITE1(sc, (row * 80 + col) * 2 + 0,
ch);
- MEM_WRITE1(sc, 0x18000 + (row * 80 + col) * 2 + 1,
+ MEM_WRITE1(sc, (row * 80 + col) * 2 + 1,
attr);
}
}
@@ -1226,8 +1226,6 @@ vga_init(struct vt_device *vd)
# error "Architecture not yet supported!"
#endif
- bus_space_map(sc->vga_fb_tag, VGA_MEM_BASE, VGA_MEM_SIZE, 0,
- &sc->vga_fb_handle);
bus_space_map(sc->vga_reg_tag, VGA_REG_BASE, VGA_REG_SIZE, 0,
&sc->vga_reg_handle);
@@ -1236,9 +1234,13 @@ vga_init(struct vt_device *vd)
vd->vd_flags |= VDF_TEXTMODE;
vd->vd_width = 80;
vd->vd_height = 25;
+ bus_space_map(sc->vga_fb_tag, VGA_TXT_BASE, VGA_TXT_SIZE, 0,
+ &sc->vga_fb_handle);
} else {
vd->vd_width = VT_VGA_WIDTH;
vd->vd_height = VT_VGA_HEIGHT;
+ bus_space_map(sc->vga_fb_tag, VGA_MEM_BASE, VGA_MEM_SIZE, 0,
+ &sc->vga_fb_handle);
}
if (vga_initialize(vd, textmode) != 0)
return (CN_DEAD);
diff --git a/sys/dev/vt/hw/vga/vt_vga_reg.h b/sys/dev/vt/hw/vga/vt_vga_reg.h
index 5bfb8ce..cf33a37 100644
--- a/sys/dev/vt/hw/vga/vt_vga_reg.h
+++ b/sys/dev/vt/hw/vga/vt_vga_reg.h
@@ -49,6 +49,8 @@
#define VGA_MEM_BASE 0xA0000
#define VGA_MEM_SIZE 0x10000
+#define VGA_TXT_BASE 0xB8000
+#define VGA_TXT_SIZE 0x08000
#define VGA_REG_BASE 0x3c0
#define VGA_REG_SIZE 0x10+0x0c
diff --git a/sys/dev/vt/vt_core.c b/sys/dev/vt/vt_core.c
index 99da892..702df42 100644
--- a/sys/dev/vt/vt_core.c
+++ b/sys/dev/vt/vt_core.c
@@ -264,8 +264,9 @@ vt_update_static(void *dummy)
if (!vty_enabled(VTY_VT))
return;
if (main_vd->vd_driver != NULL)
- printf("VT: running with driver \"%s\".\n",
- main_vd->vd_driver->vd_name);
+ printf("VT(%s): %s %ux%u\n", main_vd->vd_driver->vd_name,
+ (main_vd->vd_flags & VDF_TEXTMODE) ? "text" : "resolution",
+ main_vd->vd_width, main_vd->vd_height);
else
printf("VT: init without driver.\n");
diff --git a/sys/dev/xen/netfront/netfront.c b/sys/dev/xen/netfront/netfront.c
index 2f972b8..302c017 100644
--- a/sys/dev/xen/netfront/netfront.c
+++ b/sys/dev/xen/netfront/netfront.c
@@ -280,8 +280,6 @@ struct netfront_info {
struct callout xn_stat_ch;
u_long rx_pfn_array[NET_RX_RING_SIZE];
- multicall_entry_t rx_mcl[NET_RX_RING_SIZE+1];
- mmu_update_t rx_mmu[NET_RX_RING_SIZE];
struct ifmedia sc_media;
bool xn_resume;
@@ -882,13 +880,6 @@ refill:
gnttab_grant_foreign_transfer_ref(ref,
otherend_id, pfn);
sc->rx_pfn_array[nr_flips] = pfn;
- if (!xen_feature(XENFEAT_auto_translated_physmap)) {
- /* Remove this page before passing
- * back to Xen.
- */
- MULTI_update_va_mapping(&sc->rx_mcl[i],
- vaddr, 0, 0);
- }
nr_flips++;
} else {
gnttab_grant_foreign_access_ref(ref,
@@ -918,25 +909,6 @@ refill:
reservation.extent_order = 0;
reservation.address_bits = 0;
reservation.domid = DOMID_SELF;
-
- if (!xen_feature(XENFEAT_auto_translated_physmap)) {
- /* After all PTEs have been zapped, flush the TLB. */
- sc->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] =
- UVMF_TLB_FLUSH|UVMF_ALL;
-
- /* Give away a batch of pages. */
- sc->rx_mcl[i].op = __HYPERVISOR_memory_op;
- sc->rx_mcl[i].args[0] = XENMEM_decrease_reservation;
- sc->rx_mcl[i].args[1] = (u_long)&reservation;
- /* Zap PTEs and give away pages in one big multicall. */
- (void)HYPERVISOR_multicall(sc->rx_mcl, i+1);
-
- if (__predict_false(sc->rx_mcl[i].result != i ||
- HYPERVISOR_memory_op(XENMEM_decrease_reservation,
- &reservation) != i))
- panic("%s: unable to reduce memory "
- "reservation\n", __func__);
- }
} else {
wmb();
}
@@ -961,7 +933,6 @@ xn_rxeof(struct netfront_info *np)
struct netif_rx_response *rx = &rinfo.rx;
struct netif_extra_info *extras = rinfo.extras;
RING_IDX i, rp;
- multicall_entry_t *mcl;
struct mbuf *m;
struct mbufq rxq, errq;
int err, pages_flipped = 0, work_to_do;
@@ -1022,19 +993,6 @@ xn_rxeof(struct netfront_info *np)
#ifdef notyet
balloon_update_driver_allowance(-pages_flipped);
#endif
- /* Do all the remapping work, and M->P updates, in one big
- * hypercall.
- */
- if (!!xen_feature(XENFEAT_auto_translated_physmap)) {
- mcl = np->rx_mcl + pages_flipped;
- mcl->op = __HYPERVISOR_mmu_update;
- mcl->args[0] = (u_long)np->rx_mmu;
- mcl->args[1] = pages_flipped;
- mcl->args[2] = 0;
- mcl->args[3] = DOMID_SELF;
- (void)HYPERVISOR_multicall(np->rx_mcl,
- pages_flipped + 1);
- }
}
mbufq_drain(&errq);
@@ -1273,8 +1231,6 @@ xennet_get_responses(struct netfront_info *np,
int *pages_flipped_p)
{
int pages_flipped = *pages_flipped_p;
- struct mmu_update *mmu;
- struct multicall_entry *mcl;
struct netif_rx_response *rx = &rinfo->rx;
struct netif_extra_info *extras = rinfo->extras;
struct mbuf *m, *m0, *m_prev;
@@ -1346,22 +1302,6 @@ xennet_get_responses(struct netfront_info *np,
goto next;
}
- if (!xen_feature( XENFEAT_auto_translated_physmap)) {
- /* Remap the page. */
- void *vaddr = mtod(m, void *);
- uint32_t pfn;
-
- mcl = np->rx_mcl + pages_flipped;
- mmu = np->rx_mmu + pages_flipped;
-
- MULTI_update_va_mapping(mcl, (u_long)vaddr,
- (((vm_paddr_t)mfn) << PAGE_SHIFT) | PG_RW |
- PG_V | PG_M | PG_A, 0);
- pfn = (uintptr_t)m->m_ext.ext_arg1;
- mmu->ptr = ((vm_paddr_t)mfn << PAGE_SHIFT) |
- MMU_MACHPHYS_UPDATE;
- mmu->val = pfn;
- }
pages_flipped++;
} else {
ret = gnttab_end_foreign_access_ref(ref);
diff --git a/sys/fs/nfsserver/nfs_nfsdstate.c b/sys/fs/nfsserver/nfs_nfsdstate.c
index d1ade4a..c0e05b9 100644
--- a/sys/fs/nfsserver/nfs_nfsdstate.c
+++ b/sys/fs/nfsserver/nfs_nfsdstate.c
@@ -401,9 +401,12 @@ nfsrv_setclient(struct nfsrv_descript *nd, struct nfsclient **new_clpp,
}
/* For NFSv4.1, mark that we found a confirmed clientid. */
- if ((nd->nd_flag & ND_NFSV41) != 0)
+ if ((nd->nd_flag & ND_NFSV41) != 0) {
+ clientidp->lval[0] = clp->lc_clientid.lval[0];
+ clientidp->lval[1] = clp->lc_clientid.lval[1];
+ confirmp->lval[0] = 0; /* Ignored by client */
confirmp->lval[1] = 1;
- else {
+ } else {
/*
* id and verifier match, so update the net address info
* and get rid of any existing callback authentication
diff --git a/sys/kern/genassym.sh b/sys/kern/genassym.sh
index 1cbc32b..521c7a2 100644
--- a/sys/kern/genassym.sh
+++ b/sys/kern/genassym.sh
@@ -10,7 +10,7 @@ usage()
work()
{
- ${NM:='nm'} "$1" | ${AWK:='awk'} '
+ ${NM:='nm'} ${NMFLAGS} "$1" | ${AWK:='awk'} '
/ C .*sign$/ {
sign = substr($1, length($1) - 3, 4)
sub("^0*", "", sign)
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
index 3310d1d..d84c26f 100644
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c
@@ -981,6 +981,10 @@ proc_to_reap(struct thread *td, struct proc *p, idtype_t idtype, id_t id,
switch (idtype) {
case P_ALL:
+ if (p->p_procdesc != NULL) {
+ PROC_UNLOCK(p);
+ return (0);
+ }
break;
case P_PID:
if (p->p_pid != (pid_t)id) {
diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c
index d09f0b6..432e38a 100644
--- a/sys/kern/kern_tc.c
+++ b/sys/kern/kern_tc.c
@@ -133,6 +133,8 @@ SYSCTL_PROC(_kern_timecounter, OID_AUTO, alloweddeviation,
sysctl_kern_timecounter_adjprecision, "I",
"Allowed time interval deviation in percents");
+static int tc_chosen; /* Non-zero if a specific tc was chosen via sysctl. */
+
static void tc_windup(void);
static void cpu_tick_calibrate(int);
@@ -1197,10 +1199,13 @@ tc_init(struct timecounter *tc)
"quality", CTLFLAG_RD, &(tc->tc_quality), 0,
"goodness of time counter");
/*
- * Never automatically use a timecounter with negative quality.
+ * Do not automatically switch if the current tc was specifically
+ * chosen. Never automatically use a timecounter with negative quality.
* Even though we run on the dummy counter, switching here may be
- * worse since this timecounter may not be monotonous.
+ * worse since this timecounter may not be monotonic.
*/
+ if (tc_chosen)
+ return;
if (tc->tc_quality < 0)
return;
if (tc->tc_quality < timecounter->tc_quality)
@@ -1433,9 +1438,12 @@ sysctl_kern_timecounter_hardware(SYSCTL_HANDLER_ARGS)
strlcpy(newname, tc->tc_name, sizeof(newname));
error = sysctl_handle_string(oidp, &newname[0], sizeof(newname), req);
- if (error != 0 || req->newptr == NULL ||
- strcmp(newname, tc->tc_name) == 0)
+ if (error != 0 || req->newptr == NULL)
return (error);
+ /* Record that the tc in use now was specifically chosen. */
+ tc_chosen = 1;
+ if (strcmp(newname, tc->tc_name) == 0)
+ return (0);
for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) {
if (strcmp(newname, newtc->tc_name) != 0)
continue;
@@ -1464,7 +1472,7 @@ SYSCTL_PROC(_kern_timecounter, OID_AUTO, hardware, CTLTYPE_STRING | CTLFLAG_RW,
"Timecounter hardware selected");
-/* Report or change the active timecounter hardware. */
+/* Report the available timecounter hardware. */
static int
sysctl_kern_timecounter_choice(SYSCTL_HANDLER_ARGS)
{
diff --git a/sys/modules/Makefile b/sys/modules/Makefile
index bd52356..21009a9 100644
--- a/sys/modules/Makefile
+++ b/sys/modules/Makefile
@@ -296,6 +296,9 @@ SUBDIR= \
${_qlxgbe} \
ral \
${_ralfw} \
+ ${_random_fortuna} \
+ ${_random_yarrow} \
+ ${_random_other} \
rc4 \
${_rdma} \
${_rdrand_rng} \
@@ -398,6 +401,9 @@ _autofs= autofs
.if exists(${.CURDIR}/../opencrypto)
_crypto= crypto
_cryptodev= cryptodev
+_random_fortuna=random_fortuna
+_random_yarrow= random_yarrow
+_random_other= random_other
.endif
.endif
diff --git a/sys/modules/am335x_dmtpps/Makefile b/sys/modules/am335x_dmtpps/Makefile
new file mode 100644
index 0000000..4d9deac
--- /dev/null
+++ b/sys/modules/am335x_dmtpps/Makefile
@@ -0,0 +1,8 @@
+# $FreeBSD$
+
+.PATH: ${.CURDIR}/../../arm/ti/am335x
+
+KMOD= am335x_dmtpps
+SRCS= am335x_dmtpps.c
+
+.include <bsd.kmod.mk>
diff --git a/sys/modules/ctl/Makefile b/sys/modules/ctl/Makefile
index e97ec38..c74f000 100644
--- a/sys/modules/ctl/Makefile
+++ b/sys/modules/ctl/Makefile
@@ -11,7 +11,7 @@ SRCS+= ctl_backend_ramdisk.c
SRCS+= ctl_cmd_table.c
SRCS+= ctl_frontend.c
SRCS+= ctl_frontend_cam_sim.c
-SRCS+= ctl_frontend_internal.c
+SRCS+= ctl_frontend_ioctl.c
SRCS+= ctl_frontend_iscsi.c
SRCS+= ctl_scsi_all.c
SRCS+= ctl_tpc.c
diff --git a/sys/modules/gpio/gpiobus/Makefile b/sys/modules/gpio/gpiobus/Makefile
index e868cba..2a3f86d 100644
--- a/sys/modules/gpio/gpiobus/Makefile
+++ b/sys/modules/gpio/gpiobus/Makefile
@@ -32,8 +32,9 @@
.PATH: ${.CURDIR}/../../../dev/gpio/
KMOD= gpiobus
-SRCS= gpiobus.c
-SRCS+= device_if.h bus_if.h gpio_if.h gpiobus_if.h opt_platform.h
+SRCS= gpiobus.c gpioc.c
+SRCS+= gpio_if.c gpio_if.h gpiobus_if.c gpiobus_if.h
+SRCS+= device_if.h bus_if.h opt_platform.h
CFLAGS+= -I. -I${.CURDIR}/../../../dev/gpio/
diff --git a/sys/modules/random_fortuna/Makefile b/sys/modules/random_fortuna/Makefile
new file mode 100644
index 0000000..d28ae4d
--- /dev/null
+++ b/sys/modules/random_fortuna/Makefile
@@ -0,0 +1,11 @@
+# $FreeBSD$
+
+.PATH: ${.CURDIR}/../../dev/random
+
+KMOD = random_fortuna
+SRCS = randomdev.c hash.c fortuna.c
+SRCS += opt_param.h bus_if.h device_if.h
+SRCS += opt_ddb.h
+CFLAGS += -DRANDOM_LOADABLE
+
+.include <bsd.kmod.mk>
diff --git a/sys/modules/random_other/Makefile b/sys/modules/random_other/Makefile
new file mode 100644
index 0000000..6ce586b
--- /dev/null
+++ b/sys/modules/random_other/Makefile
@@ -0,0 +1,11 @@
+# $FreeBSD$
+
+.PATH: ${.CURDIR}/../../dev/random
+
+KMOD = random_OTHER
+SRCS = randomdev.c hash.c other_algorithm.c
+SRCS += opt_param.h bus_if.h device_if.h
+SRCS += opt_ddb.h
+CFLAGS += -DRANDOM_LOADABLE
+
+.include <bsd.kmod.mk>
diff --git a/sys/modules/random_yarrow/Makefile b/sys/modules/random_yarrow/Makefile
new file mode 100644
index 0000000..1750af4
--- /dev/null
+++ b/sys/modules/random_yarrow/Makefile
@@ -0,0 +1,11 @@
+# $FreeBSD$
+
+.PATH: ${.CURDIR}/../../dev/random
+
+KMOD = random_yarrow
+SRCS = randomdev.c hash.c yarrow.c
+SRCS += opt_param.h bus_if.h device_if.h
+SRCS += opt_ddb.h
+CFLAGS += -DRANDOM_LOADABLE
+
+.include <bsd.kmod.mk>
diff --git a/sys/net/ieee8023ad_lacp.c b/sys/net/ieee8023ad_lacp.c
index 64aafb1..1af4ffc 100644
--- a/sys/net/ieee8023ad_lacp.c
+++ b/sys/net/ieee8023ad_lacp.c
@@ -522,7 +522,7 @@ lacp_port_create(struct lagg_port *lgp)
int error;
boolean_t active = TRUE; /* XXX should be configurable */
- boolean_t fast = FALSE; /* XXX should be configurable */
+ boolean_t fast = FALSE; /* Configurable via ioctl */
link_init_sdl(ifp, (struct sockaddr *)&sdl, IFT_ETHER);
sdl.sdl_alen = ETHER_ADDR_LEN;
diff --git a/sys/net/ieee8023ad_lacp.h b/sys/net/ieee8023ad_lacp.h
index e814f83..8f0f51a 100644
--- a/sys/net/ieee8023ad_lacp.h
+++ b/sys/net/ieee8023ad_lacp.h
@@ -251,6 +251,7 @@ struct lacp_softc {
u_int32_t lsc_tx_test;
} lsc_debug;
u_int32_t lsc_strict_mode;
+ boolean_t lsc_fast_timeout; /* if set, fast timeout */
};
#define LACP_TYPE_ACTORINFO 1
diff --git a/sys/net/if_lagg.c b/sys/net/if_lagg.c
index dcd005a..b623493 100644
--- a/sys/net/if_lagg.c
+++ b/sys/net/if_lagg.c
@@ -1257,6 +1257,8 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
ro->ro_opts |= LAGG_OPT_LACP_RXTEST;
if (lsc->lsc_strict_mode != 0)
ro->ro_opts |= LAGG_OPT_LACP_STRICT;
+ if (lsc->lsc_fast_timeout != 0)
+ ro->ro_opts |= LAGG_OPT_LACP_TIMEOUT;
ro->ro_active = sc->sc_active;
} else {
@@ -1292,6 +1294,8 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
case -LAGG_OPT_LACP_RXTEST:
case LAGG_OPT_LACP_STRICT:
case -LAGG_OPT_LACP_STRICT:
+ case LAGG_OPT_LACP_TIMEOUT:
+ case -LAGG_OPT_LACP_TIMEOUT:
valid = lacp = 1;
break;
default:
@@ -1320,6 +1324,7 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
sc->sc_opts &= ~ro->ro_opts;
} else {
struct lacp_softc *lsc;
+ struct lacp_port *lp;
lsc = (struct lacp_softc *)sc->sc_psc;
@@ -1342,6 +1347,20 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
case -LAGG_OPT_LACP_STRICT:
lsc->lsc_strict_mode = 0;
break;
+ case LAGG_OPT_LACP_TIMEOUT:
+ LACP_LOCK(lsc);
+ LIST_FOREACH(lp, &lsc->lsc_ports, lp_next)
+ lp->lp_state |= LACP_STATE_TIMEOUT;
+ LACP_UNLOCK(lsc);
+ lsc->lsc_fast_timeout = 1;
+ break;
+ case -LAGG_OPT_LACP_TIMEOUT:
+ LACP_LOCK(lsc);
+ LIST_FOREACH(lp, &lsc->lsc_ports, lp_next)
+ lp->lp_state &= ~LACP_STATE_TIMEOUT;
+ LACP_UNLOCK(lsc);
+ lsc->lsc_fast_timeout = 0;
+ break;
}
}
LAGG_WUNLOCK(sc);
diff --git a/sys/net/if_lagg.h b/sys/net/if_lagg.h
index a45fa16..bb5ea23 100644
--- a/sys/net/if_lagg.h
+++ b/sys/net/if_lagg.h
@@ -150,6 +150,7 @@ struct lagg_reqopts {
#define LAGG_OPT_LACP_STRICT 0x10 /* LACP strict mode */
#define LAGG_OPT_LACP_TXTEST 0x20 /* LACP debug: txtest */
#define LAGG_OPT_LACP_RXTEST 0x40 /* LACP debug: rxtest */
+#define LAGG_OPT_LACP_TIMEOUT 0x80 /* LACP timeout */
u_int ro_count; /* number of ports */
u_int ro_active; /* active port count */
u_int ro_flapping; /* number of flapping */
diff --git a/sys/netinet/if_ether.c b/sys/netinet/if_ether.c
index f90925a..263c197 100644
--- a/sys/netinet/if_ether.c
+++ b/sys/netinet/if_ether.c
@@ -130,6 +130,13 @@ static void arptimer(void *);
static void in_arpinput(struct mbuf *);
#endif
+static void arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr,
+ struct ifnet *ifp, int bridged, struct llentry *la);
+static void arp_update_lle(struct arphdr *ah, struct ifnet *ifp,
+ struct llentry *la);
+static void arp_mark_lle_reachable(struct llentry *la);
+
+
static const struct netisr_handler arp_nh = {
.nh_name = "arp",
.nh_handler = arpintr,
@@ -302,57 +309,37 @@ arprequest(struct ifnet *ifp, const struct in_addr *sip,
}
/*
- * Resolve an IP address into an ethernet address.
- * On input:
- * ifp is the interface we use
- * is_gw != if @dst represents gateway to some destination
- * m is the mbuf. May be NULL if we don't have a packet.
- * dst is the next hop,
- * desten is where we want the address.
- * flags returns lle entry flags.
+ * Resolve an IP address into an ethernet address - heavy version.
+ * Used internally by arpresolve().
+ * We have already checked than we can't use existing lle without
+ * modification so we have to acquire LLE_EXCLUSIVE lle lock.
*
* On success, desten and flags are filled in and the function returns 0;
* If the packet must be held pending resolution, we return EWOULDBLOCK
* On other errors, we return the corresponding error code.
* Note that m_freem() handles NULL.
*/
-int
-arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m,
+static int
+arpresolve_full(struct ifnet *ifp, int is_gw, int create, struct mbuf *m,
const struct sockaddr *dst, u_char *desten, uint32_t *pflags)
{
- struct llentry *la = 0;
- u_int flags = 0;
+ struct llentry *la = NULL;
struct mbuf *curr = NULL;
struct mbuf *next = NULL;
- int create, error, renew;
+ int error, renew;
if (pflags != NULL)
*pflags = 0;
- create = 0;
- if (m != NULL) {
- if (m->m_flags & M_BCAST) {
- /* broadcast */
- (void)memcpy(desten,
- ifp->if_broadcastaddr, ifp->if_addrlen);
- return (0);
- }
- if (m->m_flags & M_MCAST) {
- /* multicast */
- ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten);
- return (0);
- }
+ if (create == 0) {
+ IF_AFDATA_RLOCK(ifp);
+ la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
+ IF_AFDATA_RUNLOCK(ifp);
}
-retry:
- IF_AFDATA_RLOCK(ifp);
- la = lla_lookup(LLTABLE(ifp), flags, dst);
- IF_AFDATA_RUNLOCK(ifp);
- if ((la == NULL) && ((flags & LLE_EXCLUSIVE) == 0)
- && ((ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0)) {
+ if (la == NULL && (ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0) {
create = 1;
- flags |= LLE_EXCLUSIVE;
IF_AFDATA_WLOCK(ifp);
- la = lla_create(LLTABLE(ifp), flags, dst);
+ la = lla_create(LLTABLE(ifp), 0, dst);
IF_AFDATA_WUNLOCK(ifp);
}
if (la == NULL) {
@@ -382,10 +369,7 @@ retry:
if (pflags != NULL)
*pflags = la->la_flags;
- if (flags & LLE_EXCLUSIVE)
- LLE_WUNLOCK(la);
- else
- LLE_RUNLOCK(la);
+ LLE_WUNLOCK(la);
if (renew == 1)
arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL);
@@ -393,20 +377,7 @@ retry:
return (0);
}
- if (la->la_flags & LLE_STATIC) { /* should not happen! */
- log(LOG_DEBUG, "arpresolve: ouch, empty static llinfo for %s\n",
- inet_ntoa(SIN(dst)->sin_addr));
- m_freem(m);
- error = EINVAL;
- goto done;
- }
-
renew = (la->la_asked == 0 || la->la_expire != time_uptime);
- if ((renew || m != NULL) && (flags & LLE_EXCLUSIVE) == 0) {
- flags |= LLE_EXCLUSIVE;
- LLE_RUNLOCK(la);
- goto retry;
- }
/*
* There is an arptab entry, but no ethernet address
* response yet. Add the mbuf to the list, dropping
@@ -431,11 +402,6 @@ retry:
} else
la->la_hold = m;
la->la_numheld++;
- if (renew == 0 && (flags & LLE_EXCLUSIVE)) {
- flags &= ~LLE_EXCLUSIVE;
- LLE_DOWNGRADE(la);
- }
-
}
/*
* Return EWOULDBLOCK if we have tried less than arp_maxtries. It
@@ -462,15 +428,88 @@ retry:
arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL);
return (error);
}
-done:
- if (flags & LLE_EXCLUSIVE)
- LLE_WUNLOCK(la);
- else
- LLE_RUNLOCK(la);
+
+ LLE_WUNLOCK(la);
return (error);
}
/*
+ * Resolve an IP address into an ethernet address.
+ * On input:
+ * ifp is the interface we use
+ * is_gw != 0 if @dst represents gateway to some destination
+ * m is the mbuf. May be NULL if we don't have a packet.
+ * dst is the next hop,
+ * desten is the storage to put LL address.
+ * flags returns lle entry flags.
+ *
+ * On success, desten and flags are filled in and the function returns 0;
+ * If the packet must be held pending resolution, we return EWOULDBLOCK
+ * On other errors, we return the corresponding error code.
+ * Note that m_freem() handles NULL.
+ */
+int
+arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m,
+ const struct sockaddr *dst, u_char *desten, uint32_t *pflags)
+{
+ struct llentry *la = 0;
+ int renew;
+
+ if (pflags != NULL)
+ *pflags = 0;
+
+ if (m != NULL) {
+ if (m->m_flags & M_BCAST) {
+ /* broadcast */
+ (void)memcpy(desten,
+ ifp->if_broadcastaddr, ifp->if_addrlen);
+ return (0);
+ }
+ if (m->m_flags & M_MCAST) {
+ /* multicast */
+ ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten);
+ return (0);
+ }
+ }
+
+ IF_AFDATA_RLOCK(ifp);
+ la = lla_lookup(LLTABLE(ifp), 0, dst);
+ IF_AFDATA_RUNLOCK(ifp);
+
+ if (la == NULL)
+ return (arpresolve_full(ifp, is_gw, 1, m, dst, desten, pflags));
+
+ if ((la->la_flags & LLE_VALID) &&
+ ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) {
+ bcopy(&la->ll_addr, desten, ifp->if_addrlen);
+ renew = 0;
+ /*
+ * If entry has an expiry time and it is approaching,
+ * see if we need to send an ARP request within this
+ * arpt_down interval.
+ */
+ if (!(la->la_flags & LLE_STATIC) &&
+ time_uptime + la->la_preempt > la->la_expire) {
+ renew = 1;
+ la->la_preempt--;
+ }
+
+ if (pflags != NULL)
+ *pflags = la->la_flags;
+
+ LLE_RUNLOCK(la);
+
+ if (renew == 1)
+ arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL);
+
+ return (0);
+ }
+ LLE_RUNLOCK(la);
+
+ return (arpresolve_full(ifp, is_gw, 0, m, dst, desten, pflags));
+}
+
+/*
* Common length and type checks are done here,
* then the protocol-specific routine is called.
*/
@@ -576,10 +615,10 @@ in_arpinput(struct mbuf *m)
struct sockaddr sa;
struct in_addr isaddr, itaddr, myaddr;
u_int8_t *enaddr = NULL;
- int op, flags;
+ int op;
int req_len;
int bridged = 0, is_bridge = 0;
- int carped, create;
+ int carped;
struct sockaddr_in sin;
sin.sin_len = sizeof(struct sockaddr_in);
sin.sin_family = AF_INET;
@@ -708,6 +747,16 @@ match:
"%s!\n", inet_ntoa(isaddr));
goto drop;
}
+
+ if (ifp->if_addrlen != ah->ar_hln) {
+ LLE_WUNLOCK(la);
+ ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, "
+ "i/f %d (ignored)\n", ifp->if_addrlen,
+ (u_char *) ar_sha(ah), ":", ah->ar_hln,
+ ifp->if_addrlen);
+ goto drop;
+ }
+
/*
* Warn if another host is using the same IP address, but only if the
* IP address isn't 0.0.0.0, which is used for DHCP only, in which
@@ -730,100 +779,22 @@ match:
sin.sin_len = sizeof(struct sockaddr_in);
sin.sin_family = AF_INET;
sin.sin_addr = isaddr;
- create = (itaddr.s_addr == myaddr.s_addr) ? 1 : 0;
- flags = LLE_EXCLUSIVE;
- IF_AFDATA_LOCK(ifp);
- if (create != 0)
- la = lla_create(LLTABLE(ifp), 0, (struct sockaddr *)&sin);
- else
- la = lla_lookup(LLTABLE(ifp), flags, (struct sockaddr *)&sin);
- IF_AFDATA_UNLOCK(ifp);
- if (la != NULL) {
- /* the following is not an error when doing bridging */
- if (!bridged && la->lle_tbl->llt_ifp != ifp) {
- if (log_arp_wrong_iface)
- ARP_LOG(LOG_WARNING, "%s is on %s "
- "but got reply from %*D on %s\n",
- inet_ntoa(isaddr),
- la->lle_tbl->llt_ifp->if_xname,
- ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
- ifp->if_xname);
- LLE_WUNLOCK(la);
- goto reply;
- }
- if ((la->la_flags & LLE_VALID) &&
- bcmp(ar_sha(ah), &la->ll_addr, ifp->if_addrlen)) {
- if (la->la_flags & LLE_STATIC) {
- LLE_WUNLOCK(la);
- if (log_arp_permanent_modify)
- ARP_LOG(LOG_ERR,
- "%*D attempts to modify "
- "permanent entry for %s on %s\n",
- ifp->if_addrlen,
- (u_char *)ar_sha(ah), ":",
- inet_ntoa(isaddr), ifp->if_xname);
- goto reply;
- }
- if (log_arp_movements) {
- ARP_LOG(LOG_INFO, "%s moved from %*D "
- "to %*D on %s\n",
- inet_ntoa(isaddr),
- ifp->if_addrlen,
- (u_char *)&la->ll_addr, ":",
- ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
- ifp->if_xname);
- }
- }
-
- if (ifp->if_addrlen != ah->ar_hln) {
- LLE_WUNLOCK(la);
- ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, "
- "i/f %d (ignored)\n", ifp->if_addrlen,
- (u_char *) ar_sha(ah), ":", ah->ar_hln,
- ifp->if_addrlen);
- goto drop;
- }
- (void)memcpy(&la->ll_addr, ar_sha(ah), ifp->if_addrlen);
- la->la_flags |= LLE_VALID;
-
- EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED);
-
- if (!(la->la_flags & LLE_STATIC)) {
- int canceled;
-
- LLE_ADDREF(la);
- la->la_expire = time_uptime + V_arpt_keep;
- canceled = callout_reset(&la->lle_timer,
- hz * V_arpt_keep, arptimer, la);
- if (canceled)
- LLE_REMREF(la);
- }
- la->la_asked = 0;
- la->la_preempt = V_arp_maxtries;
+ IF_AFDATA_RLOCK(ifp);
+ la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, (struct sockaddr *)&sin);
+ IF_AFDATA_RUNLOCK(ifp);
+ if (la != NULL)
+ arp_check_update_lle(ah, isaddr, ifp, bridged, la);
+ else if (itaddr.s_addr == myaddr.s_addr) {
/*
- * The packets are all freed within the call to the output
- * routine.
- *
- * NB: The lock MUST be released before the call to the
- * output routine.
+ * Reply to our address, but no lle exists yet.
+ * do we really have to create an entry?
*/
- if (la->la_hold != NULL) {
- struct mbuf *m_hold, *m_hold_next;
-
- m_hold = la->la_hold;
- la->la_hold = NULL;
- la->la_numheld = 0;
- lltable_fill_sa_entry(la, (struct sockaddr *)&sa);
- LLE_WUNLOCK(la);
- for (; m_hold != NULL; m_hold = m_hold_next) {
- m_hold_next = m_hold->m_nextpkt;
- m_hold->m_nextpkt = NULL;
- /* Avoid confusing lower layers. */
- m_clrprotoflags(m_hold);
- (*ifp->if_output)(ifp, m_hold, &sa, NULL);
- }
- } else
- LLE_WUNLOCK(la);
+ IF_AFDATA_WLOCK(ifp);
+ la = lla_create(LLTABLE(ifp), 0, (struct sockaddr *)&sin);
+ arp_update_lle(ah, ifp, la);
+ IF_AFDATA_WUNLOCK(ifp);
+ arp_mark_lle_reachable(la);
+ LLE_WUNLOCK(la);
}
reply:
if (op != ARPOP_REQUEST)
@@ -934,6 +905,140 @@ drop:
}
#endif
+/*
+ * Checks received arp data against existing @la.
+ * Updates lle state/performs notification if necessary.
+ */
+static void
+arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp,
+ int bridged, struct llentry *la)
+{
+ struct sockaddr sa;
+ struct mbuf *m_hold, *m_hold_next;
+
+ LLE_WLOCK_ASSERT(la);
+
+ /* the following is not an error when doing bridging */
+ if (!bridged && la->lle_tbl->llt_ifp != ifp) {
+ if (log_arp_wrong_iface)
+ ARP_LOG(LOG_WARNING, "%s is on %s "
+ "but got reply from %*D on %s\n",
+ inet_ntoa(isaddr),
+ la->lle_tbl->llt_ifp->if_xname,
+ ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
+ ifp->if_xname);
+ LLE_WUNLOCK(la);
+ return;
+ }
+ if ((la->la_flags & LLE_VALID) &&
+ bcmp(ar_sha(ah), &la->ll_addr, ifp->if_addrlen)) {
+ if (la->la_flags & LLE_STATIC) {
+ LLE_WUNLOCK(la);
+ if (log_arp_permanent_modify)
+ ARP_LOG(LOG_ERR,
+ "%*D attempts to modify "
+ "permanent entry for %s on %s\n",
+ ifp->if_addrlen,
+ (u_char *)ar_sha(ah), ":",
+ inet_ntoa(isaddr), ifp->if_xname);
+ return;
+ }
+ if (log_arp_movements) {
+ ARP_LOG(LOG_INFO, "%s moved from %*D "
+ "to %*D on %s\n",
+ inet_ntoa(isaddr),
+ ifp->if_addrlen,
+ (u_char *)&la->ll_addr, ":",
+ ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
+ ifp->if_xname);
+ }
+ }
+
+ /* Check if something has changed */
+ if (memcmp(&la->ll_addr, ar_sha(ah), ifp->if_addrlen) != 0 ||
+ (la->la_flags & LLE_VALID) == 0) {
+ /* Perform real LLE update */
+ /* use afdata WLOCK to update fields */
+ LLE_ADDREF(la);
+ LLE_WUNLOCK(la);
+ IF_AFDATA_WLOCK(ifp);
+ LLE_WLOCK(la);
+
+ /*
+ * Since we droppped LLE lock, other thread might have deleted
+ * this lle. Check and return
+ */
+ if ((la->la_flags & LLE_DELETED) != 0) {
+ IF_AFDATA_WUNLOCK(ifp);
+ LLE_FREE_LOCKED(la);
+ return;
+ }
+
+ /* Update data */
+ arp_update_lle(ah, ifp, la);
+
+ IF_AFDATA_WUNLOCK(ifp);
+ LLE_REMREF(la);
+ }
+
+ arp_mark_lle_reachable(la);
+
+ /*
+ * The packets are all freed within the call to the output
+ * routine.
+ *
+ * NB: The lock MUST be released before the call to the
+ * output routine.
+ */
+ if (la->la_hold != NULL) {
+ m_hold = la->la_hold;
+ la->la_hold = NULL;
+ la->la_numheld = 0;
+ lltable_fill_sa_entry(la, &sa);
+ LLE_WUNLOCK(la);
+ for (; m_hold != NULL; m_hold = m_hold_next) {
+ m_hold_next = m_hold->m_nextpkt;
+ m_hold->m_nextpkt = NULL;
+ /* Avoid confusing lower layers. */
+ m_clrprotoflags(m_hold);
+ (*ifp->if_output)(ifp, m_hold, &sa, NULL);
+ }
+ } else
+ LLE_WUNLOCK(la);
+}
+
+/*
+ * Updates @la fields used by fast path code.
+ */
+static void
+arp_update_lle(struct arphdr *ah, struct ifnet *ifp, struct llentry *la)
+{
+
+ memcpy(&la->ll_addr, ar_sha(ah), ifp->if_addrlen);
+ la->la_flags |= LLE_VALID;
+}
+
+static void
+arp_mark_lle_reachable(struct llentry *la)
+{
+ int canceled;
+
+ LLE_WLOCK_ASSERT(la);
+
+ EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED);
+
+ if (!(la->la_flags & LLE_STATIC)) {
+ LLE_ADDREF(la);
+ la->la_expire = time_uptime + V_arpt_keep;
+ canceled = callout_reset(&la->lle_timer,
+ hz * V_arpt_keep, arptimer, la);
+ if (canceled)
+ LLE_REMREF(la);
+ }
+ la->la_asked = 0;
+ la->la_preempt = V_arp_maxtries;
+}
+
void
arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa)
{
diff --git a/sys/netinet/sctp_timer.c b/sys/netinet/sctp_timer.c
index 6c8589e..3e72585 100644
--- a/sys/netinet/sctp_timer.c
+++ b/sys/netinet/sctp_timer.c
@@ -1492,6 +1492,8 @@ sctp_pathmtu_timer(struct sctp_inpcb *inp,
#endif
if (mtu > next_mtu) {
net->mtu = next_mtu;
+ } else {
+ net->mtu = mtu;
}
}
}
diff --git a/sys/ofed/drivers/infiniband/core/cma.c b/sys/ofed/drivers/infiniband/core/cma.c
index f1d26cc..7ee525a 100644
--- a/sys/ofed/drivers/infiniband/core/cma.c
+++ b/sys/ofed/drivers/infiniband/core/cma.c
@@ -72,6 +72,11 @@ static int def_prec2sl = 3;
module_param_named(def_prec2sl, def_prec2sl, int, 0644);
MODULE_PARM_DESC(def_prec2sl, "Default value for SL priority with RoCE. Valid values 0 - 7");
+static int unify_tcp_port_space = 1;
+module_param(unify_tcp_port_space, int, 0644);
+MODULE_PARM_DESC(unify_tcp_port_space, "Unify the host TCP and RDMA port "
+ "space allocation (default=1)");
+
static int debug_level = 0;
#define cma_pr(level, priv, format, arg...) \
printk(level "CMA: %p: %s: " format, ((struct rdma_id_priv *) priv) , __func__, ## arg)
@@ -957,6 +962,8 @@ static void cma_release_port(struct rdma_id_private *id_priv)
kfree(bind_list);
}
mutex_unlock(&lock);
+ if (id_priv->sock)
+ sock_release(id_priv->sock);
}
static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
@@ -2449,6 +2456,42 @@ static int cma_bind_listen(struct rdma_id_private *id_priv)
return ret;
}
+static int cma_get_tcp_port(struct rdma_id_private *id_priv)
+{
+ int ret;
+ int size;
+ struct socket *sock;
+
+ ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+ if (ret)
+ return ret;
+#ifdef __linux__
+ ret = sock->ops->bind(sock,
+ (struct sockaddr *) &id_priv->id.route.addr.src_addr,
+ ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr));
+#else
+ ret = -sobind(sock,
+ (struct sockaddr *)&id_priv->id.route.addr.src_addr,
+ curthread);
+#endif
+ if (ret) {
+ sock_release(sock);
+ return ret;
+ }
+
+ size = ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr);
+ ret = sock_getname(sock,
+ (struct sockaddr *) &id_priv->id.route.addr.src_addr,
+ &size, 0);
+ if (ret) {
+ sock_release(sock);
+ return ret;
+ }
+
+ id_priv->sock = sock;
+ return 0;
+}
+
static int cma_get_port(struct rdma_id_private *id_priv)
{
struct idr *ps;
@@ -2460,6 +2503,11 @@ static int cma_get_port(struct rdma_id_private *id_priv)
break;
case RDMA_PS_TCP:
ps = &tcp_ps;
+ if (unify_tcp_port_space) {
+ ret = cma_get_tcp_port(id_priv);
+ if (ret)
+ goto out;
+ }
break;
case RDMA_PS_UDP:
ps = &udp_ps;
@@ -2480,7 +2528,7 @@ static int cma_get_port(struct rdma_id_private *id_priv)
else
ret = cma_use_port(ps, id_priv);
mutex_unlock(&lock);
-
+out:
return ret;
}
diff --git a/sys/powerpc/powerpc/trap.c b/sys/powerpc/powerpc/trap.c
index 57008e9..d2e5eaa 100644
--- a/sys/powerpc/powerpc/trap.c
+++ b/sys/powerpc/powerpc/trap.c
@@ -413,8 +413,8 @@ printtrap(u_int vector, struct trapframe *frame, int isfatal, int user)
case EXC_DTMISS:
printf(" virtual address = 0x%" PRIxPTR "\n", frame->dar);
#ifdef AIM
- printf(" dsisr = 0x%" PRIxPTR "\n",
- frame->cpu.aim.dsisr);
+ printf(" dsisr = 0x%lx\n",
+ (u_long)frame->cpu.aim.dsisr);
#endif
break;
case EXC_ISE:
@@ -438,7 +438,7 @@ printtrap(u_int vector, struct trapframe *frame, int isfatal, int user)
frame->cpu.booke.esr);
#endif
printf(" srr0 = 0x%" PRIxPTR "\n", frame->srr0);
- printf(" srr1 = 0x%" PRIxPTR "\n", frame->srr1);
+ printf(" srr1 = 0x%lx\n", (u_long)frame->srr1);
printf(" lr = 0x%" PRIxPTR "\n", frame->lr);
printf(" curthread = %p\n", curthread);
if (curthread != NULL)
diff --git a/sys/sys/ata.h b/sys/sys/ata.h
index 863f0e8..272b46a 100644
--- a/sys/sys/ata.h
+++ b/sys/sys/ata.h
@@ -399,6 +399,7 @@ struct ata_params {
#define ATA_IDLE_CMD 0xe3 /* idle */
#define ATA_READ_BUFFER 0xe4 /* read buffer */
#define ATA_READ_PM 0xe4 /* read portmultiplier */
+#define ATA_CHECK_POWER_MODE 0xe5 /* device power mode */
#define ATA_SLEEP 0xe6 /* sleep */
#define ATA_FLUSHCACHE 0xe7 /* flush cache to disk */
#define ATA_WRITE_PM 0xe8 /* write portmultiplier */
diff --git a/sys/sys/nv.h b/sys/sys/nv.h
index fa5d138..a985b6d 100644
--- a/sys/sys/nv.h
+++ b/sys/sys/nv.h
@@ -1,5 +1,6 @@
/*-
* Copyright (c) 2009-2013 The FreeBSD Foundation
+ * Copyright (c) 2013-2015 Mariusz Zaborski <oshogbo@FreeBSD.org>
* All rights reserved.
*
* This software was developed by Pawel Jakub Dawidek under sponsorship from
@@ -59,6 +60,11 @@ typedef struct nvlist nvlist_t;
#define NV_TYPE_NVLIST 5
#define NV_TYPE_DESCRIPTOR 6
#define NV_TYPE_BINARY 7
+#define NV_TYPE_BOOL_ARRAY 8
+#define NV_TYPE_NUMBER_ARRAY 9
+#define NV_TYPE_STRING_ARRAY 10
+#define NV_TYPE_NVLIST_ARRAY 11
+#define NV_TYPE_DESCRIPTOR_ARRAY 12
/*
* Perform case-insensitive lookups of provided names.
@@ -101,6 +107,11 @@ const char *nvlist_next(const nvlist_t *nvl, int *typep, void **cookiep);
const nvlist_t *nvlist_get_parent(const nvlist_t *nvl, void **cookiep);
+const nvlist_t *nvlist_get_array_next(const nvlist_t *nvl);
+bool nvlist_in_array(const nvlist_t *nvl);
+
+const nvlist_t *nvlist_get_pararr(const nvlist_t *nvl, void **cookiep);
+
/*
* The nvlist_exists functions check if the given name (optionally of the given
* type) exists on nvlist.
@@ -114,10 +125,15 @@ bool nvlist_exists_bool(const nvlist_t *nvl, const char *name);
bool nvlist_exists_number(const nvlist_t *nvl, const char *name);
bool nvlist_exists_string(const nvlist_t *nvl, const char *name);
bool nvlist_exists_nvlist(const nvlist_t *nvl, const char *name);
+bool nvlist_exists_binary(const nvlist_t *nvl, const char *name);
+bool nvlist_exists_bool_array(const nvlist_t *nvl, const char *name);
+bool nvlist_exists_number_array(const nvlist_t *nvl, const char *name);
+bool nvlist_exists_string_array(const nvlist_t *nvl, const char *name);
+bool nvlist_exists_nvlist_array(const nvlist_t *nvl, const char *name);
#ifndef _KERNEL
bool nvlist_exists_descriptor(const nvlist_t *nvl, const char *name);
+bool nvlist_exists_descriptor_array(const nvlist_t *nvl, const char *name);
#endif
-bool nvlist_exists_binary(const nvlist_t *nvl, const char *name);
/*
* The nvlist_add functions add the given name/value pair.
@@ -134,10 +150,15 @@ void nvlist_add_stringf(nvlist_t *nvl, const char *name, const char *valuefmt, .
void nvlist_add_stringv(nvlist_t *nvl, const char *name, const char *valuefmt, va_list valueap) __printflike(3, 0);
#endif
void nvlist_add_nvlist(nvlist_t *nvl, const char *name, const nvlist_t *value);
+void nvlist_add_binary(nvlist_t *nvl, const char *name, const void *value, size_t size);
+void nvlist_add_bool_array(nvlist_t *nvl, const char *name, const bool *value, size_t nitems);
+void nvlist_add_number_array(nvlist_t *nvl, const char *name, const uint64_t *value, size_t nitems);
+void nvlist_add_string_array(nvlist_t *nvl, const char *name, const char * const *value, size_t nitems);
+void nvlist_add_nvlist_array(nvlist_t *nvl, const char *name, const nvlist_t * const *value, size_t nitems);
#ifndef _KERNEL
void nvlist_add_descriptor(nvlist_t *nvl, const char *name, int value);
+void nvlist_add_descriptor_array(nvlist_t *nvl, const char *name, const int *value, size_t nitems);
#endif
-void nvlist_add_binary(nvlist_t *nvl, const char *name, const void *value, size_t size);
/*
* The nvlist_move functions add the given name/value pair.
@@ -146,10 +167,15 @@ void nvlist_add_binary(nvlist_t *nvl, const char *name, const void *value, size_
void nvlist_move_string(nvlist_t *nvl, const char *name, char *value);
void nvlist_move_nvlist(nvlist_t *nvl, const char *name, nvlist_t *value);
+void nvlist_move_binary(nvlist_t *nvl, const char *name, void *value, size_t size);
+void nvlist_move_bool_array(nvlist_t *nvl, const char *name, bool *value, size_t nitems);
+void nvlist_move_string_array(nvlist_t *nvl, const char *name, char **value, size_t nitems);
+void nvlist_move_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t **value, size_t nitems);
+void nvlist_move_number_array(nvlist_t *nvl, const char *name, uint64_t *value, size_t nitems);
#ifndef _KERNEL
void nvlist_move_descriptor(nvlist_t *nvl, const char *name, int value);
+void nvlist_move_descriptor_array(nvlist_t *nvl, const char *name, int *value, size_t nitems);
#endif
-void nvlist_move_binary(nvlist_t *nvl, const char *name, void *value, size_t size);
/*
* The nvlist_get functions returns value associated with the given name.
@@ -157,14 +183,19 @@ void nvlist_move_binary(nvlist_t *nvl, const char *name, void *value, size_t siz
* not be freed by the caller.
*/
-bool nvlist_get_bool(const nvlist_t *nvl, const char *name);
-uint64_t nvlist_get_number(const nvlist_t *nvl, const char *name);
-const char *nvlist_get_string(const nvlist_t *nvl, const char *name);
-const nvlist_t *nvlist_get_nvlist(const nvlist_t *nvl, const char *name);
+bool nvlist_get_bool(const nvlist_t *nvl, const char *name);
+uint64_t nvlist_get_number(const nvlist_t *nvl, const char *name);
+const char *nvlist_get_string(const nvlist_t *nvl, const char *name);
+const nvlist_t *nvlist_get_nvlist(const nvlist_t *nvl, const char *name);
+const void *nvlist_get_binary(const nvlist_t *nvl, const char *name, size_t *sizep);
+const bool *nvlist_get_bool_array(const nvlist_t *nvl, const char *name, size_t *nitemsp);
+const uint64_t *nvlist_get_number_array(const nvlist_t *nvl, const char *name, size_t *nitemsp);
+const char * const *nvlist_get_string_array(const nvlist_t *nvl, const char *name, size_t *nitemsp);
+const nvlist_t * const *nvlist_get_nvlist_array(const nvlist_t *nvl, const char *name, size_t *nitemsp);
#ifndef _KERNEL
-int nvlist_get_descriptor(const nvlist_t *nvl, const char *name);
+int nvlist_get_descriptor(const nvlist_t *nvl, const char *name);
+const int *nvlist_get_descriptor_array(const nvlist_t *nvl, const char *name, size_t *nitemsp);
#endif
-const void *nvlist_get_binary(const nvlist_t *nvl, const char *name, size_t *sizep);
/*
* The nvlist_take functions returns value associated with the given name and
@@ -172,14 +203,19 @@ const void *nvlist_get_binary(const nvlist_t *nvl, const char *name, size_t *siz
* The caller is responsible for freeing received data.
*/
-bool nvlist_take_bool(nvlist_t *nvl, const char *name);
-uint64_t nvlist_take_number(nvlist_t *nvl, const char *name);
-char *nvlist_take_string(nvlist_t *nvl, const char *name);
-nvlist_t *nvlist_take_nvlist(nvlist_t *nvl, const char *name);
+bool nvlist_take_bool(nvlist_t *nvl, const char *name);
+uint64_t nvlist_take_number(nvlist_t *nvl, const char *name);
+char *nvlist_take_string(nvlist_t *nvl, const char *name);
+nvlist_t *nvlist_take_nvlist(nvlist_t *nvl, const char *name);
+void *nvlist_take_binary(nvlist_t *nvl, const char *name, size_t *sizep);
+bool *nvlist_take_bool_array(nvlist_t *nvl, const char *name, size_t *nitemsp);
+uint64_t *nvlist_take_number_array(nvlist_t *nvl, const char *name, size_t *nitemsp);
+char **nvlist_take_string_array(nvlist_t *nvl, const char *name, size_t *nitemsp);
+nvlist_t **nvlist_take_nvlist_array(nvlist_t *nvl, const char *name, size_t *nitemsp);
#ifndef _KERNEL
int nvlist_take_descriptor(nvlist_t *nvl, const char *name);
+int *nvlist_take_descriptor_array(nvlist_t *nvl, const char *name, size_t *nitemsp);
#endif
-void *nvlist_take_binary(nvlist_t *nvl, const char *name, size_t *sizep);
/*
* The nvlist_free functions removes the given name/value pair from the nvlist
@@ -194,10 +230,16 @@ void nvlist_free_bool(nvlist_t *nvl, const char *name);
void nvlist_free_number(nvlist_t *nvl, const char *name);
void nvlist_free_string(nvlist_t *nvl, const char *name);
void nvlist_free_nvlist(nvlist_t *nvl, const char *name);
+void nvlist_free_binary(nvlist_t *nvl, const char *name);
+void nvlist_free_bool_array(nvlist_t *nvl, const char *name);
+void nvlist_free_number_array(nvlist_t *nvl, const char *name);
+void nvlist_free_string_array(nvlist_t *nvl, const char *name);
+void nvlist_free_nvlist_array(nvlist_t *nvl, const char *name);
+void nvlist_free_binary_array(nvlist_t *nvl, const char *name);
#ifndef _KERNEL
void nvlist_free_descriptor(nvlist_t *nvl, const char *name);
+void nvlist_free_descriptor_array(nvlist_t *nvl, const char *name);
#endif
-void nvlist_free_binary(nvlist_t *nvl, const char *name);
__END_DECLS
diff --git a/sys/sys/random.h b/sys/sys/random.h
index 78a9955..92eb80f 100644
--- a/sys/sys/random.h
+++ b/sys/sys/random.h
@@ -33,10 +33,29 @@
#include <sys/types.h>
+#include "opt_random.h"
+
+#if defined(RANDOM_LOADABLE) && defined(RANDOM_YARROW)
+#error "Cannot define both RANDOM_LOADABLE and RANDOM_YARROW"
+#endif
+
struct uio;
+#if defined(DEV_RANDOM)
u_int read_random(void *, u_int);
int read_random_uio(struct uio *, bool);
+#else
+static __inline int
+read_random_uio(void *a __unused, u_int b __unused)
+{
+ return (0);
+}
+static __inline u_int
+read_random(void *a __unused, u_int b __unused)
+{
+ return (0);
+}
+#endif
/*
* Note: if you add or remove members of random_entropy_source, remember to also update the
@@ -76,15 +95,15 @@ enum random_entropy_source {
#define RANDOM_HARVEST_EVERYTHING_MASK ((1 << (RANDOM_ENVIRONMENTAL_END + 1)) - 1)
-#if defined(RANDOM_DUMMY)
-#define random_harvest_queue(a, b, c, d) do {} while (0)
-#define random_harvest_fast(a, b, c, d) do {} while (0)
-#define random_harvest_direct(a, b, c, d) do {} while (0)
-#else /* !defined(RANDOM_DUMMY) */
+#if defined(DEV_RANDOM)
void random_harvest_queue(const void *, u_int, u_int, enum random_entropy_source);
void random_harvest_fast(const void *, u_int, u_int, enum random_entropy_source);
void random_harvest_direct(const void *, u_int, u_int, enum random_entropy_source);
-#endif /* defined(RANDOM_DUMMY) */
+#else
+#define random_harvest_queue(a, b, c, d) do {} while (0)
+#define random_harvest_fast(a, b, c, d) do {} while (0)
+#define random_harvest_direct(a, b, c, d) do {} while (0)
+#endif
#endif /* _KERNEL */
diff --git a/sys/sys/socketvar.h b/sys/sys/socketvar.h
index 112bb2c..26cf9a6 100644
--- a/sys/sys/socketvar.h
+++ b/sys/sys/socketvar.h
@@ -78,7 +78,7 @@ struct socket {
short so_state; /* (b) internal state flags SS_* */
int so_qstate; /* (e) internal state flags SQ_* */
void *so_pcb; /* protocol control block */
- struct vnet *so_vnet; /* network stack instance */
+ struct vnet *so_vnet; /* (a) network stack instance */
struct protosw *so_proto; /* (a) protocol handle */
/*
* Variables for connection queuing.
diff --git a/sys/sys/timeet.h b/sys/sys/timeet.h
index 728578b..3d50e51 100644
--- a/sys/sys/timeet.h
+++ b/sys/sys/timeet.h
@@ -53,7 +53,7 @@ typedef int et_deregister_cb_t(struct eventtimer *et, void *arg);
struct eventtimer {
SLIST_ENTRY(eventtimer) et_all;
/* Pointer to the next event timer. */
- char *et_name;
+ const char *et_name;
/* Name of the event timer. */
int et_flags;
/* Set of capabilities flags: */
diff --git a/sys/sys/timetc.h b/sys/sys/timetc.h
index e68e327..8f00e22 100644
--- a/sys/sys/timetc.h
+++ b/sys/sys/timetc.h
@@ -49,7 +49,7 @@ struct timecounter {
/* This mask should mask off any unimplemented bits. */
uint64_t tc_frequency;
/* Frequency of the counter in Hz. */
- char *tc_name;
+ const char *tc_name;
/* Name of the timecounter. */
int tc_quality;
/*
diff --git a/sys/teken/demo/teken_demo.c b/sys/teken/demo/teken_demo.c
index 08323dc..42747ce 100644
--- a/sys/teken/demo/teken_demo.c
+++ b/sys/teken/demo/teken_demo.c
@@ -72,7 +72,7 @@ struct pixel {
#define NCOLS 80
#define NROWS 24
-struct pixel buffer[NCOLS][NROWS];
+static struct pixel buffer[NCOLS][NROWS];
static int ptfd;
diff --git a/sys/teken/teken.c b/sys/teken/teken.c
index 3002a88..8834390 100644
--- a/sys/teken/teken.c
+++ b/sys/teken/teken.c
@@ -29,12 +29,14 @@
#include <sys/cdefs.h>
#if defined(__FreeBSD__) && defined(_KERNEL)
#include <sys/param.h>
+#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/systm.h>
#define teken_assert(x) MPASS(x)
#else /* !(__FreeBSD__ && _KERNEL) */
#include <sys/types.h>
#include <assert.h>
+#include <limits.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
@@ -405,18 +407,24 @@ teken_state_numbers(teken_t *t, teken_char_t c)
teken_assert(t->t_curnum < T_NUMSIZE);
if (c >= '0' && c <= '9') {
- /*
- * Don't do math with the default value of 1 when a
- * custom number is inserted.
- */
if (t->t_stateflags & TS_FIRSTDIGIT) {
+ /* First digit. */
t->t_stateflags &= ~TS_FIRSTDIGIT;
- t->t_nums[t->t_curnum] = 0;
- } else {
- t->t_nums[t->t_curnum] *= 10;
+ t->t_nums[t->t_curnum] = c - '0';
+ } else if (t->t_nums[t->t_curnum] < UINT_MAX / 100) {
+ /*
+ * There is no need to continue parsing input
+ * once the value exceeds the size of the
+ * terminal. It would only allow for integer
+ * overflows when performing arithmetic on the
+ * cursor position.
+ *
+ * Ignore any further digits if the value is
+ * already UINT_MAX / 100.
+ */
+ t->t_nums[t->t_curnum] =
+ t->t_nums[t->t_curnum] * 10 + c - '0';
}
-
- t->t_nums[t->t_curnum] += c - '0';
return (1);
} else if (c == ';') {
if (t->t_stateflags & TS_FIRSTDIGIT)
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index 91affa0..13916c0 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -566,11 +566,6 @@ vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen,
if (pageout_status[i] != VM_PAGER_PEND) {
vm_object_pip_wakeup(object);
vm_page_sunbusy(mt);
- if (vm_page_count_severe()) {
- vm_page_lock(mt);
- vm_page_try_to_cache(mt);
- vm_page_unlock(mt);
- }
}
}
if (prunlen != NULL)
diff --git a/sys/x86/iommu/intel_idpgtbl.c b/sys/x86/iommu/intel_idpgtbl.c
index 405976b..d52e8d4 100644
--- a/sys/x86/iommu/intel_idpgtbl.c
+++ b/sys/x86/iommu/intel_idpgtbl.c
@@ -374,8 +374,9 @@ retry:
KASSERT(lvl > 0,
("lost root page table page %p", domain));
/*
- * Page table page does not exists, allocate
- * it and create pte in the up level.
+ * Page table page does not exist, allocate
+ * it and create a pte in the preceeding page level
+ * to reference the allocated page table page.
*/
m = dmar_pgalloc(domain->pgtbl_obj, idx, flags |
DMAR_PGF_ZERO);
diff --git a/sys/x86/x86/busdma_bounce.c b/sys/x86/x86/busdma_bounce.c
index dcdeafa..48c500f 100644
--- a/sys/x86/x86/busdma_bounce.c
+++ b/sys/x86/x86/busdma_bounce.c
@@ -79,7 +79,8 @@ struct bounce_page {
vm_offset_t vaddr; /* kva of bounce buffer */
bus_addr_t busaddr; /* Physical address */
vm_offset_t datavaddr; /* kva of client data */
- bus_addr_t dataaddr; /* client physical address */
+ vm_page_t datapage; /* physical page of client data */
+ vm_offset_t dataoffs; /* page offset of client data */
bus_size_t datacount; /* client data count */
STAILQ_ENTRY(bounce_page) links;
};
@@ -658,7 +659,7 @@ bounce_bus_dmamap_load_buffer(bus_dma_tag_t dmat, bus_dmamap_t map, void *buf,
{
bus_size_t sgsize, max_sgsize;
bus_addr_t curaddr;
- vm_offset_t vaddr;
+ vm_offset_t kvaddr, vaddr;
int error;
if (map == NULL)
@@ -681,10 +682,13 @@ bounce_bus_dmamap_load_buffer(bus_dma_tag_t dmat, bus_dmamap_t map, void *buf,
/*
* Get the physical address for this segment.
*/
- if (pmap == kernel_pmap)
+ if (pmap == kernel_pmap) {
curaddr = pmap_kextract(vaddr);
- else
+ kvaddr = vaddr;
+ } else {
curaddr = pmap_extract(pmap, vaddr);
+ kvaddr = 0;
+ }
/*
* Compute the segment size, and adjust counts.
@@ -696,7 +700,7 @@ bounce_bus_dmamap_load_buffer(bus_dma_tag_t dmat, bus_dmamap_t map, void *buf,
bus_dma_run_filter(&dmat->common, curaddr)) {
sgsize = roundup2(sgsize, dmat->common.alignment);
sgsize = MIN(sgsize, max_sgsize);
- curaddr = add_bounce_page(dmat, map, vaddr, curaddr,
+ curaddr = add_bounce_page(dmat, map, kvaddr, curaddr,
sgsize);
} else {
sgsize = MIN(sgsize, max_sgsize);
@@ -757,48 +761,56 @@ bounce_bus_dmamap_sync(bus_dma_tag_t dmat, bus_dmamap_t map,
bus_dmasync_op_t op)
{
struct bounce_page *bpage;
+ vm_offset_t datavaddr, tempvaddr;
- if ((bpage = STAILQ_FIRST(&map->bpages)) != NULL) {
- /*
- * Handle data bouncing. We might also
- * want to add support for invalidating
- * the caches on broken hardware
- */
- CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x op 0x%x "
- "performing bounce", __func__, dmat,
- dmat->common.flags, op);
-
- if ((op & BUS_DMASYNC_PREWRITE) != 0) {
- while (bpage != NULL) {
- if (bpage->datavaddr != 0) {
- bcopy((void *)bpage->datavaddr,
- (void *)bpage->vaddr,
- bpage->datacount);
- } else {
- physcopyout(bpage->dataaddr,
- (void *)bpage->vaddr,
- bpage->datacount);
- }
- bpage = STAILQ_NEXT(bpage, links);
+ if ((bpage = STAILQ_FIRST(&map->bpages)) == NULL)
+ return;
+
+ /*
+ * Handle data bouncing. We might also want to add support for
+ * invalidating the caches on broken hardware.
+ */
+ CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x op 0x%x "
+ "performing bounce", __func__, dmat, dmat->common.flags, op);
+
+ if ((op & BUS_DMASYNC_PREWRITE) != 0) {
+ while (bpage != NULL) {
+ tempvaddr = 0;
+ datavaddr = bpage->datavaddr;
+ if (datavaddr == 0) {
+ tempvaddr =
+ pmap_quick_enter_page(bpage->datapage);
+ datavaddr = tempvaddr | bpage->dataoffs;
}
- dmat->bounce_zone->total_bounced++;
+
+ bcopy((void *)datavaddr,
+ (void *)bpage->vaddr, bpage->datacount);
+
+ if (tempvaddr != 0)
+ pmap_quick_remove_page(tempvaddr);
+ bpage = STAILQ_NEXT(bpage, links);
}
+ dmat->bounce_zone->total_bounced++;
+ }
- if ((op & BUS_DMASYNC_POSTREAD) != 0) {
- while (bpage != NULL) {
- if (bpage->datavaddr != 0) {
- bcopy((void *)bpage->vaddr,
- (void *)bpage->datavaddr,
- bpage->datacount);
- } else {
- physcopyin((void *)bpage->vaddr,
- bpage->dataaddr,
- bpage->datacount);
- }
- bpage = STAILQ_NEXT(bpage, links);
+ if ((op & BUS_DMASYNC_POSTREAD) != 0) {
+ while (bpage != NULL) {
+ tempvaddr = 0;
+ datavaddr = bpage->datavaddr;
+ if (datavaddr == 0) {
+ tempvaddr =
+ pmap_quick_enter_page(bpage->datapage);
+ datavaddr = tempvaddr | bpage->dataoffs;
}
- dmat->bounce_zone->total_bounced++;
+
+ bcopy((void *)bpage->vaddr,
+ (void *)datavaddr, bpage->datacount);
+
+ if (tempvaddr != 0)
+ pmap_quick_remove_page(tempvaddr);
+ bpage = STAILQ_NEXT(bpage, links);
}
+ dmat->bounce_zone->total_bounced++;
}
}
@@ -993,7 +1005,8 @@ add_bounce_page(bus_dma_tag_t dmat, bus_dmamap_t map, vm_offset_t vaddr,
bpage->busaddr |= addr & PAGE_MASK;
}
bpage->datavaddr = vaddr;
- bpage->dataaddr = addr;
+ bpage->datapage = PHYS_TO_VM_PAGE(addr & ~PAGE_MASK);
+ bpage->dataoffs = addr & PAGE_MASK;
bpage->datacount = size;
STAILQ_INSERT_TAIL(&(map->bpages), bpage, links);
return (bpage->busaddr);
diff --git a/sys/xen/gnttab.h b/sys/xen/gnttab.h
index d0a44ae..9e82124 100644
--- a/sys/xen/gnttab.h
+++ b/sys/xen/gnttab.h
@@ -126,10 +126,8 @@ gnttab_set_map_op(struct gnttab_map_grant_ref *map, vm_paddr_t addr,
{
if (flags & GNTMAP_contains_pte)
map->host_addr = addr;
- else if (xen_feature(XENFEAT_auto_translated_physmap))
- map->host_addr = vtophys(addr);
else
- map->host_addr = addr;
+ map->host_addr = vtophys(addr);
map->flags = flags;
map->ref = ref;
@@ -142,10 +140,8 @@ gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, vm_paddr_t addr,
{
if (flags & GNTMAP_contains_pte)
unmap->host_addr = addr;
- else if (xen_feature(XENFEAT_auto_translated_physmap))
- unmap->host_addr = vtophys(addr);
else
- unmap->host_addr = addr;
+ unmap->host_addr = vtophys(addr);
unmap->handle = handle;
unmap->dev_bus_addr = 0;
@@ -155,13 +151,8 @@ static inline void
gnttab_set_replace_op(struct gnttab_unmap_and_replace *unmap, vm_paddr_t addr,
vm_paddr_t new_addr, grant_handle_t handle)
{
- if (xen_feature(XENFEAT_auto_translated_physmap)) {
- unmap->host_addr = vtophys(addr);
- unmap->new_addr = vtophys(new_addr);
- } else {
- unmap->host_addr = addr;
- unmap->new_addr = new_addr;
- }
+ unmap->host_addr = vtophys(addr);
+ unmap->new_addr = vtophys(new_addr);
unmap->handle = handle;
}
OpenPOWER on IntegriCloud