diff options
Diffstat (limited to 'sys')
48 files changed, 1425 insertions, 170 deletions
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index 9e20f95..d72afd6 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -242,8 +242,11 @@ topo_probe_0x4(void) * logical processors that belong to the same core * as BSP thus deducing number of threads per core. */ - cpuid_count(0x04, 0, p); - max_cores = ((p[0] >> 26) & 0x3f) + 1; + if (cpu_high >= 0x4) { + cpuid_count(0x04, 0, p); + max_cores = ((p[0] >> 26) & 0x3f) + 1; + } else + max_cores = 1; core_id_bits = mask_width(max_logical/max_cores); if (core_id_bits < 0) return; diff --git a/sys/cddl/compat/opensolaris/sys/atomic.h b/sys/cddl/compat/opensolaris/sys/atomic.h index af9cc5d..f34d77e 100644 --- a/sys/cddl/compat/opensolaris/sys/atomic.h +++ b/sys/cddl/compat/opensolaris/sys/atomic.h @@ -40,8 +40,6 @@ extern void atomic_add_64(volatile uint64_t *target, int64_t delta); extern void atomic_dec_64(volatile uint64_t *target); #endif -#ifndef __LP64__ -#endif #ifndef __sparc64__ extern uint32_t atomic_cas_32(volatile uint32_t *target, uint32_t cmp, uint32_t newval); diff --git a/sys/conf/files b/sys/conf/files index 59286a5..d654c6f 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -2748,6 +2748,7 @@ netinet/ip_gre.c optional gre inet netinet/ip_id.c optional inet netinet/in_mcast.c optional inet netinet/in_pcb.c optional inet | inet6 +netinet/in_pcbgroup.c optional inet pcbgroup | inet6 pcbgroup netinet/in_proto.c optional inet | inet6 \ compile-with "${NORMAL_C} -I$S/contrib/pf" netinet/in_rmx.c optional inet @@ -2825,6 +2826,7 @@ netinet6/in6_gif.c optional gif inet6 | netgraph_gif inet6 netinet6/in6_ifattach.c optional inet6 netinet6/in6_mcast.c optional inet6 netinet6/in6_pcb.c optional inet6 +netinet6/in6_pcbgroup.c optional inet6 pcbgroup netinet6/in6_proto.c optional inet6 netinet6/in6_rmx.c optional inet6 netinet6/in6_src.c optional inet6 diff --git a/sys/conf/options b/sys/conf/options index a608d86..ee696a8 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -419,6 +419,7 @@ MROUTING opt_mrouting.h NCP NETATALK opt_atalk.h NFSLOCKD +PCBGROUP opt_pcbgroup.h RADIX_MPATH opt_mpath.h ROUTETABLES opt_route.h SLIP_IFF_OPTS opt_slip.h diff --git a/sys/dev/ath/if_ath.c b/sys/dev/ath/if_ath.c index 451bbeaf..6c7b0e7 100644 --- a/sys/dev/ath/if_ath.c +++ b/sys/dev/ath/if_ath.c @@ -3473,7 +3473,15 @@ ath_rx_proc(void *arg, int npending) if (rs->rs_status & HAL_RXERR_PHY) { sc->sc_stats.ast_rx_phyerr++; /* Process DFS radar events */ - ath_dfs_process_phy_err(sc, mtod(m, char *), tsf, rs); + if ((rs->rs_phyerr == HAL_PHYERR_RADAR) || + (rs->rs_phyerr == HAL_PHYERR_FALSE_RADAR_EXT)) { + /* Since we're touching the frame data, sync it */ + bus_dmamap_sync(sc->sc_dmat, + bf->bf_dmamap, + BUS_DMASYNC_POSTREAD); + /* Now pass it to the radar processing code */ + ath_dfs_process_phy_err(sc, mtod(m, char *), tsf, rs); + } /* Be suitably paranoid about receiving phy errors out of the stats array bounds */ if (rs->rs_phyerr < 64) diff --git a/sys/dev/cardbus/cardbus_cis.c b/sys/dev/cardbus/cardbus_cis.c index 2cfea19..3352a56 100644 --- a/sys/dev/cardbus/cardbus_cis.c +++ b/sys/dev/cardbus/cardbus_cis.c @@ -324,7 +324,7 @@ decode_tuple_bar(device_t cbdev, device_t child, int id, * hint when the cardbus bridge is a child of pci0 (the main * bus). The PC Card spec seems to indicate that this should * only be done on x86 based machines, which suggests that on - * non-x86 machines the adddresses can be anywhere. Since the + * non-x86 machines the addresses can be anywhere. Since the * hardware can do it on non-x86 machines, it should be able * to do it on x86 machines too. Therefore, we can and should * ignore this hint. Furthermore, the PC Card spec recommends @@ -430,7 +430,6 @@ cardbus_read_tuple_finish(device_t cbdev, device_t child, int rid, { if (res != CIS_CONFIG_SPACE) { bus_release_resource(child, SYS_RES_MEMORY, rid, res); - bus_delete_resource(child, SYS_RES_MEMORY, rid); } } @@ -467,7 +466,7 @@ cardbus_read_tuple_init(device_t cbdev, device_t child, uint32_t *start, } /* allocate the memory space to read CIS */ - res = bus_alloc_resource(child, SYS_RES_MEMORY, rid, 0, ~0, 1, + res = bus_alloc_resource_any(child, SYS_RES_MEMORY, rid, rman_make_alignment_flags(4096) | RF_ACTIVE); if (res == NULL) { device_printf(cbdev, "Unable to allocate resource " diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h index 58ff04e..0b33b67 100644 --- a/sys/dev/cxgbe/adapter.h +++ b/sys/dev/cxgbe/adapter.h @@ -396,6 +396,7 @@ struct sge_ctrlq { struct sge { uint16_t timer_val[SGE_NTIMERS]; uint8_t counter_val[SGE_NCOUNTERS]; + int fl_starve_threshold; int nrxq; /* total rx queues (all ports and the rest) */ int ntxq; /* total tx queues (all ports and the rest) */ diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c index 165a677..90e5001 100644 --- a/sys/dev/cxgbe/t4_main.c +++ b/sys/dev/cxgbe/t4_main.c @@ -492,6 +492,8 @@ t4_attach(device_t dev) V_RXTSHIFTMAXR2(15) | V_PERSHIFTBACKOFFMAX(8) | V_PERSHIFTMAX(8) | V_KEEPALIVEMAXR1(4) | V_KEEPALIVEMAXR2(9)); t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, V_HPZ0(PAGE_SHIFT - 12)); + t4_set_reg_field(sc, A_TP_PARA_REG3, F_TUNNELCNGDROP0 | + F_TUNNELCNGDROP1 | F_TUNNELCNGDROP2 | F_TUNNELCNGDROP3, 0); setup_memwin(sc); diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c index f8efda5..b3e3567 100644 --- a/sys/dev/cxgbe/t4_sge.c +++ b/sys/dev/cxgbe/t4_sge.c @@ -203,6 +203,9 @@ t4_sge_init(struct adapter *sc) FL_BUF_SIZE(i)); } + i = t4_read_reg(sc, A_SGE_CONM_CTRL); + s->fl_starve_threshold = G_EGRTHRESHOLD(i) * 2 + 1; + t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, V_THRESHOLD_0(s->counter_val[0]) | V_THRESHOLD_1(s->counter_val[1]) | @@ -1233,7 +1236,8 @@ alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl, sc->sge.eqmap[cntxt_id] = (void *)fl; FL_LOCK(fl); - refill_fl(sc, fl, -1, 8); + /* Just enough to make sure it doesn't starve right away. */ + refill_fl(sc, fl, roundup(sc->sge.fl_starve_threshold, 8), 8); FL_UNLOCK(fl); } @@ -1389,6 +1393,10 @@ alloc_rxq(struct port_info *pi, struct sge_rxq *rxq, int intr_idx, int idx) if (rc != 0) return (rc); + FL_LOCK(&rxq->fl); + refill_fl(pi->adapter, &rxq->fl, rxq->fl.needed / 8, 8); + FL_UNLOCK(&rxq->fl); + #ifdef INET rc = tcp_lro_init(&rxq->lro); if (rc != 0) diff --git a/sys/dev/pccard/pccard.c b/sys/dev/pccard/pccard.c index 00cd1dc..1de571c 100644 --- a/sys/dev/pccard/pccard.c +++ b/sys/dev/pccard/pccard.c @@ -1405,8 +1405,8 @@ pccard_ccr_read_impl(device_t brdev, device_t child, uint32_t offset, struct pccard_ivar *devi = PCCARD_IVAR(child); *val = pccard_ccr_read(devi->pf, offset); - device_printf(child, "ccr_read of %#x (%#x) is %#x\n", offset, - devi->pf->pf_ccr_offset, *val); + DEVPRINTF((child, "ccr_read of %#x (%#x) is %#x\n", offset, + devi->pf->pf_ccr_offset, *val)); return 0; } @@ -1421,8 +1421,8 @@ pccard_ccr_write_impl(device_t brdev, device_t child, uint32_t offset, * Can't use pccard_ccr_write since client drivers may access * registers not contained in the 'mask' if they are non-standard. */ - device_printf(child, "ccr_write of %#x to %#x (%#x)\n", val, offset, - devi->pf->pf_ccr_offset); + DEVPRINTF((child, "ccr_write of %#x to %#x (%#x)\n", val, offset, + devi->pf->pf_ccr_offset)); bus_space_write_1(pf->pf_ccrt, pf->pf_ccrh, pf->pf_ccr_offset + offset, val); return 0; diff --git a/sys/dev/pci/pci.c b/sys/dev/pci/pci.c index 22046c1..9cd5a1c 100644 --- a/sys/dev/pci/pci.c +++ b/sys/dev/pci/pci.c @@ -2576,6 +2576,17 @@ pci_add_map(device_t bus, device_t dev, int reg, struct resource_list *rl, uint16_t cmd; struct resource *res; + /* + * The BAR may already exist if the device is a CardBus card + * whose CIS is stored in this BAR. + */ + pm = pci_find_bar(dev, reg); + if (pm != NULL) { + maprange = pci_maprange(pm->pm_value); + barlen = maprange == 64 ? 2 : 1; + return (barlen); + } + pci_read_bar(dev, reg, &map, &testval); if (PCI_BAR_MEM(map)) { type = SYS_RES_MEMORY; diff --git a/sys/dev/puc/pucdata.c b/sys/dev/puc/pucdata.c index a56971e..2b38d9b 100644 --- a/sys/dev/puc/pucdata.c +++ b/sys/dev/puc/pucdata.c @@ -51,12 +51,12 @@ static puc_config_f puc_config_amc; static puc_config_f puc_config_diva; static puc_config_f puc_config_exar; static puc_config_f puc_config_icbook; +static puc_config_f puc_config_oxford_pcie; static puc_config_f puc_config_quatech; static puc_config_f puc_config_syba; static puc_config_f puc_config_siig; static puc_config_f puc_config_timedia; static puc_config_f puc_config_titan; -static puc_config_f puc_config_oxford_pcie; const struct puc_cfg puc_pci_devices[] = { @@ -1366,14 +1366,12 @@ puc_config_oxford_pcie(struct puc_softc *sc, enum puc_cfg_cmd cmd, int port, bar = puc_get_bar(sc, cfg->rid); if (bar == NULL) return (ENXIO); - for (idx = 0; idx < sc->sc_nports; idx++) { - value = bus_read_1(bar->b_res, 0x1000 + (idx << 9) - + 0x92); + value = bus_read_1(bar->b_res, 0x1000 + (idx << 9) + + 0x92); bus_write_1(bar->b_res, 0x1000 + (idx << 9) + 0x92, - value | 0x10); + value | 0x10); } - return (0); case PUC_CFG_GET_LEN: *res = 0x200; diff --git a/sys/fs/nfs/nfs_var.h b/sys/fs/nfs/nfs_var.h index 8ed60a7..5f944b5 100644 --- a/sys/fs/nfs/nfs_var.h +++ b/sys/fs/nfs/nfs_var.h @@ -401,10 +401,10 @@ int nfsrpc_readdirplus(vnode_t, struct uio *, nfsuint64 *, int nfsrpc_commit(vnode_t, u_quad_t, int, struct ucred *, NFSPROC_T *, u_char *, struct nfsvattr *, int *, void *); int nfsrpc_advlock(vnode_t, off_t, int, struct flock *, int, - struct ucred *, NFSPROC_T *); + struct ucred *, NFSPROC_T *, void *, int); int nfsrpc_lockt(struct nfsrv_descript *, vnode_t, struct nfsclclient *, u_int64_t, u_int64_t, struct flock *, - struct ucred *, NFSPROC_T *); + struct ucred *, NFSPROC_T *, void *, int); int nfsrpc_lock(struct nfsrv_descript *, struct nfsmount *, vnode_t, u_int8_t *, int, struct nfscllockowner *, int, int, u_int64_t, u_int64_t, short, struct ucred *, NFSPROC_T *, int); @@ -439,16 +439,16 @@ struct nfsclclient *nfscl_findcl(struct nfsmount *); void nfscl_clientrelease(struct nfsclclient *); void nfscl_freelock(struct nfscllock *, int); int nfscl_getbytelock(vnode_t, u_int64_t, u_int64_t, short, - struct ucred *, NFSPROC_T *, struct nfsclclient *, int, u_int8_t *, - u_int8_t *, struct nfscllockowner **, int *, int *); + struct ucred *, NFSPROC_T *, struct nfsclclient *, int, void *, int, + u_int8_t *, u_int8_t *, struct nfscllockowner **, int *, int *); int nfscl_relbytelock(vnode_t, u_int64_t, u_int64_t, struct ucred *, NFSPROC_T *, int, struct nfsclclient *, - struct nfscllockowner **, int *); + void *, int, struct nfscllockowner **, int *); int nfscl_checkwritelocked(vnode_t, struct flock *, - struct ucred *, NFSPROC_T *); + struct ucred *, NFSPROC_T *, void *, int); void nfscl_lockrelease(struct nfscllockowner *, int, int); void nfscl_fillclid(u_int64_t, char *, u_int8_t *, u_int16_t); -void nfscl_filllockowner(NFSPROC_T *, u_int8_t *); +void nfscl_filllockowner(void *, u_int8_t *, int); void nfscl_freeopen(struct nfsclopen *, int); void nfscl_umount(struct nfsmount *, NFSPROC_T *); void nfscl_renewthread(struct nfsclclient *, NFSPROC_T *); @@ -466,9 +466,10 @@ void nfscl_lockexcl(struct nfsv4lock *, void *); void nfscl_lockunlock(struct nfsv4lock *); void nfscl_lockderef(struct nfsv4lock *); void nfscl_docb(struct nfsrv_descript *, NFSPROC_T *); -void nfscl_releasealllocks(struct nfsclclient *, vnode_t, NFSPROC_T *); +void nfscl_releasealllocks(struct nfsclclient *, vnode_t, NFSPROC_T *, void *, + int); int nfscl_lockt(vnode_t, struct nfsclclient *, u_int64_t, - u_int64_t, struct flock *, NFSPROC_T *); + u_int64_t, struct flock *, NFSPROC_T *, void *, int); int nfscl_mustflush(vnode_t); int nfscl_nodeleg(vnode_t, int); int nfscl_removedeleg(vnode_t, NFSPROC_T *, nfsv4stateid_t *); diff --git a/sys/fs/nfsclient/nfs_clport.c b/sys/fs/nfsclient/nfs_clport.c index 0c3a4c9..4d88bd2 100644 --- a/sys/fs/nfsclient/nfs_clport.c +++ b/sys/fs/nfsclient/nfs_clport.c @@ -500,7 +500,7 @@ nfscl_fillclid(u_int64_t clval, char *uuid, u_int8_t *cp, u_int16_t idlen) * Fill in a lock owner name. For now, pid + the process's creation time. */ void -nfscl_filllockowner(struct thread *td, u_int8_t *cp) +nfscl_filllockowner(void *id, u_int8_t *cp, int flags) { union { u_int32_t lval; @@ -508,37 +508,35 @@ nfscl_filllockowner(struct thread *td, u_int8_t *cp) } tl; struct proc *p; -if (td == NULL) { - printf("NULL td\n"); - bzero(cp, 12); - return; -} - p = td->td_proc; -if (p == NULL) { - printf("NULL pid\n"); - bzero(cp, 12); - return; -} - tl.lval = p->p_pid; - *cp++ = tl.cval[0]; - *cp++ = tl.cval[1]; - *cp++ = tl.cval[2]; - *cp++ = tl.cval[3]; -if (p->p_stats == NULL) { - printf("pstats null\n"); - bzero(cp, 8); - return; -} - tl.lval = p->p_stats->p_start.tv_sec; - *cp++ = tl.cval[0]; - *cp++ = tl.cval[1]; - *cp++ = tl.cval[2]; - *cp++ = tl.cval[3]; - tl.lval = p->p_stats->p_start.tv_usec; - *cp++ = tl.cval[0]; - *cp++ = tl.cval[1]; - *cp++ = tl.cval[2]; - *cp = tl.cval[3]; + if (id == NULL) { + printf("NULL id\n"); + bzero(cp, NFSV4CL_LOCKNAMELEN); + return; + } + if ((flags & F_POSIX) != 0) { + p = (struct proc *)id; + tl.lval = p->p_pid; + *cp++ = tl.cval[0]; + *cp++ = tl.cval[1]; + *cp++ = tl.cval[2]; + *cp++ = tl.cval[3]; + tl.lval = p->p_stats->p_start.tv_sec; + *cp++ = tl.cval[0]; + *cp++ = tl.cval[1]; + *cp++ = tl.cval[2]; + *cp++ = tl.cval[3]; + tl.lval = p->p_stats->p_start.tv_usec; + *cp++ = tl.cval[0]; + *cp++ = tl.cval[1]; + *cp++ = tl.cval[2]; + *cp = tl.cval[3]; + } else if ((flags & F_FLOCK) != 0) { + bcopy(&id, cp, sizeof(id)); + bzero(&cp[sizeof(id)], NFSV4CL_LOCKNAMELEN - sizeof(id)); + } else { + printf("nfscl_filllockowner: not F_POSIX or F_FLOCK\n"); + bzero(cp, NFSV4CL_LOCKNAMELEN); + } } /* @@ -943,6 +941,7 @@ nfscl_getmyip(struct nfsmount *nmp, int *isinet6p) sad.sin_family = AF_INET; sad.sin_len = sizeof (struct sockaddr_in); sad.sin_addr.s_addr = sin->sin_addr.s_addr; + CURVNET_SET(CRED_TO_VNET(nmp->nm_sockreq.nr_cred)); rt = rtalloc1((struct sockaddr *)&sad, 0, 0UL); if (rt != NULL) { if (rt->rt_ifp != NULL && @@ -956,6 +955,7 @@ nfscl_getmyip(struct nfsmount *nmp, int *isinet6p) } RTFREE_LOCKED(rt); } + CURVNET_RESTORE(); #ifdef INET6 } else if (nmp->nm_nam->sa_family == AF_INET6) { struct sockaddr_in6 sad6, *sin6; @@ -966,6 +966,7 @@ nfscl_getmyip(struct nfsmount *nmp, int *isinet6p) sad6.sin6_family = AF_INET6; sad6.sin6_len = sizeof (struct sockaddr_in6); sad6.sin6_addr = sin6->sin6_addr; + CURVNET_SET(CRED_TO_VNET(nmp->nm_sockreq.nr_cred)); rt = rtalloc1((struct sockaddr *)&sad6, 0, 0UL); if (rt != NULL) { if (rt->rt_ifp != NULL && @@ -980,6 +981,7 @@ nfscl_getmyip(struct nfsmount *nmp, int *isinet6p) } RTFREE_LOCKED(rt); } + CURVNET_RESTORE(); #endif } return (retp); diff --git a/sys/fs/nfsclient/nfs_clrpcops.c b/sys/fs/nfsclient/nfs_clrpcops.c index 0fc9bfd..5d83d0b 100644 --- a/sys/fs/nfsclient/nfs_clrpcops.c +++ b/sys/fs/nfsclient/nfs_clrpcops.c @@ -3459,7 +3459,7 @@ nfsmout: */ APPLESTATIC int nfsrpc_advlock(vnode_t vp, off_t size, int op, struct flock *fl, - int reclaim, struct ucred *cred, NFSPROC_T *p) + int reclaim, struct ucred *cred, NFSPROC_T *p, void *id, int flags) { struct nfscllockowner *lp; struct nfsclclient *clp; @@ -3511,11 +3511,11 @@ nfsrpc_advlock(vnode_t vp, off_t size, int op, struct flock *fl, error = nfscl_getcl(vp, cred, p, &clp); if (error) return (error); - error = nfscl_lockt(vp, clp, off, len, fl, p); + error = nfscl_lockt(vp, clp, off, len, fl, p, id, flags); if (!error) { clidrev = clp->nfsc_clientidrev; error = nfsrpc_lockt(nd, vp, clp, off, len, fl, cred, - p); + p, id, flags); } else if (error == -1) { error = 0; } @@ -3530,7 +3530,7 @@ nfsrpc_advlock(vnode_t vp, off_t size, int op, struct flock *fl, return (error); do { error = nfscl_relbytelock(vp, off, len, cred, p, callcnt, - clp, &lp, &dorpc); + clp, id, flags, &lp, &dorpc); /* * If it returns a NULL lp, we're done. */ @@ -3538,7 +3538,7 @@ nfsrpc_advlock(vnode_t vp, off_t size, int op, struct flock *fl, if (callcnt == 0) nfscl_clientrelease(clp); else - nfscl_releasealllocks(clp, vp, p); + nfscl_releasealllocks(clp, vp, p, id, flags); return (error); } if (nmp->nm_clp != NULL) @@ -3572,10 +3572,10 @@ nfsrpc_advlock(vnode_t vp, off_t size, int op, struct flock *fl, } callcnt++; } while (error == 0 && nd->nd_repstat == 0); - nfscl_releasealllocks(clp, vp, p); + nfscl_releasealllocks(clp, vp, p, id, flags); } else if (op == F_SETLK) { error = nfscl_getbytelock(vp, off, len, fl->l_type, cred, p, - NULL, 0, NULL, NULL, &lp, &newone, &donelocally); + NULL, 0, id, flags, NULL, NULL, &lp, &newone, &donelocally); if (error || donelocally) { return (error); } @@ -3625,7 +3625,7 @@ nfsrpc_advlock(vnode_t vp, off_t size, int op, struct flock *fl, APPLESTATIC int nfsrpc_lockt(struct nfsrv_descript *nd, vnode_t vp, struct nfsclclient *clp, u_int64_t off, u_int64_t len, struct flock *fl, - struct ucred *cred, NFSPROC_T *p) + struct ucred *cred, NFSPROC_T *p, void *id, int flags) { u_int32_t *tl; int error, type, size; @@ -3643,7 +3643,7 @@ nfsrpc_lockt(struct nfsrv_descript *nd, vnode_t vp, tl += 2; *tl++ = clp->nfsc_clientid.lval[0]; *tl = clp->nfsc_clientid.lval[1]; - nfscl_filllockowner(p, own); + nfscl_filllockowner(id, own, flags); (void) nfsm_strtom(nd, own, NFSV4CL_LOCKNAMELEN); error = nfscl_request(nd, vp, p, cred, NULL); if (error) diff --git a/sys/fs/nfsclient/nfs_clstate.c b/sys/fs/nfsclient/nfs_clstate.c index 86d71b6..aa81437 100644 --- a/sys/fs/nfsclient/nfs_clstate.c +++ b/sys/fs/nfsclient/nfs_clstate.c @@ -226,7 +226,7 @@ nfscl_open(vnode_t vp, u_int8_t *nfhp, int fhlen, u_int32_t amode, int usedeleg, * If none found, add the new one or return error, depending upon * "create". */ - nfscl_filllockowner(p, own); + nfscl_filllockowner(p->td_proc, own, F_POSIX); NFSLOCKCLSTATE(); dp = NULL; /* First check the delegation list */ @@ -521,7 +521,7 @@ nfscl_getstateid(vnode_t vp, u_int8_t *nfhp, int fhlen, u_int32_t mode, * If p != NULL, we want to search the parentage tree * for a matching OpenOwner and use that. */ - nfscl_filllockowner(p, own); + nfscl_filllockowner(p->td_proc, own, F_POSIX); error = nfscl_getopen(&clp->nfsc_owner, nfhp, fhlen, NULL, p, mode, NULL, &op); if (error == 0) { @@ -596,7 +596,7 @@ nfscl_getopen(struct nfsclownerhead *ohp, u_int8_t *nfhp, int fhlen, op = NULL; while (op == NULL && (nproc != NULL || rown != NULL)) { if (nproc != NULL) { - nfscl_filllockowner(nproc, own); + nfscl_filllockowner(nproc->td_proc, own, F_POSIX); ownp = own; } else { ownp = rown; @@ -881,7 +881,7 @@ nfscl_clientrelease(struct nfsclclient *clp) APPLESTATIC int nfscl_getbytelock(vnode_t vp, u_int64_t off, u_int64_t len, short type, struct ucred *cred, NFSPROC_T *p, struct nfsclclient *rclp, - int recovery, u_int8_t *rownp, u_int8_t *ropenownp, + int recovery, void *id, int flags, u_int8_t *rownp, u_int8_t *ropenownp, struct nfscllockowner **lpp, int *newonep, int *donelocallyp) { struct nfscllockowner *lp; @@ -942,7 +942,7 @@ nfscl_getbytelock(vnode_t vp, u_int64_t off, u_int64_t len, if (recovery) { ownp = rownp; } else { - nfscl_filllockowner(p, own); + nfscl_filllockowner(id, own, flags); ownp = own; } if (!recovery) { @@ -1079,7 +1079,8 @@ nfscl_getbytelock(vnode_t vp, u_int64_t off, u_int64_t len, APPLESTATIC int nfscl_relbytelock(vnode_t vp, u_int64_t off, u_int64_t len, __unused struct ucred *cred, NFSPROC_T *p, int callcnt, - struct nfsclclient *clp, struct nfscllockowner **lpp, int *dorpcp) + struct nfsclclient *clp, void *id, int flags, + struct nfscllockowner **lpp, int *dorpcp) { struct nfscllockowner *lp; struct nfsclowner *owp; @@ -1116,7 +1117,7 @@ nfscl_relbytelock(vnode_t vp, u_int64_t off, u_int64_t len, sizeof (struct nfscllock), M_NFSCLLOCK, M_WAITOK); *other_lop = *nlop; } - nfscl_filllockowner(p, own); + nfscl_filllockowner(id, own, flags); dp = NULL; NFSLOCKCLSTATE(); if (callcnt == 0) @@ -1188,7 +1189,8 @@ nfscl_relbytelock(vnode_t vp, u_int64_t off, u_int64_t len, * Release all lockowners marked in progess for this process and file. */ APPLESTATIC void -nfscl_releasealllocks(struct nfsclclient *clp, vnode_t vp, NFSPROC_T *p) +nfscl_releasealllocks(struct nfsclclient *clp, vnode_t vp, NFSPROC_T *p, + void *id, int flags) { struct nfsclowner *owp; struct nfsclopen *op; @@ -1197,7 +1199,7 @@ nfscl_releasealllocks(struct nfsclclient *clp, vnode_t vp, NFSPROC_T *p) u_int8_t own[NFSV4CL_LOCKNAMELEN]; np = VTONFS(vp); - nfscl_filllockowner(p, own); + nfscl_filllockowner(id, own, flags); NFSLOCKCLSTATE(); LIST_FOREACH(owp, &clp->nfsc_owner, nfsow_list) { LIST_FOREACH(op, &owp->nfsow_open, nfso_list) { @@ -1226,7 +1228,7 @@ nfscl_releasealllocks(struct nfsclclient *clp, vnode_t vp, NFSPROC_T *p) */ APPLESTATIC int nfscl_checkwritelocked(vnode_t vp, struct flock *fl, - struct ucred *cred, NFSPROC_T *p) + struct ucred *cred, NFSPROC_T *p, void *id, int flags) { struct nfsclowner *owp; struct nfscllockowner *lp; @@ -1266,7 +1268,7 @@ nfscl_checkwritelocked(vnode_t vp, struct flock *fl, error = nfscl_getcl(vp, cred, p, &clp); if (error) return (1); - nfscl_filllockowner(p, own); + nfscl_filllockowner(id, own, flags); NFSLOCKCLSTATE(); /* @@ -1641,7 +1643,7 @@ nfscl_cleanup(NFSPROC_T *p) if (!nfscl_inited) return; - nfscl_filllockowner(p, own); + nfscl_filllockowner(p->td_proc, own, F_POSIX); NFSLOCKCLSTATE(); /* @@ -3322,7 +3324,7 @@ nfscl_checkconflict(struct nfscllockownerhead *lhp, struct nfscllock *nlop, */ APPLESTATIC int nfscl_lockt(vnode_t vp, struct nfsclclient *clp, u_int64_t off, - u_int64_t len, struct flock *fl, NFSPROC_T *p) + u_int64_t len, struct flock *fl, NFSPROC_T *p, void *id, int flags) { struct nfscllock *lop, nlck; struct nfscldeleg *dp; @@ -3340,7 +3342,7 @@ nfscl_lockt(vnode_t vp, struct nfsclclient *clp, u_int64_t off, return (NFSERR_INVAL); } np = VTONFS(vp); - nfscl_filllockowner(p, own); + nfscl_filllockowner(id, own, flags); NFSLOCKCLSTATE(); dp = nfscl_finddeleg(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len); error = nfscl_localconflict(clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, @@ -3615,7 +3617,7 @@ nfscl_relock(vnode_t vp, struct nfsclclient *clp, struct nfsmount *nmp, off = lop->nfslo_first; len = lop->nfslo_end - lop->nfslo_first; error = nfscl_getbytelock(vp, off, len, lop->nfslo_type, cred, p, - clp, 1, lp->nfsl_owner, lp->nfsl_openowner, &nlp, &newone, + clp, 1, NULL, 0, lp->nfsl_owner, lp->nfsl_openowner, &nlp, &newone, &donelocally); if (error || donelocally) return (error); diff --git a/sys/fs/nfsclient/nfs_clvnops.c b/sys/fs/nfsclient/nfs_clvnops.c index 984724d..3e1c66d 100644 --- a/sys/fs/nfsclient/nfs_clvnops.c +++ b/sys/fs/nfsclient/nfs_clvnops.c @@ -2884,8 +2884,11 @@ nfs_advlock(struct vop_advlock_args *ap) int ret, error = EOPNOTSUPP; u_quad_t size; - if (NFS_ISV4(vp) && (ap->a_flags & F_POSIX)) { - cred = p->p_ucred; + if (NFS_ISV4(vp) && (ap->a_flags & (F_POSIX | F_FLOCK)) != 0) { + if ((ap->a_flags & F_POSIX) != 0) + cred = p->p_ucred; + else + cred = td->td_ucred; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_iflag & VI_DOOMED) { VOP_UNLOCK(vp, 0); @@ -2898,7 +2901,8 @@ nfs_advlock(struct vop_advlock_args *ap) * RFC3530 Sec. 9.3.2. */ if (ap->a_op == F_UNLCK && - nfscl_checkwritelocked(vp, ap->a_fl, cred, td)) + nfscl_checkwritelocked(vp, ap->a_fl, cred, td, ap->a_id, + ap->a_flags)) (void) ncl_flush(vp, MNT_WAIT, cred, td, 1, 0); /* @@ -2907,7 +2911,7 @@ nfs_advlock(struct vop_advlock_args *ap) */ do { ret = nfsrpc_advlock(vp, np->n_size, ap->a_op, - ap->a_fl, 0, cred, td); + ap->a_fl, 0, cred, td, ap->a_id, ap->a_flags); if (ret == NFSERR_DENIED && (ap->a_flags & F_WAIT) && ap->a_op == F_SETLK) { VOP_UNLOCK(vp, 0); diff --git a/sys/i386/i386/mp_machdep.c b/sys/i386/i386/mp_machdep.c index fbaae89..be603eb 100644 --- a/sys/i386/i386/mp_machdep.c +++ b/sys/i386/i386/mp_machdep.c @@ -290,8 +290,11 @@ topo_probe_0x4(void) * logical processors that belong to the same core * as BSP thus deducing number of threads per core. */ - cpuid_count(0x04, 0, p); - max_cores = ((p[0] >> 26) & 0x3f) + 1; + if (cpu_high >= 0x4) { + cpuid_count(0x04, 0, p); + max_cores = ((p[0] >> 26) & 0x3f) + 1; + } else + max_cores = 1; core_id_bits = mask_width(max_logical/max_cores); if (core_id_bits < 0) return; diff --git a/sys/ia64/acpica/acpi_machdep.c b/sys/ia64/acpica/acpi_machdep.c index b7b612f..1466cfe 100644 --- a/sys/ia64/acpica/acpi_machdep.c +++ b/sys/ia64/acpica/acpi_machdep.c @@ -56,7 +56,14 @@ acpi_machdep_quirks(int *quirks) void acpi_cpu_c1() { +#ifdef INVARIANTS + register_t ie; + + ie = intr_disable(); + KASSERT(ie == 0, ("%s called with interrupts enabled\n", __func__)); +#endif ia64_call_pal_static(PAL_HALT_LIGHT, 0, 0, 0); + ia64_enable_intr(); } void * diff --git a/sys/ia64/ia64/machdep.c b/sys/ia64/ia64/machdep.c index 7252865..fc7df7a 100644 --- a/sys/ia64/ia64/machdep.c +++ b/sys/ia64/ia64/machdep.c @@ -411,12 +411,34 @@ cpu_halt() void cpu_idle(int busy) { - struct ia64_pal_result res; + register_t ie; - if (cpu_idle_hook != NULL) +#if 0 + if (!busy) { + critical_enter(); + cpu_idleclock(); + } +#endif + + ie = intr_disable(); + KASSERT(ie != 0, ("%s called with interrupts disabled\n", __func__)); + + if (sched_runnable()) + ia64_enable_intr(); + else if (cpu_idle_hook != NULL) { (*cpu_idle_hook)(); - else - res = ia64_call_pal_static(PAL_HALT_LIGHT, 0, 0, 0); + /* The hook must enable interrupts! */ + } else { + ia64_call_pal_static(PAL_HALT_LIGHT, 0, 0, 0); + ia64_enable_intr(); + } + +#if 0 + if (!busy) { + cpu_activeclock(); + critical_exit(); + } +#endif } int @@ -644,9 +666,12 @@ calculate_frequencies(void) { struct ia64_sal_result sal; struct ia64_pal_result pal; + register_t ie; + ie = intr_disable(); sal = ia64_sal_entry(SAL_FREQ_BASE, 0, 0, 0, 0, 0, 0, 0); pal = ia64_call_pal_static(PAL_FREQ_RATIOS, 0, 0, 0); + intr_restore(ie); if (sal.sal_status == 0 && pal.pal_status == 0) { if (bootverbose) { diff --git a/sys/ia64/ia64/pal.S b/sys/ia64/ia64/pal.S index 2f0d0da..2e3f4cd 100644 --- a/sys/ia64/ia64/pal.S +++ b/sys/ia64/ia64/pal.S @@ -38,43 +38,40 @@ ia64_pal_entry: .quad 0 * u_int64_t arg1, u_int64_t arg2, u_int64_t arg3) */ ENTRY(ia64_call_pal_static, 4) - - .regstk 4,5,0,0 + + .regstk 4,4,0,0 palret = loc0 entry = loc1 rpsave = loc2 pfssave = loc3 -psrsave = loc4 - alloc pfssave=ar.pfs,4,5,0,0 + alloc pfssave=ar.pfs,4,4,0,0 ;; mov rpsave=rp - movl entry=@gprel(ia64_pal_entry) + 1: mov palret=ip // for return address ;; add entry=entry,gp - mov psrsave=psr + add palret=2f-1b,palret // calculate return address mov r28=in0 // procedure number - ;; - ld8 entry=[entry] // read entry point mov r29=in1 // copy arguments mov r30=in2 mov r31=in3 ;; - mov b6=entry - add palret=2f-1b,palret // calculate return address - ;; + ld8 entry=[entry] // read entry point mov b0=palret - rsm psr.i // disable interrupts + ;; + mov b6=entry ;; br.cond.sptk b6 // call into firmware -2: mov psr.l=psrsave + ;; +2: mov rp=rpsave mov ar.pfs=pfssave ;; - srlz.d br.ret.sptk rp + ;; END(ia64_call_pal_static) /* diff --git a/sys/kern/subr_rman.c b/sys/kern/subr_rman.c index 3014b19..abd72c0 100644 --- a/sys/kern/subr_rman.c +++ b/sys/kern/subr_rman.c @@ -839,6 +839,7 @@ int_rman_release_resource(struct rman *rm, struct resource_i *r) * without freeing anything. */ r->r_flags &= ~RF_ALLOCATED; + r->r_dev = NULL; return 0; } diff --git a/sys/netinet/icmp6.h b/sys/netinet/icmp6.h index 5faae7c..c9da86a 100644 --- a/sys/netinet/icmp6.h +++ b/sys/netinet/icmp6.h @@ -297,8 +297,9 @@ struct nd_opt_hdr { /* Neighbor discovery option header */ #define ND_OPT_PREFIX_INFORMATION 3 #define ND_OPT_REDIRECTED_HEADER 4 #define ND_OPT_MTU 5 - -#define ND_OPT_ROUTE_INFO 200 /* draft-ietf-ipngwg-router-preference, not officially assigned yet */ +#define ND_OPT_ROUTE_INFO 24 /* RFC 4191 */ +#define ND_OPT_RDNSS 25 /* RFC 6016 */ +#define ND_OPT_DNSSL 31 /* RFC 6016 */ struct nd_opt_prefix_info { /* prefix information */ u_int8_t nd_opt_pi_type; @@ -338,6 +339,22 @@ struct nd_opt_route_info { /* route info */ /* prefix follows */ } __packed; +struct nd_opt_rdnss { /* RDNSS option (RFC 6106) */ + u_int8_t nd_opt_rdnss_type; + u_int8_t nd_opt_rdnss_len; + u_int16_t nd_opt_rdnss_reserved; + u_int32_t nd_opt_rdnss_lifetime; + /* followed by list of recursive DNS servers */ +} __packed; + +struct nd_opt_dnssl { /* DNSSL option (RFC 6106) */ + u_int8_t nd_opt_dnssl_type; + u_int8_t nd_opt_dnssl_len; + u_int16_t nd_opt_dnssl_reserved; + u_int32_t nd_opt_dnssl_lifetime; + /* followed by list of DNS search domains */ +} __packed; + /* * icmp6 namelookup */ diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 4aa998f..4eb309a 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$"); #include "opt_ipsec.h" #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_pcbgroup.h" #include <sys/param.h> #include <sys/systm.h> @@ -212,7 +213,7 @@ void in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, struct inpcbhead *listhead, int hash_nelements, int porthash_nelements, char *inpcbzone_name, uma_init inpcbzone_init, uma_fini inpcbzone_fini, - uint32_t inpcbzone_flags) + uint32_t inpcbzone_flags, u_int hashfields) { INP_INFO_LOCK_INIT(pcbinfo, name); @@ -227,6 +228,9 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, &pcbinfo->ipi_hashmask); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_porthashmask); +#ifdef PCBGROUP + in_pcbgroup_init(pcbinfo, hashfields, hash_nelements); +#endif pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), NULL, NULL, inpcbzone_init, inpcbzone_fini, UMA_ALIGN_PTR, inpcbzone_flags); @@ -246,6 +250,9 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, pcbinfo->ipi_porthashmask); +#ifdef PCBGROUP + in_pcbgroup_destroy(pcbinfo); +#endif uma_zdestroy(pcbinfo->ipi_zone); INP_HASH_LOCK_DESTROY(pcbinfo); INP_INFO_LOCK_DESTROY(pcbinfo); @@ -1053,7 +1060,8 @@ in_pcbdetach(struct inpcb *inp) * in_pcbref() bumps the reference count on an inpcb in order to maintain * stability of an inpcb pointer despite the inpcb lock being released. This * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, - * but where the inpcb lock is already held. + * but where the inpcb lock may already held, or when acquiring a reference + * via a pcbgroup. * * in_pcbref() should be used only to provide brief memory stability, and * must always be followed by a call to INP_WLOCK() and in_pcbrele() to @@ -1223,6 +1231,9 @@ in_pcbdrop(struct inpcb *inp) } INP_HASH_WUNLOCK(inp->inp_pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; +#ifdef PCBGROUP + in_pcbgroup_remove(inp); +#endif } } @@ -1472,6 +1483,148 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, } #undef INP_LOOKUP_MAPPED_PCB_COST +#ifdef PCBGROUP +/* + * Lookup PCB in hash list, using pcbgroup tables. + */ +static struct inpcb * +in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, + struct in_addr faddr, u_int fport_arg, struct in_addr laddr, + u_int lport_arg, int lookupflags, struct ifnet *ifp) +{ + struct inpcbhead *head; + struct inpcb *inp, *tmpinp; + u_short fport = fport_arg, lport = lport_arg; + + /* + * First look for an exact match. + */ + tmpinp = NULL; + INP_GROUP_LOCK(pcbgroup); + head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, + pcbgroup->ipg_hashmask)]; + LIST_FOREACH(inp, head, inp_pcbgrouphash) { +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr == faddr.s_addr && + inp->inp_laddr.s_addr == laddr.s_addr && + inp->inp_fport == fport && + inp->inp_lport == lport) { + /* + * XXX We should be able to directly return + * the inp here, without any checks. + * Well unless both bound with SO_REUSEPORT? + */ + if (prison_flag(inp->inp_cred, PR_IP4)) + goto found; + if (tmpinp == NULL) + tmpinp = inp; + } + } + if (tmpinp != NULL) { + inp = tmpinp; + goto found; + } + + /* + * Then look for a wildcard match, if requested. + */ + if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { + struct inpcb *local_wild = NULL, *local_exact = NULL; +#ifdef INET6 + struct inpcb *local_wild_mapped = NULL; +#endif + struct inpcb *jail_wild = NULL; + struct inpcbhead *head; + int injail; + + /* + * Order of socket selection - we always prefer jails. + * 1. jailed, non-wild. + * 2. jailed, wild. + * 3. non-jailed, non-wild. + * 4. non-jailed, wild. + */ + head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport, + 0, pcbinfo->ipi_wildmask)]; + LIST_FOREACH(inp, head, inp_pcbgroup_wild) { +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr != INADDR_ANY || + inp->inp_lport != lport) + continue; + + /* XXX inp locking */ + if (ifp && ifp->if_type == IFT_FAITH && + (inp->inp_flags & INP_FAITH) == 0) + continue; + + injail = prison_flag(inp->inp_cred, PR_IP4); + if (injail) { + if (prison_check_ip4(inp->inp_cred, + &laddr) != 0) + continue; + } else { + if (local_exact != NULL) + continue; + } + + if (inp->inp_laddr.s_addr == laddr.s_addr) { + if (injail) + goto found; + else + local_exact = inp; + } else if (inp->inp_laddr.s_addr == INADDR_ANY) { +#ifdef INET6 + /* XXX inp locking, NULL check */ + if (inp->inp_vflag & INP_IPV6PROTO) + local_wild_mapped = inp; + else +#endif /* INET6 */ + if (injail) + jail_wild = inp; + else + local_wild = inp; + } + } /* LIST_FOREACH */ + inp = jail_wild; + if (inp == NULL) + inp = local_exact; + if (inp == NULL) + inp = local_wild; +#ifdef INET6 + if (inp == NULL) + inp = local_wild_mapped; +#endif /* defined(INET6) */ + if (inp != NULL) + goto found; + } /* if (lookupflags & INPLOOKUP_WILDCARD) */ + INP_GROUP_UNLOCK(pcbgroup); + return (NULL); + +found: + in_pcbref(inp); + INP_GROUP_UNLOCK(pcbgroup); + if (lookupflags & INPLOOKUP_WLOCKPCB) { + INP_WLOCK(inp); + if (in_pcbrele_wlocked(inp)) + return (NULL); + } else if (lookupflags & INPLOOKUP_RLOCKPCB) { + INP_RLOCK(inp); + if (in_pcbrele_rlocked(inp)) + return (NULL); + } else + panic("%s: locking bug", __func__); + return (inp); +} +#endif /* PCBGROUP */ + /* * Lookup PCB in hash list, using pcbinfo tables. This variation assumes * that the caller has locked the hash list, and will not perform any further @@ -1636,17 +1789,30 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, /* * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf * from which a pre-calculated hash value may be extracted. + * + * Possibly more of this logic should be in in_pcbgroup.c. */ struct inpcb * in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { +#if defined(PCBGROUP) + struct inpcbgroup *pcbgroup; +#endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); +#if defined(PCBGROUP) + if (in_pcbgroup_enabled(pcbinfo)) { + pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, + fport); + return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, + laddr, lport, lookupflags, ifp)); + } +#endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } @@ -1656,12 +1822,28 @@ in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp, struct mbuf *m) { +#ifdef PCBGROUP + struct inpcbgroup *pcbgroup; +#endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); +#ifdef PCBGROUP + if (in_pcbgroup_enabled(pcbinfo)) { + pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), + m->m_pkthdr.flowid); + if (pcbgroup != NULL) + return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, + fport, laddr, lport, lookupflags, ifp)); + pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, + fport); + return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, + laddr, lport, lookupflags, ifp)); + } +#endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } @@ -1670,8 +1852,8 @@ in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, /* * Insert PCB onto various hash lists. */ -int -in_pcbinshash(struct inpcb *inp) +static int +in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update) { struct inpcbhead *pcbhash; struct inpcbporthead *pcbporthash; @@ -1721,10 +1903,39 @@ in_pcbinshash(struct inpcb *inp) LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); LIST_INSERT_HEAD(pcbhash, inp, inp_hash); inp->inp_flags |= INP_INHASHLIST; +#ifdef PCBGROUP + if (do_pcbgroup_update) + in_pcbgroup_update(inp); +#endif return (0); } /* + * For now, there are two public interfaces to insert an inpcb into the hash + * lists -- one that does update pcbgroups, and one that doesn't. The latter + * is used only in the TCP syncache, where in_pcbinshash is called before the + * full 4-tuple is set for the inpcb, and we don't want to install in the + * pcbgroup until later. + * + * XXXRW: This seems like a misfeature. in_pcbinshash should always update + * connection groups, and partially initialised inpcbs should not be exposed + * to either reservation hash tables or pcbgroups. + */ +int +in_pcbinshash(struct inpcb *inp) +{ + + return (in_pcbinshash_internal(inp, 1)); +} + +int +in_pcbinshash_nopcbgroup(struct inpcb *inp) +{ + + return (in_pcbinshash_internal(inp, 0)); +} + +/* * Move PCB to the proper hash bucket when { faddr, fport } have been * changed. NOTE: This does not handle the case of the lport changing (the * hashed port list would have to be updated as well), so the lport must @@ -1755,6 +1966,13 @@ in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m) LIST_REMOVE(inp, inp_hash); LIST_INSERT_HEAD(head, inp, inp_hash); + +#ifdef PCBGROUP + if (m != NULL) + in_pcbgroup_update_mbuf(inp, m); + else + in_pcbgroup_update(inp); +#endif } void @@ -1791,6 +2009,9 @@ in_pcbremlists(struct inpcb *inp) } LIST_REMOVE(inp, inp_list); pcbinfo->ipi_count--; +#ifdef PCBGROUP + in_pcbgroup_remove(inp); +#endif } /* diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index 809bc05..a8524be 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -141,6 +141,7 @@ struct icmp6_filter; * * Key: * (c) - Constant after initialization + * (g) - Protected by the pcbgroup lock * (i) - Protected by the inpcb lock * (p) - Protected by the pcbinfo lock for the inpcb * (s) - Protected by another subsystem's locks @@ -160,9 +161,12 @@ struct icmp6_filter; */ struct inpcb { LIST_ENTRY(inpcb) inp_hash; /* (i/p) hash list */ + LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */ LIST_ENTRY(inpcb) inp_list; /* (i/p) list for all PCBs for proto */ void *inp_ppcb; /* (i) pointer to per-protocol pcb */ struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */ + struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */ + LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/p) group wildcard entry */ struct socket *inp_socket; /* (i) back pointer to socket */ struct ucred *inp_cred; /* (c) cache of socket cred */ u_int32_t inp_flow; /* (i) IPv6 flow information */ @@ -272,13 +276,14 @@ struct inpcbport { * the former covering mutable global fields (such as the global pcb list), * and the latter covering the hashed lookup tables. The lock order is: * - * ipi_lock (before) inpcb locks (before) ipi_hash_lock + * ipi_lock (before) inpcb locks (before) {ipi_hash_lock, pcbgroup locks} * * Locking key: * * (c) Constant or nearly constant after initialisation * (g) Locked by ipi_lock - * (h) Read using either ipi_hash_lock or inpcb lock; write requires both. + * (h) Read using either ipi_hash_lock or inpcb lock; write requires both + * (p) Protected by one or more pcbgroup locks * (x) Synchronisation properties poorly defined */ struct inpcbinfo { @@ -312,7 +317,16 @@ struct inpcbinfo { struct uma_zone *ipi_zone; /* (c) */ /* - * Global lock protecting hash lookup tables. + * Connection groups associated with this protocol. These fields are + * constant, but pcbgroup structures themselves are protected by + * per-pcbgroup locks. + */ + struct inpcbgroup *ipi_pcbgroups; /* (c) */ + u_int ipi_npcbgroups; /* (c) */ + u_int ipi_hashfields; /* (c) */ + + /* + * Global lock protecting non-pcbgroup hash lookup tables. */ struct rwlock ipi_hash_lock; @@ -330,6 +344,14 @@ struct inpcbinfo { u_long ipi_porthashmask; /* (h) */ /* + * List of wildcard inpcbs for use with pcbgroups. In the past, was + * per-pcbgroup but is now global. All pcbgroup locks must be held + * to modify the list, so any is sufficient to read it. + */ + struct inpcbhead *ipi_wildbase; /* (p) */ + u_long ipi_wildmask; /* (p) */ + + /* * Pointer to network stack instance */ struct vnet *ipi_vnet; /* (c) */ @@ -340,6 +362,31 @@ struct inpcbinfo { void *ipi_pspare[2]; }; +/* + * Connection groups hold sets of connections that have similar CPU/thread + * affinity. Each connection belongs to exactly one connection group. + */ +struct inpcbgroup { + /* + * Per-connection group hash of inpcbs, hashed by local and foreign + * addresses and port numbers. + */ + struct inpcbhead *ipg_hashbase; /* (c) */ + u_long ipg_hashmask; /* (c) */ + + /* + * Notional affinity of this pcbgroup. + */ + u_int ipg_cpu; /* (p) */ + + /* + * Per-connection group lock, not to be confused with ipi_lock. + * Protects the hash table hung off the group, but also the global + * wildcard list in inpcbinfo. + */ + struct mtx ipg_lock; +} __aligned(CACHE_LINE_SIZE); + #define INP_LOCK_INIT(inp, d, t) \ rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK) #define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock) @@ -423,6 +470,14 @@ void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, #define INP_HASH_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \ RA_WLOCKED) +#define INP_GROUP_LOCK_INIT(ipg, d) mtx_init(&(ipg)->ipg_lock, (d), NULL, \ + MTX_DEF | MTX_DUPOK) +#define INP_GROUP_LOCK_DESTROY(ipg) mtx_destroy(&(ipg)->ipg_lock) + +#define INP_GROUP_LOCK(ipg) mtx_lock(&(ipg)->ipg_lock) +#define INP_GROUP_LOCK_ASSERT(ipg) mtx_assert(&(ipg)->ipg_lock, MA_OWNED) +#define INP_GROUP_UNLOCK(ipg) mtx_unlock(&(ipg)->ipg_lock) + #define INP_PCBHASH(faddr, lport, fport, mask) \ (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask)) #define INP_PCBPORTHASH(lport, mask) \ @@ -482,6 +537,7 @@ void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, */ #define INP_LLE_VALID 0x00000001 /* cached lle is valid */ #define INP_RT_VALID 0x00000002 /* cached rtentry is valid */ +#define INP_PCBGROUPWILD 0x00000004 /* in pcbgroup wildcard list */ /* * Flags passed to in_pcblookup*() functions. @@ -500,6 +556,13 @@ void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, #define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af) +/* + * Constants for pcbinfo.ipi_hashfields. + */ +#define IPI_HASHFIELDS_NONE 0 +#define IPI_HASHFIELDS_2TUPLE 1 +#define IPI_HASHFIELDS_4TUPLE 2 + #ifdef _KERNEL VNET_DECLARE(int, ipport_reservedhigh); VNET_DECLARE(int, ipport_reservedlow); @@ -531,7 +594,21 @@ VNET_DECLARE(int, ipport_tcpallocs); void in_pcbinfo_destroy(struct inpcbinfo *); void in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *, - int, int, char *, uma_init, uma_fini, uint32_t); + int, int, char *, uma_init, uma_fini, uint32_t, u_int); + +struct inpcbgroup * + in_pcbgroup_byhash(struct inpcbinfo *, u_int, uint32_t); +struct inpcbgroup * + in_pcbgroup_byinpcb(struct inpcb *); +struct inpcbgroup * + in_pcbgroup_bytuple(struct inpcbinfo *, struct in_addr, u_short, + struct in_addr, u_short); +void in_pcbgroup_destroy(struct inpcbinfo *); +int in_pcbgroup_enabled(struct inpcbinfo *); +void in_pcbgroup_init(struct inpcbinfo *, u_int, int); +void in_pcbgroup_remove(struct inpcb *); +void in_pcbgroup_update(struct inpcb *); +void in_pcbgroup_update_mbuf(struct inpcb *, struct mbuf *); void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); int in_pcballoc(struct socket *, struct inpcbinfo *); @@ -551,6 +628,7 @@ void in_pcbdisconnect(struct inpcb *); void in_pcbdrop(struct inpcb *); void in_pcbfree(struct inpcb *); int in_pcbinshash(struct inpcb *); +int in_pcbinshash_nopcbgroup(struct inpcb *); struct inpcb * in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short, int, struct ucred *); diff --git a/sys/netinet/in_pcbgroup.c b/sys/netinet/in_pcbgroup.c new file mode 100644 index 0000000..c9f5c70 --- /dev/null +++ b/sys/netinet/in_pcbgroup.c @@ -0,0 +1,457 @@ +/*- + * Copyright (c) 2010-2011 Juniper Networks, Inc. + * All rights reserved. + * + * This software was developed by Robert N. M. Watson under contract + * to Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> + +__FBSDID("$FreeBSD$"); + +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/smp.h> +#include <sys/socketvar.h> + +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#ifdef INET6 +#include <netinet6/in6_pcb.h> +#endif /* INET6 */ + +/* + * pcbgroups, or "connection groups" are based on Willman, Rixner, and Cox's + * 2006 USENIX paper, "An Evaluation of Network Stack Parallelization + * Strategies in Modern Operating Systems". This implementation differs + * significantly from that described in the paper, in that it attempts to + * introduce not just notions of affinity for connections and distribute work + * so as to reduce lock contention, but also align those notions with + * hardware work distribution strategies such as RSS. In this construction, + * connection groups supplement, rather than replace, existing reservation + * tables for protocol 4-tuples, offering CPU-affine lookup tables with + * minimal cache line migration and lock contention during steady state + * operation. + * + * Internet protocols, such as UDP and TCP, register to use connection groups + * by providing an ipi_hashfields value other than IPI_HASHFIELDS_NONE; this + * indicates to the connection group code whether a 2-tuple or 4-tuple is + * used as an argument to hashes that assign a connection to a particular + * group. This must be aligned with any hardware offloaded distribution + * model, such as RSS or similar approaches taken in embedded network boards. + * Wildcard sockets require special handling, as in Willman 2006, and are + * shared between connection groups -- while being protected by group-local + * locks. This means that connection establishment and teardown can be + * signficantly more expensive than without connection groups, but that + * steady-state processing can be significantly faster. + * + * Most of the implementation of connection groups is in this file; however, + * connection group lookup is implemented in in_pcb.c alongside reservation + * table lookups -- see in_pcblookup_group(). + * + * TODO: + * + * Implement dynamic rebalancing of buckets with connection groups; when + * load is unevenly distributed, search for more optimal balancing on + * demand. This might require scaling up the number of connection groups + * by <<1. + * + * Provide an IP 2-tuple or 4-tuple netisr m2cpu handler based on connection + * groups for ip_input and ip6_input, allowing non-offloaded work + * distribution. + * + * Expose effective CPU affinity of connections to userspace using socket + * options. + * + * Investigate per-connection affinity overrides based on socket options; an + * option could be set, certainly resulting in work being distributed + * differently in software, and possibly propagated to supporting hardware + * with TCAMs or hardware hash tables. This might require connections to + * exist in more than one connection group at a time. + * + * Hook netisr thread reconfiguration events, and propagate those to RSS so + * that rebalancing can occur when the thread pool grows or shrinks. + * + * Expose per-pcbgroup statistics to userspace monitoring tools such as + * netstat, in order to allow better debugging and profiling. + */ + +void +in_pcbgroup_init(struct inpcbinfo *pcbinfo, u_int hashfields, + int hash_nelements) +{ + struct inpcbgroup *pcbgroup; + u_int numpcbgroups, pgn; + + /* + * Only enable connection groups for a protocol if it has been + * specifically requested. + */ + if (hashfields == IPI_HASHFIELDS_NONE) + return; + + /* + * Connection groups are about multi-processor load distribution, + * lock contention, and connection CPU affinity. As such, no point + * in turning them on for a uniprocessor machine, it only wastes + * memory. + */ + if (mp_ncpus == 1) + return; + + /* + * Use one group per CPU for now. If we decide to do dynamic + * rebalancing a la RSS, we'll need to shift left by at least 1. + */ + numpcbgroups = mp_ncpus; + + pcbinfo->ipi_hashfields = hashfields; + pcbinfo->ipi_pcbgroups = malloc(numpcbgroups * + sizeof(*pcbinfo->ipi_pcbgroups), M_PCB, M_WAITOK | M_ZERO); + pcbinfo->ipi_npcbgroups = numpcbgroups; + pcbinfo->ipi_wildbase = hashinit(hash_nelements, M_PCB, + &pcbinfo->ipi_wildmask); + for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) { + pcbgroup = &pcbinfo->ipi_pcbgroups[pgn]; + pcbgroup->ipg_hashbase = hashinit(hash_nelements, M_PCB, + &pcbgroup->ipg_hashmask); + INP_GROUP_LOCK_INIT(pcbgroup, "pcbgroup"); + + /* + * Initialise notional affinity of the pcbgroup -- for RSS, + * we want the same notion of affinity as NICs to be used. + * Just round robin for the time being. + */ + pcbgroup->ipg_cpu = (pgn % mp_ncpus); + } +} + +void +in_pcbgroup_destroy(struct inpcbinfo *pcbinfo) +{ + struct inpcbgroup *pcbgroup; + u_int pgn; + + if (pcbinfo->ipi_npcbgroups == 0) + return; + + for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) { + pcbgroup = &pcbinfo->ipi_pcbgroups[pgn]; + KASSERT(LIST_EMPTY(pcbinfo->ipi_listhead), + ("in_pcbinfo_destroy: listhead not empty")); + INP_GROUP_LOCK_DESTROY(pcbgroup); + hashdestroy(pcbgroup->ipg_hashbase, M_PCB, + pcbgroup->ipg_hashmask); + } + hashdestroy(pcbinfo->ipi_wildbase, M_PCB, pcbinfo->ipi_wildmask); + free(pcbinfo->ipi_pcbgroups, M_PCB); + pcbinfo->ipi_pcbgroups = NULL; + pcbinfo->ipi_npcbgroups = 0; + pcbinfo->ipi_hashfields = 0; +} + +/* + * Given a hash of whatever the covered tuple might be, return a pcbgroup + * index. + */ +static __inline u_int +in_pcbgroup_getbucket(struct inpcbinfo *pcbinfo, uint32_t hash) +{ + + return (hash % pcbinfo->ipi_npcbgroups); +} + +/* + * Map a (hashtype, hash) tuple into a connection group, or NULL if the hash + * information is insufficient to identify the pcbgroup. + */ +struct inpcbgroup * +in_pcbgroup_byhash(struct inpcbinfo *pcbinfo, u_int hashtype, uint32_t hash) +{ + + return (NULL); +} + +static struct inpcbgroup * +in_pcbgroup_bymbuf(struct inpcbinfo *pcbinfo, struct mbuf *m) +{ + + return (in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), + m->m_pkthdr.flowid)); +} + +struct inpcbgroup * +in_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, struct in_addr laddr, + u_short lport, struct in_addr faddr, u_short fport) +{ + uint32_t hash; + + switch (pcbinfo->ipi_hashfields) { + case IPI_HASHFIELDS_4TUPLE: + hash = faddr.s_addr ^ fport; + break; + + case IPI_HASHFIELDS_2TUPLE: + hash = faddr.s_addr ^ laddr.s_addr; + break; + + default: + hash = 0; + } + return (&pcbinfo->ipi_pcbgroups[in_pcbgroup_getbucket(pcbinfo, + hash)]); +} + +struct inpcbgroup * +in_pcbgroup_byinpcb(struct inpcb *inp) +{ + + return (in_pcbgroup_bytuple(inp->inp_pcbinfo, inp->inp_laddr, + inp->inp_lport, inp->inp_faddr, inp->inp_fport)); +} + +static void +in_pcbwild_add(struct inpcb *inp) +{ + struct inpcbinfo *pcbinfo; + struct inpcbhead *head; + u_int pgn; + + INP_WLOCK_ASSERT(inp); + KASSERT(!(inp->inp_flags2 & INP_PCBGROUPWILD), + ("%s: is wild",__func__)); + + pcbinfo = inp->inp_pcbinfo; + for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) + INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]); + head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, inp->inp_lport, + 0, pcbinfo->ipi_wildmask)]; + LIST_INSERT_HEAD(head, inp, inp_pcbgroup_wild); + inp->inp_flags2 |= INP_PCBGROUPWILD; + for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) + INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]); +} + +static void +in_pcbwild_remove(struct inpcb *inp) +{ + struct inpcbinfo *pcbinfo; + u_int pgn; + + INP_WLOCK_ASSERT(inp); + KASSERT((inp->inp_flags2 & INP_PCBGROUPWILD), + ("%s: not wild", __func__)); + + pcbinfo = inp->inp_pcbinfo; + for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) + INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]); + LIST_REMOVE(inp, inp_pcbgroup_wild); + for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) + INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]); + inp->inp_flags2 &= ~INP_PCBGROUPWILD; +} + +static __inline int +in_pcbwild_needed(struct inpcb *inp) +{ + +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) + return (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)); + else +#endif + return (inp->inp_faddr.s_addr == htonl(INADDR_ANY)); +} + +static void +in_pcbwild_update_internal(struct inpcb *inp) +{ + int wildcard_needed; + + wildcard_needed = in_pcbwild_needed(inp); + if (wildcard_needed && !(inp->inp_flags2 & INP_PCBGROUPWILD)) + in_pcbwild_add(inp); + else if (!wildcard_needed && (inp->inp_flags2 & INP_PCBGROUPWILD)) + in_pcbwild_remove(inp); +} + +/* + * Update the pcbgroup of an inpcb, which might include removing an old + * pcbgroup reference and/or adding a new one. Wildcard processing is not + * performed here, although ideally we'll never install a pcbgroup for a + * wildcard inpcb (asserted below). + */ +static void +in_pcbgroup_update_internal(struct inpcbinfo *pcbinfo, + struct inpcbgroup *newpcbgroup, struct inpcb *inp) +{ + struct inpcbgroup *oldpcbgroup; + struct inpcbhead *pcbhash; + uint32_t hashkey_faddr; + + INP_WLOCK_ASSERT(inp); + + oldpcbgroup = inp->inp_pcbgroup; + if (oldpcbgroup != NULL && oldpcbgroup != newpcbgroup) { + INP_GROUP_LOCK(oldpcbgroup); + LIST_REMOVE(inp, inp_pcbgrouphash); + inp->inp_pcbgroup = NULL; + INP_GROUP_UNLOCK(oldpcbgroup); + } + if (newpcbgroup != NULL && oldpcbgroup != newpcbgroup) { +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) + hashkey_faddr = inp->in6p_faddr.s6_addr32[3]; /* XXX */ + else +#endif + hashkey_faddr = inp->inp_faddr.s_addr; + INP_GROUP_LOCK(newpcbgroup); + pcbhash = &newpcbgroup->ipg_hashbase[ + INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, + newpcbgroup->ipg_hashmask)]; + LIST_INSERT_HEAD(pcbhash, inp, inp_pcbgrouphash); + inp->inp_pcbgroup = newpcbgroup; + INP_GROUP_UNLOCK(newpcbgroup); + } + + KASSERT(!(newpcbgroup != NULL && in_pcbwild_needed(inp)), + ("%s: pcbgroup and wildcard!", __func__)); +} + +/* + * Two update paths: one in which the 4-tuple on an inpcb has been updated + * and therefore connection groups may need to change (or a wildcard entry + * may needed to be installed), and another in which the 4-tuple has been + * set as a result of a packet received, in which case we may be able to use + * the hash on the mbuf to avoid doing a software hash calculation for RSS. + * + * In each case: first, let the wildcard code have a go at placing it as a + * wildcard socket. If it was a wildcard, or if the connection has been + * dropped, then no pcbgroup is required (so potentially clear it); + * otherwise, calculate and update the pcbgroup for the inpcb. + */ +void +in_pcbgroup_update(struct inpcb *inp) +{ + struct inpcbinfo *pcbinfo; + struct inpcbgroup *newpcbgroup; + + INP_WLOCK_ASSERT(inp); + + pcbinfo = inp->inp_pcbinfo; + if (!in_pcbgroup_enabled(pcbinfo)) + return; + + in_pcbwild_update_internal(inp); + if (!(inp->inp_flags2 & INP_PCBGROUPWILD) && + !(inp->inp_flags & INP_DROPPED)) { +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) + newpcbgroup = in6_pcbgroup_byinpcb(inp); + else +#endif + newpcbgroup = in_pcbgroup_byinpcb(inp); + } else + newpcbgroup = NULL; + in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp); +} + +void +in_pcbgroup_update_mbuf(struct inpcb *inp, struct mbuf *m) +{ + struct inpcbinfo *pcbinfo; + struct inpcbgroup *newpcbgroup; + + INP_WLOCK_ASSERT(inp); + + pcbinfo = inp->inp_pcbinfo; + if (!in_pcbgroup_enabled(pcbinfo)) + return; + + /* + * Possibly should assert !INP_PCBGROUPWILD rather than testing for + * it; presumably this function should never be called for anything + * other than non-wildcard socket? + */ + in_pcbwild_update_internal(inp); + if (!(inp->inp_flags2 & INP_PCBGROUPWILD) && + !(inp->inp_flags & INP_DROPPED)) { + newpcbgroup = in_pcbgroup_bymbuf(pcbinfo, m); +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) { + if (newpcbgroup == NULL) + newpcbgroup = in6_pcbgroup_byinpcb(inp); + } else { +#endif + if (newpcbgroup == NULL) + newpcbgroup = in_pcbgroup_byinpcb(inp); +#ifdef INET6 + } +#endif + } else + newpcbgroup = NULL; + in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp); +} + +/* + * Remove pcbgroup entry and optional pcbgroup wildcard entry for this inpcb. + */ +void +in_pcbgroup_remove(struct inpcb *inp) +{ + struct inpcbgroup *pcbgroup; + + INP_WLOCK_ASSERT(inp); + + if (!in_pcbgroup_enabled(inp->inp_pcbinfo)) + return; + + if (inp->inp_flags2 & INP_PCBGROUPWILD) + in_pcbwild_remove(inp); + + pcbgroup = inp->inp_pcbgroup; + if (pcbgroup != NULL) { + INP_GROUP_LOCK(pcbgroup); + LIST_REMOVE(inp, inp_pcbgrouphash); + inp->inp_pcbgroup = NULL; + INP_GROUP_UNLOCK(pcbgroup); + } +} + +/* + * Query whether or not it is appropriate to use pcbgroups to look up inpcbs + * for a protocol. + */ +int +in_pcbgroup_enabled(struct inpcbinfo *pcbinfo) +{ + + return (pcbinfo->ipi_npcbgroups > 0); +} diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c index 6f5bce7..527ce56 100644 --- a/sys/netinet/ip_divert.c +++ b/sys/netinet/ip_divert.c @@ -153,7 +153,8 @@ div_init(void) * place for hashbase == NULL. */ in_pcbinfo_init(&V_divcbinfo, "div", &V_divcb, 1, 1, "divcb", - div_inpcb_init, div_inpcb_fini, UMA_ZONE_NOFREE); + div_inpcb_init, div_inpcb_fini, UMA_ZONE_NOFREE, + IPI_HASHFIELDS_NONE); } static void diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c index b4d3abb..49c48b9 100644 --- a/sys/netinet/ipfw/ip_fw2.c +++ b/sys/netinet/ipfw/ip_fw2.c @@ -692,6 +692,10 @@ check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, lookupflags |= INPLOOKUP_RLOCKPCB; match = 0; if (*ugid_lookupp == 0) { + /* + * XXXRW: If we had the mbuf here, could use + * in_pcblookup_mbuf(). + */ pcb = (oif) ? in_pcblookup(pi, dst_ip, htons(dst_port), diff --git a/sys/netinet/ipfw/ip_fw_sockopt.c b/sys/netinet/ipfw/ip_fw_sockopt.c index f81d57d..2347456 100644 --- a/sys/netinet/ipfw/ip_fw_sockopt.c +++ b/sys/netinet/ipfw/ip_fw_sockopt.c @@ -349,12 +349,13 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg) } if (n == 0) { - /* A flush request (arg == 0) on empty ruleset - * returns with no error. On the contrary, + /* A flush request (arg == 0 or cmd == 1) on empty + * ruleset returns with no error. On the contrary, * if there is no match on a specific request, * we return EINVAL. */ - error = (arg == 0) ? 0 : EINVAL; + if (arg != 0 && cmd != 1) + error = EINVAL; break; } diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c index 635f08f..e754b88 100644 --- a/sys/netinet/raw_ip.c +++ b/sys/netinet/raw_ip.c @@ -205,7 +205,8 @@ rip_init(void) { in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE, - 1, "ripcb", rip_inpcb_init, NULL, UMA_ZONE_NOFREE); + 1, "ripcb", rip_inpcb_init, NULL, UMA_ZONE_NOFREE, + IPI_HASHFIELDS_NONE); EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, EVENTHANDLER_PRI_ANY); } diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 06854ec..6ed58911 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -300,7 +300,8 @@ tcp_init(void) hashsize = 512; /* safe default */ } in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize, - "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE); + "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE, + IPI_HASHFIELDS_4TUPLE); /* * These have to be type stable for the benefit of the timers. diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index 5125134..66e4732 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" +#include "opt_pcbgroup.h" #include <sys/param.h> #include <sys/systm.h> @@ -676,8 +677,14 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) #ifdef INET6 } #endif + + /* + * Install in the reservation hash table for now, but don't yet + * install a connection group since the full 4-tuple isn't yet + * configured. + */ inp->inp_lport = sc->sc_inc.inc_lport; - if ((error = in_pcbinshash(inp)) != 0) { + if ((error = in_pcbinshash_nopcbgroup(inp)) != 0) { /* * Undo the assignments above if we failed to * put the PCB on the hash lists. diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index fd864c0..28eb8fd 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -186,7 +186,8 @@ udp_init(void) { in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE, - "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE); + "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE, + IPI_HASHFIELDS_2TUPLE); V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(V_udpcb_zone, maxsockets); diff --git a/sys/netinet6/in6.c b/sys/netinet6/in6.c index 9e8e5cd..9558d1b 100644 --- a/sys/netinet6/in6.c +++ b/sys/netinet6/in6.c @@ -652,8 +652,32 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, * that is, this address might make other addresses detached. */ pfxlist_onlink_check(); - if (error == 0 && ia) + if (error == 0 && ia) { + if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) { + /* + * Try to clear the flag when a new + * IPv6 address is added onto an + * IFDISABLED interface and it + * succeeds. + */ + struct in6_ndireq nd; + + memset(&nd, 0, sizeof(nd)); + nd.ndi.flags = ND_IFINFO(ifp)->flags; + nd.ndi.flags &= ~ND6_IFF_IFDISABLED; + if (nd6_ioctl(SIOCSIFINFO_FLAGS, + (caddr_t)&nd, ifp) < 0) + log(LOG_NOTICE, "SIOCAIFADDR_IN6: " + "SIOCSIFINFO_FLAGS for -ifdisabled " + "failed."); + /* + * Ignore failure of clearing the flag + * intentionally. The failure means + * address duplication was detected. + */ + } EVENTHANDLER_INVOKE(ifaddr_event, ifp); + } break; } diff --git a/sys/netinet6/in6.h b/sys/netinet6/in6.h index 32759af..ae0da6a 100644 --- a/sys/netinet6/in6.h +++ b/sys/netinet6/in6.h @@ -611,7 +611,10 @@ struct ip6_mtuinfo { #define IPV6CTL_STEALTH 45 #define ICMPV6CTL_ND6_ONLINKNSRFC4861 47 -#define IPV6CTL_MAXID 48 +#define IPV6CTL_NO_RADR 48 /* No defroute from RA */ +#define IPV6CTL_NORBIT_RAIF 49 /* Disable R-bit in NA on RA + * receiving IF. */ +#define IPV6CTL_MAXID 50 #endif /* __BSD_VISIBLE */ /* diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c index da73f21..d15c605 100644 --- a/sys/netinet6/in6_pcb.c +++ b/sys/netinet6/in6_pcb.c @@ -70,6 +70,7 @@ __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" +#include "opt_pcbgroup.h" #include <sys/param.h> #include <sys/systm.h> @@ -827,6 +828,141 @@ in6_rtchange(struct inpcb *inp, int errno) return inp; } +#ifdef PCBGROUP +/* + * Lookup PCB in hash list, using pcbgroup tables. + */ +static struct inpcb * +in6_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, + struct in6_addr *faddr, u_int fport_arg, struct in6_addr *laddr, + u_int lport_arg, int lookupflags, struct ifnet *ifp) +{ + struct inpcbhead *head; + struct inpcb *inp, *tmpinp; + u_short fport = fport_arg, lport = lport_arg; + int faith; + + if (faithprefix_p != NULL) + faith = (*faithprefix_p)(laddr); + else + faith = 0; + + /* + * First look for an exact match. + */ + tmpinp = NULL; + INP_GROUP_LOCK(pcbgroup); + head = &pcbgroup->ipg_hashbase[ + INP_PCBHASH(faddr->s6_addr32[3] /* XXX */, lport, fport, + pcbgroup->ipg_hashmask)]; + LIST_FOREACH(inp, head, inp_pcbgrouphash) { + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV6) == 0) + continue; + if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) && + IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && + inp->inp_fport == fport && + inp->inp_lport == lport) { + /* + * XXX We should be able to directly return + * the inp here, without any checks. + * Well unless both bound with SO_REUSEPORT? + */ + if (prison_flag(inp->inp_cred, PR_IP6)) + goto found; + if (tmpinp == NULL) + tmpinp = inp; + } + } + if (tmpinp != NULL) { + inp = tmpinp; + goto found; + } + + /* + * Then look for a wildcard match, if requested. + */ + if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { + struct inpcb *local_wild = NULL, *local_exact = NULL; + struct inpcb *jail_wild = NULL; + int injail; + + /* + * Order of socket selection - we always prefer jails. + * 1. jailed, non-wild. + * 2. jailed, wild. + * 3. non-jailed, non-wild. + * 4. non-jailed, wild. + */ + head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport, + 0, pcbinfo->ipi_wildmask)]; + LIST_FOREACH(inp, head, inp_pcbgroup_wild) { + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV6) == 0) + continue; + + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) || + inp->inp_lport != lport) { + continue; + } + + /* XXX inp locking */ + if (faith && (inp->inp_flags & INP_FAITH) == 0) + continue; + + injail = prison_flag(inp->inp_cred, PR_IP6); + if (injail) { + if (prison_check_ip6(inp->inp_cred, + laddr) != 0) + continue; + } else { + if (local_exact != NULL) + continue; + } + + if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) { + if (injail) + goto found; + else + local_exact = inp; + } else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { + if (injail) + jail_wild = inp; + else + local_wild = inp; + } + } /* LIST_FOREACH */ + + inp = jail_wild; + if (inp == NULL) + inp = jail_wild; + if (inp == NULL) + inp = local_exact; + if (inp == NULL) + inp = local_wild; + if (inp != NULL) + goto found; + } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ + INP_GROUP_UNLOCK(pcbgroup); + return (NULL); + +found: + in_pcbref(inp); + INP_GROUP_UNLOCK(pcbgroup); + if (lookupflags & INPLOOKUP_WLOCKPCB) { + INP_WLOCK(inp); + if (in_pcbrele_wlocked(inp)) + return (NULL); + } else if (lookupflags & INPLOOKUP_RLOCKPCB) { + INP_RLOCK(inp); + if (in_pcbrele_rlocked(inp)) + return (NULL); + } else + panic("%s: locking buf", __func__); + return (inp); +} +#endif /* PCBGROUP */ + /* * Lookup PCB in hash list. */ @@ -983,16 +1119,30 @@ in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, /* * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf * from which a pre-calculated hash value may be extracted. + * + * Possibly more of this logic should be in in6_pcbgroup.c. */ struct inpcb * in6_pcblookup(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags, struct ifnet *ifp) { +#if defined(PCBGROUP) + struct inpcbgroup *pcbgroup; +#endif + KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); +#if defined(PCBGROUP) + if (in_pcbgroup_enabled(pcbinfo)) { + pcbgroup = in6_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, + fport); + return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, + laddr, lport, lookupflags, ifp)); + } +#endif return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } @@ -1002,11 +1152,28 @@ in6_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags, struct ifnet *ifp, struct mbuf *m) { +#ifdef PCBGROUP + struct inpcbgroup *pcbgroup; +#endif + KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); +#ifdef PCBGROUP + if (in_pcbgroup_enabled(pcbinfo)) { + pcbgroup = in6_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), + m->m_pkthdr.flowid); + if (pcbgroup != NULL) + return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, + fport, laddr, lport, lookupflags, ifp)); + pcbgroup = in6_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, + fport); + return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, + laddr, lport, lookupflags, ifp)); + } +#endif return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } diff --git a/sys/netinet6/in6_pcb.h b/sys/netinet6/in6_pcb.h index cf24704..8398d54 100644 --- a/sys/netinet6/in6_pcb.h +++ b/sys/netinet6/in6_pcb.h @@ -69,6 +69,16 @@ #define sin6tosa(sin6) ((struct sockaddr *)(sin6)) #define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) +struct inpcbgroup * + in6_pcbgroup_byhash(struct inpcbinfo *, u_int, uint32_t); +struct inpcbgroup * + in6_pcbgroup_byinpcb __P((struct inpcb *)); +struct inpcbgroup * + in6_pcbgroup_bymbuf(struct inpcbinfo *, struct mbuf *); +struct inpcbgroup * + in6_pcbgroup_bytuple __P((struct inpcbinfo *, const struct in6_addr *, + u_short, const struct in6_addr *, u_short)); + void in6_pcbpurgeif0 __P((struct inpcbinfo *, struct ifnet *)); void in6_losing __P((struct inpcb *)); int in6_pcbbind __P((struct inpcb *, struct sockaddr *, struct ucred *)); diff --git a/sys/netinet6/in6_pcbgroup.c b/sys/netinet6/in6_pcbgroup.c new file mode 100644 index 0000000..850d7f4 --- /dev/null +++ b/sys/netinet6/in6_pcbgroup.c @@ -0,0 +1,103 @@ +/*- + * Copyright (c) 2010-2011 Juniper Networks, Inc. + * All rights reserved. + * + * This software was developed by Robert N. M. Watson under contract + * to Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> + +__FBSDID("$FreeBSD$"); + +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/mbuf.h> + +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#ifdef INET6 +#include <netinet6/in6_pcb.h> +#endif /* INET6 */ + +/* + * Given a hash of whatever the covered tuple might be, return a pcbgroup + * index. + */ +static __inline u_int +in6_pcbgroup_getbucket(struct inpcbinfo *pcbinfo, uint32_t hash) +{ + + return (hash % pcbinfo->ipi_npcbgroups); +} + +/* + * Map a (hashtype, hash) tuple into a connection group, or NULL if the hash + * information is insufficient to identify the pcbgroup. + */ +struct inpcbgroup * +in6_pcbgroup_byhash(struct inpcbinfo *pcbinfo, u_int hashtype, uint32_t hash) +{ + + return (NULL); +} + +struct inpcbgroup * +in6_pcbgroup_bymbuf(struct inpcbinfo *pcbinfo, struct mbuf *m) +{ + + return (in6_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), + m->m_pkthdr.flowid)); +} + +struct inpcbgroup * +in6_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, const struct in6_addr *laddrp, + u_short lport, const struct in6_addr *faddrp, u_short fport) +{ + uint32_t hash; + + switch (pcbinfo->ipi_hashfields) { + case IPI_HASHFIELDS_4TUPLE: + hash = faddrp->s6_addr32[3] ^ fport; + break; + + case IPI_HASHFIELDS_2TUPLE: + hash = faddrp->s6_addr32[3] ^ laddrp->s6_addr32[3]; + break; + + default: + hash = 0; + } + return (&pcbinfo->ipi_pcbgroups[in6_pcbgroup_getbucket(pcbinfo, + hash)]); +} + +struct inpcbgroup * +in6_pcbgroup_byinpcb(struct inpcb *inp) +{ + + return (in6_pcbgroup_bytuple(inp->inp_pcbinfo, &inp->in6p_laddr, + inp->inp_lport, &inp->in6p_faddr, inp->inp_fport)); +} diff --git a/sys/netinet6/in6_proto.c b/sys/netinet6/in6_proto.c index ab54755..9e78e9a 100644 --- a/sys/netinet6/in6_proto.c +++ b/sys/netinet6/in6_proto.c @@ -409,6 +409,8 @@ VNET_DEFINE(int, ip6_sendredirects) = IPV6_SENDREDIRECTS; VNET_DEFINE(int, ip6_defhlim) = IPV6_DEFHLIM; VNET_DEFINE(int, ip6_defmcasthlim) = IPV6_DEFAULT_MULTICAST_HOPS; VNET_DEFINE(int, ip6_accept_rtadv) = 0; +VNET_DEFINE(int, ip6_no_radr) = 0; +VNET_DEFINE(int, ip6_norbit_raif) = 0; VNET_DEFINE(int, ip6_maxfragpackets); /* initialized in frag6.c:frag6_init() */ VNET_DEFINE(int, ip6_maxfrags); /* initialized in frag6.c:frag6_init() */ VNET_DEFINE(int, ip6_log_interval) = 5; @@ -537,6 +539,15 @@ SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_ACCEPT_RTADV, accept_rtadv, CTLFLAG_RW, &VNET_NAME(ip6_accept_rtadv), 0, "Default value of per-interface flag for accepting ICMPv6 Router" "Advertisement messages"); +SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_NO_RADR, no_radr, + CTLFLAG_RW, &VNET_NAME(ip6_no_radr), 0, + "Default value of per-interface flag to control whether routers " + "sending ICMPv6 RA messages on that interface are added into the " + "default router list."); +SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_NORBIT_RAIF, norbit_raif, CTLFLAG_RW, + &VNET_NAME(ip6_norbit_raif), 0, + "Always set 0 to R flag in ICMPv6 NA messages when accepting RA" + " on the interface."); SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_KEEPFAITH, keepfaith, CTLFLAG_RW, &VNET_NAME(ip6_keepfaith), 0, ""); SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_LOG_INTERVAL, log_interval, diff --git a/sys/netinet6/ip6_var.h b/sys/netinet6/ip6_var.h index 4e44d9f..dbfba9a 100644 --- a/sys/netinet6/ip6_var.h +++ b/sys/netinet6/ip6_var.h @@ -316,6 +316,9 @@ VNET_DECLARE(int, ip6_maxfragpackets); /* Maximum packets in reassembly VNET_DECLARE(int, ip6_maxfrags); /* Maximum fragments in reassembly * queue */ VNET_DECLARE(int, ip6_accept_rtadv); /* Acts as a host not a router */ +VNET_DECLARE(int, ip6_no_radr); /* No defroute from RA */ +VNET_DECLARE(int, ip6_norbit_raif); /* Disable R-bit in NA on RA + * receiving IF. */ VNET_DECLARE(int, ip6_keepfaith); /* Firewall Aided Internet Translator */ VNET_DECLARE(int, ip6_log_interval); VNET_DECLARE(time_t, ip6_log_time); @@ -327,6 +330,8 @@ VNET_DECLARE(int, ip6_dad_count); /* DupAddrDetectionTransmits */ #define V_ip6_maxfragpackets VNET(ip6_maxfragpackets) #define V_ip6_maxfrags VNET(ip6_maxfrags) #define V_ip6_accept_rtadv VNET(ip6_accept_rtadv) +#define V_ip6_no_radr VNET(ip6_no_radr) +#define V_ip6_norbit_raif VNET(ip6_norbit_raif) #define V_ip6_keepfaith VNET(ip6_keepfaith) #define V_ip6_log_interval VNET(ip6_log_interval) #define V_ip6_log_time VNET(ip6_log_time) diff --git a/sys/netinet6/nd6.c b/sys/netinet6/nd6.c index f1e48ea..2b51e43 100644 --- a/sys/netinet6/nd6.c +++ b/sys/netinet6/nd6.c @@ -193,6 +193,8 @@ nd6_ifattach(struct ifnet *ifp) /* A loopback interface does not need to accept RTADV. */ if (V_ip6_accept_rtadv && !(ifp->if_flags & IFF_LOOPBACK)) nd->flags |= ND6_IFF_ACCEPT_RTADV; + if (V_ip6_no_radr && !(ifp->if_flags & IFF_LOOPBACK)) + nd->flags |= ND6_IFF_NO_RADR; /* XXX: we cannot call nd6_setmtu since ifp is not fully initialized */ nd6_setmtu0(ifp, nd); @@ -825,7 +827,7 @@ nd6_purge(struct ifnet *ifp) if (V_nd6_defifindex == ifp->if_index) nd6_setdefaultiface(0); - if (!V_ip6_forwarding && ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { + if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { /* Refresh default router list. */ defrouter_select(); } @@ -958,10 +960,9 @@ nd6_is_new_addr_neighbor(struct sockaddr_in6 *addr, struct ifnet *ifp) /* * If the default router list is empty, all addresses are regarded * as on-link, and thus, as a neighbor. - * XXX: we restrict the condition to hosts, because routers usually do - * not have the "default router list". */ - if (!V_ip6_forwarding && TAILQ_FIRST(&V_nd_defrouter) == NULL && + if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV && + TAILQ_FIRST(&V_nd_defrouter) == NULL && V_nd6_defifindex == ifp->if_index) { return (1); } @@ -1022,8 +1023,7 @@ nd6_free(struct llentry *ln, int gc) ifp = ln->lle_tbl->llt_ifp; - if (!V_ip6_forwarding) { - + if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { dr = defrouter_lookup(&L3_ADDR_SIN6(ln)->sin6_addr, ifp); if (dr != NULL && dr->expire && @@ -1322,6 +1322,16 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) struct ifaddr *ifa; struct in6_ifaddr *ia; + /* + * Try to clear ifdisabled flag when enabling + * accept_rtadv or auto_linklocal. + */ + if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) && + !(ND.flags & ND6_IFF_IFDISABLED) && + (ND.flags & (ND6_IFF_ACCEPT_RTADV | + ND6_IFF_AUTO_LINKLOCAL))) + ND.flags &= ~ND6_IFF_IFDISABLED; + if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) && !(ND.flags & ND6_IFF_IFDISABLED)) { /* ifdisabled 1->0 transision */ @@ -1340,7 +1350,7 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) continue; ia = (struct in6_ifaddr *)ifa; if ((ia->ia6_flags & IN6_IFF_DUPLICATED) && - IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) { + IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) { duplicated_linklocal = 1; break; } @@ -1379,6 +1389,28 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) /* If no link-local address on ifp, configure */ ND_IFINFO(ifp)->flags |= ND6_IFF_AUTO_LINKLOCAL; in6_ifattach(ifp, NULL); + } else if ((ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL) && + !(ND.flags & ND6_IFF_IFDISABLED)) { + /* + * When the IF already has + * ND6_IFF_AUTO_LINKLOCAL and no link-local + * address is assigned, try to assign one. + */ + int haslinklocal = 0; + + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != AF_INET6) + continue; + ia = (struct in6_ifaddr *)ifa; + if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) { + haslinklocal = 1; + break; + } + } + IF_ADDR_UNLOCK(ifp); + if (!haslinklocal) + in6_ifattach(ifp, NULL); } } ND_IFINFO(ifp)->flags = ND.flags; @@ -1718,7 +1750,7 @@ nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr, * for those are not autoconfigured hosts, we explicitly avoid such * cases for safety. */ - if (do_update && router && !V_ip6_forwarding && + if (do_update && router && ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { /* * guaranteed recursion diff --git a/sys/netinet6/nd6.h b/sys/netinet6/nd6.h index abcfcb7..6f63192 100644 --- a/sys/netinet6/nd6.h +++ b/sys/netinet6/nd6.h @@ -85,6 +85,7 @@ struct nd_ifinfo { */ #define ND6_IFF_DONT_SET_IFROUTE 0x10 #define ND6_IFF_AUTO_LINKLOCAL 0x20 +#define ND6_IFF_NO_RADR 0x40 #define ND6_CREATE LLE_CREATE #define ND6_EXCLUSIVE LLE_EXCLUSIVE diff --git a/sys/netinet6/nd6_nbr.c b/sys/netinet6/nd6_nbr.c index fd5bcf2..fb8e379 100644 --- a/sys/netinet6/nd6_nbr.c +++ b/sys/netinet6/nd6_nbr.c @@ -112,10 +112,14 @@ nd6_ns_input(struct mbuf *m, int off, int icmp6len) int lladdrlen = 0; int anycast = 0, proxy = 0, tentative = 0; int tlladdr; + int rflag; union nd_opts ndopts; struct sockaddr_dl proxydl; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; + rflag = (V_ip6_forwarding) ? ND_NA_FLAG_ROUTER : 0; + if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV && V_ip6_norbit_raif) + rflag = 0; #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len,); nd_ns = (struct nd_neighbor_solicit *)((caddr_t)ip6 + off); @@ -339,8 +343,7 @@ nd6_ns_input(struct mbuf *m, int off, int icmp6len) goto bad; nd6_na_output(ifp, &in6_all, &taddr6, ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) | - (V_ip6_forwarding ? ND_NA_FLAG_ROUTER : 0), - tlladdr, proxy ? (struct sockaddr *)&proxydl : NULL); + rflag, tlladdr, proxy ? (struct sockaddr *)&proxydl : NULL); goto freeit; } @@ -349,8 +352,8 @@ nd6_ns_input(struct mbuf *m, int off, int icmp6len) nd6_na_output(ifp, &saddr6, &taddr6, ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) | - (V_ip6_forwarding ? ND_NA_FLAG_ROUTER : 0) | ND_NA_FLAG_SOLICITED, - tlladdr, proxy ? (struct sockaddr *)&proxydl : NULL); + rflag | ND_NA_FLAG_SOLICITED, tlladdr, + proxy ? (struct sockaddr *)&proxydl : NULL); freeit: if (ifa != NULL) ifa_free(ifa); @@ -862,7 +865,8 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) dr = defrouter_lookup(in6, ln->lle_tbl->llt_ifp); if (dr) defrtrlist_del(dr); - else if (!V_ip6_forwarding) { + else if (ND_IFINFO(ln->lle_tbl->llt_ifp)->flags & + ND6_IFF_ACCEPT_RTADV) { /* * Even if the neighbor is not in the default * router list, the neighbor may be used diff --git a/sys/netinet6/nd6_rtr.c b/sys/netinet6/nd6_rtr.c index 19ec989..e791e2e 100644 --- a/sys/netinet6/nd6_rtr.c +++ b/sys/netinet6/nd6_rtr.c @@ -127,8 +127,11 @@ nd6_rs_input(struct mbuf *m, int off, int icmp6len) union nd_opts ndopts; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; - /* If I'm not a router, ignore it. */ - if (!V_ip6_forwarding) + /* + * Accept RS only when V_ip6_forwarding=1 and the interface has + * no ND6_IFF_ACCEPT_RTADV. + */ + if (!V_ip6_forwarding || ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) goto freeit; /* Sanity checks */ @@ -213,11 +216,10 @@ nd6_ra_input(struct mbuf *m, int off, int icmp6len) char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; /* - * We only accept RAs only when - * the node is not a router and - * per-interface variable allows RAs on the receiving interface. + * We only accept RAs only when the per-interface flag + * ND6_IFF_ACCEPT_RTADV is on the receiving interface. */ - if (V_ip6_forwarding || !(ndi->flags & ND6_IFF_ACCEPT_RTADV)) + if (!(ndi->flags & ND6_IFF_ACCEPT_RTADV)) goto freeit; if (ip6->ip6_hlim != 255) { @@ -266,7 +268,15 @@ nd6_ra_input(struct mbuf *m, int off, int icmp6len) bzero(&dr0, sizeof(dr0)); dr0.rtaddr = saddr6; dr0.flags = nd_ra->nd_ra_flags_reserved; - dr0.rtlifetime = ntohs(nd_ra->nd_ra_router_lifetime); + /* + * Effectively-disable the route in the RA packet + * when ND6_IFF_NO_RADR on the receiving interface or + * ip6.forwarding=1. + */ + if (ndi->flags & ND6_IFF_NO_RADR || V_ip6_forwarding) + dr0.rtlifetime = 0; + else + dr0.rtlifetime = ntohs(nd_ra->nd_ra_router_lifetime); dr0.expire = time_second + dr0.rtlifetime; dr0.ifp = ifp; /* unspecified or not? (RFC 2461 6.3.4) */ @@ -557,7 +567,7 @@ defrtrlist_del(struct nd_defrouter *dr) * Flush all the routing table entries that use the router * as a next hop. */ - if (!V_ip6_forwarding) + if (ND_IFINFO(dr->ifp)->flags & ND6_IFF_ACCEPT_RTADV) rt6_flush(&dr->rtaddr, dr->ifp); if (dr->installed) { @@ -616,20 +626,6 @@ defrouter_select(void) struct llentry *ln = NULL; /* - * This function should be called only when acting as an autoconfigured - * host. Although the remaining part of this function is not effective - * if the node is not an autoconfigured host, we explicitly exclude - * such cases here for safety. - */ - if (V_ip6_forwarding) { - nd6log((LOG_WARNING, - "defrouter_select: called unexpectedly (forwarding=%d)\n", - V_ip6_forwarding)); - splx(s); - return; - } - - /* * Let's handle easy case (3) first: * If default router list is empty, there's nothing to be done. */ diff --git a/sys/netinet6/send.h b/sys/netinet6/send.h index 36ba571..9795d14 100644 --- a/sys/netinet6/send.h +++ b/sys/netinet6/send.h @@ -33,7 +33,7 @@ #define SND_IN 1 /* Incoming traffic. */ struct sockaddr_send { - unsigned char send_len; /* total length */ + uint8_t send_len; /* total length */ sa_family_t send_family; /* address family */ int send_direction; int send_ifidx; diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h index 654f145..c2b7081 100644 --- a/sys/sys/mbuf.h +++ b/sys/sys/mbuf.h @@ -199,7 +199,9 @@ struct mbuf { #define M_PROTO6 0x00080000 /* protocol-specific */ #define M_PROTO7 0x00100000 /* protocol-specific */ #define M_PROTO8 0x00200000 /* protocol-specific */ -#define M_FLOWID 0x00400000 /* flowid is valid */ +#define M_FLOWID 0x00400000 /* deprecated: flowid is valid */ +#define M_HASHTYPEBITS 0x0F000000 /* mask of bits holding flowid hash type */ + /* * For RELENG_{6,7} steal these flags for limited multiple routing table * support. In RELENG_8 and beyond, use just one flag and a tag. @@ -215,11 +217,45 @@ struct mbuf { (M_PROTO1|M_PROTO2|M_PROTO3|M_PROTO4|M_PROTO5|M_PROTO6|M_PROTO7|M_PROTO8) /* + * Network interface cards are able to hash protocol fields (such as IPv4 + * addresses and TCP port numbers) classify packets into flows. These flows + * can then be used to maintain ordering while delivering packets to the OS + * via parallel input queues, as well as to provide a stateless affinity + * model. NIC drivers can pass up the hash via m->m_pkthdr.flowid, and set + * m_flag fields to indicate how the hash should be interpreted by the + * network stack. + * + * Most NICs support RSS, which provides ordering and explicit affinity, and + * use the hash m_flag bits to indicate what header fields were covered by + * the hash. M_HASHTYPE_OPAQUE can be set by non-RSS cards or configurations + * that provide an opaque flow identifier, allowing for ordering and + * distribution without explicit affinity. + */ +#define M_HASHTYPE_SHIFT 24 +#define M_HASHTYPE_NONE 0x0 +#define M_HASHTYPE_RSS_IPV4 0x1 /* IPv4 2-tuple */ +#define M_HASHTYPE_RSS_TCP_IPV4 0x2 /* TCPv4 4-tuple */ +#define M_HASHTYPE_RSS_IPV6 0x3 /* IPv6 2-tuple */ +#define M_HASHTYPE_RSS_TCP_IPV6 0x4 /* TCPv6 4-tuple */ +#define M_HASHTYPE_RSS_IPV6_EX 0x5 /* IPv6 2-tuple + ext hdrs */ +#define M_HASHTYPE_RSS_TCP_IPV6_EX 0x6 /* TCPv6 4-tiple + ext hdrs */ +#define M_HASHTYPE_OPAQUE 0xf /* ordering, not affinity */ + +#define M_HASHTYPE_CLEAR(m) (m)->m_flags &= ~(M_HASHTYPEBITS) +#define M_HASHTYPE_GET(m) (((m)->m_flags & M_HASHTYPEBITS) >> \ + M_HASHTYPE_SHIFT) +#define M_HASHTYPE_SET(m, v) do { \ + (m)->m_flags &= ~M_HASHTYPEBITS; \ + (m)->m_flags |= ((v) << M_HASHTYPE_SHIFT); \ +} while (0) +#define M_HASHTYPE_TEST(m, v) (M_HASHTYPE_GET(m) == (v)) + +/* * Flags preserved when copying m_pkthdr. */ #define M_COPYFLAGS \ (M_PKTHDR|M_EOR|M_RDONLY|M_PROTOFLAGS|M_SKIP_FIREWALL|M_BCAST|M_MCAST|\ - M_FRAG|M_FIRSTFRAG|M_LASTFRAG|M_VLANTAG|M_PROMISC|M_FIB) + M_FRAG|M_FIRSTFRAG|M_LASTFRAG|M_VLANTAG|M_PROMISC|M_FIB|M_HASHTYPEBITS) /* * External buffer types: identify ext_buf type. diff --git a/sys/sys/soundcard.h b/sys/sys/soundcard.h index c4cfc27..a6817df 100644 --- a/sys/sys/soundcard.h +++ b/sys/sys/soundcard.h @@ -311,7 +311,8 @@ typedef struct _snd_capabilities { * IOCTL Commands for /dev/sequencer */ -#define SNDCTL_SEQ_RESET _IO ('Q', 0) +#define SNDCTL_SEQ_HALT _IO ('Q', 0) +#define SNDCTL_SEQ_RESET SNDCTL_SEQ_HALT /* Historic interface */ #define SNDCTL_SEQ_SYNC _IO ('Q', 1) #define SNDCTL_SYNTH_INFO _IOWR('Q', 2, struct synth_info) #define SNDCTL_SEQ_CTRLRATE _IOWR('Q', 3, int) /* Set/get timer res.(hz) */ diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c index 5b8f3c8..7f5d1b4 100644 --- a/sys/ufs/ffs/ffs_alloc.c +++ b/sys/ufs/ffs/ffs_alloc.c @@ -420,13 +420,13 @@ nospace: */ if (reclaimed == 0) { reclaimed = 1; - softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT); UFS_UNLOCK(ump); if (bp) { brelse(bp); bp = NULL; } UFS_LOCK(ump); + softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT); goto retry; } UFS_UNLOCK(ump); @@ -2356,8 +2356,8 @@ ffs_fserr(fs, inum, cp) * specified inode by the specified amount. Under normal * operation the count should always go down. Decrementing * the count to zero will cause the inode to be freed. - * adjblkcnt(inode, amt) - adjust the number of blocks used to - * by the specifed amount. + * adjblkcnt(inode, amt) - adjust the number of blocks used by the + * inode by the specified amount. * adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) - * adjust the superblock summary. * freedirs(inode, count) - directory inodes [inode..inode + count - 1] |