diff options
author | grehan <grehan@FreeBSD.org> | 2011-06-28 06:26:03 +0000 |
---|---|---|
committer | grehan <grehan@FreeBSD.org> | 2011-06-28 06:26:03 +0000 |
commit | 2c6741be0f59191f2283eb268e4f7690399d578a (patch) | |
tree | b139c8c6dcca4fa284815daade405b75886ee360 /sys/netinet | |
parent | 3c35264f695e0a1f8a04dbcca1c93bb5159b2274 (diff) | |
parent | 19ae02bba572390c7299166228d31e54003e094a (diff) | |
download | FreeBSD-src-2c6741be0f59191f2283eb268e4f7690399d578a.zip FreeBSD-src-2c6741be0f59191f2283eb268e4f7690399d578a.tar.gz |
IFC @ r222830
Diffstat (limited to 'sys/netinet')
38 files changed, 3195 insertions, 1209 deletions
diff --git a/sys/netinet/icmp6.h b/sys/netinet/icmp6.h index 5faae7c..c9da86a 100644 --- a/sys/netinet/icmp6.h +++ b/sys/netinet/icmp6.h @@ -297,8 +297,9 @@ struct nd_opt_hdr { /* Neighbor discovery option header */ #define ND_OPT_PREFIX_INFORMATION 3 #define ND_OPT_REDIRECTED_HEADER 4 #define ND_OPT_MTU 5 - -#define ND_OPT_ROUTE_INFO 200 /* draft-ietf-ipngwg-router-preference, not officially assigned yet */ +#define ND_OPT_ROUTE_INFO 24 /* RFC 4191 */ +#define ND_OPT_RDNSS 25 /* RFC 6016 */ +#define ND_OPT_DNSSL 31 /* RFC 6016 */ struct nd_opt_prefix_info { /* prefix information */ u_int8_t nd_opt_pi_type; @@ -338,6 +339,22 @@ struct nd_opt_route_info { /* route info */ /* prefix follows */ } __packed; +struct nd_opt_rdnss { /* RDNSS option (RFC 6106) */ + u_int8_t nd_opt_rdnss_type; + u_int8_t nd_opt_rdnss_len; + u_int16_t nd_opt_rdnss_reserved; + u_int32_t nd_opt_rdnss_lifetime; + /* followed by list of recursive DNS servers */ +} __packed; + +struct nd_opt_dnssl { /* DNSSL option (RFC 6106) */ + u_int8_t nd_opt_dnssl_type; + u_int8_t nd_opt_dnssl_len; + u_int16_t nd_opt_dnssl_reserved; + u_int32_t nd_opt_dnssl_lifetime; + /* followed by list of DNS search domains */ +} __packed; + /* * icmp6 namelookup */ diff --git a/sys/netinet/if_ether.c b/sys/netinet/if_ether.c index 3afdc7d..6a66c05 100644 --- a/sys/netinet/if_ether.c +++ b/sys/netinet/if_ether.c @@ -759,7 +759,7 @@ match: } } else LLE_WUNLOCK(la); - } /* end of FIB loop */ + } reply: if (op != ARPOP_REQUEST) goto drop; diff --git a/sys/netinet/in.c b/sys/netinet/in.c index 684d808..7ae8477 100644 --- a/sys/netinet/in.c +++ b/sys/netinet/in.c @@ -548,7 +548,7 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, * is the same as before, then the call is * un-necessarily executed here. */ - in_ifscrub(ifp, ia, 0); + in_ifscrub(ifp, ia, LLE_STATIC); ia->ia_sockmask = ifra->ifra_mask; ia->ia_sockmask.sin_family = AF_INET; ia->ia_subnetmask = @@ -557,7 +557,7 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, } if ((ifp->if_flags & IFF_POINTOPOINT) && (ifra->ifra_dstaddr.sin_family == AF_INET)) { - in_ifscrub(ifp, ia, 0); + in_ifscrub(ifp, ia, LLE_STATIC); ia->ia_dstaddr = ifra->ifra_dstaddr; maskIsNew = 1; /* We lie; but the effect's the same */ } @@ -1179,14 +1179,20 @@ in_scrubprefix(struct in_ifaddr *target, u_int flags) && (ia->ia_ifp->if_type != IFT_CARP)) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(); - rtinit(&(target->ia_ifa), (int)RTM_DELETE, + error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); - target->ia_flags &= ~IFA_ROUTE; - + if (error == 0) + target->ia_flags &= ~IFA_ROUTE; + else + log(LOG_INFO, "in_scrubprefix: err=%d, old prefix delete failed\n", + error); error = rtinit(&ia->ia_ifa, (int)RTM_ADD, rtinitflags(ia) | RTF_UP); if (error == 0) ia->ia_flags |= IFA_ROUTE; + else + log(LOG_INFO, "in_scrubprefix: err=%d, new prefix add failed\n", + error); ifa_free(&ia->ia_ifa); return (error); } @@ -1210,9 +1216,12 @@ in_scrubprefix(struct in_ifaddr *target, u_int flags) /* * As no-one seem to have this prefix, we can remove the route. */ - rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); - target->ia_flags &= ~IFA_ROUTE; - return (0); + error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); + if (error == 0) + target->ia_flags &= ~IFA_ROUTE; + else + log(LOG_INFO, "in_scrubprefix: err=%d, prefix delete failed\n", error); + return (error); } #undef rtinitflags diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 85e31dc..4eb309a 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$"); #include "opt_ipsec.h" #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_pcbgroup.h" #include <sys/param.h> #include <sys/systm.h> @@ -128,8 +129,12 @@ static VNET_DEFINE(int, ipport_tcplastcount); #define V_ipport_tcplastcount VNET(ipport_tcplastcount) static void in_pcbremlists(struct inpcb *inp); - #ifdef INET +static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, + struct in_addr faddr, u_int fport_arg, + struct in_addr laddr, u_int lport_arg, + int lookupflags, struct ifnet *ifp); + #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ else if ((var) > (max)) { (var) = (max); } @@ -208,19 +213,24 @@ void in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, struct inpcbhead *listhead, int hash_nelements, int porthash_nelements, char *inpcbzone_name, uma_init inpcbzone_init, uma_fini inpcbzone_fini, - uint32_t inpcbzone_flags) + uint32_t inpcbzone_flags, u_int hashfields) { INP_INFO_LOCK_INIT(pcbinfo, name); + INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */ #ifdef VIMAGE pcbinfo->ipi_vnet = curvnet; #endif pcbinfo->ipi_listhead = listhead; LIST_INIT(pcbinfo->ipi_listhead); + pcbinfo->ipi_count = 0; pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_hashmask); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_porthashmask); +#ifdef PCBGROUP + in_pcbgroup_init(pcbinfo, hashfields, hash_nelements); +#endif pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), NULL, NULL, inpcbzone_init, inpcbzone_fini, UMA_ALIGN_PTR, inpcbzone_flags); @@ -234,10 +244,17 @@ void in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) { + KASSERT(pcbinfo->ipi_count == 0, + ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); + hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, pcbinfo->ipi_porthashmask); +#ifdef PCBGROUP + in_pcbgroup_destroy(pcbinfo); +#endif uma_zdestroy(pcbinfo->ipi_zone); + INP_HASH_LOCK_DESTROY(pcbinfo); INP_INFO_LOCK_DESTROY(pcbinfo); } @@ -309,8 +326,8 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { int anonport, error; - INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) return (EINVAL); @@ -351,8 +368,8 @@ in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, * Because no actual state changes occur here, a global write lock on * the pcbinfo isn't required. */ - INP_INFO_LOCK_ASSERT(pcbinfo); INP_LOCK_ASSERT(inp); + INP_HASH_LOCK_ASSERT(pcbinfo); if (inp->inp_flags & INP_HIGHPORT) { first = V_ipport_hifirstauto; /* sysctl */ @@ -473,11 +490,10 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, int error; /* - * Because no actual state changes occur here, a global write lock on - * the pcbinfo isn't required. + * No state changes, so read locks are sufficient here. */ - INP_INFO_LOCK_ASSERT(pcbinfo); INP_LOCK_ASSERT(inp); + INP_HASH_LOCK_ASSERT(pcbinfo); if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); @@ -612,14 +628,15 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, * then pick one. */ int -in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) +in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam, + struct ucred *cred, struct mbuf *m) { u_short lport, fport; in_addr_t laddr, faddr; int anonport, error; - INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); lport = inp->inp_lport; laddr = inp->inp_laddr.s_addr; @@ -645,13 +662,20 @@ in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) inp->inp_laddr.s_addr = laddr; inp->inp_faddr.s_addr = faddr; inp->inp_fport = fport; - in_pcbrehash(inp); + in_pcbrehash_mbuf(inp, m); if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } +int +in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) +{ + + return (in_pcbconnect_mbuf(inp, nam, cred, NULL)); +} + /* * Do proper source address selection on an unbound socket in case * of connect. Take jails into account as well. @@ -907,8 +931,8 @@ in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, * Because a global state change doesn't actually occur here, a read * lock is sufficient. */ - INP_INFO_LOCK_ASSERT(inp->inp_pcbinfo); INP_LOCK_ASSERT(inp); + INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); if (oinpp != NULL) *oinpp = NULL; @@ -983,8 +1007,8 @@ in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, if (error) return (error); } - oinp = in_pcblookup_hash(inp->inp_pcbinfo, faddr, fport, laddr, lport, - 0, NULL); + oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport, + laddr, lport, 0, NULL); if (oinp != NULL) { if (oinpp != NULL) *oinpp = oinp; @@ -1007,8 +1031,8 @@ void in_pcbdisconnect(struct inpcb *inp) { - INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; @@ -1036,7 +1060,8 @@ in_pcbdetach(struct inpcb *inp) * in_pcbref() bumps the reference count on an inpcb in order to maintain * stability of an inpcb pointer despite the inpcb lock being released. This * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, - * but where the inpcb lock is already held. + * but where the inpcb lock may already held, or when acquiring a reference + * via a pcbgroup. * * in_pcbref() should be used only to provide brief memory stability, and * must always be followed by a call to INP_WLOCK() and in_pcbrele() to @@ -1187,20 +1212,28 @@ void in_pcbdrop(struct inpcb *inp) { - INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); + /* + * XXXRW: Possibly we should protect the setting of INP_DROPPED with + * the hash lock...? + */ inp->inp_flags |= INP_DROPPED; if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; + INP_HASH_WLOCK(inp->inp_pcbinfo); LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { LIST_REMOVE(phd, phd_hash); free(phd, M_PCB); } + INP_HASH_WUNLOCK(inp->inp_pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; +#ifdef PCBGROUP + in_pcbgroup_remove(inp); +#endif } } @@ -1328,7 +1361,8 @@ in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) } /* - * Lookup a PCB based on the local address and port. + * Lookup a PCB based on the local address and port. Caller must hold the + * hash lock. No inpcb locks or references are acquired. */ #define INP_LOOKUP_MAPPED_PCB_COST 3 struct inpcb * @@ -1346,7 +1380,7 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); - INP_INFO_LOCK_ASSERT(pcbinfo); + INP_HASH_LOCK_ASSERT(pcbinfo); if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { struct inpcbhead *head; @@ -1449,11 +1483,155 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, } #undef INP_LOOKUP_MAPPED_PCB_COST +#ifdef PCBGROUP /* - * Lookup PCB in hash list. + * Lookup PCB in hash list, using pcbgroup tables. */ -struct inpcb * -in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, +static struct inpcb * +in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, + struct in_addr faddr, u_int fport_arg, struct in_addr laddr, + u_int lport_arg, int lookupflags, struct ifnet *ifp) +{ + struct inpcbhead *head; + struct inpcb *inp, *tmpinp; + u_short fport = fport_arg, lport = lport_arg; + + /* + * First look for an exact match. + */ + tmpinp = NULL; + INP_GROUP_LOCK(pcbgroup); + head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, + pcbgroup->ipg_hashmask)]; + LIST_FOREACH(inp, head, inp_pcbgrouphash) { +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr == faddr.s_addr && + inp->inp_laddr.s_addr == laddr.s_addr && + inp->inp_fport == fport && + inp->inp_lport == lport) { + /* + * XXX We should be able to directly return + * the inp here, without any checks. + * Well unless both bound with SO_REUSEPORT? + */ + if (prison_flag(inp->inp_cred, PR_IP4)) + goto found; + if (tmpinp == NULL) + tmpinp = inp; + } + } + if (tmpinp != NULL) { + inp = tmpinp; + goto found; + } + + /* + * Then look for a wildcard match, if requested. + */ + if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { + struct inpcb *local_wild = NULL, *local_exact = NULL; +#ifdef INET6 + struct inpcb *local_wild_mapped = NULL; +#endif + struct inpcb *jail_wild = NULL; + struct inpcbhead *head; + int injail; + + /* + * Order of socket selection - we always prefer jails. + * 1. jailed, non-wild. + * 2. jailed, wild. + * 3. non-jailed, non-wild. + * 4. non-jailed, wild. + */ + head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport, + 0, pcbinfo->ipi_wildmask)]; + LIST_FOREACH(inp, head, inp_pcbgroup_wild) { +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr != INADDR_ANY || + inp->inp_lport != lport) + continue; + + /* XXX inp locking */ + if (ifp && ifp->if_type == IFT_FAITH && + (inp->inp_flags & INP_FAITH) == 0) + continue; + + injail = prison_flag(inp->inp_cred, PR_IP4); + if (injail) { + if (prison_check_ip4(inp->inp_cred, + &laddr) != 0) + continue; + } else { + if (local_exact != NULL) + continue; + } + + if (inp->inp_laddr.s_addr == laddr.s_addr) { + if (injail) + goto found; + else + local_exact = inp; + } else if (inp->inp_laddr.s_addr == INADDR_ANY) { +#ifdef INET6 + /* XXX inp locking, NULL check */ + if (inp->inp_vflag & INP_IPV6PROTO) + local_wild_mapped = inp; + else +#endif /* INET6 */ + if (injail) + jail_wild = inp; + else + local_wild = inp; + } + } /* LIST_FOREACH */ + inp = jail_wild; + if (inp == NULL) + inp = local_exact; + if (inp == NULL) + inp = local_wild; +#ifdef INET6 + if (inp == NULL) + inp = local_wild_mapped; +#endif /* defined(INET6) */ + if (inp != NULL) + goto found; + } /* if (lookupflags & INPLOOKUP_WILDCARD) */ + INP_GROUP_UNLOCK(pcbgroup); + return (NULL); + +found: + in_pcbref(inp); + INP_GROUP_UNLOCK(pcbgroup); + if (lookupflags & INPLOOKUP_WLOCKPCB) { + INP_WLOCK(inp); + if (in_pcbrele_wlocked(inp)) + return (NULL); + } else if (lookupflags & INPLOOKUP_RLOCKPCB) { + INP_RLOCK(inp); + if (in_pcbrele_rlocked(inp)) + return (NULL); + } else + panic("%s: locking bug", __func__); + return (inp); +} +#endif /* PCBGROUP */ + +/* + * Lookup PCB in hash list, using pcbinfo tables. This variation assumes + * that the caller has locked the hash list, and will not perform any further + * locking or reference operations on either the hash list or the connection. + */ +static struct inpcb * +in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { @@ -1464,7 +1642,7 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); - INP_INFO_LOCK_ASSERT(pcbinfo); + INP_HASH_LOCK_ASSERT(pcbinfo); /* * First look for an exact match. @@ -1574,13 +1752,108 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, return (NULL); } + +/* + * Lookup PCB in hash list, using pcbinfo tables. This variation locks the + * hash list lock, and will return the inpcb locked (i.e., requires + * INPLOOKUP_LOCKPCB). + */ +static struct inpcb * +in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, + u_int fport, struct in_addr laddr, u_int lport, int lookupflags, + struct ifnet *ifp) +{ + struct inpcb *inp; + + INP_HASH_RLOCK(pcbinfo); + inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, + (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); + if (inp != NULL) { + in_pcbref(inp); + INP_HASH_RUNLOCK(pcbinfo); + if (lookupflags & INPLOOKUP_WLOCKPCB) { + INP_WLOCK(inp); + if (in_pcbrele_wlocked(inp)) + return (NULL); + } else if (lookupflags & INPLOOKUP_RLOCKPCB) { + INP_RLOCK(inp); + if (in_pcbrele_rlocked(inp)) + return (NULL); + } else + panic("%s: locking bug", __func__); + } else + INP_HASH_RUNLOCK(pcbinfo); + return (inp); +} + +/* + * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf + * from which a pre-calculated hash value may be extracted. + * + * Possibly more of this logic should be in in_pcbgroup.c. + */ +struct inpcb * +in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, + struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) +{ +#if defined(PCBGROUP) + struct inpcbgroup *pcbgroup; +#endif + + KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, + ("%s: invalid lookup flags %d", __func__, lookupflags)); + KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, + ("%s: LOCKPCB not set", __func__)); + +#if defined(PCBGROUP) + if (in_pcbgroup_enabled(pcbinfo)) { + pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, + fport); + return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, + laddr, lport, lookupflags, ifp)); + } +#endif + return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, + lookupflags, ifp)); +} + +struct inpcb * +in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, + u_int fport, struct in_addr laddr, u_int lport, int lookupflags, + struct ifnet *ifp, struct mbuf *m) +{ +#ifdef PCBGROUP + struct inpcbgroup *pcbgroup; +#endif + + KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, + ("%s: invalid lookup flags %d", __func__, lookupflags)); + KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, + ("%s: LOCKPCB not set", __func__)); + +#ifdef PCBGROUP + if (in_pcbgroup_enabled(pcbinfo)) { + pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), + m->m_pkthdr.flowid); + if (pcbgroup != NULL) + return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, + fport, laddr, lport, lookupflags, ifp)); + pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, + fport); + return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, + laddr, lport, lookupflags, ifp)); + } +#endif + return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, + lookupflags, ifp)); +} #endif /* INET */ /* * Insert PCB onto various hash lists. */ -int -in_pcbinshash(struct inpcb *inp) +static int +in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update) { struct inpcbhead *pcbhash; struct inpcbporthead *pcbporthash; @@ -1588,8 +1861,9 @@ in_pcbinshash(struct inpcb *inp) struct inpcbport *phd; u_int32_t hashkey_faddr; - INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(pcbinfo); + KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, ("in_pcbinshash: INP_INHASHLIST")); @@ -1629,24 +1903,54 @@ in_pcbinshash(struct inpcb *inp) LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); LIST_INSERT_HEAD(pcbhash, inp, inp_hash); inp->inp_flags |= INP_INHASHLIST; +#ifdef PCBGROUP + if (do_pcbgroup_update) + in_pcbgroup_update(inp); +#endif return (0); } /* + * For now, there are two public interfaces to insert an inpcb into the hash + * lists -- one that does update pcbgroups, and one that doesn't. The latter + * is used only in the TCP syncache, where in_pcbinshash is called before the + * full 4-tuple is set for the inpcb, and we don't want to install in the + * pcbgroup until later. + * + * XXXRW: This seems like a misfeature. in_pcbinshash should always update + * connection groups, and partially initialised inpcbs should not be exposed + * to either reservation hash tables or pcbgroups. + */ +int +in_pcbinshash(struct inpcb *inp) +{ + + return (in_pcbinshash_internal(inp, 1)); +} + +int +in_pcbinshash_nopcbgroup(struct inpcb *inp) +{ + + return (in_pcbinshash_internal(inp, 0)); +} + +/* * Move PCB to the proper hash bucket when { faddr, fport } have been * changed. NOTE: This does not handle the case of the lport changing (the * hashed port list would have to be updated as well), so the lport must * not change after in_pcbinshash() has been called. */ void -in_pcbrehash(struct inpcb *inp) +in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *head; u_int32_t hashkey_faddr; - INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(pcbinfo); + KASSERT(inp->inp_flags & INP_INHASHLIST, ("in_pcbrehash: !INP_INHASHLIST")); @@ -1662,6 +1966,20 @@ in_pcbrehash(struct inpcb *inp) LIST_REMOVE(inp, inp_hash); LIST_INSERT_HEAD(head, inp, inp_hash); + +#ifdef PCBGROUP + if (m != NULL) + in_pcbgroup_update_mbuf(inp, m); + else + in_pcbgroup_update(inp); +#endif +} + +void +in_pcbrehash(struct inpcb *inp) +{ + + in_pcbrehash_mbuf(inp, NULL); } /* @@ -1679,16 +1997,21 @@ in_pcbremlists(struct inpcb *inp) if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; + INP_HASH_WLOCK(pcbinfo); LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { LIST_REMOVE(phd, phd_hash); free(phd, M_PCB); } + INP_HASH_WUNLOCK(pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; } LIST_REMOVE(inp, inp_list); pcbinfo->ipi_count--; +#ifdef PCBGROUP + in_pcbgroup_remove(inp); +#endif } /* diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index 14d4ea2..dfef963 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -44,6 +44,7 @@ #include <sys/_rwlock.h> #ifdef _KERNEL +#include <sys/lock.h> #include <sys/rwlock.h> #include <net/vnet.h> #include <vm/uma.h> @@ -141,6 +142,7 @@ struct icmp6_filter; * * Key: * (c) - Constant after initialization + * (g) - Protected by the pcbgroup lock * (i) - Protected by the inpcb lock * (p) - Protected by the pcbinfo lock for the inpcb * (s) - Protected by another subsystem's locks @@ -160,9 +162,12 @@ struct icmp6_filter; */ struct inpcb { LIST_ENTRY(inpcb) inp_hash; /* (i/p) hash list */ + LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */ LIST_ENTRY(inpcb) inp_list; /* (i/p) list for all PCBs for proto */ void *inp_ppcb; /* (i) pointer to per-protocol pcb */ struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */ + struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */ + LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/p) group wildcard entry */ struct socket *inp_socket; /* (i) back pointer to socket */ struct ucred *inp_cred; /* (c) cache of socket cred */ u_int32_t inp_flow; /* (i) IPv6 flow information */ @@ -268,22 +273,23 @@ struct inpcbport { * Global data structure for each high-level protocol (UDP, TCP, ...) in both * IPv4 and IPv6. Holds inpcb lists and information for managing them. * - * Each pcbinfo is protected by ipi_lock, covering mutable global fields (such - * as the global pcb list) and hashed lookup tables. The lock order is: + * Each pcbinfo is protected by two locks: ipi_lock and ipi_hash_lock, + * the former covering mutable global fields (such as the global pcb list), + * and the latter covering the hashed lookup tables. The lock order is: * - * ipi_lock (before) inpcb locks + * ipi_lock (before) inpcb locks (before) {ipi_hash_lock, pcbgroup locks} * * Locking key: * * (c) Constant or nearly constant after initialisation * (g) Locked by ipi_lock - * (h) Read using either ipi_lock or inpcb lock; write requires both. + * (h) Read using either ipi_hash_lock or inpcb lock; write requires both + * (p) Protected by one or more pcbgroup locks * (x) Synchronisation properties poorly defined */ struct inpcbinfo { /* - * Global lock protecting global inpcb list, inpcb count, hash tables, - * etc. + * Global lock protecting global inpcb list, inpcb count, etc. */ struct rwlock ipi_lock; @@ -312,17 +318,39 @@ struct inpcbinfo { struct uma_zone *ipi_zone; /* (c) */ /* + * Connection groups associated with this protocol. These fields are + * constant, but pcbgroup structures themselves are protected by + * per-pcbgroup locks. + */ + struct inpcbgroup *ipi_pcbgroups; /* (c) */ + u_int ipi_npcbgroups; /* (c) */ + u_int ipi_hashfields; /* (c) */ + + /* + * Global lock protecting non-pcbgroup hash lookup tables. + */ + struct rwlock ipi_hash_lock; + + /* * Global hash of inpcbs, hashed by local and foreign addresses and * port numbers. */ - struct inpcbhead *ipi_hashbase; /* (g) */ - u_long ipi_hashmask; /* (g) */ + struct inpcbhead *ipi_hashbase; /* (h) */ + u_long ipi_hashmask; /* (h) */ /* * Global hash of inpcbs, hashed by only local port number. */ - struct inpcbporthead *ipi_porthashbase; /* (g) */ - u_long ipi_porthashmask; /* (g) */ + struct inpcbporthead *ipi_porthashbase; /* (h) */ + u_long ipi_porthashmask; /* (h) */ + + /* + * List of wildcard inpcbs for use with pcbgroups. In the past, was + * per-pcbgroup but is now global. All pcbgroup locks must be held + * to modify the list, so any is sufficient to read it. + */ + struct inpcbhead *ipi_wildbase; /* (p) */ + u_long ipi_wildmask; /* (p) */ /* * Pointer to network stack instance @@ -335,6 +363,31 @@ struct inpcbinfo { void *ipi_pspare[2]; }; +/* + * Connection groups hold sets of connections that have similar CPU/thread + * affinity. Each connection belongs to exactly one connection group. + */ +struct inpcbgroup { + /* + * Per-connection group hash of inpcbs, hashed by local and foreign + * addresses and port numbers. + */ + struct inpcbhead *ipg_hashbase; /* (c) */ + u_long ipg_hashmask; /* (c) */ + + /* + * Notional affinity of this pcbgroup. + */ + u_int ipg_cpu; /* (p) */ + + /* + * Per-connection group lock, not to be confused with ipi_lock. + * Protects the hash table hung off the group, but also the global + * wildcard list in inpcbinfo. + */ + struct mtx ipg_lock; +} __aligned(CACHE_LINE_SIZE); + #define INP_LOCK_INIT(inp, d, t) \ rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK) #define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock) @@ -406,6 +459,26 @@ void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, #define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED) #define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED) +#define INP_HASH_LOCK_INIT(ipi, d) \ + rw_init_flags(&(ipi)->ipi_hash_lock, (d), 0) +#define INP_HASH_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_hash_lock) +#define INP_HASH_RLOCK(ipi) rw_rlock(&(ipi)->ipi_hash_lock) +#define INP_HASH_WLOCK(ipi) rw_wlock(&(ipi)->ipi_hash_lock) +#define INP_HASH_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_hash_lock) +#define INP_HASH_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_hash_lock) +#define INP_HASH_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \ + RA_LOCKED) +#define INP_HASH_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \ + RA_WLOCKED) + +#define INP_GROUP_LOCK_INIT(ipg, d) mtx_init(&(ipg)->ipg_lock, (d), NULL, \ + MTX_DEF | MTX_DUPOK) +#define INP_GROUP_LOCK_DESTROY(ipg) mtx_destroy(&(ipg)->ipg_lock) + +#define INP_GROUP_LOCK(ipg) mtx_lock(&(ipg)->ipg_lock) +#define INP_GROUP_LOCK_ASSERT(ipg) mtx_assert(&(ipg)->ipg_lock, MA_OWNED) +#define INP_GROUP_UNLOCK(ipg) mtx_unlock(&(ipg)->ipg_lock) + #define INP_PCBHASH(faddr, lport, fport, mask) \ (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask)) #define INP_PCBPORTHASH(lport, mask) \ @@ -465,8 +538,18 @@ void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, */ #define INP_LLE_VALID 0x00000001 /* cached lle is valid */ #define INP_RT_VALID 0x00000002 /* cached rtentry is valid */ +#define INP_PCBGROUPWILD 0x00000004 /* in pcbgroup wildcard list */ + +/* + * Flags passed to in_pcblookup*() functions. + */ +#define INPLOOKUP_WILDCARD 0x00000001 /* Allow wildcard sockets. */ +#define INPLOOKUP_RLOCKPCB 0x00000002 /* Return inpcb read-locked. */ +#define INPLOOKUP_WLOCKPCB 0x00000004 /* Return inpcb write-locked. */ + +#define INPLOOKUP_MASK (INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \ + INPLOOKUP_WLOCKPCB) -#define INPLOOKUP_WILDCARD 1 #define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) #define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */ @@ -474,6 +557,13 @@ void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, #define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af) +/* + * Constants for pcbinfo.ipi_hashfields. + */ +#define IPI_HASHFIELDS_NONE 0 +#define IPI_HASHFIELDS_2TUPLE 1 +#define IPI_HASHFIELDS_4TUPLE 2 + #ifdef _KERNEL VNET_DECLARE(int, ipport_reservedhigh); VNET_DECLARE(int, ipport_reservedlow); @@ -505,7 +595,21 @@ VNET_DECLARE(int, ipport_tcpallocs); void in_pcbinfo_destroy(struct inpcbinfo *); void in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *, - int, int, char *, uma_init, uma_fini, uint32_t); + int, int, char *, uma_init, uma_fini, uint32_t, u_int); + +struct inpcbgroup * + in_pcbgroup_byhash(struct inpcbinfo *, u_int, uint32_t); +struct inpcbgroup * + in_pcbgroup_byinpcb(struct inpcb *); +struct inpcbgroup * + in_pcbgroup_bytuple(struct inpcbinfo *, struct in_addr, u_short, + struct in_addr, u_short); +void in_pcbgroup_destroy(struct inpcbinfo *); +int in_pcbgroup_enabled(struct inpcbinfo *); +void in_pcbgroup_init(struct inpcbinfo *, u_int, int); +void in_pcbgroup_remove(struct inpcb *); +void in_pcbgroup_update(struct inpcb *); +void in_pcbgroup_update_mbuf(struct inpcb *, struct mbuf *); void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); int in_pcballoc(struct socket *, struct inpcbinfo *); @@ -515,6 +619,8 @@ int in_pcb_lport(struct inpcb *, struct in_addr *, u_short *, int in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *, u_short *, struct ucred *); int in_pcbconnect(struct inpcb *, struct sockaddr *, struct ucred *); +int in_pcbconnect_mbuf(struct inpcb *, struct sockaddr *, struct ucred *, + struct mbuf *); int in_pcbconnect_setup(struct inpcb *, struct sockaddr *, in_addr_t *, u_short *, in_addr_t *, u_short *, struct inpcb **, struct ucred *); @@ -523,16 +629,21 @@ void in_pcbdisconnect(struct inpcb *); void in_pcbdrop(struct inpcb *); void in_pcbfree(struct inpcb *); int in_pcbinshash(struct inpcb *); +int in_pcbinshash_nopcbgroup(struct inpcb *); struct inpcb * in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short, int, struct ucred *); struct inpcb * - in_pcblookup_hash(struct inpcbinfo *, struct in_addr, u_int, + in_pcblookup(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *); +struct inpcb * + in_pcblookup_mbuf(struct inpcbinfo *, struct in_addr, u_int, + struct in_addr, u_int, int, struct ifnet *, struct mbuf *); void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr, int, struct inpcb *(*)(struct inpcb *, int)); void in_pcbref(struct inpcb *); void in_pcbrehash(struct inpcb *); +void in_pcbrehash_mbuf(struct inpcb *, struct mbuf *); int in_pcbrele(struct inpcb *); int in_pcbrele_rlocked(struct inpcb *); int in_pcbrele_wlocked(struct inpcb *); diff --git a/sys/netinet/in_pcbgroup.c b/sys/netinet/in_pcbgroup.c new file mode 100644 index 0000000..c9f5c70 --- /dev/null +++ b/sys/netinet/in_pcbgroup.c @@ -0,0 +1,457 @@ +/*- + * Copyright (c) 2010-2011 Juniper Networks, Inc. + * All rights reserved. + * + * This software was developed by Robert N. M. Watson under contract + * to Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> + +__FBSDID("$FreeBSD$"); + +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/smp.h> +#include <sys/socketvar.h> + +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#ifdef INET6 +#include <netinet6/in6_pcb.h> +#endif /* INET6 */ + +/* + * pcbgroups, or "connection groups" are based on Willman, Rixner, and Cox's + * 2006 USENIX paper, "An Evaluation of Network Stack Parallelization + * Strategies in Modern Operating Systems". This implementation differs + * significantly from that described in the paper, in that it attempts to + * introduce not just notions of affinity for connections and distribute work + * so as to reduce lock contention, but also align those notions with + * hardware work distribution strategies such as RSS. In this construction, + * connection groups supplement, rather than replace, existing reservation + * tables for protocol 4-tuples, offering CPU-affine lookup tables with + * minimal cache line migration and lock contention during steady state + * operation. + * + * Internet protocols, such as UDP and TCP, register to use connection groups + * by providing an ipi_hashfields value other than IPI_HASHFIELDS_NONE; this + * indicates to the connection group code whether a 2-tuple or 4-tuple is + * used as an argument to hashes that assign a connection to a particular + * group. This must be aligned with any hardware offloaded distribution + * model, such as RSS or similar approaches taken in embedded network boards. + * Wildcard sockets require special handling, as in Willman 2006, and are + * shared between connection groups -- while being protected by group-local + * locks. This means that connection establishment and teardown can be + * signficantly more expensive than without connection groups, but that + * steady-state processing can be significantly faster. + * + * Most of the implementation of connection groups is in this file; however, + * connection group lookup is implemented in in_pcb.c alongside reservation + * table lookups -- see in_pcblookup_group(). + * + * TODO: + * + * Implement dynamic rebalancing of buckets with connection groups; when + * load is unevenly distributed, search for more optimal balancing on + * demand. This might require scaling up the number of connection groups + * by <<1. + * + * Provide an IP 2-tuple or 4-tuple netisr m2cpu handler based on connection + * groups for ip_input and ip6_input, allowing non-offloaded work + * distribution. + * + * Expose effective CPU affinity of connections to userspace using socket + * options. + * + * Investigate per-connection affinity overrides based on socket options; an + * option could be set, certainly resulting in work being distributed + * differently in software, and possibly propagated to supporting hardware + * with TCAMs or hardware hash tables. This might require connections to + * exist in more than one connection group at a time. + * + * Hook netisr thread reconfiguration events, and propagate those to RSS so + * that rebalancing can occur when the thread pool grows or shrinks. + * + * Expose per-pcbgroup statistics to userspace monitoring tools such as + * netstat, in order to allow better debugging and profiling. + */ + +void +in_pcbgroup_init(struct inpcbinfo *pcbinfo, u_int hashfields, + int hash_nelements) +{ + struct inpcbgroup *pcbgroup; + u_int numpcbgroups, pgn; + + /* + * Only enable connection groups for a protocol if it has been + * specifically requested. + */ + if (hashfields == IPI_HASHFIELDS_NONE) + return; + + /* + * Connection groups are about multi-processor load distribution, + * lock contention, and connection CPU affinity. As such, no point + * in turning them on for a uniprocessor machine, it only wastes + * memory. + */ + if (mp_ncpus == 1) + return; + + /* + * Use one group per CPU for now. If we decide to do dynamic + * rebalancing a la RSS, we'll need to shift left by at least 1. + */ + numpcbgroups = mp_ncpus; + + pcbinfo->ipi_hashfields = hashfields; + pcbinfo->ipi_pcbgroups = malloc(numpcbgroups * + sizeof(*pcbinfo->ipi_pcbgroups), M_PCB, M_WAITOK | M_ZERO); + pcbinfo->ipi_npcbgroups = numpcbgroups; + pcbinfo->ipi_wildbase = hashinit(hash_nelements, M_PCB, + &pcbinfo->ipi_wildmask); + for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) { + pcbgroup = &pcbinfo->ipi_pcbgroups[pgn]; + pcbgroup->ipg_hashbase = hashinit(hash_nelements, M_PCB, + &pcbgroup->ipg_hashmask); + INP_GROUP_LOCK_INIT(pcbgroup, "pcbgroup"); + + /* + * Initialise notional affinity of the pcbgroup -- for RSS, + * we want the same notion of affinity as NICs to be used. + * Just round robin for the time being. + */ + pcbgroup->ipg_cpu = (pgn % mp_ncpus); + } +} + +void +in_pcbgroup_destroy(struct inpcbinfo *pcbinfo) +{ + struct inpcbgroup *pcbgroup; + u_int pgn; + + if (pcbinfo->ipi_npcbgroups == 0) + return; + + for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) { + pcbgroup = &pcbinfo->ipi_pcbgroups[pgn]; + KASSERT(LIST_EMPTY(pcbinfo->ipi_listhead), + ("in_pcbinfo_destroy: listhead not empty")); + INP_GROUP_LOCK_DESTROY(pcbgroup); + hashdestroy(pcbgroup->ipg_hashbase, M_PCB, + pcbgroup->ipg_hashmask); + } + hashdestroy(pcbinfo->ipi_wildbase, M_PCB, pcbinfo->ipi_wildmask); + free(pcbinfo->ipi_pcbgroups, M_PCB); + pcbinfo->ipi_pcbgroups = NULL; + pcbinfo->ipi_npcbgroups = 0; + pcbinfo->ipi_hashfields = 0; +} + +/* + * Given a hash of whatever the covered tuple might be, return a pcbgroup + * index. + */ +static __inline u_int +in_pcbgroup_getbucket(struct inpcbinfo *pcbinfo, uint32_t hash) +{ + + return (hash % pcbinfo->ipi_npcbgroups); +} + +/* + * Map a (hashtype, hash) tuple into a connection group, or NULL if the hash + * information is insufficient to identify the pcbgroup. + */ +struct inpcbgroup * +in_pcbgroup_byhash(struct inpcbinfo *pcbinfo, u_int hashtype, uint32_t hash) +{ + + return (NULL); +} + +static struct inpcbgroup * +in_pcbgroup_bymbuf(struct inpcbinfo *pcbinfo, struct mbuf *m) +{ + + return (in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), + m->m_pkthdr.flowid)); +} + +struct inpcbgroup * +in_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, struct in_addr laddr, + u_short lport, struct in_addr faddr, u_short fport) +{ + uint32_t hash; + + switch (pcbinfo->ipi_hashfields) { + case IPI_HASHFIELDS_4TUPLE: + hash = faddr.s_addr ^ fport; + break; + + case IPI_HASHFIELDS_2TUPLE: + hash = faddr.s_addr ^ laddr.s_addr; + break; + + default: + hash = 0; + } + return (&pcbinfo->ipi_pcbgroups[in_pcbgroup_getbucket(pcbinfo, + hash)]); +} + +struct inpcbgroup * +in_pcbgroup_byinpcb(struct inpcb *inp) +{ + + return (in_pcbgroup_bytuple(inp->inp_pcbinfo, inp->inp_laddr, + inp->inp_lport, inp->inp_faddr, inp->inp_fport)); +} + +static void +in_pcbwild_add(struct inpcb *inp) +{ + struct inpcbinfo *pcbinfo; + struct inpcbhead *head; + u_int pgn; + + INP_WLOCK_ASSERT(inp); + KASSERT(!(inp->inp_flags2 & INP_PCBGROUPWILD), + ("%s: is wild",__func__)); + + pcbinfo = inp->inp_pcbinfo; + for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) + INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]); + head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, inp->inp_lport, + 0, pcbinfo->ipi_wildmask)]; + LIST_INSERT_HEAD(head, inp, inp_pcbgroup_wild); + inp->inp_flags2 |= INP_PCBGROUPWILD; + for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) + INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]); +} + +static void +in_pcbwild_remove(struct inpcb *inp) +{ + struct inpcbinfo *pcbinfo; + u_int pgn; + + INP_WLOCK_ASSERT(inp); + KASSERT((inp->inp_flags2 & INP_PCBGROUPWILD), + ("%s: not wild", __func__)); + + pcbinfo = inp->inp_pcbinfo; + for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) + INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]); + LIST_REMOVE(inp, inp_pcbgroup_wild); + for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) + INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]); + inp->inp_flags2 &= ~INP_PCBGROUPWILD; +} + +static __inline int +in_pcbwild_needed(struct inpcb *inp) +{ + +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) + return (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)); + else +#endif + return (inp->inp_faddr.s_addr == htonl(INADDR_ANY)); +} + +static void +in_pcbwild_update_internal(struct inpcb *inp) +{ + int wildcard_needed; + + wildcard_needed = in_pcbwild_needed(inp); + if (wildcard_needed && !(inp->inp_flags2 & INP_PCBGROUPWILD)) + in_pcbwild_add(inp); + else if (!wildcard_needed && (inp->inp_flags2 & INP_PCBGROUPWILD)) + in_pcbwild_remove(inp); +} + +/* + * Update the pcbgroup of an inpcb, which might include removing an old + * pcbgroup reference and/or adding a new one. Wildcard processing is not + * performed here, although ideally we'll never install a pcbgroup for a + * wildcard inpcb (asserted below). + */ +static void +in_pcbgroup_update_internal(struct inpcbinfo *pcbinfo, + struct inpcbgroup *newpcbgroup, struct inpcb *inp) +{ + struct inpcbgroup *oldpcbgroup; + struct inpcbhead *pcbhash; + uint32_t hashkey_faddr; + + INP_WLOCK_ASSERT(inp); + + oldpcbgroup = inp->inp_pcbgroup; + if (oldpcbgroup != NULL && oldpcbgroup != newpcbgroup) { + INP_GROUP_LOCK(oldpcbgroup); + LIST_REMOVE(inp, inp_pcbgrouphash); + inp->inp_pcbgroup = NULL; + INP_GROUP_UNLOCK(oldpcbgroup); + } + if (newpcbgroup != NULL && oldpcbgroup != newpcbgroup) { +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) + hashkey_faddr = inp->in6p_faddr.s6_addr32[3]; /* XXX */ + else +#endif + hashkey_faddr = inp->inp_faddr.s_addr; + INP_GROUP_LOCK(newpcbgroup); + pcbhash = &newpcbgroup->ipg_hashbase[ + INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, + newpcbgroup->ipg_hashmask)]; + LIST_INSERT_HEAD(pcbhash, inp, inp_pcbgrouphash); + inp->inp_pcbgroup = newpcbgroup; + INP_GROUP_UNLOCK(newpcbgroup); + } + + KASSERT(!(newpcbgroup != NULL && in_pcbwild_needed(inp)), + ("%s: pcbgroup and wildcard!", __func__)); +} + +/* + * Two update paths: one in which the 4-tuple on an inpcb has been updated + * and therefore connection groups may need to change (or a wildcard entry + * may needed to be installed), and another in which the 4-tuple has been + * set as a result of a packet received, in which case we may be able to use + * the hash on the mbuf to avoid doing a software hash calculation for RSS. + * + * In each case: first, let the wildcard code have a go at placing it as a + * wildcard socket. If it was a wildcard, or if the connection has been + * dropped, then no pcbgroup is required (so potentially clear it); + * otherwise, calculate and update the pcbgroup for the inpcb. + */ +void +in_pcbgroup_update(struct inpcb *inp) +{ + struct inpcbinfo *pcbinfo; + struct inpcbgroup *newpcbgroup; + + INP_WLOCK_ASSERT(inp); + + pcbinfo = inp->inp_pcbinfo; + if (!in_pcbgroup_enabled(pcbinfo)) + return; + + in_pcbwild_update_internal(inp); + if (!(inp->inp_flags2 & INP_PCBGROUPWILD) && + !(inp->inp_flags & INP_DROPPED)) { +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) + newpcbgroup = in6_pcbgroup_byinpcb(inp); + else +#endif + newpcbgroup = in_pcbgroup_byinpcb(inp); + } else + newpcbgroup = NULL; + in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp); +} + +void +in_pcbgroup_update_mbuf(struct inpcb *inp, struct mbuf *m) +{ + struct inpcbinfo *pcbinfo; + struct inpcbgroup *newpcbgroup; + + INP_WLOCK_ASSERT(inp); + + pcbinfo = inp->inp_pcbinfo; + if (!in_pcbgroup_enabled(pcbinfo)) + return; + + /* + * Possibly should assert !INP_PCBGROUPWILD rather than testing for + * it; presumably this function should never be called for anything + * other than non-wildcard socket? + */ + in_pcbwild_update_internal(inp); + if (!(inp->inp_flags2 & INP_PCBGROUPWILD) && + !(inp->inp_flags & INP_DROPPED)) { + newpcbgroup = in_pcbgroup_bymbuf(pcbinfo, m); +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) { + if (newpcbgroup == NULL) + newpcbgroup = in6_pcbgroup_byinpcb(inp); + } else { +#endif + if (newpcbgroup == NULL) + newpcbgroup = in_pcbgroup_byinpcb(inp); +#ifdef INET6 + } +#endif + } else + newpcbgroup = NULL; + in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp); +} + +/* + * Remove pcbgroup entry and optional pcbgroup wildcard entry for this inpcb. + */ +void +in_pcbgroup_remove(struct inpcb *inp) +{ + struct inpcbgroup *pcbgroup; + + INP_WLOCK_ASSERT(inp); + + if (!in_pcbgroup_enabled(inp->inp_pcbinfo)) + return; + + if (inp->inp_flags2 & INP_PCBGROUPWILD) + in_pcbwild_remove(inp); + + pcbgroup = inp->inp_pcbgroup; + if (pcbgroup != NULL) { + INP_GROUP_LOCK(pcbgroup); + LIST_REMOVE(inp, inp_pcbgrouphash); + inp->inp_pcbgroup = NULL; + INP_GROUP_UNLOCK(pcbgroup); + } +} + +/* + * Query whether or not it is appropriate to use pcbgroups to look up inpcbs + * for a protocol. + */ +int +in_pcbgroup_enabled(struct inpcbinfo *pcbinfo) +{ + + return (pcbinfo->ipi_npcbgroups > 0); +} diff --git a/sys/netinet/in_proto.c b/sys/netinet/in_proto.c index 2827c22..d2a772f 100644 --- a/sys/netinet/in_proto.c +++ b/sys/netinet/in_proto.c @@ -106,6 +106,8 @@ static struct pr_usrreqs nousrreqs; #include <net/if_pfsync.h> #endif +FEATURE(inet, "Internet Protocol version 4"); + extern struct domain inetdomain; /* Spacer for loadable protocols. */ diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c index de88556..527ce56 100644 --- a/sys/netinet/ip_divert.c +++ b/sys/netinet/ip_divert.c @@ -153,7 +153,8 @@ div_init(void) * place for hashbase == NULL. */ in_pcbinfo_init(&V_divcbinfo, "div", &V_divcb, 1, 1, "divcb", - div_inpcb_init, div_inpcb_fini, UMA_ZONE_NOFREE); + div_inpcb_init, div_inpcb_fini, UMA_ZONE_NOFREE, + IPI_HASHFIELDS_NONE); } static void @@ -530,7 +531,9 @@ div_bind(struct socket *so, struct sockaddr *nam, struct thread *td) ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY; INP_INFO_WLOCK(&V_divcbinfo); INP_WLOCK(inp); + INP_HASH_WLOCK(&V_divcbinfo); error = in_pcbbind(inp, nam, td->td_ucred); + INP_HASH_WUNLOCK(&V_divcbinfo); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_divcbinfo); return error; @@ -659,9 +662,9 @@ div_pcblist(SYSCTL_HANDLER_ARGS) INP_INFO_WLOCK(&V_divcbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; - INP_WLOCK(inp); - if (!in_pcbrele(inp)) - INP_WUNLOCK(inp); + INP_RLOCK(inp); + if (!in_pcbrele_rlocked(inp)) + INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_divcbinfo); diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c index ac1c723..67fcb74 100644 --- a/sys/netinet/ip_input.c +++ b/sys/netinet/ip_input.c @@ -488,7 +488,7 @@ tooshort: } #ifdef IPSEC /* - * Bypass packet filtering for packets from a tunnel (gif). + * Bypass packet filtering for packets previously handled by IPsec. */ if (ip_ipsec_filtertunnel(m)) goto passin; diff --git a/sys/netinet/ip_ipsec.c b/sys/netinet/ip_ipsec.c index 50a6ce4..a3c87f5 100644 --- a/sys/netinet/ip_ipsec.c +++ b/sys/netinet/ip_ipsec.c @@ -95,7 +95,7 @@ ip_ipsec_filtertunnel(struct mbuf *m) #if defined(IPSEC) /* - * Bypass packet filtering for packets from a tunnel. + * Bypass packet filtering for packets previously handled by IPsec. */ if (!V_ip4_ipsec_filtertunnel && m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL) diff --git a/sys/netinet/ipfw/ip_dummynet.c b/sys/netinet/ipfw/ip_dummynet.c index ba6e892..e23ba3a 100644 --- a/sys/netinet/ipfw/ip_dummynet.c +++ b/sys/netinet/ipfw/ip_dummynet.c @@ -1045,7 +1045,7 @@ config_red(struct dn_fsk *fs) fs->w_q = fs->fs.w_q; fs->max_p = fs->fs.max_p; - D("called"); + ND("called"); /* Doing stuff that was in userland */ i = fs->sched->link.bandwidth; s = (i <= 0) ? 0 : @@ -1109,7 +1109,7 @@ config_red(struct dn_fsk *fs) if (dn_cfg.red_max_pkt_size < 1) dn_cfg.red_max_pkt_size = 1500; fs->max_pkt_size = dn_cfg.red_max_pkt_size; - D("exit"); + ND("exit"); return 0; } @@ -2176,7 +2176,7 @@ ip_dn_destroy(int last) DN_BH_WLOCK(); if (last) { - printf("%s removing last instance\n", __FUNCTION__); + ND("removing last instance\n"); ip_dn_ctl_ptr = NULL; ip_dn_io_ptr = NULL; } @@ -2256,13 +2256,13 @@ unload_dn_sched(struct dn_alg *s) struct dn_alg *tmp, *r; int err = EINVAL; - D("called for %s", s->name); + ND("called for %s", s->name); DN_BH_WLOCK(); SLIST_FOREACH_SAFE(r, &dn_cfg.schedlist, next, tmp) { if (strcmp(s->name, r->name) != 0) continue; - D("ref_count = %d", r->ref_count); + ND("ref_count = %d", r->ref_count); err = (r->ref_count != 0) ? EBUSY : 0; if (err == 0) SLIST_REMOVE(&dn_cfg.schedlist, r, dn_alg, next); diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c index 9a75cf5..9e5c737 100644 --- a/sys/netinet/ipfw/ip_fw2.c +++ b/sys/netinet/ipfw/ip_fw2.c @@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$"); #include <netinet/ip6.h> #include <netinet/icmp6.h> #ifdef INET6 +#include <netinet6/in6_pcb.h> #include <netinet6/scope6_var.h> #include <netinet6/ip6_var.h> #endif @@ -646,21 +647,27 @@ send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip) * we tried and failed, or any other value if successful. */ static int -check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, - struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, - u_int16_t src_port, int *ugid_lookupp, - struct ucred **uc, struct inpcb *inp) +check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp, + struct ucred **uc) { #ifndef __FreeBSD__ + /* XXX */ return cred_check(insn, proto, oif, dst_ip, dst_port, src_ip, src_port, (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb); #else /* FreeBSD */ + struct in_addr src_ip, dst_ip; struct inpcbinfo *pi; - int wildcard; - struct inpcb *pcb; + struct ipfw_flow_id *id; + struct inpcb *pcb, *inp; + struct ifnet *oif; + int lookupflags; int match; + id = &args->f_id; + inp = args->inp; + oif = args->oif; + /* * Check to see if the UDP or TCP stack supplied us with * the PCB. If so, rather then holding a lock and looking @@ -681,31 +688,53 @@ check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, */ if (*ugid_lookupp == -1) return (0); - if (proto == IPPROTO_TCP) { - wildcard = 0; + if (id->proto == IPPROTO_TCP) { + lookupflags = 0; pi = &V_tcbinfo; - } else if (proto == IPPROTO_UDP) { - wildcard = INPLOOKUP_WILDCARD; + } else if (id->proto == IPPROTO_UDP) { + lookupflags = INPLOOKUP_WILDCARD; pi = &V_udbinfo; } else return 0; + lookupflags |= INPLOOKUP_RLOCKPCB; match = 0; if (*ugid_lookupp == 0) { - INP_INFO_RLOCK(pi); - pcb = (oif) ? - in_pcblookup_hash(pi, - dst_ip, htons(dst_port), - src_ip, htons(src_port), - wildcard, oif) : - in_pcblookup_hash(pi, - src_ip, htons(src_port), - dst_ip, htons(dst_port), - wildcard, NULL); + if (id->addr_type == 6) { +#ifdef INET6 + if (oif == NULL) + pcb = in6_pcblookup_mbuf(pi, + &id->src_ip6, htons(id->src_port), + &id->dst_ip6, htons(id->dst_port), + lookupflags, oif, args->m); + else + pcb = in6_pcblookup_mbuf(pi, + &id->dst_ip6, htons(id->dst_port), + &id->src_ip6, htons(id->src_port), + lookupflags, oif, args->m); +#else + *ugid_lookupp = -1; + return (0); +#endif + } else { + src_ip.s_addr = htonl(id->src_ip); + dst_ip.s_addr = htonl(id->dst_ip); + if (oif == NULL) + pcb = in_pcblookup_mbuf(pi, + src_ip, htons(id->src_port), + dst_ip, htons(id->dst_port), + lookupflags, oif, args->m); + else + pcb = in_pcblookup_mbuf(pi, + dst_ip, htons(id->dst_port), + src_ip, htons(id->src_port), + lookupflags, oif, args->m); + } if (pcb != NULL) { + INP_RLOCK_ASSERT(pcb); *uc = crhold(pcb->inp_cred); *ugid_lookupp = 1; + INP_RUNLOCK(pcb); } - INP_INFO_RUNLOCK(pi); if (*ugid_lookupp == 0) { /* * We tried and failed, set the variable to -1 @@ -714,14 +743,14 @@ check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, *ugid_lookupp = -1; return (0); } - } + } if (insn->o.opcode == O_UID) match = ((*uc)->cr_uid == (uid_t)insn->d[0]); else if (insn->o.opcode == O_GID) match = groupmember((gid_t)insn->d[0], *uc); else if (insn->o.opcode == O_JAIL) match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]); - return match; + return (match); #endif /* __FreeBSD__ */ } @@ -1259,22 +1288,17 @@ do { \ * as this ensures that we have a * packet with the ports info. */ - if (offset!=0) - break; - if (is_ipv6) /* XXX to be fixed later */ + if (offset != 0) break; if (proto == IPPROTO_TCP || proto == IPPROTO_UDP) match = check_uidgid( (ipfw_insn_u32 *)cmd, - proto, oif, - dst_ip, dst_port, - src_ip, src_port, &ucred_lookup, + args, &ucred_lookup, #ifdef __FreeBSD__ - &ucred_cache, args->inp); + &ucred_cache); #else - (void *)&ucred_cache, - (struct inpcb *)args->m); + (void *)&ucred_cache); #endif break; @@ -1389,18 +1413,15 @@ do { \ else if (v == 4 || v == 5) { check_uidgid( (ipfw_insn_u32 *)cmd, - proto, oif, - dst_ip, dst_port, - src_ip, src_port, &ucred_lookup, + args, &ucred_lookup, #ifdef __FreeBSD__ - &ucred_cache, args->inp); + &ucred_cache); if (v == 4 /* O_UID */) key = ucred_cache->cr_uid; else if (v == 5 /* O_JAIL */) key = ucred_cache->cr_prison->pr_id; #else /* !__FreeBSD__ */ - (void *)&ucred_cache, - (struct inpcb *)args->m); + (void *)&ucred_cache); if (v ==4 /* O_UID */) key = ucred_cache.uid; else if (v == 5 /* O_JAIL */) @@ -1827,21 +1848,32 @@ do { \ else break; + /* + * XXXRW: so_user_cookie should almost + * certainly be inp_user_cookie? + */ + /* For incomming packet, lookup up the inpcb using the src/dest ip/port tuple */ if (inp == NULL) { - INP_INFO_RLOCK(pi); - inp = in_pcblookup_hash(pi, + inp = in_pcblookup(pi, src_ip, htons(src_port), dst_ip, htons(dst_port), - 0, NULL); - INP_INFO_RUNLOCK(pi); - } - - if (inp && inp->inp_socket) { - tablearg = inp->inp_socket->so_user_cookie; - if (tablearg) - match = 1; + INPLOOKUP_RLOCKPCB, NULL); + if (inp != NULL) { + tablearg = + inp->inp_socket->so_user_cookie; + if (tablearg) + match = 1; + INP_RUNLOCK(inp); + } + } else { + if (inp->inp_socket) { + tablearg = + inp->inp_socket->so_user_cookie; + if (tablearg) + match = 1; + } } break; } @@ -2106,7 +2138,8 @@ do { \ case O_FORWARD_IP: if (args->eh) /* not valid on layer2 pkts */ break; - if (!q || dyn_dir == MATCH_FORWARD) { + if (q == NULL || q->rule != f || + dyn_dir == MATCH_FORWARD) { struct sockaddr_in *sa; sa = &(((ipfw_insn_sa *)cmd)->sa); if (sa->sin_addr.s_addr == INADDR_ANY) { @@ -2137,14 +2170,21 @@ do { \ done = 1; /* exit outer loop */ break; - case O_SETFIB: + case O_SETFIB: { + uint32_t fib; + f->pcnt++; /* update stats */ f->bcnt += pktlen; f->timestamp = time_uptime; - M_SETFIB(m, cmd->arg1); - args->f_id.fib = cmd->arg1; + fib = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg: + cmd->arg1; + if (fib >= rt_numfibs) + fib = 0; + M_SETFIB(m, fib); + args->f_id.fib = fib; l = 0; /* exit inner loop */ break; + } case O_NAT: if (!IPFW_NAT_LOADED) { @@ -2154,6 +2194,13 @@ do { \ int nat_id; set_match(args, f_pos, chain); + /* Check if this is 'global' nat rule */ + if (cmd->arg1 == 0) { + retval = ipfw_nat_ptr(args, NULL, m); + l = 0; + done = 1; + break; + } t = ((ipfw_insn_nat *)cmd)->nat; if (t == NULL) { nat_id = (cmd->arg1 == IP_FW_TABLEARG) ? diff --git a/sys/netinet/ipfw/ip_fw_dynamic.c b/sys/netinet/ipfw/ip_fw_dynamic.c index 7f0feb4..0bc4cc1 100644 --- a/sys/netinet/ipfw/ip_fw_dynamic.c +++ b/sys/netinet/ipfw/ip_fw_dynamic.c @@ -753,11 +753,12 @@ ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL); if (q != NULL) { /* should never occur */ + DEB( if (last_log != time_uptime) { last_log = time_uptime; printf("ipfw: %s: entry already present, done\n", __func__); - } + }) IPFW_DYN_UNLOCK(); return (0); } diff --git a/sys/netinet/ipfw/ip_fw_nat.c b/sys/netinet/ipfw/ip_fw_nat.c index f8c3e63..1679a97 100644 --- a/sys/netinet/ipfw/ip_fw_nat.c +++ b/sys/netinet/ipfw/ip_fw_nat.c @@ -207,7 +207,8 @@ ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m) struct mbuf *mcl; struct ip *ip; /* XXX - libalias duct tape */ - int ldt, retval; + int ldt, retval, found; + struct ip_fw_chain *chain; char *c; ldt = 0; @@ -256,23 +257,65 @@ ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m) ldt = 1; c = mtod(mcl, char *); - if (args->oif == NULL) - retval = LibAliasIn(t->lib, c, - mcl->m_len + M_TRAILINGSPACE(mcl)); - else - retval = LibAliasOut(t->lib, c, - mcl->m_len + M_TRAILINGSPACE(mcl)); - if (retval == PKT_ALIAS_RESPOND) { - m->m_flags |= M_SKIP_FIREWALL; - retval = PKT_ALIAS_OK; + + /* Check if this is 'global' instance */ + if (t == NULL) { + if (args->oif == NULL) { + /* Wrong direction, skip processing */ + args->m = mcl; + return (IP_FW_NAT); + } + + found = 0; + chain = &V_layer3_chain; + IPFW_RLOCK(chain); + /* Check every nat entry... */ + LIST_FOREACH(t, &chain->nat, _next) { + if ((t->mode & PKT_ALIAS_SKIP_GLOBAL) != 0) + continue; + retval = LibAliasOutTry(t->lib, c, + mcl->m_len + M_TRAILINGSPACE(mcl), 0); + if (retval == PKT_ALIAS_OK) { + /* Nat instance recognises state */ + found = 1; + break; + } + } + IPFW_RUNLOCK(chain); + if (found != 1) { + /* No instance found, return ignore */ + args->m = mcl; + return (IP_FW_NAT); + } + } else { + if (args->oif == NULL) + retval = LibAliasIn(t->lib, c, + mcl->m_len + M_TRAILINGSPACE(mcl)); + else + retval = LibAliasOut(t->lib, c, + mcl->m_len + M_TRAILINGSPACE(mcl)); } - if (retval != PKT_ALIAS_OK && - retval != PKT_ALIAS_FOUND_HEADER_FRAGMENT) { + + /* + * We drop packet when: + * 1. libalias returns PKT_ALIAS_ERROR; + * 2. For incoming packets: + * a) for unresolved fragments; + * b) libalias returns PKT_ALIAS_IGNORED and + * PKT_ALIAS_DENY_INCOMING flag is set. + */ + if (retval == PKT_ALIAS_ERROR || + (args->oif == NULL && (retval == PKT_ALIAS_UNRESOLVED_FRAGMENT || + (retval == PKT_ALIAS_IGNORED && + (t->mode & PKT_ALIAS_DENY_INCOMING) != 0)))) { /* XXX - should i add some logging? */ m_free(mcl); args->m = NULL; return (IP_FW_DENY); } + + if (retval == PKT_ALIAS_RESPOND) + m->m_flags |= M_SKIP_FIREWALL; mcl->m_pkthdr.len = mcl->m_len = ntohs(ip->ip_len); /* diff --git a/sys/netinet/ipfw/ip_fw_sockopt.c b/sys/netinet/ipfw/ip_fw_sockopt.c index 0c903ee..2347456 100644 --- a/sys/netinet/ipfw/ip_fw_sockopt.c +++ b/sys/netinet/ipfw/ip_fw_sockopt.c @@ -349,12 +349,13 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg) } if (n == 0) { - /* A flush request (arg == 0) on empty ruleset - * returns with no error. On the contrary, + /* A flush request (arg == 0 or cmd == 1) on empty + * ruleset returns with no error. On the contrary, * if there is no match on a specific request, * we return EINVAL. */ - error = (arg == 0) ? 0 : EINVAL; + if (arg != 0 && cmd != 1) + error = EINVAL; break; } @@ -606,7 +607,8 @@ check_ipfw_struct(struct ip_fw *rule, int size) case O_SETFIB: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; - if (cmd->arg1 >= rt_numfibs) { + if ((cmd->arg1 != IP_FW_TABLEARG) && + (cmd->arg1 >= rt_numfibs)) { printf("ipfw: invalid fib number %d\n", cmd->arg1); return EINVAL; diff --git a/sys/netinet/libalias/alias.h b/sys/netinet/libalias/alias.h index 2aed829..b12b353 100644 --- a/sys/netinet/libalias/alias.h +++ b/sys/netinet/libalias/alias.h @@ -197,6 +197,18 @@ struct mbuf *m_megapullup(struct mbuf *, int); */ #define PKT_ALIAS_RESET_ON_ADDR_CHANGE 0x20 +/* + * If PKT_ALIAS_PROXY_ONLY is set, then NAT will be disabled and only + * transparent proxying is performed. + */ +#define PKT_ALIAS_PROXY_ONLY 0x40 + +/* + * If PKT_ALIAS_REVERSE is set, the actions of PacketAliasIn() and + * PacketAliasOut() are reversed. + */ +#define PKT_ALIAS_REVERSE 0x80 + #ifndef NO_FW_PUNCH /* * If PKT_ALIAS_PUNCH_FW is set, active FTP and IRC DCC connections will @@ -209,16 +221,10 @@ struct mbuf *m_megapullup(struct mbuf *, int); #endif /* - * If PKT_ALIAS_PROXY_ONLY is set, then NAT will be disabled and only - * transparent proxying is performed. - */ -#define PKT_ALIAS_PROXY_ONLY 0x40 - -/* - * If PKT_ALIAS_REVERSE is set, the actions of PacketAliasIn() and - * PacketAliasOut() are reversed. + * If PKT_ALIAS_SKIP_GLOBAL is set, nat instance is not checked for matching + * states in 'ipfw nat global' rule. */ -#define PKT_ALIAS_REVERSE 0x80 +#define PKT_ALIAS_SKIP_GLOBAL 0x200 /* Function return codes. */ #define PKT_ALIAS_ERROR -1 diff --git a/sys/netinet/libalias/alias_sctp.h b/sys/netinet/libalias/alias_sctp.h index 80ed965..99d54ce 100644 --- a/sys/netinet/libalias/alias_sctp.h +++ b/sys/netinet/libalias/alias_sctp.h @@ -135,13 +135,13 @@ struct sctp_nat_assoc { struct in_addr a_addr; /**< alias ip address */ int state; /**< current state of NAT association */ int TableRegister; /**< stores which look up tables association is registered in */ - int exp; /**< timer expiration in seconds from uptime */ + int exp; /**< timer expiration in seconds from uptime */ int exp_loc; /**< current location in timer_Q */ int num_Gaddr; /**< number of global IP addresses in the list */ LIST_HEAD(sctpGlobalAddresshead,sctp_GlobalAddress) Gaddr; /**< List of global addresses */ - LIST_ENTRY (sctp_nat_assoc) list_L; /**< Linked list of pointers for Local table*/ - LIST_ENTRY (sctp_nat_assoc) list_G; /**< Linked list of pointers for Global table */ - LIST_ENTRY (sctp_nat_assoc) timer_Q; /**< Linked list of pointers for timer Q */ + LIST_ENTRY (sctp_nat_assoc) list_L; /**< Linked list of pointers for Local table*/ + LIST_ENTRY (sctp_nat_assoc) list_G; /**< Linked list of pointers for Global table */ + LIST_ENTRY (sctp_nat_assoc) timer_Q; /**< Linked list of pointers for timer Q */ //Using libalias locking }; diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c index be099a8..e754b88 100644 --- a/sys/netinet/raw_ip.c +++ b/sys/netinet/raw_ip.c @@ -205,7 +205,8 @@ rip_init(void) { in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE, - 1, "ripcb", rip_inpcb_init, NULL, UMA_ZONE_NOFREE); + 1, "ripcb", rip_inpcb_init, NULL, UMA_ZONE_NOFREE, + IPI_HASHFIELDS_NONE); EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, EVENTHANDLER_PRI_ANY); } @@ -226,7 +227,7 @@ rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n, { int policyfail = 0; - INP_RLOCK_ASSERT(last); + INP_LOCK_ASSERT(last); #ifdef IPSEC /* check AH/ESP integrity. */ @@ -834,16 +835,19 @@ rip_detach(struct socket *so) static void rip_dodisconnect(struct socket *so, struct inpcb *inp) { + struct inpcbinfo *pcbinfo; - INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); - INP_WLOCK_ASSERT(inp); - + pcbinfo = inp->inp_pcbinfo; + INP_INFO_WLOCK(pcbinfo); + INP_WLOCK(inp); rip_delhash(inp); inp->inp_faddr.s_addr = INADDR_ANY; rip_inshash(inp); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; SOCK_UNLOCK(so); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(pcbinfo); } static void @@ -854,11 +858,7 @@ rip_abort(struct socket *so) inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_abort: inp == NULL")); - INP_INFO_WLOCK(&V_ripcbinfo); - INP_WLOCK(inp); rip_dodisconnect(so, inp); - INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_ripcbinfo); } static void @@ -869,11 +869,7 @@ rip_close(struct socket *so) inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_close: inp == NULL")); - INP_INFO_WLOCK(&V_ripcbinfo); - INP_WLOCK(inp); rip_dodisconnect(so, inp); - INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_ripcbinfo); } static int @@ -887,11 +883,7 @@ rip_disconnect(struct socket *so) inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_disconnect: inp == NULL")); - INP_INFO_WLOCK(&V_ripcbinfo); - INP_WLOCK(inp); rip_dodisconnect(so, inp); - INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } @@ -1077,9 +1069,9 @@ rip_pcblist(SYSCTL_HANDLER_ARGS) INP_INFO_WLOCK(&V_ripcbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; - INP_WLOCK(inp); - if (!in_pcbrele(inp)) - INP_WUNLOCK(inp); + INP_RLOCK(inp); + if (!in_pcbrele_rlocked(inp)) + INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_ripcbinfo); diff --git a/sys/netinet/sctp.h b/sys/netinet/sctp.h index fa29a75..3c8cf36 100644 --- a/sys/netinet/sctp.h +++ b/sys/netinet/sctp.h @@ -91,7 +91,7 @@ struct sctp_paramhdr { #define SCTP_PEER_ADDR_PARAMS 0x0000000a #define SCTP_DEFAULT_SEND_PARAM 0x0000000b /* ancillary data/notification interest options */ -#define SCTP_EVENTS 0x0000000c +#define SCTP_EVENTS 0x0000000c /* deprecated */ /* Without this applied we will give V4 and V6 addresses on a V6 socket */ #define SCTP_I_WANT_MAPPED_V4_ADDR 0x0000000d #define SCTP_MAXSEG 0x0000000e @@ -114,6 +114,11 @@ struct sctp_paramhdr { #define SCTP_EXPLICIT_EOR 0x0000001b #define SCTP_REUSE_PORT 0x0000001c /* rw */ #define SCTP_AUTH_DEACTIVATE_KEY 0x0000001d +#define SCTP_EVENT 0x0000001e +#define SCTP_RECVRCVINFO 0x0000001f +#define SCTP_RECVNXTINFO 0x00000020 +#define SCTP_DEFAULT_SNDINFO 0x00000021 +#define SCTP_DEFAULT_PRINFO 0x00000022 /* * read-only options @@ -490,7 +495,7 @@ struct sctp_error_unrecognized_chunk { /* * PCB Features (in sctp_features bitmask) */ -#define SCTP_PCB_FLAGS_EXT_RCVINFO 0x00000002 +#define SCTP_PCB_FLAGS_EXT_RCVINFO 0x00000002 /* deprecated */ #define SCTP_PCB_FLAGS_DONOT_HEARTBEAT 0x00000004 #define SCTP_PCB_FLAGS_FRAG_INTERLEAVE 0x00000008 #define SCTP_PCB_FLAGS_INTERLEAVE_STRMS 0x00000010 @@ -500,7 +505,7 @@ struct sctp_error_unrecognized_chunk { /* socket options */ #define SCTP_PCB_FLAGS_NODELAY 0x00000100 #define SCTP_PCB_FLAGS_AUTOCLOSE 0x00000200 -#define SCTP_PCB_FLAGS_RECVDATAIOEVNT 0x00000400 +#define SCTP_PCB_FLAGS_RECVDATAIOEVNT 0x00000400 /* deprecated */ #define SCTP_PCB_FLAGS_RECVASSOCEVNT 0x00000800 #define SCTP_PCB_FLAGS_RECVPADDREVNT 0x00001000 #define SCTP_PCB_FLAGS_RECVPEERERR 0x00002000 @@ -516,6 +521,9 @@ struct sctp_error_unrecognized_chunk { #define SCTP_PCB_FLAGS_MULTIPLE_ASCONFS 0x01000000 #define SCTP_PCB_FLAGS_PORTREUSE 0x02000000 #define SCTP_PCB_FLAGS_DRYEVNT 0x04000000 +#define SCTP_PCB_FLAGS_RECVRCVINFO 0x08000000 +#define SCTP_PCB_FLAGS_RECVNXTINFO 0x10000000 + /*- * mobility_features parameters (by micchie).Note * these features are applied against the diff --git a/sys/netinet/sctp_auth.c b/sys/netinet/sctp_auth.c index 91e3f78..b68c840 100644 --- a/sys/netinet/sctp_auth.c +++ b/sys/netinet/sctp_auth.c @@ -1866,7 +1866,7 @@ sctp_notify_authentication(struct sctp_tcb *stcb, uint32_t indication, /* If the socket is gone we are out of here */ return; } - if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_AUTHEVNT)) + if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_AUTHEVNT)) /* event not enabled */ return; diff --git a/sys/netinet/sctp_indata.c b/sys/netinet/sctp_indata.c index 9734aea..e142a3e 100644 --- a/sys/netinet/sctp_indata.c +++ b/sys/netinet/sctp_indata.c @@ -201,95 +201,114 @@ failed_build: struct mbuf * -sctp_build_ctl_nchunk(struct sctp_inpcb *inp, - struct sctp_sndrcvinfo *sinfo) +sctp_build_ctl_nchunk(struct sctp_inpcb *inp, struct sctp_sndrcvinfo *sinfo) { + struct sctp_extrcvinfo *seinfo; struct sctp_sndrcvinfo *outinfo; + struct sctp_rcvinfo *rcvinfo; + struct sctp_nxtinfo *nxtinfo; struct cmsghdr *cmh; struct mbuf *ret; int len; - int use_extended = 0; + int use_extended; + int provide_nxt; - if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT)) { - /* user does not want the sndrcv ctl */ + if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT) && + sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVRCVINFO) && + sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVNXTINFO)) { + /* user does not want any ancillary data */ return (NULL); } - if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO)) { - use_extended = 1; - len = CMSG_LEN(sizeof(struct sctp_extrcvinfo)); + len = 0; + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVRCVINFO)) { + len += CMSG_SPACE(sizeof(struct sctp_rcvinfo)); + } + seinfo = (struct sctp_extrcvinfo *)sinfo; + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO) && + (seinfo->sreinfo_next_flags & SCTP_NEXT_MSG_AVAIL)) { + provide_nxt = 1; + len += CMSG_SPACE(sizeof(struct sctp_rcvinfo)); } else { - len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); + provide_nxt = 0; + } + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT)) { + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO)) { + use_extended = 1; + len += CMSG_SPACE(sizeof(struct sctp_extrcvinfo)); + } else { + use_extended = 0; + len += CMSG_SPACE(sizeof(struct sctp_sndrcvinfo)); + } + } else { + use_extended = 0; } - - ret = sctp_get_mbuf_for_msg(len, - 0, M_DONTWAIT, 1, MT_DATA); - + ret = sctp_get_mbuf_for_msg(len, 0, M_DONTWAIT, 1, MT_DATA); if (ret == NULL) { /* No space */ return (ret); } - /* We need a CMSG header followed by the struct */ + SCTP_BUF_LEN(ret) = 0; + + /* We need a CMSG header followed by the struct */ cmh = mtod(ret, struct cmsghdr *); - outinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmh); - cmh->cmsg_level = IPPROTO_SCTP; - if (use_extended) { - cmh->cmsg_type = SCTP_EXTRCV; - cmh->cmsg_len = len; - memcpy(outinfo, sinfo, len); - } else { - cmh->cmsg_type = SCTP_SNDRCV; - cmh->cmsg_len = len; - *outinfo = *sinfo; + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVRCVINFO)) { + cmh->cmsg_level = IPPROTO_SCTP; + cmh->cmsg_len = CMSG_LEN(sizeof(struct sctp_rcvinfo)); + cmh->cmsg_type = SCTP_RCVINFO; + rcvinfo = (struct sctp_rcvinfo *)CMSG_DATA(cmh); + rcvinfo->rcv_sid = sinfo->sinfo_stream; + rcvinfo->rcv_ssn = sinfo->sinfo_ssn; + rcvinfo->rcv_flags = sinfo->sinfo_flags; + rcvinfo->rcv_ppid = sinfo->sinfo_ppid; + rcvinfo->rcv_tsn = sinfo->sinfo_tsn; + rcvinfo->rcv_cumtsn = sinfo->sinfo_cumtsn; + rcvinfo->rcv_context = sinfo->sinfo_context; + rcvinfo->rcv_assoc_id = sinfo->sinfo_assoc_id; + cmh = (struct cmsghdr *)((caddr_t)cmh + CMSG_SPACE(sizeof(struct sctp_rcvinfo))); + SCTP_BUF_LEN(ret) += CMSG_SPACE(sizeof(struct sctp_rcvinfo)); + } + if (provide_nxt) { + cmh->cmsg_level = IPPROTO_SCTP; + cmh->cmsg_len = CMSG_LEN(sizeof(struct sctp_nxtinfo)); + cmh->cmsg_type = SCTP_NXTINFO; + nxtinfo = (struct sctp_nxtinfo *)CMSG_DATA(cmh); + nxtinfo->nxt_sid = seinfo->sreinfo_next_stream; + nxtinfo->nxt_flags = 0; + if (seinfo->sreinfo_next_flags & SCTP_NEXT_MSG_IS_UNORDERED) { + nxtinfo->nxt_flags |= SCTP_UNORDERED; + } + if (seinfo->sreinfo_next_flags & SCTP_NEXT_MSG_IS_NOTIFICATION) { + nxtinfo->nxt_flags |= SCTP_NOTIFICATION; + } + if (seinfo->sreinfo_next_flags & SCTP_NEXT_MSG_ISCOMPLETE) { + nxtinfo->nxt_flags |= SCTP_COMPLETE; + } + nxtinfo->nxt_ppid = seinfo->sreinfo_next_ppid; + nxtinfo->nxt_length = seinfo->sreinfo_next_length; + nxtinfo->nxt_assoc_id = seinfo->sreinfo_next_aid; + cmh = (struct cmsghdr *)((caddr_t)cmh + CMSG_SPACE(sizeof(struct sctp_nxtinfo))); + SCTP_BUF_LEN(ret) += CMSG_SPACE(sizeof(struct sctp_nxtinfo)); + } + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT)) { + cmh->cmsg_level = IPPROTO_SCTP; + outinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmh); + if (use_extended) { + cmh->cmsg_len = CMSG_LEN(sizeof(struct sctp_extrcvinfo)); + cmh->cmsg_type = SCTP_EXTRCV; + memcpy(outinfo, sinfo, sizeof(struct sctp_extrcvinfo)); + SCTP_BUF_LEN(ret) += CMSG_SPACE(sizeof(struct sctp_extrcvinfo)); + } else { + cmh->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); + cmh->cmsg_type = SCTP_SNDRCV; + *outinfo = *sinfo; + SCTP_BUF_LEN(ret) += CMSG_SPACE(sizeof(struct sctp_sndrcvinfo)); + } } - SCTP_BUF_LEN(ret) = cmh->cmsg_len; return (ret); } -char * -sctp_build_ctl_cchunk(struct sctp_inpcb *inp, - int *control_len, - struct sctp_sndrcvinfo *sinfo) -{ - struct sctp_sndrcvinfo *outinfo; - struct cmsghdr *cmh; - char *buf; - int len; - int use_extended = 0; - - if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT)) { - /* user does not want the sndrcv ctl */ - return (NULL); - } - if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO)) { - use_extended = 1; - len = CMSG_LEN(sizeof(struct sctp_extrcvinfo)); - } else { - len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); - } - SCTP_MALLOC(buf, char *, len, SCTP_M_CMSG); - if (buf == NULL) { - /* No space */ - return (buf); - } - /* We need a CMSG header followed by the struct */ - cmh = (struct cmsghdr *)buf; - outinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmh); - cmh->cmsg_level = IPPROTO_SCTP; - if (use_extended) { - cmh->cmsg_type = SCTP_EXTRCV; - cmh->cmsg_len = len; - memcpy(outinfo, sinfo, len); - } else { - cmh->cmsg_type = SCTP_SNDRCV; - cmh->cmsg_len = len; - *outinfo = *sinfo; - } - *control_len = len; - return (buf); -} - static void sctp_mark_non_revokable(struct sctp_association *asoc, uint32_t tsn) { diff --git a/sys/netinet/sctp_indata.h b/sys/netinet/sctp_indata.h index 34090df..1dbd364 100644 --- a/sys/netinet/sctp_indata.h +++ b/sys/netinet/sctp_indata.h @@ -83,11 +83,6 @@ struct mbuf * sctp_build_ctl_nchunk(struct sctp_inpcb *inp, struct sctp_sndrcvinfo *sinfo); -char * -sctp_build_ctl_cchunk(struct sctp_inpcb *inp, - int *control_len, - struct sctp_sndrcvinfo *sinfo); - void sctp_set_rwnd(struct sctp_tcb *, struct sctp_association *); uint32_t diff --git a/sys/netinet/sctp_output.c b/sys/netinet/sctp_output.c index a7d22bd..043b3b2 100644 --- a/sys/netinet/sctp_output.c +++ b/sys/netinet/sctp_output.c @@ -3355,54 +3355,338 @@ sctp_source_address_selection(struct sctp_inpcb *inp, } static int -sctp_find_cmsg(int c_type, void *data, struct mbuf *control, int cpsize) +sctp_find_cmsg(int c_type, void *data, struct mbuf *control, size_t cpsize) { struct cmsghdr cmh; - int tlen, at; + int tlen, at, found; + struct sctp_sndinfo sndinfo; + struct sctp_prinfo prinfo; + struct sctp_authinfo authinfo; tlen = SCTP_BUF_LEN(control); at = 0; + found = 0; /* * Independent of how many mbufs, find the c_type inside the control * structure and copy out the data. */ while (at < tlen) { if ((tlen - at) < (int)CMSG_ALIGN(sizeof(cmh))) { - /* not enough room for one more we are done. */ - return (0); + /* There is not enough room for one more. */ + return (found); } m_copydata(control, at, sizeof(cmh), (caddr_t)&cmh); + if (cmh.cmsg_len < CMSG_ALIGN(sizeof(struct cmsghdr))) { + /* We dont't have a complete CMSG header. */ + return (found); + } if (((int)cmh.cmsg_len + at) > tlen) { - /* - * this is real messed up since there is not enough - * data here to cover the cmsg header. We are done. - */ - return (0); + /* We don't have the complete CMSG. */ + return (found); } if ((cmh.cmsg_level == IPPROTO_SCTP) && - (c_type == cmh.cmsg_type)) { - /* found the one we want, copy it out */ - at += CMSG_ALIGN(sizeof(struct cmsghdr)); - if ((int)(cmh.cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr))) < cpsize) { - /* - * space of cmsg_len after header not big - * enough - */ - return (0); + ((c_type == cmh.cmsg_type) || + ((c_type == SCTP_SNDRCV) && + ((cmh.cmsg_type == SCTP_SNDINFO) || + (cmh.cmsg_type == SCTP_PRINFO) || + (cmh.cmsg_type == SCTP_AUTHINFO))))) { + if (c_type == cmh.cmsg_type) { + if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr))) < cpsize) { + return (found); + } + /* It is exactly what we want. Copy it out. */ + m_copydata(control, at + CMSG_ALIGN(sizeof(struct cmsghdr)), cpsize, (caddr_t)data); + return (1); + } else { + struct sctp_sndrcvinfo *sndrcvinfo; + + sndrcvinfo = (struct sctp_sndrcvinfo *)data; + if (found == 0) { + if (cpsize < sizeof(struct sctp_sndrcvinfo)) { + return (found); + } + memset(sndrcvinfo, 0, sizeof(struct sctp_sndrcvinfo)); + } + switch (cmh.cmsg_type) { + case SCTP_SNDINFO: + if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr))) < sizeof(struct sctp_sndinfo)) { + return (found); + } + m_copydata(control, at + CMSG_ALIGN(sizeof(struct cmsghdr)), sizeof(struct sctp_sndinfo), (caddr_t)&sndinfo); + sndrcvinfo->sinfo_stream = sndinfo.snd_sid; + sndrcvinfo->sinfo_flags = sndinfo.snd_flags; + sndrcvinfo->sinfo_ppid = sndinfo.snd_ppid; + sndrcvinfo->sinfo_context = sndinfo.snd_context; + sndrcvinfo->sinfo_assoc_id = sndinfo.snd_assoc_id; + break; + case SCTP_PRINFO: + if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr))) < sizeof(struct sctp_prinfo)) { + return (found); + } + m_copydata(control, at + CMSG_ALIGN(sizeof(struct cmsghdr)), sizeof(struct sctp_prinfo), (caddr_t)&prinfo); + sndrcvinfo->sinfo_timetolive = prinfo.pr_value; + sndrcvinfo->sinfo_flags |= prinfo.pr_policy; + break; + case SCTP_AUTHINFO: + if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr))) < sizeof(struct sctp_authinfo)) { + return (found); + } + m_copydata(control, at + CMSG_ALIGN(sizeof(struct cmsghdr)), sizeof(struct sctp_authinfo), (caddr_t)&authinfo); + sndrcvinfo->sinfo_keynumber_valid = 1; + sndrcvinfo->sinfo_keynumber = authinfo.auth_keyid; + break; + default: + return (found); + } + found = 1; } - m_copydata(control, at, cpsize, data); + } + at += CMSG_ALIGN(cmh.cmsg_len); + } + return (found); +} + +static int +sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *error) +{ + struct cmsghdr cmh; + int tlen, at; + struct sctp_initmsg initmsg; + +#ifdef INET + struct sockaddr_in sin; + +#endif +#ifdef INET6 + struct sockaddr_in6 sin6; + +#endif + + tlen = SCTP_BUF_LEN(control); + at = 0; + while (at < tlen) { + if ((tlen - at) < (int)CMSG_ALIGN(sizeof(cmh))) { + /* There is not enough room for one more. */ + *error = EINVAL; return (1); - } else { - at += CMSG_ALIGN(cmh.cmsg_len); - if (cmh.cmsg_len == 0) { + } + m_copydata(control, at, sizeof(cmh), (caddr_t)&cmh); + if (cmh.cmsg_len < CMSG_ALIGN(sizeof(struct cmsghdr))) { + /* We dont't have a complete CMSG header. */ + *error = EINVAL; + return (1); + } + if (((int)cmh.cmsg_len + at) > tlen) { + /* We don't have the complete CMSG. */ + *error = EINVAL; + return (1); + } + if (cmh.cmsg_level == IPPROTO_SCTP) { + switch (cmh.cmsg_type) { + case SCTP_INIT: + if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr))) < sizeof(struct sctp_initmsg)) { + *error = EINVAL; + return (1); + } + m_copydata(control, at + CMSG_ALIGN(sizeof(struct cmsghdr)), sizeof(struct sctp_initmsg), (caddr_t)&initmsg); + if (initmsg.sinit_max_attempts) + stcb->asoc.max_init_times = initmsg.sinit_max_attempts; + if (initmsg.sinit_num_ostreams) + stcb->asoc.pre_open_streams = initmsg.sinit_num_ostreams; + if (initmsg.sinit_max_instreams) + stcb->asoc.max_inbound_streams = initmsg.sinit_max_instreams; + if (initmsg.sinit_max_init_timeo) + stcb->asoc.initial_init_rto_max = initmsg.sinit_max_init_timeo; + if (stcb->asoc.streamoutcnt < stcb->asoc.pre_open_streams) { + struct sctp_stream_out *tmp_str; + unsigned int i; + + /* Default is NOT correct */ + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Ok, default:%d pre_open:%d\n", + stcb->asoc.streamoutcnt, stcb->asoc.pre_open_streams); + SCTP_TCB_UNLOCK(stcb); + SCTP_MALLOC(tmp_str, + struct sctp_stream_out *, + (stcb->asoc.pre_open_streams * sizeof(struct sctp_stream_out)), + SCTP_M_STRMO); + SCTP_TCB_LOCK(stcb); + if (tmp_str != NULL) { + SCTP_FREE(stcb->asoc.strmout, SCTP_M_STRMO); + stcb->asoc.strmout = tmp_str; + stcb->asoc.strm_realoutsize = stcb->asoc.streamoutcnt = stcb->asoc.pre_open_streams; + } else { + stcb->asoc.pre_open_streams = stcb->asoc.streamoutcnt; + } + for (i = 0; i < stcb->asoc.streamoutcnt; i++) { + stcb->asoc.strmout[i].next_sequence_sent = 0; + TAILQ_INIT(&stcb->asoc.strmout[i].outqueue); + stcb->asoc.strmout[i].stream_no = i; + stcb->asoc.strmout[i].last_msg_incomplete = 0; + stcb->asoc.ss_functions.sctp_ss_init_stream(&stcb->asoc.strmout[i], NULL); + } + } + break; +#ifdef INET + case SCTP_DSTADDRV4: + if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr))) < sizeof(struct in_addr)) { + *error = EINVAL; + return (1); + } + memset(&sin, 0, sizeof(struct sockaddr_in)); + sin.sin_family = AF_INET; + sin.sin_len = sizeof(struct sockaddr_in); + sin.sin_port = stcb->rport; + m_copydata(control, at + CMSG_ALIGN(sizeof(struct cmsghdr)), sizeof(struct in_addr), (caddr_t)&sin.sin_addr); + if ((sin.sin_addr.s_addr == INADDR_ANY) || + (sin.sin_addr.s_addr == INADDR_BROADCAST) || + IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { + *error = EINVAL; + return (-1); + } + if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) { + *error = ENOBUFS; + return (1); + } + break; +#endif +#ifdef INET6 + case SCTP_DSTADDRV6: + if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr))) < sizeof(struct in6_addr)) { + *error = EINVAL; + return (1); + } + memset(&sin6, 0, sizeof(struct sockaddr_in6)); + sin6.sin6_family = AF_INET6; + sin6.sin6_len = sizeof(struct sockaddr_in6); + sin6.sin6_port = stcb->rport; + m_copydata(control, at + CMSG_ALIGN(sizeof(struct cmsghdr)), sizeof(struct in6_addr), (caddr_t)&sin6.sin6_addr); + if (IN6_IS_ADDR_UNSPECIFIED(&sin6.sin6_addr) || + IN6_IS_ADDR_MULTICAST(&sin6.sin6_addr)) { + *error = EINVAL; + return (-1); + } +#ifdef INET + if (IN6_IS_ADDR_V4MAPPED(&sin6.sin6_addr)) { + in6_sin6_2_sin(&sin, &sin6); + if ((sin.sin_addr.s_addr == INADDR_ANY) || + (sin.sin_addr.s_addr == INADDR_BROADCAST) || + IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { + *error = EINVAL; + return (-1); + } + if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) { + *error = ENOBUFS; + return (1); + } + } else +#endif + if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin6, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) { + *error = ENOBUFS; + return (1); + } + break; +#endif + default: break; } } + at += CMSG_ALIGN(cmh.cmsg_len); } - /* not found */ return (0); } +static struct sctp_tcb * +sctp_findassociation_cmsgs(struct sctp_inpcb **inp_p, + in_port_t port, + struct mbuf *control, + struct sctp_nets **net_p, + int *error) +{ + struct cmsghdr cmh; + int tlen, at; + struct sctp_tcb *stcb; + struct sockaddr *addr; + +#ifdef INET + struct sockaddr_in sin; + +#endif +#ifdef INET6 + struct sockaddr_in6 sin6; + +#endif + + tlen = SCTP_BUF_LEN(control); + at = 0; + while (at < tlen) { + if ((tlen - at) < (int)CMSG_ALIGN(sizeof(cmh))) { + /* There is not enough room for one more. */ + *error = EINVAL; + return (NULL); + } + m_copydata(control, at, sizeof(cmh), (caddr_t)&cmh); + if (cmh.cmsg_len < CMSG_ALIGN(sizeof(struct cmsghdr))) { + /* We dont't have a complete CMSG header. */ + *error = EINVAL; + return (NULL); + } + if (((int)cmh.cmsg_len + at) > tlen) { + /* We don't have the complete CMSG. */ + *error = EINVAL; + return (NULL); + } + if (cmh.cmsg_level == IPPROTO_SCTP) { + switch (cmh.cmsg_type) { +#ifdef INET + case SCTP_DSTADDRV4: + if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr))) < sizeof(struct in_addr)) { + *error = EINVAL; + return (NULL); + } + memset(&sin, 0, sizeof(struct sockaddr_in)); + sin.sin_family = AF_INET; + sin.sin_len = sizeof(struct sockaddr_in); + sin.sin_port = port; + m_copydata(control, at + CMSG_ALIGN(sizeof(struct cmsghdr)), sizeof(struct in_addr), (caddr_t)&sin.sin_addr); + addr = (struct sockaddr *)&sin; + break; +#endif +#ifdef INET6 + case SCTP_DSTADDRV6: + if ((size_t)(cmh.cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr))) < sizeof(struct in6_addr)) { + *error = EINVAL; + return (NULL); + } + memset(&sin6, 0, sizeof(struct sockaddr_in6)); + sin6.sin6_family = AF_INET6; + sin6.sin6_len = sizeof(struct sockaddr_in6); + sin6.sin6_port = port; + m_copydata(control, at + CMSG_ALIGN(sizeof(struct cmsghdr)), sizeof(struct in6_addr), (caddr_t)&sin6.sin6_addr); +#ifdef INET + if (IN6_IS_ADDR_V4MAPPED(&sin6.sin6_addr)) { + in6_sin6_2_sin(&sin, &sin6); + addr = (struct sockaddr *)&sin; + } else +#endif + addr = (struct sockaddr *)&sin6; + break; +#endif + default: + addr = NULL; + break; + } + if (addr) { + stcb = sctp_findassociation_ep_addr(inp_p, addr, net_p, NULL, NULL); + if (stcb != NULL) { + return (stcb); + } + } + } + at += CMSG_ALIGN(cmh.cmsg_len); + } + return (NULL); +} + static struct mbuf * sctp_add_cookie(struct sctp_inpcb *inp, struct mbuf *init, int init_offset, struct mbuf *initack, int initack_offset, struct sctp_state_cookie *stc_in, uint8_t ** signature) @@ -5989,19 +6273,26 @@ sctp_msg_append(struct sctp_tcb *stcb, sp->some_taken = 0; sp->data = m; sp->tail_mbuf = NULL; - sp->length = 0; - at = m; sctp_set_prsctp_policy(sp); /* * We could in theory (for sendall) sifa the length in, but we would * still have to hunt through the chain since we need to setup the * tail_mbuf */ - while (at) { + sp->length = 0; + for (at = m; at; at = SCTP_BUF_NEXT(at)) { if (SCTP_BUF_NEXT(at) == NULL) sp->tail_mbuf = at; sp->length += SCTP_BUF_LEN(at); - at = SCTP_BUF_NEXT(at); + } + if (srcv->sinfo_keynumber_valid) { + sp->auth_keyid = srcv->sinfo_keynumber; + } else { + sp->auth_keyid = stcb->asoc.authinfo.active_keyid; + } + if (sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks)) { + sctp_auth_key_acquire(stcb, sp->auth_keyid); + sp->holds_key_ref = 1; } SCTP_TCB_SEND_LOCK(stcb); sctp_snd_sb_alloc(stcb, sp->length); @@ -6478,7 +6769,9 @@ sctp_sendall(struct sctp_inpcb *inp, struct uio *uio, struct mbuf *m, memset(ca, 0, sizeof(struct sctp_copy_all)); ca->inp = inp; - memcpy(&ca->sndrcv, srcv, sizeof(struct sctp_nonpad_sndrcvinfo)); + if (srcv) { + memcpy(&ca->sndrcv, srcv, sizeof(struct sctp_nonpad_sndrcvinfo)); + } /* * take off the sendall flag, it would be bad if we failed to do * this :-0 @@ -12229,9 +12522,13 @@ sctp_copy_it_in(struct sctp_tcb *stcb, *error = 0; goto skip_copy; } - sp->auth_keyid = stcb->asoc.authinfo.active_keyid; + if (srcv->sinfo_keynumber_valid) { + sp->auth_keyid = srcv->sinfo_keynumber; + } else { + sp->auth_keyid = stcb->asoc.authinfo.active_keyid; + } if (sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks)) { - sctp_auth_key_acquire(stcb, stcb->asoc.authinfo.active_keyid); + sctp_auth_key_acquire(stcb, sp->auth_keyid); sp->holds_key_ref = 1; } *error = sctp_copy_one(sp, uio, resv_in_first); @@ -12263,8 +12560,8 @@ sctp_sosend(struct socket *so, struct thread *p ) { - int error, use_rcvinfo = 0; - struct sctp_sndrcvinfo srcv; + int error, use_sndinfo = 0; + struct sctp_sndrcvinfo sndrcvninfo; struct sockaddr *addr_to_use; #if defined(INET) && defined(INET6) @@ -12274,10 +12571,10 @@ sctp_sosend(struct socket *so, if (control) { /* process cmsg snd/rcv info (maybe a assoc-id) */ - if (sctp_find_cmsg(SCTP_SNDRCV, (void *)&srcv, control, - sizeof(srcv))) { + if (sctp_find_cmsg(SCTP_SNDRCV, (void *)&sndrcvninfo, control, + sizeof(sndrcvninfo))) { /* got one */ - use_rcvinfo = 1; + use_sndinfo = 1; } } addr_to_use = addr; @@ -12295,7 +12592,7 @@ sctp_sosend(struct socket *so, error = sctp_lower_sosend(so, addr_to_use, uio, top, control, flags, - use_rcvinfo ? &srcv : NULL + use_sndinfo ? &sndrcvninfo : NULL ,p ); return (error); @@ -12500,6 +12797,9 @@ sctp_lower_sosend(struct socket *so, SCTP_INP_WUNLOCK(inp); /* With the lock applied look again */ stcb = sctp_findassociation_ep_addr(&t_inp, addr, &net, NULL, NULL); + if ((stcb == NULL) && (control != NULL) && (port > 0)) { + stcb = sctp_findassociation_cmsgs(&t_inp, port, control, &net, &error); + } if (stcb == NULL) { SCTP_INP_WLOCK(inp); SCTP_INP_DECR_REF(inp); @@ -12507,6 +12807,9 @@ sctp_lower_sosend(struct socket *so, } else { hold_tcblock = 1; } + if (error) { + goto out_unlocked; + } if (t_inp != inp) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOTCONN); error = ENOTCONN; @@ -12555,6 +12858,7 @@ sctp_lower_sosend(struct socket *so, /* Error is setup for us in the call */ goto out_unlocked; } + hold_tcblock = 1; if (create_lock_applied) { SCTP_ASOC_CREATE_UNLOCK(inp); create_lock_applied = 0; @@ -12574,84 +12878,13 @@ sctp_lower_sosend(struct socket *so, sctp_initialize_auth_params(inp, stcb); if (control) { - /* - * see if a init structure exists in cmsg - * headers - */ - struct sctp_initmsg initm; - int i; - - if (sctp_find_cmsg(SCTP_INIT, (void *)&initm, control, - sizeof(initm))) { - /* - * we have an INIT override of the - * default - */ - if (initm.sinit_max_attempts) - asoc->max_init_times = initm.sinit_max_attempts; - if (initm.sinit_num_ostreams) - asoc->pre_open_streams = initm.sinit_num_ostreams; - if (initm.sinit_max_instreams) - asoc->max_inbound_streams = initm.sinit_max_instreams; - if (initm.sinit_max_init_timeo) - asoc->initial_init_rto_max = initm.sinit_max_init_timeo; - if (asoc->streamoutcnt < asoc->pre_open_streams) { - struct sctp_stream_out *tmp_str; - int had_lock = 0; - - /* Default is NOT correct */ - SCTPDBG(SCTP_DEBUG_OUTPUT1, "Ok, defout:%d pre_open:%d\n", - asoc->streamoutcnt, asoc->pre_open_streams); - /* - * What happens if this - * fails? we panic ... - */ - - if (hold_tcblock) { - had_lock = 1; - SCTP_TCB_UNLOCK(stcb); - } - SCTP_MALLOC(tmp_str, - struct sctp_stream_out *, - (asoc->pre_open_streams * - sizeof(struct sctp_stream_out)), - SCTP_M_STRMO); - if (had_lock) { - SCTP_TCB_LOCK(stcb); - } - if (tmp_str != NULL) { - SCTP_FREE(asoc->strmout, SCTP_M_STRMO); - asoc->strmout = tmp_str; - asoc->strm_realoutsize = asoc->streamoutcnt = asoc->pre_open_streams; - } else { - asoc->pre_open_streams = asoc->streamoutcnt; - } - for (i = 0; i < asoc->streamoutcnt; i++) { - /*- - * inbound side must be set - * to 0xffff, also NOTE when - * we get the INIT-ACK back - * (for INIT sender) we MUST - * reduce the count - * (streamoutcnt) but first - * check if we sent to any - * of the upper streams that - * were dropped (if some - * were). Those that were - * dropped must be notified - * to the upper layer as - * failed to send. - */ - asoc->strmout[i].next_sequence_sent = 0x0; - TAILQ_INIT(&asoc->strmout[i].outqueue); - asoc->strmout[i].stream_no = i; - asoc->strmout[i].last_msg_incomplete = 0; - asoc->ss_functions.sctp_ss_init_stream(&asoc->strmout[i], NULL); - } - } + if (sctp_process_cmsgs_for_init(stcb, control, &error)) { + sctp_free_assoc(inp, stcb, SCTP_PCBFREE_FORCE, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_7); + hold_tcblock = 0; + stcb = NULL; + goto out_unlocked; } } - hold_tcblock = 1; /* out with the INIT */ queue_only_for_init = 1; /*- diff --git a/sys/netinet/sctp_pcb.c b/sys/netinet/sctp_pcb.c index e53e28a..8dc01cd 100644 --- a/sys/netinet/sctp_pcb.c +++ b/sys/netinet/sctp_pcb.c @@ -4196,11 +4196,11 @@ try_again: return (0); } /* - * We don't allow assoc id to be 0, this is needed otherwise if the - * id were to wrap we would have issues with some socket options. + * We don't allow assoc id to be one of SCTP_FUTURE_ASSOC, + * SCTP_CURRENT_ASSOC and SCTP_ALL_ASSOC. */ - if (inp->sctp_associd_counter == 0) { - inp->sctp_associd_counter++; + if (inp->sctp_associd_counter <= SCTP_ALL_ASSOC) { + inp->sctp_associd_counter = SCTP_ALL_ASSOC + 1; } id = inp->sctp_associd_counter; inp->sctp_associd_counter++; @@ -4793,7 +4793,7 @@ sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfre /* Held for PD-API clear that. */ sq->pdapi_aborted = 1; sq->held_length = 0; - if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PDAPIEVNT) && (so != NULL)) { + if (sctp_stcb_is_feature_on(inp, stcb, SCTP_PCB_FLAGS_PDAPIEVNT) && (so != NULL)) { /* * Need to add a PD-API * aborted indication. diff --git a/sys/netinet/sctp_structs.h b/sys/netinet/sctp_structs.h index 250b312..0f9bcaf 100644 --- a/sys/netinet/sctp_structs.h +++ b/sys/netinet/sctp_structs.h @@ -647,6 +647,8 @@ struct sctp_nonpad_sndrcvinfo { uint32_t sinfo_tsn; uint32_t sinfo_cumtsn; sctp_assoc_t sinfo_assoc_id; + uint16_t sinfo_keynumber; + uint16_t sinfo_keynumber_valid; }; /* @@ -1201,6 +1203,7 @@ struct sctp_association { /* JRS 5/21/07 - CMT PF variable */ uint8_t sctp_cmt_pf; uint8_t use_precise_time; + uint32_t sctp_features; /* * The mapping array is used to track out of order sequences above * last_acked_seq. 0 indicates packet missing 1 indicates packet diff --git a/sys/netinet/sctp_uio.h b/sys/netinet/sctp_uio.h index 56aef9d..a798682 100644 --- a/sys/netinet/sctp_uio.h +++ b/sys/netinet/sctp_uio.h @@ -47,6 +47,16 @@ __FBSDID("$FreeBSD$"); typedef uint32_t sctp_assoc_t; +#define SCTP_FUTURE_ASSOC 0 +#define SCTP_CURRENT_ASSOC 1 +#define SCTP_ALL_ASSOC 2 + +struct sctp_event { + sctp_assoc_t se_assoc_id; + uint16_t se_type; + uint8_t se_on; +}; + /* Compatibility to previous define's */ #define sctp_stream_reset_events sctp_stream_reset_event @@ -69,6 +79,14 @@ struct sctp_event_subscribe { #define SCTP_INIT 0x0001 #define SCTP_SNDRCV 0x0002 #define SCTP_EXTRCV 0x0003 +#define SCTP_SNDINFO 0x0004 +#define SCTP_RCVINFO 0x0005 +#define SCTP_NXTINFO 0x0006 +#define SCTP_PRINFO 0x0007 +#define SCTP_AUTHINFO 0x0008 +#define SCTP_DSTADDRV4 0x0009 +#define SCTP_DSTADDRV6 0x000a + /* * ancillary data structures */ @@ -93,8 +111,8 @@ struct sctp_initmsg { */ -#define SCTP_ALIGN_RESV_PAD 96 -#define SCTP_ALIGN_RESV_PAD_SHORT 80 +#define SCTP_ALIGN_RESV_PAD 92 +#define SCTP_ALIGN_RESV_PAD_SHORT 76 struct sctp_sndrcvinfo { uint16_t sinfo_stream; @@ -106,6 +124,8 @@ struct sctp_sndrcvinfo { uint32_t sinfo_tsn; uint32_t sinfo_cumtsn; sctp_assoc_t sinfo_assoc_id; + uint16_t sinfo_keynumber; + uint16_t sinfo_keynumber_valid; uint8_t __reserve_pad[SCTP_ALIGN_RESV_PAD]; }; @@ -113,7 +133,6 @@ struct sctp_extrcvinfo { uint16_t sinfo_stream; uint16_t sinfo_ssn; uint16_t sinfo_flags; - uint16_t sinfo_pr_policy; uint32_t sinfo_ppid; uint32_t sinfo_context; uint32_t sinfo_timetolive; @@ -125,15 +144,86 @@ struct sctp_extrcvinfo { uint32_t sreinfo_next_aid; uint32_t sreinfo_next_length; uint32_t sreinfo_next_ppid; + uint16_t sinfo_keynumber; + uint16_t sinfo_keynumber_valid; uint8_t __reserve_pad[SCTP_ALIGN_RESV_PAD_SHORT]; }; +struct sctp_sndinfo { + uint16_t snd_sid; + uint16_t snd_flags; + uint32_t snd_ppid; + uint32_t snd_context; + sctp_assoc_t snd_assoc_id; +}; + +struct sctp_prinfo { + uint16_t pr_policy; + uint32_t pr_value; +}; + +struct sctp_default_prinfo { + uint16_t pr_policy; + uint32_t pr_value; + sctp_assoc_t pr_assoc_id; +}; + +struct sctp_authinfo { + uint16_t auth_keyid; +}; + +struct sctp_rcvinfo { + uint16_t rcv_sid; + uint16_t rcv_ssn; + uint16_t rcv_flags; + uint32_t rcv_ppid; + uint32_t rcv_tsn; + uint32_t rcv_cumtsn; + uint32_t rcv_context; + sctp_assoc_t rcv_assoc_id; +}; + +struct sctp_nxtinfo { + uint16_t nxt_sid; + uint16_t nxt_flags; + uint32_t nxt_ppid; + uint32_t nxt_length; + sctp_assoc_t nxt_assoc_id; +}; + #define SCTP_NO_NEXT_MSG 0x0000 #define SCTP_NEXT_MSG_AVAIL 0x0001 #define SCTP_NEXT_MSG_ISCOMPLETE 0x0002 #define SCTP_NEXT_MSG_IS_UNORDERED 0x0004 #define SCTP_NEXT_MSG_IS_NOTIFICATION 0x0008 +struct sctp_recvv_rn { + struct sctp_rcvinfo recvv_rcvinfo; + struct sctp_nxtinfo recvv_nxtinfo; +}; + +#define SCTP_RECVV_NOINFO 0 +#define SCTP_RECVV_RCVINFO 1 +#define SCTP_RECVV_NXTINFO 2 +#define SCTP_RECVV_RN 3 + +#define SCTP_SENDV_NOINFO 0 +#define SCTP_SENDV_SNDINFO 1 +#define SCTP_SENDV_PRINFO 2 +#define SCTP_SENDV_AUTHINFO 3 +#define SCTP_SENDV_SPA 4 + +struct sctp_sendv_spa { + uint32_t sendv_flags; + struct sctp_sndinfo sendv_sndinfo; + struct sctp_prinfo sendv_prinfo; + struct sctp_authinfo sendv_authinfo; +}; + +#define SCTP_SEND_SNDINFO_VALID 0x00000001 +#define SCTP_SEND_PRINFO_VALID 0x00000002 +#define SCTP_SEND_AUTHINFO_VALID 0x00000004 + struct sctp_snd_all_completes { uint16_t sall_stream; uint16_t sall_flags; @@ -144,6 +234,8 @@ struct sctp_snd_all_completes { }; /* Flags that go into the sinfo->sinfo_flags field */ +#define SCTP_NOTIFICATION 0x0010 /* next message is a notification */ +#define SCTP_COMPLETE 0x0020 /* next message is complete */ #define SCTP_EOF 0x0100 /* Start shutdown procedures */ #define SCTP_ABORT 0x0200 /* Send an ABORT to peer */ #define SCTP_UNORDERED 0x0400 /* Message is un-ordered */ @@ -152,7 +244,7 @@ struct sctp_snd_all_completes { #define SCTP_EOR 0x2000 /* end of message signal */ #define SCTP_SACK_IMMEDIATELY 0x4000 /* Set I-Bit */ -#define INVALID_SINFO_FLAG(x) (((x) & 0xffffff00 \ +#define INVALID_SINFO_FLAG(x) (((x) & 0xfffffff0 \ & ~(SCTP_EOF | SCTP_ABORT | SCTP_UNORDERED |\ SCTP_ADDR_OVER | SCTP_SENDALL | SCTP_EOR |\ SCTP_SACK_IMMEDIATELY)) != 0) @@ -163,7 +255,7 @@ struct sctp_snd_all_completes { #define SCTP_PR_SCTP_BUF 0x0002/* Buffer based PR-SCTP */ #define SCTP_PR_SCTP_RTX 0x0003/* Number of retransmissions based PR-SCTP */ -#define PR_SCTP_POLICY(x) ((x) & 0xff) +#define PR_SCTP_POLICY(x) ((x) & 0x0f) #define PR_SCTP_ENABLED(x) (PR_SCTP_POLICY(x) != 0) #define PR_SCTP_TTL_ENABLED(x) (PR_SCTP_POLICY(x) == SCTP_PR_SCTP_TTL) #define PR_SCTP_BUF_ENABLED(x) (PR_SCTP_POLICY(x) == SCTP_PR_SCTP_BUF) @@ -1132,26 +1224,34 @@ int sctp_getladdrs __P((int, sctp_assoc_t, struct sockaddr **)); void sctp_freeladdrs __P((struct sockaddr *)); int sctp_opt_info __P((int, sctp_assoc_t, int, void *, socklen_t *)); +/* deprecated */ ssize_t sctp_sendmsg -__P((int, const void *, size_t, - const struct sockaddr *, +__P((int, const void *, size_t, const struct sockaddr *, socklen_t, uint32_t, uint32_t, uint16_t, uint32_t, uint32_t)); - ssize_t sctp_send __P((int sd, const void *msg, size_t len, - const struct sctp_sndrcvinfo *sinfo, int flags)); +/* deprecated */ + ssize_t sctp_send __P((int, const void *, size_t, + const struct sctp_sndrcvinfo *, int)); + +/* deprecated */ + ssize_t sctp_sendx __P((int, const void *, size_t, struct sockaddr *, + int, struct sctp_sndrcvinfo *, int)); + +/* deprecated */ + ssize_t sctp_sendmsgx __P((int sd, const void *, size_t, struct sockaddr *, + int, uint32_t, uint32_t, uint16_t, uint32_t, uint32_t)); - ssize_t sctp_sendx __P((int sd, const void *msg, size_t len, - struct sockaddr *addrs, int addrcnt, - struct sctp_sndrcvinfo *sinfo, int flags)); + sctp_assoc_t sctp_getassocid __P((int, struct sockaddr *)); - ssize_t sctp_sendmsgx __P((int sd, const void *, size_t, - struct sockaddr *, int, - uint32_t, uint32_t, uint16_t, uint32_t, uint32_t)); +/* deprecated */ + ssize_t sctp_recvmsg __P((int, void *, size_t, struct sockaddr *, socklen_t *, + struct sctp_sndrcvinfo *, int *)); - sctp_assoc_t sctp_getassocid __P((int sd, struct sockaddr *sa)); + ssize_t sctp_sendv __P((int, const struct iovec *, int, struct sockaddr *, + int, void *, socklen_t, unsigned int, int)); - ssize_t sctp_recvmsg __P((int, void *, size_t, struct sockaddr *, - socklen_t *, struct sctp_sndrcvinfo *, int *)); + ssize_t sctp_recvv __P((int, const struct iovec *, int, struct sockaddr *, + socklen_t *, void *, socklen_t *, unsigned int *, int *)); __END_DECLS diff --git a/sys/netinet/sctp_usrreq.c b/sys/netinet/sctp_usrreq.c index b3eb805..4c1d726 100644 --- a/sys/netinet/sctp_usrreq.c +++ b/sys/netinet/sctp_usrreq.c @@ -713,7 +713,7 @@ sctp_sendm(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, control = NULL; } error = EDESTADDRREQ; - return EDESTADDRREQ; + return (error); } #endif /* INET6 */ connected_type: @@ -1448,7 +1448,6 @@ sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval, struct sctp_tcb *stcb = NULL; struct sockaddr *sa; int num_v6 = 0, num_v4 = 0, *totaddrp, totaddr; - int added = 0; uint32_t vrf_id; int bad_addresses = 0; sctp_assoc_t *a_id; @@ -1560,7 +1559,7 @@ sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval, } error = 0; - added = sctp_connectx_helper_add(stcb, sa, (totaddr - 1), &error); + sctp_connectx_helper_add(stcb, sa, (totaddr - 1), &error); /* Fill in the return id */ if (error) { (void)sctp_free_assoc(inp, stcb, SCTP_PCBFREE_FORCE, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_6); @@ -1603,7 +1602,7 @@ out_now: SCTP_TCB_LOCK(stcb); \ } \ SCTP_INP_RUNLOCK(inp); \ - } else if (assoc_id != 0) { \ + } else if (assoc_id > SCTP_ALL_ASSOC) { \ stcb = sctp_findassociation_ep_asocid(inp, assoc_id, 1); \ if (stcb == NULL) { \ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); \ @@ -1691,10 +1690,6 @@ sctp_getopt(struct socket *so, int optname, void *optval, size_t *optsize, SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT); error = ENOPROTOOPT; } /* end switch (sopt->sopt_name) */ - if (optname != SCTP_AUTOCLOSE) { - /* make it an "on/off" value */ - val = (val != 0); - } if (*optsize < sizeof(val)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; @@ -1734,8 +1729,8 @@ flags_out: SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize); *value = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE); *optsize = sizeof(uint32_t); + break; } - break; case SCTP_PARTIAL_DELIVERY_POINT: { uint32_t *value; @@ -1743,8 +1738,8 @@ flags_out: SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize); *value = inp->partial_delivery_point; *optsize = sizeof(uint32_t); + break; } - break; case SCTP_FRAGMENT_INTERLEAVE: { uint32_t *value; @@ -1760,8 +1755,8 @@ flags_out: *value = SCTP_FRAG_LEVEL_0; } *optsize = sizeof(uint32_t); + break; } - break; case SCTP_CMT_ON_OFF: { struct sctp_assoc_value *av; @@ -1772,14 +1767,20 @@ flags_out: av->assoc_value = stcb->asoc.sctp_cmt_on_off; SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_RLOCK(inp); - av->assoc_value = inp->sctp_cmt_on_off; - SCTP_INP_RUNLOCK(inp); + if (av->assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + av->assoc_value = inp->sctp_cmt_on_off; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } } - *optsize = sizeof(*av); + if (error == 0) { + *optsize = sizeof(struct sctp_assoc_value); + } + break; } - break; - /* JRS - Get socket option for pluggable congestion control */ case SCTP_PLUGGABLE_CC: { struct sctp_assoc_value *av; @@ -1790,11 +1791,20 @@ flags_out: av->assoc_value = stcb->asoc.congestion_control_module; SCTP_TCB_UNLOCK(stcb); } else { - av->assoc_value = inp->sctp_ep.sctp_default_cc_module; + if (av->assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + av->assoc_value = inp->sctp_ep.sctp_default_cc_module; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } } - *optsize = sizeof(*av); + if (error == 0) { + *optsize = sizeof(struct sctp_assoc_value); + } + break; } - break; case SCTP_CC_OPTION: { struct sctp_cc_option *cc_opt; @@ -1807,15 +1817,13 @@ flags_out: if (stcb->asoc.cc_functions.sctp_cwnd_socket_option == NULL) { error = ENOTSUP; } else { - error = (*stcb->asoc.cc_functions.sctp_cwnd_socket_option) (stcb, 0, - cc_opt); - *optsize = sizeof(*cc_opt); + error = (*stcb->asoc.cc_functions.sctp_cwnd_socket_option) (stcb, 0, cc_opt); + *optsize = sizeof(struct sctp_cc_option); } SCTP_TCB_UNLOCK(stcb); } + break; } - break; - /* RS - Get socket option for pluggable stream scheduling */ case SCTP_PLUGGABLE_SS: { struct sctp_assoc_value *av; @@ -1826,11 +1834,20 @@ flags_out: av->assoc_value = stcb->asoc.stream_scheduling_module; SCTP_TCB_UNLOCK(stcb); } else { - av->assoc_value = inp->sctp_ep.sctp_default_ss_module; + if (av->assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + av->assoc_value = inp->sctp_ep.sctp_default_ss_module; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_assoc_value); } - *optsize = sizeof(*av); + break; } - break; case SCTP_SS_VALUE: { struct sctp_stream_value *av; @@ -1843,7 +1860,7 @@ flags_out: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } else { - *optsize = sizeof(*av); + *optsize = sizeof(struct sctp_stream_value); } SCTP_TCB_UNLOCK(stcb); } else { @@ -1854,8 +1871,8 @@ flags_out: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } + break; } - break; case SCTP_GET_ADDR_LEN: { struct sctp_assoc_value *av; @@ -1876,10 +1893,11 @@ flags_out: #endif if (error) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + } else { + *optsize = sizeof(struct sctp_assoc_value); } - *optsize = sizeof(*av); + break; } - break; case SCTP_GET_ASSOC_NUMBER: { uint32_t *value, cnt; @@ -1893,9 +1911,8 @@ flags_out: SCTP_INP_RUNLOCK(inp); *value = cnt; *optsize = sizeof(uint32_t); + break; } - break; - case SCTP_GET_ASSOC_ID_LIST: { struct sctp_assoc_ids *ids; @@ -1915,10 +1932,12 @@ flags_out: } } SCTP_INP_RUNLOCK(inp); - ids->gaids_number_of_ids = at; - *optsize = ((at * sizeof(sctp_assoc_t)) + sizeof(uint32_t)); + if (error == 0) { + ids->gaids_number_of_ids = at; + *optsize = ((at * sizeof(sctp_assoc_t)) + sizeof(uint32_t)); + } + break; } - break; case SCTP_CONTEXT: { struct sctp_assoc_value *av; @@ -1930,19 +1949,27 @@ flags_out: av->assoc_value = stcb->asoc.context; SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_RLOCK(inp); - av->assoc_value = inp->sctp_context; - SCTP_INP_RUNLOCK(inp); + if (av->assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + av->assoc_value = inp->sctp_context; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } } - *optsize = sizeof(*av); + if (error == 0) { + *optsize = sizeof(struct sctp_assoc_value); + } + break; } - break; case SCTP_VRF_ID: { uint32_t *default_vrfid; SCTP_CHECK_AND_CAST(default_vrfid, optval, uint32_t, *optsize); *default_vrfid = inp->def_vrf_id; + *optsize = sizeof(uint32_t); break; } case SCTP_GET_ASOC_VRF: @@ -1954,9 +1981,10 @@ flags_out: if (stcb == NULL) { error = EINVAL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); - break; + } else { + id->assoc_value = stcb->asoc.vrf_id; + *optsize = sizeof(struct sctp_assoc_value); } - id->assoc_value = stcb->asoc.vrf_id; break; } case SCTP_GET_VRF_IDS: @@ -1976,13 +2004,13 @@ flags_out: gnv->gn_peers_tag = stcb->asoc.peer_vtag; gnv->gn_local_tag = stcb->asoc.my_vtag; SCTP_TCB_UNLOCK(stcb); + *optsize = sizeof(struct sctp_get_nonce_values); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN); error = ENOTCONN; } - *optsize = sizeof(*gnv); + break; } - break; case SCTP_DELAYED_SACK: { struct sctp_sack_info *sack; @@ -1994,15 +2022,21 @@ flags_out: sack->sack_freq = stcb->asoc.sack_freq; SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_RLOCK(inp); - sack->sack_delay = TICKS_TO_MSEC(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_RECV]); - sack->sack_freq = inp->sctp_ep.sctp_sack_freq; - SCTP_INP_RUNLOCK(inp); + if (sack->sack_assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + sack->sack_delay = TICKS_TO_MSEC(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_RECV]); + sack->sack_freq = inp->sctp_ep.sctp_sack_freq; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } } - *optsize = sizeof(*sack); + if (error == 0) { + *optsize = sizeof(struct sctp_sack_info); + } + break; } - break; - case SCTP_GET_SNDBUF_USE: { struct sctp_sockstat *ss; @@ -2015,13 +2049,13 @@ flags_out: ss->ss_total_recv_buf = (stcb->asoc.size_on_reasm_queue + stcb->asoc.size_on_all_streams); SCTP_TCB_UNLOCK(stcb); + *optsize = sizeof(struct sctp_sockstat); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN); error = ENOTCONN; } - *optsize = sizeof(struct sctp_sockstat); + break; } - break; case SCTP_MAX_BURST: { struct sctp_assoc_value *av; @@ -2033,14 +2067,20 @@ flags_out: av->assoc_value = stcb->asoc.max_burst; SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_RLOCK(inp); - av->assoc_value = inp->sctp_ep.max_burst; - SCTP_INP_RUNLOCK(inp); + if (av->assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + av->assoc_value = inp->sctp_ep.max_burst; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } } - *optsize = sizeof(struct sctp_assoc_value); - + if (error == 0) { + *optsize = sizeof(struct sctp_assoc_value); + } + break; } - break; case SCTP_MAXSEG: { struct sctp_assoc_value *av; @@ -2053,21 +2093,28 @@ flags_out: av->assoc_value = sctp_get_frag_point(stcb, &stcb->asoc); SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_RLOCK(inp); - if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { - ovh = SCTP_MED_OVERHEAD; + if (av->assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + ovh = SCTP_MED_OVERHEAD; + } else { + ovh = SCTP_MED_V4_OVERHEAD; + } + if (inp->sctp_frag_point >= SCTP_DEFAULT_MAXSEGMENT) + av->assoc_value = 0; + else + av->assoc_value = inp->sctp_frag_point - ovh; + SCTP_INP_RUNLOCK(inp); } else { - ovh = SCTP_MED_V4_OVERHEAD; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; } - if (inp->sctp_frag_point >= SCTP_DEFAULT_MAXSEGMENT) - av->assoc_value = 0; - else - av->assoc_value = inp->sctp_frag_point - ovh; - SCTP_INP_RUNLOCK(inp); } - *optsize = sizeof(struct sctp_assoc_value); + if (error == 0) { + *optsize = sizeof(struct sctp_assoc_value); + } + break; } - break; case SCTP_GET_STAT_LOG: error = sctp_fill_stat_log(optval, optsize); break; @@ -2076,7 +2123,7 @@ flags_out: struct sctp_event_subscribe *events; SCTP_CHECK_AND_CAST(events, optval, struct sctp_event_subscribe, *optsize); - memset(events, 0, sizeof(*events)); + memset(events, 0, sizeof(struct sctp_event_subscribe)); SCTP_INP_RLOCK(inp); if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT)) events->sctp_data_io_event = 1; @@ -2112,9 +2159,8 @@ flags_out: events->sctp_stream_reset_event = 1; SCTP_INP_RUNLOCK(inp); *optsize = sizeof(struct sctp_event_subscribe); + break; } - break; - case SCTP_ADAPTATION_LAYER: { uint32_t *value; @@ -2125,8 +2171,8 @@ flags_out: *value = inp->sctp_ep.adaptation_layer_indicator; SCTP_INP_RUNLOCK(inp); *optsize = sizeof(uint32_t); + break; } - break; case SCTP_SET_INITIAL_DBG_SEQ: { uint32_t *value; @@ -2136,8 +2182,8 @@ flags_out: *value = inp->sctp_ep.initial_sequence_debug; SCTP_INP_RUNLOCK(inp); *optsize = sizeof(uint32_t); + break; } - break; case SCTP_GET_LOCAL_ADDR_SIZE: { uint32_t *value; @@ -2147,8 +2193,8 @@ flags_out: *value = sctp_count_max_addresses(inp); SCTP_INP_RUNLOCK(inp); *optsize = sizeof(uint32_t); + break; } - break; case SCTP_GET_REMOTE_ADDR_SIZE: { uint32_t *value; @@ -2184,13 +2230,13 @@ flags_out: } SCTP_TCB_UNLOCK(stcb); *value = (uint32_t) size; + *optsize = sizeof(uint32_t); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN); error = ENOTCONN; } - *optsize = sizeof(uint32_t); + break; } - break; case SCTP_GET_PEER_ADDRESSES: /* * Get the address information, an array is passed in to @@ -2260,8 +2306,8 @@ flags_out: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); error = ENOENT; } + break; } - break; case SCTP_GET_LOCAL_ADDRESSES: { size_t limit, actual; @@ -2278,8 +2324,8 @@ flags_out: SCTP_TCB_UNLOCK(stcb); } *optsize = sizeof(struct sockaddr_storage) + actual; + break; } - break; case SCTP_PEER_ADDR_PARAMS: { struct sctp_paddrparams *paddrp; @@ -2416,38 +2462,45 @@ flags_out: paddrp->spp_assoc_id = sctp_get_associd(stcb); SCTP_TCB_UNLOCK(stcb); } else { - /* Use endpoint defaults */ - SCTP_INP_RLOCK(inp); - paddrp->spp_pathmaxrxt = inp->sctp_ep.def_net_failure; - paddrp->spp_hbinterval = TICKS_TO_MSEC(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT]); - paddrp->spp_assoc_id = (sctp_assoc_t) 0; - /* get inp's default */ + if (paddrp->spp_assoc_id == SCTP_FUTURE_ASSOC) { + /* Use endpoint defaults */ + SCTP_INP_RLOCK(inp); + paddrp->spp_pathmaxrxt = inp->sctp_ep.def_net_failure; + paddrp->spp_hbinterval = TICKS_TO_MSEC(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT]); + paddrp->spp_assoc_id = SCTP_FUTURE_ASSOC; + /* get inp's default */ #ifdef INET - paddrp->spp_ipv4_tos = inp->ip_inp.inp.inp_ip_tos; - paddrp->spp_flags |= SPP_IPV4_TOS; + paddrp->spp_ipv4_tos = inp->ip_inp.inp.inp_ip_tos; + paddrp->spp_flags |= SPP_IPV4_TOS; #endif #ifdef INET6 - if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { - paddrp->spp_ipv6_flowlabel = ((struct in6pcb *)inp)->in6p_flowinfo; - paddrp->spp_flags |= SPP_IPV6_FLOWLABEL; - } + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + paddrp->spp_ipv6_flowlabel = ((struct in6pcb *)inp)->in6p_flowinfo; + paddrp->spp_flags |= SPP_IPV6_FLOWLABEL; + } #endif - /* can't return this */ - paddrp->spp_pathmtu = 0; + /* can't return this */ + paddrp->spp_pathmtu = 0; - /* default behavior, no stcb */ - paddrp->spp_flags = SPP_PMTUD_ENABLE; + /* default behavior, no stcb */ + paddrp->spp_flags = SPP_PMTUD_ENABLE; - if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT)) { - paddrp->spp_flags |= SPP_HB_ENABLE; + if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT)) { + paddrp->spp_flags |= SPP_HB_ENABLE; + } else { + paddrp->spp_flags |= SPP_HB_DISABLE; + } + SCTP_INP_RUNLOCK(inp); } else { - paddrp->spp_flags |= SPP_HB_DISABLE; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; } - SCTP_INP_RUNLOCK(inp); } - *optsize = sizeof(struct sctp_paddrparams); + if (error == 0) { + *optsize = sizeof(struct sctp_paddrparams); + } + break; } - break; case SCTP_GET_PEER_ADDR_INFO: { struct sctp_paddrinfo *paddri; @@ -2491,6 +2544,7 @@ flags_out: paddri->spinfo_assoc_id = sctp_get_associd(stcb); paddri->spinfo_mtu = net->mtu; SCTP_TCB_UNLOCK(stcb); + *optsize = sizeof(struct sctp_paddrinfo); } else { if (stcb) { SCTP_TCB_UNLOCK(stcb); @@ -2498,9 +2552,8 @@ flags_out: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); error = ENOENT; } - *optsize = sizeof(struct sctp_paddrinfo); + break; } - break; case SCTP_PCB_STATUS: { struct sctp_pcbinfo *spcb; @@ -2508,9 +2561,8 @@ flags_out: SCTP_CHECK_AND_CAST(spcb, optval, struct sctp_pcbinfo, *optsize); sctp_fill_pcbinfo(spcb); *optsize = sizeof(struct sctp_pcbinfo); + break; } - break; - case SCTP_STATUS: { struct sctp_nets *net; @@ -2520,7 +2572,7 @@ flags_out: SCTP_FIND_STCB(inp, stcb, sstat->sstat_assoc_id); if (stcb == NULL) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } @@ -2569,9 +2621,9 @@ flags_out: sstat->sstat_primary.spinfo_mtu = net->mtu; sstat->sstat_primary.spinfo_assoc_id = sctp_get_associd(stcb); SCTP_TCB_UNLOCK(stcb); - *optsize = sizeof(*sstat); + *optsize = sizeof(struct sctp_status); + break; } - break; case SCTP_RTOINFO: { struct sctp_rtoinfo *srto; @@ -2585,15 +2637,22 @@ flags_out: srto->srto_min = stcb->asoc.minrto; SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_RLOCK(inp); - srto->srto_initial = inp->sctp_ep.initial_rto; - srto->srto_max = inp->sctp_ep.sctp_maxrto; - srto->srto_min = inp->sctp_ep.sctp_minrto; - SCTP_INP_RUNLOCK(inp); + if (srto->srto_assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + srto->srto_initial = inp->sctp_ep.initial_rto; + srto->srto_max = inp->sctp_ep.sctp_maxrto; + srto->srto_min = inp->sctp_ep.sctp_minrto; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_rtoinfo); } - *optsize = sizeof(*srto); + break; } - break; case SCTP_TIMEOUTS: { struct sctp_timeouts *stimo; @@ -2610,23 +2669,21 @@ flags_out: stimo->stimo_cookie = stcb->asoc.timocookie; stimo->stimo_shutdownack = stcb->asoc.timoshutdownack; SCTP_TCB_UNLOCK(stcb); + *optsize = sizeof(struct sctp_timeouts); } else { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } - *optsize = sizeof(*stimo); + break; } - break; case SCTP_ASSOCINFO: { struct sctp_assocparams *sasoc; - uint32_t oldval; SCTP_CHECK_AND_CAST(sasoc, optval, struct sctp_assocparams, *optsize); SCTP_FIND_STCB(inp, stcb, sasoc->sasoc_assoc_id); if (stcb) { - oldval = sasoc->sasoc_cookie_life; sasoc->sasoc_cookie_life = TICKS_TO_MSEC(stcb->asoc.cookie_life); sasoc->sasoc_asocmaxrxt = stcb->asoc.max_send_times; sasoc->sasoc_number_peer_destinations = stcb->asoc.numnets; @@ -2634,17 +2691,24 @@ flags_out: sasoc->sasoc_local_rwnd = stcb->asoc.my_rwnd; SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_RLOCK(inp); - sasoc->sasoc_cookie_life = TICKS_TO_MSEC(inp->sctp_ep.def_cookie_life); - sasoc->sasoc_asocmaxrxt = inp->sctp_ep.max_send_times; - sasoc->sasoc_number_peer_destinations = 0; - sasoc->sasoc_peer_rwnd = 0; - sasoc->sasoc_local_rwnd = sbspace(&inp->sctp_socket->so_rcv); - SCTP_INP_RUNLOCK(inp); + if (sasoc->sasoc_assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + sasoc->sasoc_cookie_life = TICKS_TO_MSEC(inp->sctp_ep.def_cookie_life); + sasoc->sasoc_asocmaxrxt = inp->sctp_ep.max_send_times; + sasoc->sasoc_number_peer_destinations = 0; + sasoc->sasoc_peer_rwnd = 0; + sasoc->sasoc_local_rwnd = sbspace(&inp->sctp_socket->so_rcv); + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_assocparams); } - *optsize = sizeof(*sasoc); + break; } - break; case SCTP_DEFAULT_SEND_PARAM: { struct sctp_sndrcvinfo *s_info; @@ -2656,13 +2720,20 @@ flags_out: memcpy(s_info, &stcb->asoc.def_send, sizeof(stcb->asoc.def_send)); SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_RLOCK(inp); - memcpy(s_info, &inp->def_send, sizeof(inp->def_send)); - SCTP_INP_RUNLOCK(inp); + if (s_info->sinfo_assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + memcpy(s_info, &inp->def_send, sizeof(inp->def_send)); + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } } - *optsize = sizeof(*s_info); + if (error == 0) { + *optsize = sizeof(struct sctp_sndrcvinfo); + } + break; } - break; case SCTP_INITMSG: { struct sctp_initmsg *sinit; @@ -2674,9 +2745,9 @@ flags_out: sinit->sinit_max_attempts = inp->sctp_ep.max_init_times; sinit->sinit_max_init_timeo = inp->sctp_ep.initial_init_rto_max; SCTP_INP_RUNLOCK(inp); - *optsize = sizeof(*sinit); + *optsize = sizeof(struct sctp_initmsg); + break; } - break; case SCTP_PRIMARY_ADDR: /* we allow a "get" operation on this */ { @@ -2697,14 +2768,13 @@ flags_out: &stcb->asoc.primary_destination->ro._l_addr, len); SCTP_TCB_UNLOCK(stcb); + *optsize = sizeof(struct sctp_setprim); } else { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } - *optsize = sizeof(*ssp); + break; } - break; - case SCTP_HMAC_IDENT: { struct sctp_hmacalgo *shmac; @@ -2726,7 +2796,7 @@ flags_out: size = sizeof(*shmac) + (hmaclist->num_algo * sizeof(shmac->shmac_idents[0])); if ((size_t)(*optsize) < size) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; SCTP_INP_RUNLOCK(inp); break; @@ -2752,12 +2822,19 @@ flags_out: scact->scact_keynumber = stcb->asoc.authinfo.active_keyid; SCTP_TCB_UNLOCK(stcb); } else { - /* get the endpoint active key */ - SCTP_INP_RLOCK(inp); - scact->scact_keynumber = inp->sctp_ep.default_keyid; - SCTP_INP_RUNLOCK(inp); + if (scact->scact_assoc_id == SCTP_FUTURE_ASSOC) { + /* get the endpoint active key */ + SCTP_INP_RLOCK(inp); + scact->scact_keynumber = inp->sctp_ep.default_keyid; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_authkeyid); } - *optsize = sizeof(*scact); break; } case SCTP_LOCAL_AUTH_CHUNKS: @@ -2780,24 +2857,30 @@ flags_out: } else { /* copy in the chunks */ (void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks); + *optsize = sizeof(struct sctp_authchunks) + size; } SCTP_TCB_UNLOCK(stcb); } else { - /* get off the endpoint */ - SCTP_INP_RLOCK(inp); - chklist = inp->sctp_ep.local_auth_chunks; - /* is there enough space? */ - size = sctp_auth_get_chklist_size(chklist); - if (*optsize < (sizeof(struct sctp_authchunks) + size)) { - error = EINVAL; - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + if (sac->gauth_assoc_id == SCTP_FUTURE_ASSOC) { + /* get off the endpoint */ + SCTP_INP_RLOCK(inp); + chklist = inp->sctp_ep.local_auth_chunks; + /* is there enough space? */ + size = sctp_auth_get_chklist_size(chklist); + if (*optsize < (sizeof(struct sctp_authchunks) + size)) { + error = EINVAL; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + } else { + /* copy in the chunks */ + (void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks); + *optsize = sizeof(struct sctp_authchunks) + size; + } + SCTP_INP_RUNLOCK(inp); } else { - /* copy in the chunks */ - (void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; } - SCTP_INP_RUNLOCK(inp); } - *optsize = sizeof(struct sctp_authchunks) + size; break; } case SCTP_PEER_AUTH_CHUNKS: @@ -2820,23 +2903,191 @@ flags_out: } else { /* copy in the chunks */ (void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks); + *optsize = sizeof(struct sctp_authchunks) + size; } SCTP_TCB_UNLOCK(stcb); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); error = ENOENT; } - *optsize = sizeof(struct sctp_authchunks) + size; break; } + case SCTP_EVENT: + { + struct sctp_event *event; + uint32_t event_type; + + SCTP_CHECK_AND_CAST(event, optval, struct sctp_event, *optsize); + SCTP_FIND_STCB(inp, stcb, event->se_assoc_id); + + switch (event->se_type) { + case SCTP_ASSOC_CHANGE: + event_type = SCTP_PCB_FLAGS_RECVASSOCEVNT; + break; + case SCTP_PEER_ADDR_CHANGE: + event_type = SCTP_PCB_FLAGS_RECVPADDREVNT; + break; + case SCTP_REMOTE_ERROR: + event_type = SCTP_PCB_FLAGS_RECVPEERERR; + break; + case SCTP_SEND_FAILED: + event_type = SCTP_PCB_FLAGS_RECVSENDFAILEVNT; + break; + case SCTP_SHUTDOWN_EVENT: + event_type = SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT; + break; + case SCTP_ADAPTATION_INDICATION: + event_type = SCTP_PCB_FLAGS_ADAPTATIONEVNT; + break; + case SCTP_PARTIAL_DELIVERY_EVENT: + event_type = SCTP_PCB_FLAGS_PDAPIEVNT; + break; + case SCTP_AUTHENTICATION_EVENT: + event_type = SCTP_PCB_FLAGS_AUTHEVNT; + break; + case SCTP_STREAM_RESET_EVENT: + event_type = SCTP_PCB_FLAGS_STREAM_RESETEVNT; + break; + case SCTP_SENDER_DRY_EVENT: + event_type = SCTP_PCB_FLAGS_DRYEVNT; + break; + case SCTP_NOTIFICATIONS_STOPPED_EVENT: + event_type = 0; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTSUP); + error = ENOTSUP; + break; + default: + event_type = 0; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + break; + } + if (event_type > 0) { + if (stcb) { + event->se_on = sctp_stcb_is_feature_on(inp, stcb, event_type); + SCTP_TCB_UNLOCK(stcb); + } else { + if (event->se_assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + event->se_on = sctp_is_feature_on(inp, event_type); + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_event); + } + break; + } + case SCTP_RECVRCVINFO: + { + int onoff; + + if (*optsize < sizeof(int)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } else { + SCTP_INP_RUNLOCK(inp); + onoff = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVRCVINFO); + SCTP_INP_RUNLOCK(inp); + } + if (error == 0) { + /* return the option value */ + *(int *)optval = onoff; + *optsize = sizeof(int); + } + break; + } + case SCTP_RECVNXTINFO: + { + int onoff; + + if (*optsize < sizeof(int)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } else { + SCTP_INP_RUNLOCK(inp); + onoff = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO); + SCTP_INP_RUNLOCK(inp); + } + if (error == 0) { + /* return the option value */ + *(int *)optval = onoff; + *optsize = sizeof(int); + } + break; + } + case SCTP_DEFAULT_SNDINFO: + { + struct sctp_sndinfo *info; + + SCTP_CHECK_AND_CAST(info, optval, struct sctp_sndinfo, *optsize); + SCTP_FIND_STCB(inp, stcb, info->snd_assoc_id); + + if (stcb) { + info->snd_sid = stcb->asoc.def_send.sinfo_stream; + info->snd_flags = stcb->asoc.def_send.sinfo_flags; + info->snd_flags &= 0xfff0; + info->snd_ppid = stcb->asoc.def_send.sinfo_ppid; + info->snd_context = stcb->asoc.def_send.sinfo_context; + SCTP_TCB_UNLOCK(stcb); + } else { + if (info->snd_assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + info->snd_sid = inp->def_send.sinfo_stream; + info->snd_flags = inp->def_send.sinfo_flags; + info->snd_flags &= 0xfff0; + info->snd_ppid = inp->def_send.sinfo_ppid; + info->snd_context = inp->def_send.sinfo_context; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_sndinfo); + } + break; + } + case SCTP_DEFAULT_PRINFO: + { + struct sctp_default_prinfo *info; + SCTP_CHECK_AND_CAST(info, optval, struct sctp_default_prinfo, *optsize); + SCTP_FIND_STCB(inp, stcb, info->pr_assoc_id); + if (stcb) { + info->pr_policy = PR_SCTP_POLICY(stcb->asoc.def_send.sinfo_flags); + info->pr_value = stcb->asoc.def_send.sinfo_timetolive; + SCTP_TCB_UNLOCK(stcb); + } else { + if (info->pr_assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_RLOCK(inp); + info->pr_policy = PR_SCTP_POLICY(inp->def_send.sinfo_flags); + info->pr_value = inp->def_send.sinfo_timetolive; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_default_prinfo); + } + break; + } default: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT); error = ENOPROTOOPT; - *optsize = 0; break; } /* end switch (sopt->sopt_name) */ + if (error) { + *optsize = 0; + } return (error); } @@ -2949,8 +3200,8 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, sctp_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE); else sctp_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE); + break; } - break; case SCTP_PARTIAL_DELIVERY_POINT: { uint32_t *value; @@ -2962,8 +3213,8 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, break; } inp->partial_delivery_point = *value; + break; } - break; case SCTP_FRAGMENT_INTERLEAVE: /* not yet until we re-write sctp_recvmsg() */ { @@ -2984,83 +3235,95 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } + break; } - break; case SCTP_CMT_ON_OFF: if (SCTP_BASE_SYSCTL(sctp_cmt_on_off)) { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); + if (av->assoc_value > SCTP_CMT_MAX) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + break; + } SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { - if (av->assoc_value > SCTP_CMT_MAX) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); - error = EINVAL; - } else { - stcb->asoc.sctp_cmt_on_off = av->assoc_value; - } + stcb->asoc.sctp_cmt_on_off = av->assoc_value; SCTP_TCB_UNLOCK(stcb); } else { - if (av->assoc_value > SCTP_CMT_MAX) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); - error = EINVAL; - } else { + if ((av->assoc_id == SCTP_FUTURE_ASSOC) || + (av->assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_WLOCK(inp); inp->sctp_cmt_on_off = av->assoc_value; SCTP_INP_WUNLOCK(inp); } + if ((av->assoc_id == SCTP_CURRENT_ASSOC) || + (av->assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + stcb->asoc.sctp_cmt_on_off = av->assoc_value; + SCTP_TCB_UNLOCK(stcb); + SCTP_INP_RUNLOCK(inp); + } + } } } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT); error = ENOPROTOOPT; } break; - /* JRS - Set socket option for pluggable congestion control */ case SCTP_PLUGGABLE_CC: { struct sctp_assoc_value *av; struct sctp_nets *net; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); + if ((av->assoc_value != SCTP_CC_RFC2581) && + (av->assoc_value != SCTP_CC_HSTCP) && + (av->assoc_value != SCTP_CC_HTCP) && + (av->assoc_value != SCTP_CC_RTCC)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + break; + } SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { - switch (av->assoc_value) { - case SCTP_CC_RFC2581: - case SCTP_CC_HSTCP: - case SCTP_CC_HTCP: - case SCTP_CC_RTCC: - stcb->asoc.cc_functions = sctp_cc_functions[av->assoc_value]; - stcb->asoc.congestion_control_module = av->assoc_value; - if (stcb->asoc.cc_functions.sctp_set_initial_cc_param != NULL) { - TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { - stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net); - } + stcb->asoc.cc_functions = sctp_cc_functions[av->assoc_value]; + stcb->asoc.congestion_control_module = av->assoc_value; + if (stcb->asoc.cc_functions.sctp_set_initial_cc_param != NULL) { + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net); } - break; - default: - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); - error = EINVAL; - break; } SCTP_TCB_UNLOCK(stcb); } else { - switch (av->assoc_value) { - case SCTP_CC_RFC2581: - case SCTP_CC_HSTCP: - case SCTP_CC_HTCP: - case SCTP_CC_RTCC: + if ((av->assoc_id == SCTP_FUTURE_ASSOC) || + (av->assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_WLOCK(inp); inp->sctp_ep.sctp_default_cc_module = av->assoc_value; SCTP_INP_WUNLOCK(inp); - break; - default: - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); - error = EINVAL; - break; + } + if ((av->assoc_id == SCTP_CURRENT_ASSOC) || + (av->assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + stcb->asoc.cc_functions = sctp_cc_functions[av->assoc_value]; + stcb->asoc.congestion_control_module = av->assoc_value; + if (stcb->asoc.cc_functions.sctp_set_initial_cc_param != NULL) { + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net); + } + } + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); } } + break; } - break; case SCTP_CC_OPTION: { struct sctp_cc_option *cc_opt; @@ -3068,7 +3331,19 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_CHECK_AND_CAST(cc_opt, optval, struct sctp_cc_option, optsize); SCTP_FIND_STCB(inp, stcb, cc_opt->aid_value.assoc_id); if (stcb == NULL) { - error = EINVAL; + if (cc_opt->aid_value.assoc_id == SCTP_CURRENT_ASSOC) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + if (stcb->asoc.cc_functions.sctp_cwnd_socket_option) { + (*stcb->asoc.cc_functions.sctp_cwnd_socket_option) (stcb, 1, cc_opt); + } + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); + } else { + error = EINVAL; + } } else { if (stcb->asoc.cc_functions.sctp_cwnd_socket_option == NULL) { error = ENOTSUP; @@ -3078,54 +3353,54 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } SCTP_TCB_UNLOCK(stcb); } + break; } - break; - /* RS - Set socket option for pluggable stream scheduling */ case SCTP_PLUGGABLE_SS: { struct sctp_assoc_value *av; SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); + if ((av->assoc_value != SCTP_SS_DEFAULT) && + (av->assoc_value != SCTP_SS_DEFAULT) && + (av->assoc_value != SCTP_SS_ROUND_ROBIN) && + (av->assoc_value != SCTP_SS_ROUND_ROBIN_PACKET) && + (av->assoc_value != SCTP_SS_PRIORITY) && + (av->assoc_value != SCTP_SS_FAIR_BANDWITH) && + (av->assoc_value != SCTP_SS_FIRST_COME)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + break; + } SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { - switch (av->assoc_value) { - case SCTP_SS_DEFAULT: - case SCTP_SS_ROUND_ROBIN: - case SCTP_SS_ROUND_ROBIN_PACKET: - case SCTP_SS_PRIORITY: - case SCTP_SS_FAIR_BANDWITH: - case SCTP_SS_FIRST_COME: - stcb->asoc.ss_functions.sctp_ss_clear(stcb, &stcb->asoc, 1, 1); - stcb->asoc.ss_functions = sctp_ss_functions[av->assoc_value]; - stcb->asoc.stream_scheduling_module = av->assoc_value; - stcb->asoc.ss_functions.sctp_ss_init(stcb, &stcb->asoc, 1); - break; - default: - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); - error = EINVAL; - break; - } + stcb->asoc.ss_functions.sctp_ss_clear(stcb, &stcb->asoc, 1, 1); + stcb->asoc.ss_functions = sctp_ss_functions[av->assoc_value]; + stcb->asoc.stream_scheduling_module = av->assoc_value; + stcb->asoc.ss_functions.sctp_ss_init(stcb, &stcb->asoc, 1); SCTP_TCB_UNLOCK(stcb); } else { - switch (av->assoc_value) { - case SCTP_SS_DEFAULT: - case SCTP_SS_ROUND_ROBIN: - case SCTP_SS_ROUND_ROBIN_PACKET: - case SCTP_SS_PRIORITY: - case SCTP_SS_FAIR_BANDWITH: - case SCTP_SS_FIRST_COME: + if ((av->assoc_id == SCTP_FUTURE_ASSOC) || + (av->assoc_id == SCTP_ALL_ASSOC)) { SCTP_INP_WLOCK(inp); inp->sctp_ep.sctp_default_ss_module = av->assoc_value; SCTP_INP_WUNLOCK(inp); - break; - default: - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); - error = EINVAL; - break; + } + if ((av->assoc_id == SCTP_CURRENT_ASSOC) || + (av->assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + stcb->asoc.ss_functions.sctp_ss_clear(stcb, &stcb->asoc, 1, 1); + stcb->asoc.ss_functions = sctp_ss_functions[av->assoc_value]; + stcb->asoc.stream_scheduling_module = av->assoc_value; + stcb->asoc.ss_functions.sctp_ss_init(stcb, &stcb->asoc, 1); + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); } } + break; } - break; case SCTP_SS_VALUE: { struct sctp_stream_value *av; @@ -3140,15 +3415,29 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } SCTP_TCB_UNLOCK(stcb); } else { - /* - * Can't set stream value without - * association - */ - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); - error = EINVAL; + if (av->assoc_id == SCTP_CURRENT_ASSOC) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + stcb->asoc.ss_functions.sctp_ss_set_value(stcb, + &stcb->asoc, + &stcb->asoc.strmout[av->stream_id], + av->stream_value); + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); + + } else { + /* + * Can't set stream value without + * association + */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } } + break; } - break; case SCTP_CLR_STAT_LOG: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP); error = EOPNOTSUPP; @@ -3164,12 +3453,25 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, stcb->asoc.context = av->assoc_value; SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_WLOCK(inp); - inp->sctp_context = av->assoc_value; - SCTP_INP_WUNLOCK(inp); + if ((av->assoc_id == SCTP_FUTURE_ASSOC) || + (av->assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_WLOCK(inp); + inp->sctp_context = av->assoc_value; + SCTP_INP_WUNLOCK(inp); + } + if ((av->assoc_id == SCTP_CURRENT_ASSOC) || + (av->assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + stcb->asoc.context = av->assoc_value; + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); + } } + break; } - break; case SCTP_VRF_ID: { uint32_t *default_vrfid; @@ -3204,12 +3506,12 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, if (sack->sack_delay) { if (sack->sack_delay > SCTP_MAX_SACK_DELAY) sack->sack_delay = SCTP_MAX_SACK_DELAY; + if (MSEC_TO_TICKS(sack->sack_delay) < 1) { + sack->sack_delay = TICKS_TO_MSEC(1); + } } if (stcb) { if (sack->sack_delay) { - if (MSEC_TO_TICKS(sack->sack_delay) < 1) { - sack->sack_delay = TICKS_TO_MSEC(1); - } stcb->asoc.delayed_ack = sack->sack_delay; } if (sack->sack_freq) { @@ -3217,17 +3519,32 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_WLOCK(inp); - if (sack->sack_delay) { - if (MSEC_TO_TICKS(sack->sack_delay) < 1) { - sack->sack_delay = TICKS_TO_MSEC(1); + if ((sack->sack_assoc_id == SCTP_FUTURE_ASSOC) || + (sack->sack_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_WLOCK(inp); + if (sack->sack_delay) { + inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_RECV] = MSEC_TO_TICKS(sack->sack_delay); + } + if (sack->sack_freq) { + inp->sctp_ep.sctp_sack_freq = sack->sack_freq; } - inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_RECV] = MSEC_TO_TICKS(sack->sack_delay); + SCTP_INP_WUNLOCK(inp); } - if (sack->sack_freq) { - inp->sctp_ep.sctp_sack_freq = sack->sack_freq; + if ((sack->sack_assoc_id == SCTP_CURRENT_ASSOC) || + (sack->sack_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + if (sack->sack_delay) { + stcb->asoc.delayed_ack = sack->sack_delay; + } + if (sack->sack_freq) { + stcb->asoc.sack_freq = sack->sack_freq; + } + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); } - SCTP_INP_WUNLOCK(inp); } break; } @@ -3255,10 +3572,9 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_CHECK_AND_CAST(sca, optval, struct sctp_authkey, optsize); SCTP_FIND_STCB(inp, stcb, sca->sca_assoc_id); - size = optsize - sizeof(*sca); + size = optsize - sizeof(struct sctp_authkey); if (stcb) { - /* set it on the assoc */ shared_keys = &stcb->asoc.shared_keys; /* clear the cached keys for this key id */ sctp_clear_cachedkeys(stcb, sca->sca_keynumber); @@ -3288,39 +3604,76 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, error = sctp_insert_sharedkey(shared_keys, shared_key); SCTP_TCB_UNLOCK(stcb); } else { - /* set it on the endpoint */ - SCTP_INP_WLOCK(inp); - shared_keys = &inp->sctp_ep.shared_keys; - /* - * clear the cached keys on all assocs for - * this key id - */ - sctp_clear_cachedkeys_ep(inp, sca->sca_keynumber); - /* - * create the new shared key and - * insert/replace it - */ - if (size > 0) { - key = sctp_set_key(sca->sca_key, (uint32_t) size); - if (key == NULL) { + if ((sca->sca_assoc_id == SCTP_FUTURE_ASSOC) || + (sca->sca_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_WLOCK(inp); + shared_keys = &inp->sctp_ep.shared_keys; + /* + * clear the cached keys on all + * assocs for this key id + */ + sctp_clear_cachedkeys_ep(inp, sca->sca_keynumber); + /* + * create the new shared key and + * insert/replace it + */ + if (size > 0) { + key = sctp_set_key(sca->sca_key, (uint32_t) size); + if (key == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM); + error = ENOMEM; + SCTP_INP_WUNLOCK(inp); + break; + } + } + shared_key = sctp_alloc_sharedkey(); + if (shared_key == NULL) { + sctp_free_key(key); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM); error = ENOMEM; SCTP_INP_WUNLOCK(inp); break; } - } - shared_key = sctp_alloc_sharedkey(); - if (shared_key == NULL) { - sctp_free_key(key); - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM); - error = ENOMEM; + shared_key->key = key; + shared_key->keyid = sca->sca_keynumber; + error = sctp_insert_sharedkey(shared_keys, shared_key); SCTP_INP_WUNLOCK(inp); - break; } - shared_key->key = key; - shared_key->keyid = sca->sca_keynumber; - error = sctp_insert_sharedkey(shared_keys, shared_key); - SCTP_INP_WUNLOCK(inp); + if ((sca->sca_assoc_id == SCTP_CURRENT_ASSOC) || + (sca->sca_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + shared_keys = &stcb->asoc.shared_keys; + /* + * clear the cached keys for + * this key id + */ + sctp_clear_cachedkeys(stcb, sca->sca_keynumber); + /* + * create the new shared key + * and insert/replace it + */ + if (size > 0) { + key = sctp_set_key(sca->sca_key, (uint32_t) size); + if (key == NULL) { + SCTP_TCB_UNLOCK(stcb); + continue; + } + } + shared_key = sctp_alloc_sharedkey(); + if (shared_key == NULL) { + sctp_free_key(key); + SCTP_TCB_UNLOCK(stcb); + continue; + } + shared_key->key = key; + shared_key->keyid = sca->sca_keynumber; + error = sctp_insert_sharedkey(shared_keys, shared_key); + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); + } } break; } @@ -3330,7 +3683,6 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, sctp_hmaclist_t *hmaclist; uint16_t hmacid; uint32_t i; - size_t found; SCTP_CHECK_AND_CAST(shmac, optval, struct sctp_hmacalgo, optsize); @@ -3381,8 +3733,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, { struct sctp_authkeyid *scact; - SCTP_CHECK_AND_CAST(scact, optval, struct sctp_authkeyid, - optsize); + SCTP_CHECK_AND_CAST(scact, optval, struct sctp_authkeyid, optsize); SCTP_FIND_STCB(inp, stcb, scact->scact_assoc_id); /* set the active key on the right place */ @@ -3397,16 +3748,25 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } SCTP_TCB_UNLOCK(stcb); } else { - /* set the active key on the endpoint */ - SCTP_INP_WLOCK(inp); - if (sctp_auth_setactivekey_ep(inp, - scact->scact_keynumber)) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, - SCTP_FROM_SCTP_USRREQ, - EINVAL); - error = EINVAL; + if ((scact->scact_assoc_id == SCTP_FUTURE_ASSOC) || + (scact->scact_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_WLOCK(inp); + if (sctp_auth_setactivekey_ep(inp, scact->scact_keynumber)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + SCTP_INP_WUNLOCK(inp); + } + if ((scact->scact_assoc_id == SCTP_CURRENT_ASSOC) || + (scact->scact_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + sctp_auth_setactivekey(stcb, scact->scact_keynumber); + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); } - SCTP_INP_WUNLOCK(inp); } break; } @@ -3414,30 +3774,36 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, { struct sctp_authkeyid *scdel; - SCTP_CHECK_AND_CAST(scdel, optval, struct sctp_authkeyid, - optsize); + SCTP_CHECK_AND_CAST(scdel, optval, struct sctp_authkeyid, optsize); SCTP_FIND_STCB(inp, stcb, scdel->scact_assoc_id); /* delete the key from the right place */ if (stcb) { - if (sctp_delete_sharedkey(stcb, - scdel->scact_keynumber)) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, - SCTP_FROM_SCTP_USRREQ, - EINVAL); + if (sctp_delete_sharedkey(stcb, scdel->scact_keynumber)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_WLOCK(inp); - if (sctp_delete_sharedkey_ep(inp, - scdel->scact_keynumber)) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, - SCTP_FROM_SCTP_USRREQ, - EINVAL); - error = EINVAL; + if ((scdel->scact_assoc_id == SCTP_FUTURE_ASSOC) || + (scdel->scact_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_WLOCK(inp); + if (sctp_delete_sharedkey_ep(inp, scdel->scact_keynumber)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + SCTP_INP_WUNLOCK(inp); + } + if ((scdel->scact_assoc_id == SCTP_CURRENT_ASSOC) || + (scdel->scact_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + sctp_delete_sharedkey(stcb, scdel->scact_keynumber); + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); } - SCTP_INP_WUNLOCK(inp); } break; } @@ -3445,30 +3811,36 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, { struct sctp_authkeyid *keyid; - SCTP_CHECK_AND_CAST(keyid, optval, struct sctp_authkeyid, - optsize); + SCTP_CHECK_AND_CAST(keyid, optval, struct sctp_authkeyid, optsize); SCTP_FIND_STCB(inp, stcb, keyid->scact_assoc_id); /* deactivate the key from the right place */ if (stcb) { - if (sctp_deact_sharedkey(stcb, - keyid->scact_keynumber)) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, - SCTP_FROM_SCTP_USRREQ, - EINVAL); + if (sctp_deact_sharedkey(stcb, keyid->scact_keynumber)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_WLOCK(inp); - if (sctp_deact_sharedkey_ep(inp, - keyid->scact_keynumber)) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, - SCTP_FROM_SCTP_USRREQ, - EINVAL); - error = EINVAL; + if ((keyid->scact_assoc_id == SCTP_FUTURE_ASSOC) || + (keyid->scact_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_WLOCK(inp); + if (sctp_deact_sharedkey_ep(inp, keyid->scact_keynumber)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + SCTP_INP_WUNLOCK(inp); + } + if ((keyid->scact_assoc_id == SCTP_CURRENT_ASSOC) || + (keyid->scact_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + sctp_deact_sharedkey(stcb, keyid->scact_keynumber); + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); } - SCTP_INP_WUNLOCK(inp); } break; } @@ -3632,9 +4004,8 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_REQ, SCTP_SO_LOCKED); SCTP_TCB_UNLOCK(stcb); + break; } - break; - case SCTP_CONNECT_X: if (optsize < (sizeof(int) + sizeof(struct sockaddr_in))) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); @@ -3643,7 +4014,6 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } error = sctp_do_connect_x(so, inp, optval, optsize, p, 0); break; - case SCTP_CONNECT_X_DELAYED: if (optsize < (sizeof(int) + sizeof(struct sockaddr_in))) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); @@ -3652,7 +4022,6 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } error = sctp_do_connect_x(so, inp, optval, optsize, p, 1); break; - case SCTP_CONNECT_X_COMPLETE: { struct sockaddr *sa; @@ -3706,8 +4075,8 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, error = EALREADY; } SCTP_TCB_UNLOCK(stcb); + break; } - break; case SCTP_MAX_BURST: { struct sctp_assoc_value *av; @@ -3719,12 +4088,25 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, stcb->asoc.max_burst = av->assoc_value; SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_WLOCK(inp); - inp->sctp_ep.max_burst = av->assoc_value; - SCTP_INP_WUNLOCK(inp); + if ((av->assoc_id == SCTP_FUTURE_ASSOC) || + (av->assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_WLOCK(inp); + inp->sctp_ep.max_burst = av->assoc_value; + SCTP_INP_WUNLOCK(inp); + } + if ((av->assoc_id == SCTP_CURRENT_ASSOC) || + (av->assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + stcb->asoc.max_burst = av->assoc_value; + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); + } } + break; } - break; case SCTP_MAXSEG: { struct sctp_assoc_value *av; @@ -3746,20 +4128,25 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_WLOCK(inp); - /* - * FIXME MT: I think this is not in tune - * with the API ID - */ - if (av->assoc_value) { - inp->sctp_frag_point = (av->assoc_value + ovh); + if (av->assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_WLOCK(inp); + /* + * FIXME MT: I think this is not in + * tune with the API ID + */ + if (av->assoc_value) { + inp->sctp_frag_point = (av->assoc_value + ovh); + } else { + inp->sctp_frag_point = SCTP_DEFAULT_MAXSEGMENT; + } + SCTP_INP_WUNLOCK(inp); } else { - inp->sctp_frag_point = SCTP_DEFAULT_MAXSEGMENT; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; } - SCTP_INP_WUNLOCK(inp); } + break; } - break; case SCTP_EVENTS: { struct sctp_event_subscribe *events; @@ -3823,22 +4210,6 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, if (events->sctp_sender_dry_event) { sctp_feature_on(inp, SCTP_PCB_FLAGS_DRYEVNT); - if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || - (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { - stcb = LIST_FIRST(&inp->sctp_asoc_list); - if (stcb) { - SCTP_TCB_LOCK(stcb); - } - if (stcb && - TAILQ_EMPTY(&stcb->asoc.send_queue) && - TAILQ_EMPTY(&stcb->asoc.sent_queue) && - (stcb->asoc.stream_queue_cnt == 0)) { - sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_LOCKED); - } - if (stcb) { - SCTP_TCB_UNLOCK(stcb); - } - } } else { sctp_feature_off(inp, SCTP_PCB_FLAGS_DRYEVNT); } @@ -3849,9 +4220,84 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, sctp_feature_off(inp, SCTP_PCB_FLAGS_STREAM_RESETEVNT); } SCTP_INP_WUNLOCK(inp); - } - break; + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + if (events->sctp_association_event) { + sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVASSOCEVNT); + } else { + sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVASSOCEVNT); + } + if (events->sctp_address_event) { + sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVPADDREVNT); + } else { + sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVPADDREVNT); + } + if (events->sctp_send_failure_event) { + sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT); + } else { + sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT); + } + if (events->sctp_peer_error_event) { + sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVPEERERR); + } else { + sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVPEERERR); + } + if (events->sctp_shutdown_event) { + sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT); + } else { + sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT); + } + if (events->sctp_partial_delivery_event) { + sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_PDAPIEVNT); + } else { + sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_PDAPIEVNT); + } + if (events->sctp_adaptation_layer_event) { + sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_ADAPTATIONEVNT); + } else { + sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_ADAPTATIONEVNT); + } + if (events->sctp_authentication_event) { + sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_AUTHEVNT); + } else { + sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_AUTHEVNT); + } + if (events->sctp_sender_dry_event) { + sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_DRYEVNT); + } else { + sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_DRYEVNT); + } + if (events->sctp_stream_reset_event) { + sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_STREAM_RESETEVNT); + } else { + sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_STREAM_RESETEVNT); + } + SCTP_TCB_UNLOCK(stcb); + } + /* + * Send up the sender dry event only for 1-to-1 + * style sockets. + */ + if (events->sctp_sender_dry_event) { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { + stcb = LIST_FIRST(&inp->sctp_asoc_list); + if (stcb) { + SCTP_TCB_LOCK(stcb); + if (TAILQ_EMPTY(&stcb->asoc.send_queue) && + TAILQ_EMPTY(&stcb->asoc.sent_queue) && + (stcb->asoc.stream_queue_cnt == 0)) { + sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_LOCKED); + } + SCTP_TCB_UNLOCK(stcb); + } + } + } + SCTP_INP_RUNLOCK(inp); + break; + } case SCTP_ADAPTATION_LAYER: { struct sctp_setadaptation *adap_bits; @@ -3860,8 +4306,8 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_INP_WLOCK(inp); inp->sctp_ep.adaptation_layer_indicator = adap_bits->ssb_adaptation_ind; SCTP_INP_WUNLOCK(inp); + break; } - break; #ifdef SCTP_DEBUG case SCTP_SET_INITIAL_DBG_SEQ: { @@ -3871,8 +4317,8 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_INP_WLOCK(inp); inp->sctp_ep.initial_sequence_debug = *vvv; SCTP_INP_WUNLOCK(inp); + break; } - break; #endif case SCTP_DEFAULT_SEND_PARAM: { @@ -3882,7 +4328,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_FIND_STCB(inp, stcb, s_info->sinfo_assoc_id); if (stcb) { - if (s_info->sinfo_stream <= stcb->asoc.streamoutcnt) { + if (s_info->sinfo_stream < stcb->asoc.streamoutcnt) { memcpy(&stcb->asoc.def_send, s_info, min(optsize, sizeof(stcb->asoc.def_send))); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); @@ -3890,12 +4336,27 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_WLOCK(inp); - memcpy(&inp->def_send, s_info, min(optsize, sizeof(inp->def_send))); - SCTP_INP_WUNLOCK(inp); + if ((s_info->sinfo_assoc_id == SCTP_FUTURE_ASSOC) || + (s_info->sinfo_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_WLOCK(inp); + memcpy(&inp->def_send, s_info, min(optsize, sizeof(inp->def_send))); + SCTP_INP_WUNLOCK(inp); + } + if ((s_info->sinfo_assoc_id == SCTP_CURRENT_ASSOC) || + (s_info->sinfo_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + if (s_info->sinfo_stream < stcb->asoc.streamoutcnt) { + memcpy(&stcb->asoc.def_send, s_info, min(optsize, sizeof(stcb->asoc.def_send))); + } + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); + } } + break; } - break; case SCTP_PEER_ADDR_PARAMS: /* Applys to the specific association */ { @@ -4116,31 +4577,37 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_TCB_UNLOCK(stcb); } else { /************************NO TCB, SET TO default stuff ******************/ - SCTP_INP_WLOCK(inp); - /* - * For the TOS/FLOWLABEL stuff you set it - * with the options on the socket - */ - if (paddrp->spp_pathmaxrxt) { - inp->sctp_ep.def_net_failure = paddrp->spp_pathmaxrxt; - } - if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) - inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = 0; - else if (paddrp->spp_hbinterval) { - if (paddrp->spp_hbinterval > SCTP_MAX_HB_INTERVAL) - paddrp->spp_hbinterval = SCTP_MAX_HB_INTERVAL; - inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = MSEC_TO_TICKS(paddrp->spp_hbinterval); - } - if (paddrp->spp_flags & SPP_HB_ENABLE) { - sctp_feature_off(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT); + if (paddrp->spp_assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_WLOCK(inp); + /* + * For the TOS/FLOWLABEL stuff you + * set it with the options on the + * socket + */ + if (paddrp->spp_pathmaxrxt) { + inp->sctp_ep.def_net_failure = paddrp->spp_pathmaxrxt; + } + if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) + inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = 0; + else if (paddrp->spp_hbinterval) { + if (paddrp->spp_hbinterval > SCTP_MAX_HB_INTERVAL) + paddrp->spp_hbinterval = SCTP_MAX_HB_INTERVAL; + inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = MSEC_TO_TICKS(paddrp->spp_hbinterval); + } + if (paddrp->spp_flags & SPP_HB_ENABLE) { + sctp_feature_off(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT); - } else if (paddrp->spp_flags & SPP_HB_DISABLE) { - sctp_feature_on(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT); + } else if (paddrp->spp_flags & SPP_HB_DISABLE) { + sctp_feature_on(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT); + } + SCTP_INP_WUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; } - SCTP_INP_WUNLOCK(inp); } + break; } - break; case SCTP_RTOINFO: { struct sctp_rtoinfo *srto; @@ -4172,31 +4639,36 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_WLOCK(inp); - if (srto->srto_initial) - new_init = srto->srto_initial; - else - new_init = inp->sctp_ep.initial_rto; - if (srto->srto_max) - new_max = srto->srto_max; - else - new_max = inp->sctp_ep.sctp_maxrto; - if (srto->srto_min) - new_min = srto->srto_min; - else - new_min = inp->sctp_ep.sctp_minrto; - if ((new_min <= new_init) && (new_init <= new_max)) { - inp->sctp_ep.initial_rto = new_init; - inp->sctp_ep.sctp_maxrto = new_max; - inp->sctp_ep.sctp_minrto = new_min; + if (srto->srto_assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_WLOCK(inp); + if (srto->srto_initial) + new_init = srto->srto_initial; + else + new_init = inp->sctp_ep.initial_rto; + if (srto->srto_max) + new_max = srto->srto_max; + else + new_max = inp->sctp_ep.sctp_maxrto; + if (srto->srto_min) + new_min = srto->srto_min; + else + new_min = inp->sctp_ep.sctp_minrto; + if ((new_min <= new_init) && (new_init <= new_max)) { + inp->sctp_ep.initial_rto = new_init; + inp->sctp_ep.sctp_maxrto = new_max; + inp->sctp_ep.sctp_minrto = new_min; + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + SCTP_INP_WUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } - SCTP_INP_WUNLOCK(inp); } + break; } - break; case SCTP_ASSOCINFO: { struct sctp_assocparams *sasoc; @@ -4214,27 +4686,26 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, if (stcb) { if (sasoc->sasoc_asocmaxrxt) stcb->asoc.max_send_times = sasoc->sasoc_asocmaxrxt; - sasoc->sasoc_number_peer_destinations = stcb->asoc.numnets; - sasoc->sasoc_peer_rwnd = 0; - sasoc->sasoc_local_rwnd = 0; if (sasoc->sasoc_cookie_life) { stcb->asoc.cookie_life = MSEC_TO_TICKS(sasoc->sasoc_cookie_life); } SCTP_TCB_UNLOCK(stcb); } else { - SCTP_INP_WLOCK(inp); - if (sasoc->sasoc_asocmaxrxt) - inp->sctp_ep.max_send_times = sasoc->sasoc_asocmaxrxt; - sasoc->sasoc_number_peer_destinations = 0; - sasoc->sasoc_peer_rwnd = 0; - sasoc->sasoc_local_rwnd = 0; - if (sasoc->sasoc_cookie_life) { - inp->sctp_ep.def_cookie_life = MSEC_TO_TICKS(sasoc->sasoc_cookie_life); + if (sasoc->sasoc_assoc_id == SCTP_FUTURE_ASSOC) { + SCTP_INP_WLOCK(inp); + if (sasoc->sasoc_asocmaxrxt) + inp->sctp_ep.max_send_times = sasoc->sasoc_asocmaxrxt; + if (sasoc->sasoc_cookie_life) { + inp->sctp_ep.def_cookie_life = MSEC_TO_TICKS(sasoc->sasoc_cookie_life); + } + SCTP_INP_WUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; } - SCTP_INP_WUNLOCK(inp); } + break; } - break; case SCTP_INITMSG: { struct sctp_initmsg *sinit; @@ -4253,12 +4724,12 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, if (sinit->sinit_max_init_timeo) inp->sctp_ep.initial_init_rto_max = sinit->sinit_max_init_timeo; SCTP_INP_WUNLOCK(inp); + break; } - break; case SCTP_PRIMARY_ADDR: { struct sctp_setprim *spa; - struct sctp_nets *net, *lnet; + struct sctp_nets *net; SCTP_CHECK_AND_CAST(spa, optval, struct sctp_setprim, optsize); SCTP_FIND_STCB(inp, stcb, spa->ssp_assoc_id); @@ -4287,7 +4758,6 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, if ((net != stcb->asoc.primary_destination) && (!(net->dest_state & SCTP_ADDR_UNCONFIRMED))) { /* Ok we need to set it */ - lnet = stcb->asoc.primary_destination; if (sctp_set_primary_addr(stcb, (struct sockaddr *)NULL, net) == 0) { if (net->dest_state & SCTP_ADDR_SWITCH_PRIMARY) { net->dest_state |= SCTP_ADDR_DOUBLE_SWITCH; @@ -4302,8 +4772,8 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, if (stcb) { SCTP_TCB_UNLOCK(stcb); } + break; } - break; case SCTP_SET_DYNAMIC_PRIMARY: { union sctp_sockstore *ss; @@ -4316,8 +4786,8 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_CHECK_AND_CAST(ss, optval, union sctp_sockstore, optsize); /* SUPER USER CHECK? */ error = sctp_dynamic_set_primary(&ss->sa, vrf_id); + break; } - break; case SCTP_SET_PEER_PRIMARY_ADDR: { struct sctp_setpeerprim *sspp; @@ -4370,9 +4840,8 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } - + break; } - break; case SCTP_BINDX_ADD_ADDR: { struct sctp_getaddresses *addrs; @@ -4418,8 +4887,8 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, sctp_bindx_add_address(so, inp, addrs->addr, addrs->sget_assoc_id, vrf_id, &error, p); + break; } - break; case SCTP_BINDX_REM_ADDR: { struct sctp_getaddresses *addrs; @@ -4465,8 +4934,232 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, sctp_bindx_delete_address(so, inp, addrs->addr, addrs->sget_assoc_id, vrf_id, &error); + break; + } + case SCTP_EVENT: + { + struct sctp_event *event; + uint32_t event_type; + + SCTP_CHECK_AND_CAST(event, optval, struct sctp_event, optsize); + SCTP_FIND_STCB(inp, stcb, event->se_assoc_id); + switch (event->se_type) { + case SCTP_ASSOC_CHANGE: + event_type = SCTP_PCB_FLAGS_RECVASSOCEVNT; + break; + case SCTP_PEER_ADDR_CHANGE: + event_type = SCTP_PCB_FLAGS_RECVPADDREVNT; + break; + case SCTP_REMOTE_ERROR: + event_type = SCTP_PCB_FLAGS_RECVPEERERR; + break; + case SCTP_SEND_FAILED: + event_type = SCTP_PCB_FLAGS_RECVSENDFAILEVNT; + break; + case SCTP_SHUTDOWN_EVENT: + event_type = SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT; + break; + case SCTP_ADAPTATION_INDICATION: + event_type = SCTP_PCB_FLAGS_ADAPTATIONEVNT; + break; + case SCTP_PARTIAL_DELIVERY_EVENT: + event_type = SCTP_PCB_FLAGS_PDAPIEVNT; + break; + case SCTP_AUTHENTICATION_EVENT: + event_type = SCTP_PCB_FLAGS_AUTHEVNT; + break; + case SCTP_STREAM_RESET_EVENT: + event_type = SCTP_PCB_FLAGS_STREAM_RESETEVNT; + break; + case SCTP_SENDER_DRY_EVENT: + event_type = SCTP_PCB_FLAGS_DRYEVNT; + break; + case SCTP_NOTIFICATIONS_STOPPED_EVENT: + event_type = 0; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTSUP); + error = ENOTSUP; + break; + default: + event_type = 0; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + break; + } + if (event_type > 0) { + if (stcb) { + if (event->se_on) { + sctp_stcb_feature_on(inp, stcb, event_type); + if (event_type == SCTP_PCB_FLAGS_DRYEVNT) { + if (TAILQ_EMPTY(&stcb->asoc.send_queue) && + TAILQ_EMPTY(&stcb->asoc.sent_queue) && + (stcb->asoc.stream_queue_cnt == 0)) { + sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_LOCKED); + } + } + } else { + sctp_stcb_feature_off(inp, stcb, event_type); + } + SCTP_TCB_UNLOCK(stcb); + } else { + /* + * We don't want to send up a storm + * of events, so return an error for + * sender dry events + */ + if ((event_type == SCTP_PCB_FLAGS_DRYEVNT) && + ((event->se_assoc_id == SCTP_ALL_ASSOC) || + (event->se_assoc_id == SCTP_CURRENT_ASSOC))) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTSUP); + error = ENOTSUP; + break; + } + if ((event->se_assoc_id == SCTP_FUTURE_ASSOC) || + (event->se_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_WLOCK(inp); + if (event->se_on) { + sctp_feature_on(inp, event_type); + } else { + sctp_feature_off(inp, event_type); + } + SCTP_INP_WUNLOCK(inp); + } + if ((event->se_assoc_id == SCTP_CURRENT_ASSOC) || + (event->se_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + if (event->se_on) { + sctp_stcb_feature_on(inp, stcb, event_type); + } else { + sctp_stcb_feature_off(inp, stcb, event_type); + } + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); + } + } + } + break; + } + case SCTP_RECVRCVINFO: + { + int *onoff; + + SCTP_CHECK_AND_CAST(onoff, optval, int, optsize); + SCTP_INP_WLOCK(inp); + if (*onoff != 0) { + sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVRCVINFO); + } else { + sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVRCVINFO); + } + SCTP_INP_WUNLOCK(inp); + break; + } + case SCTP_RECVNXTINFO: + { + int *onoff; + + SCTP_CHECK_AND_CAST(onoff, optval, int, optsize); + SCTP_INP_WLOCK(inp); + if (*onoff != 0) { + sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO); + } else { + sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVNXTINFO); + } + SCTP_INP_WUNLOCK(inp); + break; + } + case SCTP_DEFAULT_SNDINFO: + { + struct sctp_sndinfo *info; + uint16_t policy; + + SCTP_CHECK_AND_CAST(info, optval, struct sctp_sndinfo, optsize); + SCTP_FIND_STCB(inp, stcb, info->snd_assoc_id); + + if (stcb) { + if (info->snd_sid < stcb->asoc.streamoutcnt) { + stcb->asoc.def_send.sinfo_stream = info->snd_sid; + policy = PR_SCTP_POLICY(stcb->asoc.def_send.sinfo_flags); + stcb->asoc.def_send.sinfo_flags = info->snd_flags; + stcb->asoc.def_send.sinfo_flags |= policy; + stcb->asoc.def_send.sinfo_ppid = info->snd_ppid; + stcb->asoc.def_send.sinfo_context = info->snd_context; + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + SCTP_TCB_UNLOCK(stcb); + } else { + if ((info->snd_assoc_id == SCTP_FUTURE_ASSOC) || + (info->snd_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_WLOCK(inp); + inp->def_send.sinfo_stream = info->snd_sid; + policy = PR_SCTP_POLICY(inp->def_send.sinfo_flags); + inp->def_send.sinfo_flags = info->snd_flags; + inp->def_send.sinfo_flags |= policy; + inp->def_send.sinfo_ppid = info->snd_ppid; + inp->def_send.sinfo_context = info->snd_context; + SCTP_INP_WUNLOCK(inp); + } + if ((info->snd_assoc_id == SCTP_CURRENT_ASSOC) || + (info->snd_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + if (info->snd_sid < stcb->asoc.streamoutcnt) { + stcb->asoc.def_send.sinfo_stream = info->snd_sid; + policy = PR_SCTP_POLICY(stcb->asoc.def_send.sinfo_flags); + stcb->asoc.def_send.sinfo_flags = info->snd_flags; + stcb->asoc.def_send.sinfo_flags |= policy; + stcb->asoc.def_send.sinfo_ppid = info->snd_ppid; + stcb->asoc.def_send.sinfo_context = info->snd_context; + } + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); + } + } + break; + } + case SCTP_DEFAULT_PRINFO: + { + struct sctp_default_prinfo *info; + + SCTP_CHECK_AND_CAST(info, optval, struct sctp_default_prinfo, optsize); + SCTP_FIND_STCB(inp, stcb, info->pr_assoc_id); + + if (PR_SCTP_INVALID_POLICY(info->pr_policy)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + break; + } + if (stcb) { + stcb->asoc.def_send.sinfo_flags &= 0xfff0; + stcb->asoc.def_send.sinfo_flags |= info->pr_policy; + SCTP_TCB_UNLOCK(stcb); + } else { + if ((info->pr_assoc_id == SCTP_FUTURE_ASSOC) || + (info->pr_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_WLOCK(inp); + inp->def_send.sinfo_flags &= 0xfff0; + inp->def_send.sinfo_flags |= info->pr_policy; + SCTP_INP_WUNLOCK(inp); + } + if ((info->pr_assoc_id == SCTP_CURRENT_ASSOC) || + (info->pr_assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + stcb->asoc.def_send.sinfo_flags &= 0xfff0; + stcb->asoc.def_send.sinfo_flags |= info->pr_policy; + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); + } + } + break; } - break; default: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT); error = ENOPROTOOPT; diff --git a/sys/netinet/sctp_var.h b/sys/netinet/sctp_var.h index 1e17900..e48dfe4 100644 --- a/sys/netinet/sctp_var.h +++ b/sys/netinet/sctp_var.h @@ -50,6 +50,30 @@ extern struct pr_usrreqs sctp_usrreqs; #define sctp_is_feature_on(inp, feature) ((inp->sctp_features & feature) == feature) #define sctp_is_feature_off(inp, feature) ((inp->sctp_features & feature) == 0) +#define sctp_stcb_feature_on(inp, stcb, feature) {\ + if (stcb) { \ + stcb->asoc.sctp_features |= feature; \ + } else { \ + inp->sctp_features |= feature; \ + } \ +} +#define sctp_stcb_feature_off(inp, stcb, feature) {\ + if (stcb) { \ + stcb->asoc.sctp_features &= ~feature; \ + } else { \ + inp->sctp_features &= ~feature; \ + } \ +} +#define sctp_stcb_is_feature_on(inp, stcb, feature) \ + (((stcb != NULL) && \ + ((stcb->asoc.sctp_features & feature) == feature)) || \ + ((stcb == NULL) && \ + ((inp->sctp_features & feature) == feature))) +#define sctp_stcb_is_feature_off(inp, stcb, feature) \ + (((stcb != NULL) && \ + ((stcb->asoc.sctp_features & feature) == 0)) || \ + ((stcb == NULL) && \ + ((inp->sctp_features & feature) == 0))) /* managing mobility_feature in inpcb (by micchie) */ #define sctp_mobility_feature_on(inp, feature) (inp->sctp_mobility_features |= feature) diff --git a/sys/netinet/sctputil.c b/sys/netinet/sctputil.c index 39df039..9a8bd2e 100644 --- a/sys/netinet/sctputil.c +++ b/sys/netinet/sctputil.c @@ -923,6 +923,7 @@ sctp_init_asoc(struct sctp_inpcb *m, struct sctp_tcb *stcb, asoc->sctp_nr_sack_on_off = (uint8_t) SCTP_BASE_SYSCTL(sctp_nr_sack_on_off); asoc->sctp_cmt_pf = (uint8_t) SCTP_BASE_SYSCTL(sctp_cmt_pf); asoc->sctp_frag_point = m->sctp_frag_point; + asoc->sctp_features = m->sctp_features; #ifdef INET asoc->default_tos = m->ip_inp.inp.inp_ip_tos; #else @@ -2760,7 +2761,7 @@ sctp_notify_assoc_change(uint32_t event, struct sctp_tcb *stcb, } #endif } - if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_RECVASSOCEVNT)) { + if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVASSOCEVNT)) { /* event not enabled */ return; } @@ -2831,7 +2832,7 @@ sctp_notify_peer_addr_change(struct sctp_tcb *stcb, uint32_t state, struct sctp_paddr_change *spc; struct sctp_queued_to_read *control; - if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_RECVPADDREVNT)) { + if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVPADDREVNT)) { /* event not enabled */ return; } @@ -2914,7 +2915,7 @@ sctp_notify_send_failed(struct sctp_tcb *stcb, uint32_t error, struct sctp_queued_to_read *control; int length; - if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_RECVSENDFAILEVNT)) { + if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT)) { /* event not enabled */ return; } @@ -2997,7 +2998,7 @@ sctp_notify_send_failed2(struct sctp_tcb *stcb, uint32_t error, struct sctp_queued_to_read *control; int length; - if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_RECVSENDFAILEVNT)) { + if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT)) { /* event not enabled */ return; } @@ -3067,7 +3068,7 @@ sctp_notify_adaptation_layer(struct sctp_tcb *stcb, struct sctp_adaptation_event *sai; struct sctp_queued_to_read *control; - if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_ADAPTATIONEVNT)) { + if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_ADAPTATIONEVNT)) { /* event not enabled */ return; } @@ -3118,7 +3119,7 @@ sctp_notify_partial_delivery_indication(struct sctp_tcb *stcb, uint32_t error, struct sctp_queued_to_read *control; struct sockbuf *sb; - if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_PDAPIEVNT)) { + if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_PDAPIEVNT)) { /* event not enabled */ return; } @@ -3231,7 +3232,7 @@ sctp_notify_shutdown_event(struct sctp_tcb *stcb) SCTP_SOCKET_UNLOCK(so, 1); #endif } - if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT)) { + if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT)) { /* event not enabled */ return; } @@ -3278,7 +3279,7 @@ sctp_notify_sender_dry_event(struct sctp_tcb *stcb, struct sctp_sender_dry_event *event; struct sctp_queued_to_read *control; - if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_DRYEVNT)) { + if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_DRYEVNT)) { /* event not enabled */ return; } @@ -5490,7 +5491,8 @@ found_one: if ((sinfo) && filling_sinfo) { memcpy(sinfo, control, sizeof(struct sctp_nonpad_sndrcvinfo)); nxt = TAILQ_NEXT(control, next); - if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO)) { + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO) || + sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO)) { struct sctp_extrcvinfo *s_extra; s_extra = (struct sctp_extrcvinfo *)sinfo; @@ -5997,7 +5999,8 @@ out: if (((out_flags & MSG_EOR) == 0) && ((in_flags & MSG_PEEK) == 0) && (sinfo) && - (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO))) { + (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO) || + sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO))) { struct sctp_extrcvinfo *s_extra; s_extra = (struct sctp_extrcvinfo *)sinfo; @@ -6147,8 +6150,9 @@ sctp_soreceive(struct socket *so, SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); return (EINVAL); } - if ((sctp_is_feature_off(inp, - SCTP_PCB_FLAGS_RECVDATAIOEVNT)) || + if ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT) && + sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVRCVINFO) && + sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVNXTINFO)) || (controlp == NULL)) { /* user does not want the sndrcv ctl */ filling_sinfo = 0; @@ -6184,71 +6188,6 @@ sctp_soreceive(struct socket *so, } -int -sctp_l_soreceive(struct socket *so, - struct sockaddr **name, - struct uio *uio, - char **controlp, - int *controllen, - int *flag) -{ - int error, fromlen; - uint8_t sockbuf[256]; - struct sockaddr *from; - struct sctp_extrcvinfo sinfo; - int filling_sinfo = 1; - struct sctp_inpcb *inp; - - inp = (struct sctp_inpcb *)so->so_pcb; - /* pickup the assoc we are reading from */ - if (inp == NULL) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); - return (EINVAL); - } - if ((sctp_is_feature_off(inp, - SCTP_PCB_FLAGS_RECVDATAIOEVNT)) || - (controlp == NULL)) { - /* user does not want the sndrcv ctl */ - filling_sinfo = 0; - } - if (name) { - from = (struct sockaddr *)sockbuf; - fromlen = sizeof(sockbuf); - from->sa_len = 0; - } else { - from = NULL; - fromlen = 0; - } - - error = sctp_sorecvmsg(so, uio, - (struct mbuf **)NULL, - from, fromlen, flag, - (struct sctp_sndrcvinfo *)&sinfo, - filling_sinfo); - if ((controlp) && (filling_sinfo)) { - /* - * copy back the sinfo in a CMSG format note that the caller - * has reponsibility for freeing the memory. - */ - if (filling_sinfo) - *controlp = sctp_build_ctl_cchunk(inp, - controllen, - (struct sctp_sndrcvinfo *)&sinfo); - } - if (name) { - /* copy back the address info */ - if (from && from->sa_len) { - *name = sodupsockaddr(from, M_WAIT); - } else { - *name = NULL; - } - } - return (error); -} - - - - diff --git a/sys/netinet/sctputil.h b/sys/netinet/sctputil.h index 69983e0..460adc7 100644 --- a/sys/netinet/sctputil.h +++ b/sys/netinet/sctputil.h @@ -328,20 +328,6 @@ sctp_soreceive(struct socket *so, struct sockaddr **psa, struct mbuf **controlp, int *flagsp); - -/* For those not passing mbufs, this does the - * translations for you. Caller owns memory - * of size controllen returned in controlp. - */ -int -sctp_l_soreceive(struct socket *so, - struct sockaddr **name, - struct uio *uio, - char **controlp, - int *controllen, - int *flag); - - void sctp_misc_ints(uint8_t from, uint32_t a, uint32_t b, uint32_t c, uint32_t d); diff --git a/sys/netinet/siftr.c b/sys/netinet/siftr.c index 9d11262..6145a54 100644 --- a/sys/netinet/siftr.c +++ b/sys/netinet/siftr.c @@ -696,17 +696,16 @@ siftr_findinpcb(int ipver, struct ip *ip, struct mbuf *m, uint16_t sport, /* We need the tcbinfo lock. */ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); - INP_INFO_RLOCK(&V_tcbinfo); if (dir == PFIL_IN) inp = (ipver == INP_IPV4 ? - in_pcblookup_hash(&V_tcbinfo, ip->ip_src, sport, ip->ip_dst, - dport, 0, m->m_pkthdr.rcvif) + in_pcblookup(&V_tcbinfo, ip->ip_src, sport, ip->ip_dst, + dport, INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif) : #ifdef SIFTR_IPV6 - in6_pcblookup_hash(&V_tcbinfo, + in6_pcblookup(&V_tcbinfo, &((struct ip6_hdr *)ip)->ip6_src, sport, - &((struct ip6_hdr *)ip)->ip6_dst, dport, 0, + &((struct ip6_hdr *)ip)->ip6_dst, dport, INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif) #else NULL @@ -715,13 +714,13 @@ siftr_findinpcb(int ipver, struct ip *ip, struct mbuf *m, uint16_t sport, else inp = (ipver == INP_IPV4 ? - in_pcblookup_hash(&V_tcbinfo, ip->ip_dst, dport, ip->ip_src, - sport, 0, m->m_pkthdr.rcvif) + in_pcblookup(&V_tcbinfo, ip->ip_dst, dport, ip->ip_src, + sport, INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif) : #ifdef SIFTR_IPV6 - in6_pcblookup_hash(&V_tcbinfo, + in6_pcblookup(&V_tcbinfo, &((struct ip6_hdr *)ip)->ip6_dst, dport, - &((struct ip6_hdr *)ip)->ip6_src, sport, 0, + &((struct ip6_hdr *)ip)->ip6_src, sport, INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif) #else NULL @@ -734,12 +733,7 @@ siftr_findinpcb(int ipver, struct ip *ip, struct mbuf *m, uint16_t sport, ss->nskip_in_inpcb++; else ss->nskip_out_inpcb++; - } else { - /* Acquire the inpcb lock. */ - INP_UNLOCK_ASSERT(inp); - INP_RLOCK(inp); } - INP_INFO_RUNLOCK(&V_tcbinfo); return (inp); } diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index 1a94d0a..e3e9aa6 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -5,6 +5,7 @@ * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> * Copyright (c) 2010 The FreeBSD Foundation + * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed at the Centre for Advanced Internet @@ -16,6 +17,9 @@ * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * + * Portions of this software were developed by Robert N. M. Watson under + * contract to Juniper Networks, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -197,10 +201,6 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW, &VNET_NAME(tcp_autorcvbuf_max), 0, "Max size of automatic receive buffer"); -int tcp_read_locking = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, read_locking, CTLFLAG_RW, - &tcp_read_locking, 0, "Enable read locking strategy"); - VNET_DEFINE(struct inpcbhead, tcb); #define tcb6 tcb /* for KAME src sync over BSD*'s */ VNET_DEFINE(struct inpcbinfo, tcbinfo); @@ -591,8 +591,7 @@ tcp_input(struct mbuf *m, int off0) char *s = NULL; /* address and port logging */ int ti_locked; #define TI_UNLOCKED 1 -#define TI_RLOCKED 2 -#define TI_WLOCKED 3 +#define TI_WLOCKED 2 #ifdef TCPDEBUG /* @@ -756,30 +755,25 @@ tcp_input(struct mbuf *m, int off0) drop_hdrlen = off0 + off; /* - * Locate pcb for segment, which requires a lock on tcbinfo. - * Optimisticaly acquire a global read lock rather than a write lock - * unless header flags necessarily imply a state change. There are - * two cases where we might discover later we need a write lock - * despite the flags: ACKs moving a connection out of the syncache, - * and ACKs for a connection in TIMEWAIT. + * Locate pcb for segment; if we're likely to add or remove a + * connection then first acquire pcbinfo lock. There are two cases + * where we might discover later we need a write lock despite the + * flags: ACKs moving a connection out of the syncache, and ACKs for + * a connection in TIMEWAIT. */ - if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || - tcp_read_locking == 0) { + if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0) { INP_INFO_WLOCK(&V_tcbinfo); ti_locked = TI_WLOCKED; - } else { - INP_INFO_RLOCK(&V_tcbinfo); - ti_locked = TI_RLOCKED; - } + } else + ti_locked = TI_UNLOCKED; findpcb: #ifdef INVARIANTS - if (ti_locked == TI_RLOCKED) - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) { INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - else - panic("%s: findpcb ti_locked %d\n", __func__, ti_locked); + } else { + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + } #endif #ifdef INET @@ -797,20 +791,20 @@ findpcb: * Transparently forwarded. Pretend to be the destination. * already got one like this? */ - inp = in_pcblookup_hash(&V_tcbinfo, - ip->ip_src, th->th_sport, - ip->ip_dst, th->th_dport, - 0, m->m_pkthdr.rcvif); + inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, + ip->ip_dst, th->th_dport, INPLOOKUP_WLOCKPCB, + m->m_pkthdr.rcvif, m); if (!inp) { - /* It's new. Try to find the ambushing socket. */ - inp = in_pcblookup_hash(&V_tcbinfo, - ip->ip_src, th->th_sport, - next_hop->sin_addr, - next_hop->sin_port ? - ntohs(next_hop->sin_port) : - th->th_dport, - INPLOOKUP_WILDCARD, - m->m_pkthdr.rcvif); + /* + * It's new. Try to find the ambushing socket. + * Because we've rewritten the destination address, + * any hardware-generated hash is ignored. + */ + inp = in_pcblookup(&V_tcbinfo, ip->ip_src, + th->th_sport, next_hop->sin_addr, + next_hop->sin_port ? ntohs(next_hop->sin_port) : + th->th_dport, INPLOOKUP_WILDCARD | + INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif); } /* Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); @@ -820,21 +814,19 @@ findpcb: { #ifdef INET6 if (isipv6) - inp = in6_pcblookup_hash(&V_tcbinfo, - &ip6->ip6_src, th->th_sport, - &ip6->ip6_dst, th->th_dport, - INPLOOKUP_WILDCARD, - m->m_pkthdr.rcvif); + inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, + th->th_sport, &ip6->ip6_dst, th->th_dport, + INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, + m->m_pkthdr.rcvif, m); #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET - inp = in_pcblookup_hash(&V_tcbinfo, - ip->ip_src, th->th_sport, - ip->ip_dst, th->th_dport, - INPLOOKUP_WILDCARD, - m->m_pkthdr.rcvif); + inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, + th->th_sport, ip->ip_dst, th->th_dport, + INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, + m->m_pkthdr.rcvif, m); #endif } @@ -865,7 +857,7 @@ findpcb: rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } - INP_WLOCK(inp); + INP_WLOCK_ASSERT(inp); if (!(inp->inp_flags & INP_HW_FLOWID) && (m->m_flags & M_FLOWID) && ((inp->inp_socket == NULL) @@ -906,28 +898,26 @@ findpcb: * legitimate new connection attempt the old INPCB gets removed and * we can try again to find a listening socket. * - * At this point, due to earlier optimism, we may hold a read lock on - * the inpcbinfo, rather than a write lock. If so, we need to - * upgrade, or if that fails, acquire a reference on the inpcb, drop - * all locks, acquire a global write lock, and then re-acquire the - * inpcb lock. We may at that point discover that another thread has - * tried to free the inpcb, in which case we need to loop back and - * try to find a new inpcb to deliver to. + * At this point, due to earlier optimism, we may hold only an inpcb + * lock, and not the inpcbinfo write lock. If so, we need to try to + * acquire it, or if that fails, acquire a reference on the inpcb, + * drop all locks, acquire a global write lock, and then re-acquire + * the inpcb lock. We may at that point discover that another thread + * has tried to free the inpcb, in which case we need to loop back + * and try to find a new inpcb to deliver to. + * + * XXXRW: It may be time to rethink timewait locking. */ relocked: if (inp->inp_flags & INP_TIMEWAIT) { - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("%s: INP_TIMEWAIT ti_locked %d", __func__, ti_locked)); - - if (ti_locked == TI_RLOCKED) { - if (INP_INFO_TRY_UPGRADE(&V_tcbinfo) == 0) { + if (ti_locked == TI_UNLOCKED) { + if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { in_pcbref(inp); INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); INP_INFO_WLOCK(&V_tcbinfo); ti_locked = TI_WLOCKED; INP_WLOCK(inp); - if (in_pcbrele(inp)) { + if (in_pcbrele_wlocked(inp)) { inp = NULL; goto findpcb; } @@ -975,26 +965,24 @@ relocked: /* * We've identified a valid inpcb, but it could be that we need an - * inpcbinfo write lock and have only a read lock. In this case, - * attempt to upgrade/relock using the same strategy as the TIMEWAIT - * case above. If we relock, we have to jump back to 'relocked' as - * the connection might now be in TIMEWAIT. + * inpcbinfo write lock but don't hold it. In this case, attempt to + * acquire using the same strategy as the TIMEWAIT case above. If we + * relock, we have to jump back to 'relocked' as the connection might + * now be in TIMEWAIT. */ - if (tp->t_state != TCPS_ESTABLISHED || - (thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || - tcp_read_locking == 0) { - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("%s: upgrade check ti_locked %d", __func__, ti_locked)); - - if (ti_locked == TI_RLOCKED) { - if (INP_INFO_TRY_UPGRADE(&V_tcbinfo) == 0) { +#ifdef INVARIANTS + if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0) + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); +#endif + if (tp->t_state != TCPS_ESTABLISHED) { + if (ti_locked == TI_UNLOCKED) { + if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { in_pcbref(inp); INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); INP_INFO_WLOCK(&V_tcbinfo); ti_locked = TI_WLOCKED; INP_WLOCK(inp); - if (in_pcbrele(inp)) { + if (in_pcbrele_wlocked(inp)) { inp = NULL; goto findpcb; } @@ -1027,13 +1015,16 @@ relocked: /* * When the socket is accepting connections (the INPCB is in LISTEN * state) we look into the SYN cache if this is a new connection - * attempt or the completion of a previous one. + * attempt or the completion of a previous one. Because listen + * sockets are never in TCPS_ESTABLISHED, the V_tcbinfo lock will be + * held in this case. */ if (so->so_options & SO_ACCEPTCONN) { struct in_conninfo inc; KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but " "tp not listening", __func__)); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); bzero(&inc, sizeof(inc)); #ifdef INET6 @@ -1371,13 +1362,17 @@ relocked: return; dropwithreset: - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) { INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: dropwithreset ti_locked %d", __func__, ti_locked); - ti_locked = TI_UNLOCKED; + ti_locked = TI_UNLOCKED; + } +#ifdef INVARIANTS + else { + KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropwithreset " + "ti_locked: %d", __func__, ti_locked)); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + } +#endif if (inp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); @@ -1388,13 +1383,17 @@ dropwithreset: goto drop; dropunlock: - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) { INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: dropunlock ti_locked %d", __func__, ti_locked); - ti_locked = TI_UNLOCKED; + ti_locked = TI_UNLOCKED; + } +#ifdef INVARIANTS + else { + KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropunlock " + "ti_locked: %d", __func__, ti_locked)); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + } +#endif if (inp != NULL) INP_WUNLOCK(inp); @@ -1449,13 +1448,13 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, INP_INFO_WLOCK_ASSERT(&V_tcbinfo); } else { #ifdef INVARIANTS - if (ti_locked == TI_RLOCKED) - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - else - panic("%s: ti_locked %d for EST", __func__, - ti_locked); + else { + KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " + "ti_locked: %d", __func__, ti_locked)); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + } #endif } INP_WLOCK_ASSERT(tp->t_inpcb); @@ -1601,13 +1600,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, /* * This is a pure ack for outstanding data. */ - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: ti_locked %d on pure ACK", - __func__, ti_locked); ti_locked = TI_UNLOCKED; TCPSTAT_INC(tcps_predack); @@ -1708,13 +1702,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * nothing on the reassembly queue and we have enough * buffer space to take it. */ - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: ti_locked %d on pure data " - "segment", __func__, ti_locked); ti_locked = TI_UNLOCKED; /* Clean receiver SACK report if present */ @@ -2550,9 +2539,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, } process_ACK: - INP_INFO_LOCK_ASSERT(&V_tcbinfo); - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("tcp_input: process_ACK ti_locked %d", ti_locked)); INP_WLOCK_ASSERT(tp->t_inpcb); acked = BYTES_THIS_ACK(tp, th); @@ -2716,9 +2702,6 @@ process_ACK: } step6: - INP_INFO_LOCK_ASSERT(&V_tcbinfo); - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("tcp_do_segment: step6 ti_locked %d", ti_locked)); INP_WLOCK_ASSERT(tp->t_inpcb); /* @@ -2804,9 +2787,6 @@ step6: tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ - INP_INFO_LOCK_ASSERT(&V_tcbinfo); - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("tcp_do_segment: dodata ti_locked %d", ti_locked)); INP_WLOCK_ASSERT(tp->t_inpcb); /* @@ -2938,13 +2918,8 @@ dodata: /* XXX */ return; } } - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: dodata epilogue ti_locked %d", __func__, - ti_locked); ti_locked = TI_UNLOCKED; #ifdef TCPDEBUG @@ -2973,9 +2948,6 @@ check_delack: return; dropafterack: - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("tcp_do_segment: dropafterack ti_locked %d", ti_locked)); - /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. @@ -3002,13 +2974,8 @@ dropafterack: tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: dropafterack epilogue ti_locked %d", __func__, - ti_locked); ti_locked = TI_UNLOCKED; tp->t_flags |= TF_ACKNOW; @@ -3018,12 +2985,8 @@ dropafterack: return; dropwithreset: - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: dropwithreset ti_locked %d", __func__, ti_locked); ti_locked = TI_UNLOCKED; if (tp != NULL) { @@ -3034,15 +2997,14 @@ dropwithreset: return; drop: - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) { INP_INFO_WUNLOCK(&V_tcbinfo); + ti_locked = TI_UNLOCKED; + } #ifdef INVARIANTS else INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); #endif - ti_locked = TI_UNLOCKED; /* * Drop space held by incoming segment and return. diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index 4b5fa10..4542ac5 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -1102,8 +1102,15 @@ send: m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen; } +#ifdef IPSEC + KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL), + ("%s: mbuf chain shorter than expected: %ld + %u + %u - %u != %u", + __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL))); +#else KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL), - ("%s: mbuf chain shorter than expected", __func__)); + ("%s: mbuf chain shorter than expected: %ld + %u + %u != %u", + __func__, len, hdrlen, ipoptlen, m_length(m, NULL))); +#endif /* * In transmit state, time the transmission and arrange for @@ -1331,7 +1338,7 @@ out: * then remember the size of the advertised window. * Any pending ACK has now been sent. */ - if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) + if (recwin >= 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + recwin; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 2c013be..6ed58911 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -300,7 +300,8 @@ tcp_init(void) hashsize = 512; /* safe default */ } in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize, - "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE); + "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE, + IPI_HASHFIELDS_4TUPLE); /* * These have to be type stable for the benefit of the timers. @@ -1184,9 +1185,9 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) INP_INFO_WLOCK(&V_tcbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; - INP_WLOCK(inp); - if (!in_pcbrele(inp)) - INP_WUNLOCK(inp); + INP_RLOCK(inp); + if (!in_pcbrele_rlocked(inp)) + INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_tcbinfo); @@ -1228,12 +1229,9 @@ tcp_getcred(SYSCTL_HANDLER_ARGS) error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); - INP_INFO_RLOCK(&V_tcbinfo); - inp = in_pcblookup_hash(&V_tcbinfo, addrs[1].sin_addr, - addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); + inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, + addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { - INP_RLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) @@ -1241,10 +1239,8 @@ tcp_getcred(SYSCTL_HANDLER_ARGS) if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); - } else { - INP_INFO_RUNLOCK(&V_tcbinfo); + } else error = ENOENT; - } if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); @@ -1286,23 +1282,20 @@ tcp6_getcred(SYSCTL_HANDLER_ARGS) return (EINVAL); } - INP_INFO_RLOCK(&V_tcbinfo); #ifdef INET if (mapped == 1) - inp = in_pcblookup_hash(&V_tcbinfo, + inp = in_pcblookup(&V_tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], - addrs[0].sin6_port, - 0, NULL); + addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); else #endif - inp = in6_pcblookup_hash(&V_tcbinfo, + inp = in6_pcblookup(&V_tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, - &addrs[0].sin6_addr, addrs[0].sin6_port, 0, NULL); + &addrs[0].sin6_addr, addrs[0].sin6_port, + INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { - INP_RLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) @@ -1310,10 +1303,8 @@ tcp6_getcred(SYSCTL_HANDLER_ARGS) if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); - } else { - INP_INFO_RUNLOCK(&V_tcbinfo); + } else error = ENOENT; - } if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); @@ -1374,10 +1365,9 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); INP_INFO_WLOCK(&V_tcbinfo); - inp = in_pcblookup_hash(&V_tcbinfo, faddr, th->th_dport, - ip->ip_src, th->th_sport, 0, NULL); + inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, + ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL) { - INP_WLOCK(inp); if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED) && !(inp->inp_socket == NULL)) { @@ -2154,20 +2144,19 @@ sysctl_drop(SYSCTL_HANDLER_ARGS) switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: - inp = in6_pcblookup_hash(&V_tcbinfo, &fin6->sin6_addr, - fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, 0, - NULL); + inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, + fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, + INPLOOKUP_WLOCKPCB, NULL); break; #endif #ifdef INET case AF_INET: - inp = in_pcblookup_hash(&V_tcbinfo, fin->sin_addr, - fin->sin_port, lin->sin_addr, lin->sin_port, 0, NULL); + inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, + lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif } if (inp != NULL) { - INP_WLOCK(inp); if (inp->inp_flags & INP_TIMEWAIT) { /* * XXXRW: There currently exists a state where an diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index 8262f43..66e4732 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" +#include "opt_pcbgroup.h" #include <sys/param.h> #include <sys/systm.h> @@ -661,6 +662,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) inp = sotoinpcb(so); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WLOCK(inp); + INP_HASH_WLOCK(&V_tcbinfo); /* Insert new socket into PCB hash list. */ inp->inp_inc.inc_flags = sc->sc_inc.inc_flags; @@ -675,8 +677,14 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) #ifdef INET6 } #endif + + /* + * Install in the reservation hash table for now, but don't yet + * install a connection group since the full 4-tuple isn't yet + * configured. + */ inp->inp_lport = sc->sc_inc.inc_lport; - if ((error = in_pcbinshash(inp)) != 0) { + if ((error = in_pcbinshash_nopcbgroup(inp)) != 0) { /* * Undo the assignments above if we failed to * put the PCB on the hash lists. @@ -694,6 +702,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) s, __func__, error); free(s, M_TCPLOG); } + INP_HASH_WUNLOCK(&V_tcbinfo); goto abort; } #ifdef IPSEC @@ -728,8 +737,8 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) laddr6 = inp->in6p_laddr; if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) inp->in6p_laddr = sc->sc_inc.inc6_laddr; - if ((error = in6_pcbconnect(inp, (struct sockaddr *)&sin6, - thread0.td_ucred)) != 0) { + if ((error = in6_pcbconnect_mbuf(inp, (struct sockaddr *)&sin6, + thread0.td_ucred, m)) != 0) { inp->in6p_laddr = laddr6; if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: in6_pcbconnect failed " @@ -737,6 +746,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) s, __func__, error); free(s, M_TCPLOG); } + INP_HASH_WUNLOCK(&V_tcbinfo); goto abort; } /* Override flowlabel from in6_pcbconnect. */ @@ -767,8 +777,8 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) laddr = inp->inp_laddr; if (inp->inp_laddr.s_addr == INADDR_ANY) inp->inp_laddr = sc->sc_inc.inc_laddr; - if ((error = in_pcbconnect(inp, (struct sockaddr *)&sin, - thread0.td_ucred)) != 0) { + if ((error = in_pcbconnect_mbuf(inp, (struct sockaddr *)&sin, + thread0.td_ucred, m)) != 0) { inp->inp_laddr = laddr; if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: in_pcbconnect failed " @@ -776,10 +786,12 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) s, __func__, error); free(s, M_TCPLOG); } + INP_HASH_WUNLOCK(&V_tcbinfo); goto abort; } } #endif /* INET */ + INP_HASH_WUNLOCK(&V_tcbinfo); tp = intotcpcb(inp); tp->t_state = TCPS_SYN_RECEIVED; tp->iss = sc->sc_iss; diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c index 5c2c5c2..73984c7 100644 --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -490,7 +490,7 @@ tcp_timer_rexmt(void * xtp) INP_WUNLOCK(inp); INP_INFO_WLOCK(&V_tcbinfo); INP_WLOCK(inp); - if (in_pcbrele(inp)) { + if (in_pcbrele_wlocked(inp)) { INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index 318fe27..96cb1e4 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -2,8 +2,12 @@ * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * Copyright (c) 2006-2007 Robert N. M. Watson + * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * + * Portions of this software were developed by Robert N. M. Watson under + * contract to Juniper Networks, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -251,7 +255,6 @@ tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) return (EAFNOSUPPORT); TCPDEBUG0; - INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL")); INP_WLOCK(inp); @@ -261,11 +264,12 @@ tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) } tp = intotcpcb(inp); TCPDEBUG1(); + INP_HASH_WLOCK(&V_tcbinfo); error = in_pcbbind(inp, nam, td->td_ucred); + INP_HASH_WUNLOCK(&V_tcbinfo); out: TCPDEBUG2(PRU_BIND); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } @@ -292,7 +296,6 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) return (EAFNOSUPPORT); TCPDEBUG0; - INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL")); INP_WLOCK(inp); @@ -302,6 +305,7 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) } tp = intotcpcb(inp); TCPDEBUG1(); + INP_HASH_WLOCK(&V_tcbinfo); inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; #ifdef INET @@ -316,15 +320,16 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) inp->inp_vflag &= ~INP_IPV6; error = in_pcbbind(inp, (struct sockaddr *)&sin, td->td_ucred); + INP_HASH_WUNLOCK(&V_tcbinfo); goto out; } } #endif error = in6_pcbbind(inp, nam, td->td_ucred); + INP_HASH_WUNLOCK(&V_tcbinfo); out: TCPDEBUG2(PRU_BIND); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } #endif /* INET6 */ @@ -341,7 +346,6 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td) struct tcpcb *tp = NULL; TCPDEBUG0; - INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL")); INP_WLOCK(inp); @@ -353,8 +357,10 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td) TCPDEBUG1(); SOCK_LOCK(so); error = solisten_proto_check(so); + INP_HASH_WLOCK(&V_tcbinfo); if (error == 0 && inp->inp_lport == 0) error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); + INP_HASH_WUNLOCK(&V_tcbinfo); if (error == 0) { tp->t_state = TCPS_LISTEN; solisten_proto(so, backlog); @@ -365,7 +371,6 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td) out: TCPDEBUG2(PRU_LISTEN); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } #endif /* INET */ @@ -379,7 +384,6 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) struct tcpcb *tp = NULL; TCPDEBUG0; - INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL")); INP_WLOCK(inp); @@ -391,12 +395,14 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) TCPDEBUG1(); SOCK_LOCK(so); error = solisten_proto_check(so); + INP_HASH_WLOCK(&V_tcbinfo); if (error == 0 && inp->inp_lport == 0) { inp->inp_vflag &= ~INP_IPV4; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) inp->inp_vflag |= INP_IPV4; error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); } + INP_HASH_WUNLOCK(&V_tcbinfo); if (error == 0) { tp->t_state = TCPS_LISTEN; solisten_proto(so, backlog); @@ -406,7 +412,6 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) out: TCPDEBUG2(PRU_LISTEN); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } #endif /* INET6 */ @@ -440,7 +445,6 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) return (error); TCPDEBUG0; - INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL")); INP_WLOCK(inp); @@ -456,7 +460,6 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) out: TCPDEBUG2(PRU_CONNECT); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } #endif /* INET */ @@ -482,7 +485,6 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) return (EAFNOSUPPORT); - INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL")); INP_WLOCK(inp); @@ -493,6 +495,11 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) tp = intotcpcb(inp); TCPDEBUG1(); #ifdef INET + /* + * XXXRW: Some confusion: V4/V6 flags relate to binding, and + * therefore probably require the hash lock, which isn't held here. + * Is this a significant problem? + */ if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { struct sockaddr_in sin; @@ -525,7 +532,6 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) out: TCPDEBUG2(PRU_CONNECT); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } #endif /* INET6 */ @@ -639,6 +645,7 @@ tcp6_usr_accept(struct socket *so, struct sockaddr **nam) inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL")); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNABORTED; @@ -664,6 +671,7 @@ tcp6_usr_accept(struct socket *so, struct sockaddr **nam) out: TCPDEBUG2(PRU_ACCEPT); INP_WUNLOCK(inp); + INP_INFO_RUNLOCK(&V_tcbinfo); if (error == 0) { if (v4) *nam = in6_v4mapsin6_sockaddr(port, &addr); @@ -750,25 +758,17 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; - int headlocked = 0; #ifdef INET6 int isipv6; #endif TCPDEBUG0; /* - * We require the pcbinfo lock in two cases: - * - * (1) An implied connect is taking place, which can result in - * binding IPs and ports and hence modification of the pcb hash - * chains. - * - * (2) PRUS_EOF is set, resulting in explicit close on the send. + * We require the pcbinfo lock if we will close the socket as part of + * this call. */ - if ((nam != NULL) || (flags & PRUS_EOF)) { + if (flags & PRUS_EOF) INP_INFO_WLOCK(&V_tcbinfo); - headlocked = 1; - } inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); INP_WLOCK(inp); @@ -805,7 +805,6 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, * initialize maxseg/maxopd using peer's cached * MSS. */ - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); @@ -830,10 +829,6 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, socantsendmore(so); tcp_usrclosed(tp); } - if (headlocked) { - INP_INFO_WUNLOCK(&V_tcbinfo); - headlocked = 0; - } if (!(inp->inp_flags & INP_DROPPED)) { if (flags & PRUS_MORETOCOME) tp->t_flags |= TF_MORETOCOME; @@ -869,7 +864,6 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, * initialize maxseg/maxopd using peer's cached * MSS. */ - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); @@ -884,11 +878,6 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); - INP_INFO_WUNLOCK(&V_tcbinfo); - headlocked = 0; - } else if (nam) { - INP_INFO_WUNLOCK(&V_tcbinfo); - headlocked = 0; } tp->snd_up = tp->snd_una + so->so_snd.sb_cc; tp->t_flags |= TF_FORCEDATA; @@ -899,7 +888,7 @@ out: TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); INP_WUNLOCK(inp); - if (headlocked) + if (flags & PRUS_EOF) INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } @@ -1087,13 +1076,13 @@ tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) u_short lport; int error; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK(&V_tcbinfo); if (inp->inp_lport == 0) { error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); if (error) - return error; + goto out; } /* @@ -1106,11 +1095,14 @@ tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport, &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred); if (error && oinp == NULL) - return error; - if (oinp) - return EADDRINUSE; + goto out; + if (oinp) { + error = EADDRINUSE; + goto out; + } inp->inp_laddr = laddr; in_pcbrehash(inp); + INP_HASH_WUNLOCK(&V_tcbinfo); /* * Compute window scaling to request: @@ -1129,6 +1121,10 @@ tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) tcp_sendseqinit(tp); return 0; + +out: + INP_HASH_WUNLOCK(&V_tcbinfo); + return (error); } #endif /* INET */ @@ -1142,13 +1138,13 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) struct in6_addr addr6; int error; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK(&V_tcbinfo); if (inp->inp_lport == 0) { error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); if (error) - return error; + goto out; } /* @@ -1156,18 +1152,23 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) * earlier incarnation of this same connection still in * TIME_WAIT state, creating an ADDRINUSE error. * in6_pcbladdr() also handles scope zone IDs. + * + * XXXRW: We wouldn't need to expose in6_pcblookup_hash_locked() + * outside of in6_pcb.c if there were an in6_pcbconnect_setup(). */ error = in6_pcbladdr(inp, nam, &addr6); if (error) - return error; - oinp = in6_pcblookup_hash(inp->inp_pcbinfo, + goto out; + oinp = in6_pcblookup_hash_locked(inp->inp_pcbinfo, &sin6->sin6_addr, sin6->sin6_port, IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ? &addr6 : &inp->in6p_laddr, inp->inp_lport, 0, NULL); - if (oinp) - return EADDRINUSE; + if (oinp) { + error = EADDRINUSE; + goto out; + } if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) inp->in6p_laddr = addr6; inp->in6p_faddr = sin6->sin6_addr; @@ -1178,6 +1179,7 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) inp->inp_flow |= (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); in_pcbrehash(inp); + INP_HASH_WUNLOCK(&V_tcbinfo); /* Compute window scaling to request. */ while (tp->request_r_scale < TCP_MAX_WINSHIFT && @@ -1192,6 +1194,10 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) tcp_sendseqinit(tp); return 0; + +out: + INP_HASH_WUNLOCK(&V_tcbinfo); + return error; } #endif /* INET6 */ diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index c3503e6..28eb8fd 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -2,8 +2,12 @@ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2008 Robert N. M. Watson + * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * + * Portions of this software were developed by Robert N. M. Watson under + * contract to Juniper Networks, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -182,7 +186,8 @@ udp_init(void) { in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE, - "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE); + "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE, + IPI_HASHFIELDS_2TUPLE); V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(V_udpcb_zone, maxsockets); @@ -253,7 +258,7 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, #endif struct udpcb *up; - INP_RLOCK_ASSERT(inp); + INP_LOCK_ASSERT(inp); /* * Engage the tunneling protocol. @@ -458,12 +463,12 @@ udp_input(struct mbuf *m, int off) } #endif - INP_INFO_RLOCK(&V_udbinfo); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || in_broadcast(ip->ip_dst, ifp)) { struct inpcb *last; struct ip_moptions *imo; + INP_INFO_RLOCK(&V_udbinfo); last = NULL; LIST_FOREACH(inp, &V_udb, inp_list) { if (inp->inp_lport != uh->uh_dport) @@ -485,6 +490,13 @@ udp_input(struct mbuf *m, int off) INP_RLOCK(inp); /* + * XXXRW: Because we weren't holding either the inpcb + * or the hash lock when we checked for a match + * before, we should probably recheck now that the + * inpcb lock is held. + */ + + /* * Handle socket delivery policy for any-source * and source-specific multicast. [RFC3678] */ @@ -542,7 +554,10 @@ udp_input(struct mbuf *m, int off) * or multicast datgram.) */ UDPSTAT_INC(udps_noportbcast); - goto badheadlocked; + if (inp) + INP_RUNLOCK(inp); + INP_INFO_RUNLOCK(&V_udbinfo); + goto badunlocked; } udp_append(last, ip, m, iphlen, &udp_in); INP_RUNLOCK(last); @@ -553,8 +568,9 @@ udp_input(struct mbuf *m, int off) /* * Locate pcb for datagram. */ - inp = in_pcblookup_hash(&V_udbinfo, ip->ip_src, uh->uh_sport, - ip->ip_dst, uh->uh_dport, 1, ifp); + inp = in_pcblookup_mbuf(&V_udbinfo, ip->ip_src, uh->uh_sport, + ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, + ifp, m); if (inp == NULL) { if (udp_log_in_vain) { char buf[4*sizeof "123"]; @@ -568,36 +584,31 @@ udp_input(struct mbuf *m, int off) UDPSTAT_INC(udps_noport); if (m->m_flags & (M_BCAST | M_MCAST)) { UDPSTAT_INC(udps_noportbcast); - goto badheadlocked; + goto badunlocked; } if (V_udp_blackhole) - goto badheadlocked; + goto badunlocked; if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0) - goto badheadlocked; + goto badunlocked; *ip = save_ip; ip->ip_len += iphlen; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); - INP_INFO_RUNLOCK(&V_udbinfo); return; } /* * Check the minimum TTL for socket. */ - INP_RLOCK(inp); - INP_INFO_RUNLOCK(&V_udbinfo); + INP_RLOCK_ASSERT(inp); if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) { INP_RUNLOCK(inp); - goto badunlocked; + m_freem(m); + return; } udp_append(inp, ip, m, iphlen, &udp_in); INP_RUNLOCK(inp); return; -badheadlocked: - if (inp) - INP_RUNLOCK(inp); - INP_INFO_RUNLOCK(&V_udbinfo); badunlocked: m_freem(m); } @@ -656,17 +667,15 @@ udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) return; if (ip != NULL) { uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); - INP_INFO_RLOCK(&V_udbinfo); - inp = in_pcblookup_hash(&V_udbinfo, faddr, uh->uh_dport, - ip->ip_src, uh->uh_sport, 0, NULL); + inp = in_pcblookup(&V_udbinfo, faddr, uh->uh_dport, + ip->ip_src, uh->uh_sport, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { - INP_RLOCK(inp); + INP_RLOCK_ASSERT(inp); if (inp->inp_socket != NULL) { udp_notify(inp, inetctlerrmap[cmd]); } INP_RUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_udbinfo); } else in_pcbnotifyall(&V_udbinfo, faddr, inetctlerrmap[cmd], udp_notify); @@ -756,9 +765,9 @@ udp_pcblist(SYSCTL_HANDLER_ARGS) INP_INFO_WLOCK(&V_udbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; - INP_WLOCK(inp); - if (!in_pcbrele(inp)) - INP_WUNLOCK(inp); + INP_RLOCK(inp); + if (!in_pcbrele_rlocked(inp)) + INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_udbinfo); @@ -799,12 +808,11 @@ udp_getcred(SYSCTL_HANDLER_ARGS) error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); - INP_INFO_RLOCK(&V_udbinfo); - inp = in_pcblookup_hash(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port, - addrs[0].sin_addr, addrs[0].sin_port, 1, NULL); + inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port, + addrs[0].sin_addr, addrs[0].sin_port, + INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { - INP_RLOCK(inp); - INP_INFO_RUNLOCK(&V_udbinfo); + INP_RLOCK_ASSERT(inp); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) @@ -812,10 +820,8 @@ udp_getcred(SYSCTL_HANDLER_ARGS) if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); - } else { - INP_INFO_RUNLOCK(&V_udbinfo); + } else error = ENOENT; - } if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); @@ -924,6 +930,9 @@ udp_ctloutput(struct socket *so, struct sockopt *sopt) } #ifdef INET +#define UH_WLOCKED 2 +#define UH_RLOCKED 1 +#define UH_UNLOCKED 0 static int udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) @@ -1016,29 +1025,27 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, * conservative locks than required the second time around, so later * assertions have to accept that. Further analysis of the number of * misses under contention is required. + * + * XXXRW: Check that hash locking update here is correct. */ sin = (struct sockaddr_in *)addr; INP_RLOCK(inp); if (sin != NULL && (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { INP_RUNLOCK(inp); - INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); - unlock_udbinfo = 2; + INP_HASH_WLOCK(&V_udbinfo); + unlock_udbinfo = UH_WLOCKED; } else if ((sin != NULL && ( (sin->sin_addr.s_addr == INADDR_ANY) || (sin->sin_addr.s_addr == INADDR_BROADCAST) || (inp->inp_laddr.s_addr == INADDR_ANY) || (inp->inp_lport == 0))) || (src.sin_family == AF_INET)) { - if (!INP_INFO_TRY_RLOCK(&V_udbinfo)) { - INP_RUNLOCK(inp); - INP_INFO_RLOCK(&V_udbinfo); - INP_RLOCK(inp); - } - unlock_udbinfo = 1; + INP_HASH_RLOCK(&V_udbinfo); + unlock_udbinfo = UH_RLOCKED; } else - unlock_udbinfo = 0; + unlock_udbinfo = UH_UNLOCKED; /* * If the IP_SENDSRCADDR control message was specified, override the @@ -1048,7 +1055,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, laddr = inp->inp_laddr; lport = inp->inp_lport; if (src.sin_family == AF_INET) { - INP_INFO_LOCK_ASSERT(&V_udbinfo); + INP_HASH_LOCK_ASSERT(&V_udbinfo); if ((lport == 0) || (laddr.s_addr == INADDR_ANY && src.sin_addr.s_addr == INADDR_ANY)) { @@ -1099,7 +1106,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, inp->inp_lport == 0 || sin->sin_addr.s_addr == INADDR_ANY || sin->sin_addr.s_addr == INADDR_BROADCAST) { - INP_INFO_LOCK_ASSERT(&V_udbinfo); + INP_HASH_LOCK_ASSERT(&V_udbinfo); error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, &lport, &faddr.s_addr, &fport, NULL, td->td_ucred); @@ -1113,8 +1120,8 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, /* Commit the local port if newly assigned. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { - INP_INFO_WLOCK_ASSERT(&V_udbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(&V_udbinfo); /* * Remember addr if jailed, to prevent * rebinding. @@ -1209,25 +1216,25 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, ((struct ip *)ui)->ip_tos = inp->inp_ip_tos; /* XXX */ UDPSTAT_INC(udps_opackets); - if (unlock_udbinfo == 2) - INP_INFO_WUNLOCK(&V_udbinfo); - else if (unlock_udbinfo == 1) - INP_INFO_RUNLOCK(&V_udbinfo); + if (unlock_udbinfo == UH_WLOCKED) + INP_HASH_WUNLOCK(&V_udbinfo); + else if (unlock_udbinfo == UH_RLOCKED) + INP_HASH_RUNLOCK(&V_udbinfo); error = ip_output(m, inp->inp_options, NULL, ipflags, inp->inp_moptions, inp); - if (unlock_udbinfo == 2) + if (unlock_udbinfo == UH_WLOCKED) INP_WUNLOCK(inp); else INP_RUNLOCK(inp); return (error); release: - if (unlock_udbinfo == 2) { + if (unlock_udbinfo == UH_WLOCKED) { + INP_HASH_WUNLOCK(&V_udbinfo); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); - } else if (unlock_udbinfo == 1) { + } else if (unlock_udbinfo == UH_RLOCKED) { + INP_HASH_RUNLOCK(&V_udbinfo); INP_RUNLOCK(inp); - INP_INFO_RUNLOCK(&V_udbinfo); } else INP_RUNLOCK(inp); m_freem(m); @@ -1376,15 +1383,15 @@ udp_abort(struct socket *so) inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_abort: inp == NULL")); - INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { + INP_HASH_WLOCK(&V_udbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; + INP_HASH_WUNLOCK(&V_udbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); } static int @@ -1453,11 +1460,11 @@ udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_bind: inp == NULL")); - INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); + INP_HASH_WLOCK(&V_udbinfo); error = in_pcbbind(inp, nam, td->td_ucred); + INP_HASH_WUNLOCK(&V_udbinfo); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); return (error); } @@ -1468,15 +1475,15 @@ udp_close(struct socket *so) inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_close: inp == NULL")); - INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { + INP_HASH_WLOCK(&V_udbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; + INP_HASH_WUNLOCK(&V_udbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); } static int @@ -1488,25 +1495,23 @@ udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_connect: inp == NULL")); - INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); return (EISCONN); } sin = (struct sockaddr_in *)nam; error = prison_remote_ip4(td->td_ucred, &sin->sin_addr); if (error != 0) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); return (error); } + INP_HASH_WLOCK(&V_udbinfo); error = in_pcbconnect(inp, nam, td->td_ucred); + INP_HASH_WUNLOCK(&V_udbinfo); if (error == 0) soisconnected(so); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); return (error); } @@ -1538,21 +1543,19 @@ udp_disconnect(struct socket *so) inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_disconnect: inp == NULL")); - INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); if (inp->inp_faddr.s_addr == INADDR_ANY) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); return (ENOTCONN); } - + INP_HASH_WLOCK(&V_udbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; + INP_HASH_WUNLOCK(&V_udbinfo); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); return (0); } |