diff options
author | rwatson <rwatson@FreeBSD.org> | 2008-07-15 15:38:47 +0000 |
---|---|---|
committer | rwatson <rwatson@FreeBSD.org> | 2008-07-15 15:38:47 +0000 |
commit | 6d9661b22428319f8d87375a3d6f69afda13469a (patch) | |
tree | 9e83efd11a5d307fd93a89b13bf38f242e705cf1 /sys/netinet | |
parent | 2d49f66781d711152d390cc98a26f9b781a5255a (diff) | |
download | FreeBSD-src-6d9661b22428319f8d87375a3d6f69afda13469a.zip FreeBSD-src-6d9661b22428319f8d87375a3d6f69afda13469a.tar.gz |
Merge last of a series of rwlock conversion changes to UDP, which
completes the move to a fully parallel UDP transmit path by using
global read, rather than write, locking of inpcbinfo in further
semi-connected cases:
- Add macros to allow try-locking of inpcb and inpcbinfo.
- Always acquire an incpcb read lock in udp_output(), which stablizes the
local inpcb address and port bindings in order to determine what further
locking is required:
- If the inpcb is currently not bound (at all) and are implicitly
connecting, we require inpcbinfo and inpcb write locks, so drop the
read lock and re-acquire.
- If the inpcb is bound for at least one of the port or address, but an
explicit source or destination is requested, trylock the inpcbinfo
lock, and if that fails, drop the inpcb lock, lock the global lock,
and relock the inpcb lock.
- Otherwise, no further locking is required (common case).
- Update comments.
In practice, this means that the vast majority of consumers of UDP sockets
will not acquire any exclusive locks at the socket or UDP levels of the
network stack. This leads to a marked performance improvement in several
important workloads, including BIND, nsd, and memcached over UDP, as well
as significant improvements in pps microbenchmarks.
The plan is to MFC all of the rwlock changes to RELENG_7 once they have
settled for a weeks in the tree.
Tested by: ps, kris (older revision), bde
MFC after: 3 weeks
Diffstat (limited to 'sys/netinet')
-rw-r--r-- | sys/netinet/in_pcb.h | 4 | ||||
-rw-r--r-- | sys/netinet/udp_usrreq.c | 119 |
2 files changed, 89 insertions, 34 deletions
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index d412843..d3939ee 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -315,6 +315,8 @@ struct inpcbinfo { #define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock) #define INP_RLOCK(inp) rw_rlock(&(inp)->inp_lock) #define INP_WLOCK(inp) rw_wlock(&(inp)->inp_lock) +#define INP_TRY_RLOCK(inp) rw_try_rlock(&(inp)->inp_lock) +#define INP_TRY_WLOCK(inp) rw_try_wlock(&(inp)->inp_lock) #define INP_RUNLOCK(inp) rw_runlock(&(inp)->inp_lock) #define INP_WUNLOCK(inp) rw_wunlock(&(inp)->inp_lock) #define INP_LOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_LOCKED) @@ -356,6 +358,8 @@ inp_unlock_assert(struct inpcb *inp __unused) #define INP_INFO_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_lock) #define INP_INFO_RLOCK(ipi) rw_rlock(&(ipi)->ipi_lock) #define INP_INFO_WLOCK(ipi) rw_wlock(&(ipi)->ipi_lock) +#define INP_INFO_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_lock) +#define INP_INFO_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_lock) #define INP_INFO_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_lock) #define INP_INFO_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_lock) #define INP_INFO_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_LOCKED) diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index 1cce109..5c3698f 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -846,14 +846,42 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, return (error); } - if (src.sin_family == AF_INET || addr != NULL) { + /* + * Depending on whether or not the application has bound or connected + * the application, we may have to do varying levels of work. The + * optimal case is for a connected UDP socket, as a global lock isn't + * required. + * + * In order to decide which we need, we require stability of the + * inpcb binding, which we ensure by acquiring a read lock on the + * inpcb. This doesn't strictly follow the lock order, so we play + * the trylock and retry game; note that we may end up with more + * conservative locks than required the second time around, so later + * assertions have to accept that. Further analysis of the number of + * misses under contention is required. + */ + sin = (struct sockaddr_in *)addr; + INP_RLOCK(inp); + if (sin != NULL && + (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { + INP_RUNLOCK(inp); INP_INFO_WLOCK(&udbinfo); INP_WLOCK(inp); + unlock_udbinfo = 2; + } else if ((sin != NULL && ( + (sin->sin_addr.s_addr == INADDR_ANY) || + (sin->sin_addr.s_addr == INADDR_BROADCAST) || + (inp->inp_laddr.s_addr == INADDR_ANY) || + (inp->inp_lport == 0))) || + (src.sin_family == AF_INET)) { + if (!INP_INFO_TRY_RLOCK(&udbinfo)) { + INP_RUNLOCK(inp); + INP_INFO_RLOCK(&udbinfo); + INP_RLOCK(inp); + } unlock_udbinfo = 1; - } else { + } else unlock_udbinfo = 0; - INP_RLOCK(inp); - } /* * If the IP_SENDSRCADDR control message was specified, override the @@ -880,12 +908,11 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, * If a UDP socket has been connected, then a local address/port will * have been selected and bound. * - * If a UDP socket has not been connect to, then an explicit + * If a UDP socket has not been connected to, then an explicit * destination address must be used, in which case a local * address/port may not have been selected and bound. */ - if (addr) { - INP_INFO_LOCK_ASSERT(&udbinfo); + if (sin != NULL) { INP_LOCK_ASSERT(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { error = EISCONN; @@ -896,39 +923,58 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, * Jail may rewrite the destination address, so let it do * that before we use it. */ - sin = (struct sockaddr_in *)addr; if (jailed(td->td_ucred)) prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr); /* - * If a local address or port hasn't yet been selected, do - * that now. Once a port is selected, we commit the binding - * back to the socket; we also commit the binding of the - * address if in jail. + * If a local address or port hasn't yet been selected, or if + * the destination address needs to be rewritten due to using + * a special INADDR_ constant, invoke in_pcbconnect_setup() + * to do the heavy lifting. Once a port is selected, we + * commit the binding back to the socket; we also commit the + * binding of the address if in jail. + * + * If we already have a valid binding and we're not + * requesting a destination address rewrite, use a fast path. */ - error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, &lport, - &faddr.s_addr, &fport, NULL, td->td_ucred); - if (error) - goto release; + if (inp->inp_laddr.s_addr == INADDR_ANY || + inp->inp_lport == 0 || + sin->sin_addr.s_addr == INADDR_ANY || + sin->sin_addr.s_addr == INADDR_BROADCAST) { + INP_INFO_LOCK_ASSERT(&udbinfo); + error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, + &lport, &faddr.s_addr, &fport, NULL, + td->td_ucred); + if (error) + goto release; - /* Commit the local port if newly assigned. */ - if (inp->inp_laddr.s_addr == INADDR_ANY && - inp->inp_lport == 0) { - INP_INFO_WLOCK_ASSERT(&udbinfo); - INP_WLOCK_ASSERT(inp); /* - * Remember addr if jailed, to prevent rebinding. + * XXXRW: Why not commit the port if the address is + * !INADDR_ANY? */ - if (jailed(td->td_ucred)) - inp->inp_laddr = laddr; - inp->inp_lport = lport; - if (in_pcbinshash(inp) != 0) { - inp->inp_lport = 0; - error = EAGAIN; - goto release; + /* Commit the local port if newly assigned. */ + if (inp->inp_laddr.s_addr == INADDR_ANY && + inp->inp_lport == 0) { + INP_INFO_WLOCK_ASSERT(&udbinfo); + INP_WLOCK_ASSERT(inp); + /* + * Remember addr if jailed, to prevent + * rebinding. + */ + if (jailed(td->td_ucred)) + inp->inp_laddr = laddr; + inp->inp_lport = lport; + if (in_pcbinshash(inp) != 0) { + inp->inp_lport = 0; + error = EAGAIN; + goto release; + } + inp->inp_flags |= INP_ANONPORT; } - inp->inp_flags |= INP_ANONPORT; + } else { + faddr = sin->sin_addr; + fport = sin->sin_port; } } else { INP_LOCK_ASSERT(inp); @@ -1006,20 +1052,25 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, ((struct ip *)ui)->ip_tos = inp->inp_ip_tos; /* XXX */ udpstat.udps_opackets++; - if (unlock_udbinfo) + if (unlock_udbinfo == 2) INP_INFO_WUNLOCK(&udbinfo); + else if (unlock_udbinfo == 1) + INP_INFO_RUNLOCK(&udbinfo); error = ip_output(m, inp->inp_options, NULL, ipflags, inp->inp_moptions, inp); - if (unlock_udbinfo) + if (unlock_udbinfo == 2) INP_WUNLOCK(inp); else INP_RUNLOCK(inp); return (error); release: - if (unlock_udbinfo) { - INP_INFO_WUNLOCK(&udbinfo); + if (unlock_udbinfo == 2) { INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&udbinfo); + } else if (unlock_udbinfo == 1) { + INP_RUNLOCK(inp); + INP_INFO_RUNLOCK(&udbinfo); } else INP_RUNLOCK(inp); m_freem(m); |