diff options
author | jch <jch@FreeBSD.org> | 2016-11-24 14:48:46 +0000 |
---|---|---|
committer | jch <jch@FreeBSD.org> | 2016-11-24 14:48:46 +0000 |
commit | 7f3fe3a98ec981ac0d1481bb5e4cb31ef9ac8a0a (patch) | |
tree | 3cfb7702bb0aeaf23d9b6562d3f81192fd2bc65b | |
parent | 16165b4c35ba2f47480383759cd08703a8515756 (diff) | |
download | FreeBSD-src-7f3fe3a98ec981ac0d1481bb5e4cb31ef9ac8a0a.zip FreeBSD-src-7f3fe3a98ec981ac0d1481bb5e4cb31ef9ac8a0a.tar.gz |
MFC r286227, r286443:
r286227:
Decompose TCP INP_INFO lock to increase short-lived TCP connections scalability:
- The existing TCP INP_INFO lock continues to protect the global inpcb list
stability during full list traversal (e.g. tcp_pcblist()).
- A new INP_LIST lock protects inpcb list actual modifications (inp allocation
and free) and inpcb global counters.
It allows to use TCP INP_INFO_RLOCK lock in critical paths (e.g. tcp_input())
and INP_INFO_WLOCK only in occasional operations that walk all connections.
PR: 183659
Differential Revision: https://reviews.freebsd.org/D2599
Reviewed by: jhb, adrian
Tested by: adrian, nitroboost-gmail.com
Sponsored by: Verisign, Inc.
r286443:
Fix a kernel assertion issue introduced with r286227:
Avoid too strict INP_INFO_RLOCK_ASSERT checks due to
tcp_notify() being called from in6_pcbnotify().
Reported by: Larry Rosenman <ler@lerctr.org>
Submitted by: markj, jch
-rw-r--r-- | sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c | 30 | ||||
-rw-r--r-- | sys/dev/cxgb/ulp/tom/cxgb_listen.c | 28 | ||||
-rw-r--r-- | sys/dev/cxgbe/tom/t4_connect.c | 4 | ||||
-rw-r--r-- | sys/dev/cxgbe/tom/t4_cpl_io.c | 20 | ||||
-rw-r--r-- | sys/dev/cxgbe/tom/t4_listen.c | 30 | ||||
-rw-r--r-- | sys/netinet/in_pcb.c | 43 | ||||
-rw-r--r-- | sys/netinet/in_pcb.h | 98 | ||||
-rw-r--r-- | sys/netinet/tcp_input.c | 118 | ||||
-rw-r--r-- | sys/netinet/tcp_subr.c | 50 | ||||
-rw-r--r-- | sys/netinet/tcp_syncache.c | 22 | ||||
-rw-r--r-- | sys/netinet/tcp_timer.c | 46 | ||||
-rw-r--r-- | sys/netinet/tcp_timewait.c | 39 | ||||
-rw-r--r-- | sys/netinet/tcp_usrreq.c | 49 | ||||
-rw-r--r-- | sys/netinet/toecore.c | 6 | ||||
-rw-r--r-- | sys/netinet6/in6_pcb.c | 4 |
15 files changed, 340 insertions, 247 deletions
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c index e04bf20..48474da 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c @@ -638,7 +638,7 @@ t3_send_fin(struct toedev *tod, struct tcpcb *tp) unsigned int tid = toep->tp_tid; #endif - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); CTR4(KTR_CXGB, "%s: tid %d, toep %p, flags %x", __func__, tid, toep, @@ -924,12 +924,12 @@ do_act_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) rc = act_open_rpl_status_to_errno(s); if (rc != EAGAIN) - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); toe_connect_failed(tod, inp, rc); toepcb_release(toep); /* unlocks inp */ if (rc != EAGAIN) - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); @@ -1060,7 +1060,7 @@ send_reset(struct toepcb *toep) struct adapter *sc = tod->tod_softc; struct mbuf *m; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); CTR4(KTR_CXGB, "%s: tid %d, toep %p (%x)", __func__, tid, toep, @@ -1171,12 +1171,12 @@ do_rx_data(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) SOCKBUF_UNLOCK(so_rcv); INP_WUNLOCK(inp); - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); tp = tcp_drop(tp, ECONNRESET); if (tp) INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); @@ -1221,7 +1221,7 @@ do_peer_close(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) struct tcpcb *tp; struct socket *so; - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); tp = intotcpcb(inp); @@ -1249,7 +1249,7 @@ do_peer_close(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) case TCPS_FIN_WAIT_2: tcp_twstart(tp); INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); INP_WLOCK(inp); toepcb_release(toep); /* no more CPLs expected */ @@ -1263,7 +1263,7 @@ do_peer_close(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) done: INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); @@ -1284,7 +1284,7 @@ do_close_con_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) struct tcpcb *tp; struct socket *so; - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); tp = intotcpcb(inp); @@ -1302,7 +1302,7 @@ do_close_con_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) tcp_twstart(tp); release: INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); INP_WLOCK(inp); toepcb_release(toep); /* no more CPLs expected */ @@ -1327,7 +1327,7 @@ release: done: INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); @@ -1488,7 +1488,7 @@ do_abort_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) return (do_abort_req_synqe(qs, r, m)); inp = toep->tp_inp; - INP_INFO_WLOCK(&V_tcbinfo); /* for tcp_close */ + INP_INFO_RLOCK(&V_tcbinfo); /* for tcp_close */ INP_WLOCK(inp); tp = intotcpcb(inp); @@ -1502,7 +1502,7 @@ do_abort_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) toep->tp_flags |= TP_ABORT_REQ_RCVD; toep->tp_flags |= TP_ABORT_SHUTDOWN; INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); } @@ -1522,7 +1522,7 @@ do_abort_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) INP_WLOCK(inp); /* re-acquire */ toepcb_release(toep); /* no more CPLs expected */ } - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); send_abort_rpl(tod, tid, qset); m_freem(m); diff --git a/sys/dev/cxgb/ulp/tom/cxgb_listen.c b/sys/dev/cxgb/ulp/tom/cxgb_listen.c index 8148ae4..c992be4 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_listen.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_listen.c @@ -553,11 +553,11 @@ do_pass_accept_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) REJECT_PASS_ACCEPT(); /* no l2te, or ifp mismatch */ } - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); /* Don't offload if the 4-tuple is already in use */ if (toe_4tuple_check(&inc, &th, ifp) != 0) { - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); REJECT_PASS_ACCEPT(); } @@ -570,7 +570,7 @@ do_pass_accept_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) * resources tied to this listen context. */ INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); REJECT_PASS_ACCEPT(); } so = inp->inp_socket; @@ -698,7 +698,7 @@ do_pass_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) struct toepcb *toep; struct socket *so; struct listen_ctx *lctx = synqe->lctx; - struct inpcb *inp = lctx->inp; + struct inpcb *inp = lctx->inp, *new_inp; struct tcpopt to; struct tcphdr th; struct in_conninfo inc; @@ -712,7 +712,7 @@ do_pass_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) KASSERT(qs->idx == synqe->qset, ("%s qset mismatch %d %d", __func__, qs->idx, synqe->qset)); - INP_INFO_WLOCK(&V_tcbinfo); /* for syncache_expand */ + INP_INFO_RLOCK(&V_tcbinfo); /* for syncache_expand */ INP_WLOCK(inp); if (__predict_false(inp->inp_flags & INP_DROPPED)) { @@ -726,7 +726,7 @@ do_pass_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) ("%s: listen socket dropped but tid %u not aborted.", __func__, tid)); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); } @@ -742,7 +742,7 @@ do_pass_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) reset: t3_send_reset_synqe(tod, synqe); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); } @@ -760,21 +760,23 @@ reset: goto reset; } - if (__predict_false(!(synqe->flags & TP_SYNQE_EXPANDED))) { - struct inpcb *new_inp = sotoinpcb(so); + /* New connection inpcb is already locked by syncache_expand(). */ + new_inp = sotoinpcb(so); + INP_WLOCK_ASSERT(new_inp); - INP_WLOCK(new_inp); + if (__predict_false(!(synqe->flags & TP_SYNQE_EXPANDED))) { tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0); t3_offload_socket(tod, synqe, so); - INP_WUNLOCK(new_inp); } + INP_WUNLOCK(new_inp); + /* Remove the synq entry and release its reference on the lctx */ TAILQ_REMOVE(&lctx->synq, synqe, link); inp = release_lctx(td, lctx); if (inp) INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); release_synqe(synqe); m_freem(m); @@ -1140,7 +1142,7 @@ t3_offload_socket(struct toedev *tod, void *arg, struct socket *so) struct cpl_pass_establish *cpl = synqe->cpl; struct toepcb *toep = synqe->toep; - INP_INFO_LOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */ + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */ INP_WLOCK_ASSERT(inp); offload_socket(so, toep); diff --git a/sys/dev/cxgbe/tom/t4_connect.c b/sys/dev/cxgbe/tom/t4_connect.c index 80d2a20..85b9d97 100644 --- a/sys/dev/cxgbe/tom/t4_connect.c +++ b/sys/dev/cxgbe/tom/t4_connect.c @@ -179,12 +179,12 @@ act_open_failure_cleanup(struct adapter *sc, u_int atid, u_int status) toep->tid = -1; if (status != EAGAIN) - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); toe_connect_failed(tod, inp, status); final_cpl_received(toep); /* unlocks inp */ if (status != EAGAIN) - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); } static int diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c index 328210c..8756880 100644 --- a/sys/dev/cxgbe/tom/t4_cpl_io.c +++ b/sys/dev/cxgbe/tom/t4_cpl_io.c @@ -1086,7 +1086,7 @@ do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); tp = intotcpcb(inp); @@ -1128,7 +1128,7 @@ do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) case TCPS_FIN_WAIT_2: tcp_twstart(tp); INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); INP_WLOCK(inp); final_cpl_received(toep); @@ -1140,7 +1140,7 @@ do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) } done: INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (0); } @@ -1167,7 +1167,7 @@ do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); tp = intotcpcb(inp); @@ -1185,7 +1185,7 @@ do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, tcp_twstart(tp); release: INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); INP_WLOCK(inp); final_cpl_received(toep); /* no more CPLs expected */ @@ -1209,7 +1209,7 @@ release: } done: INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (0); } @@ -1368,7 +1368,7 @@ do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) } inp = toep->inp; - INP_INFO_WLOCK(&V_tcbinfo); /* for tcp_close */ + INP_INFO_RLOCK(&V_tcbinfo); /* for tcp_close */ INP_WLOCK(inp); tp = intotcpcb(inp); @@ -1402,7 +1402,7 @@ do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) final_cpl_received(toep); done: - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); return (0); } @@ -1520,12 +1520,12 @@ do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) SOCKBUF_UNLOCK(sb); INP_WUNLOCK(inp); - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); tp = tcp_drop(tp, ECONNRESET); if (tp) INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (0); } diff --git a/sys/dev/cxgbe/tom/t4_listen.c b/sys/dev/cxgbe/tom/t4_listen.c index 317bf8a..11ad4e2 100644 --- a/sys/dev/cxgbe/tom/t4_listen.c +++ b/sys/dev/cxgbe/tom/t4_listen.c @@ -931,7 +931,7 @@ t4_offload_socket(struct toedev *tod, void *arg, struct socket *so) struct cpl_pass_establish *cpl = mtod(synqe->syn, void *); struct toepcb *toep = *(struct toepcb **)(cpl + 1); - INP_INFO_LOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */ + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */ INP_WLOCK_ASSERT(inp); KASSERT(synqe->flags & TPF_SYNQE, ("%s: %p not a synq_entry?", __func__, arg)); @@ -1332,15 +1332,15 @@ found: REJECT_PASS_ACCEPT(); rpl = wrtod(wr); - INP_INFO_WLOCK(&V_tcbinfo); /* for 4-tuple check */ + INP_INFO_RLOCK(&V_tcbinfo); /* for 4-tuple check */ /* Don't offload if the 4-tuple is already in use */ if (toe_4tuple_check(&inc, &th, ifp) != 0) { - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); free(wr, M_CXGBE); REJECT_PASS_ACCEPT(); } - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); inp = lctx->inp; /* listening socket, not owned by TOE */ INP_WLOCK(inp); @@ -1398,7 +1398,7 @@ found: hold_synqe(synqe); /* hold for the duration it's in the synq */ hold_lctx(lctx); /* A synqe on the list has a ref on its lctx */ - /* + /* * If all goes well t4_syncache_respond will get called during * syncache_add. Note that syncache_add releases the pcb lock. */ @@ -1516,7 +1516,7 @@ do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, unsigned int tid = GET_TID(cpl); struct synq_entry *synqe = lookup_tid(sc, tid); struct listen_ctx *lctx = synqe->lctx; - struct inpcb *inp = lctx->inp; + struct inpcb *inp = lctx->inp, *new_inp; struct socket *so; struct tcphdr th; struct tcpopt to; @@ -1534,7 +1534,7 @@ do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, KASSERT(synqe->flags & TPF_SYNQE, ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe)); - INP_INFO_WLOCK(&V_tcbinfo); /* for syncache_expand */ + INP_INFO_RLOCK(&V_tcbinfo); /* for syncache_expand */ INP_WLOCK(inp); CTR6(KTR_CXGBE, @@ -1550,7 +1550,7 @@ do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, } INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (0); } @@ -1575,7 +1575,7 @@ reset: */ send_reset_synqe(TOEDEV(ifp), synqe); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (0); } toep->tid = tid; @@ -1609,6 +1609,10 @@ reset: goto reset; } + /* New connection inpcb is already locked by syncache_expand(). */ + new_inp = sotoinpcb(so); + INP_WLOCK_ASSERT(new_inp); + /* * This is for the unlikely case where the syncache entry that we added * has been evicted from the syncache, but the syncache_expand above @@ -1619,20 +1623,18 @@ reset: * this somewhat defeats the purpose of having a tod_offload_socket :-( */ if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) { - struct inpcb *new_inp = sotoinpcb(so); - - INP_WLOCK(new_inp); tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0); t4_offload_socket(TOEDEV(ifp), synqe, so); - INP_WUNLOCK(new_inp); } + INP_WUNLOCK(new_inp); + /* Done with the synqe */ TAILQ_REMOVE(&lctx->synq, synqe, link); inp = release_lctx(sc, lctx); if (inp != NULL) INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); release_synqe(synqe); return (0); diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 0b296e0..387e866 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -215,6 +215,7 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, INP_INFO_LOCK_INIT(pcbinfo, name); INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */ + INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist"); #ifdef VIMAGE pcbinfo->ipi_vnet = curvnet; #endif @@ -253,6 +254,7 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) in_pcbgroup_destroy(pcbinfo); #endif uma_zdestroy(pcbinfo->ipi_zone); + INP_LIST_LOCK_DESTROY(pcbinfo); INP_HASH_LOCK_DESTROY(pcbinfo); INP_INFO_LOCK_DESTROY(pcbinfo); } @@ -267,7 +269,14 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) struct inpcb *inp; int error; - INP_INFO_WLOCK_ASSERT(pcbinfo); +#ifdef INVARIANTS + if (pcbinfo == &V_tcbinfo) { + INP_INFO_RLOCK_ASSERT(pcbinfo); + } else { + INP_INFO_WLOCK_ASSERT(pcbinfo); + } +#endif + error = 0; inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); if (inp == NULL) @@ -299,6 +308,8 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) inp->inp_flags |= IN6P_IPV6_V6ONLY; } #endif + INP_WLOCK(inp); + INP_LIST_WLOCK(pcbinfo); LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); pcbinfo->ipi_count++; so->so_pcb = (caddr_t)inp; @@ -306,9 +317,9 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) if (V_ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; #endif - INP_WLOCK(inp); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */ + INP_LIST_WUNLOCK(pcbinfo); #if defined(IPSEC) || defined(MAC) out: if (error != 0) { @@ -1194,7 +1205,13 @@ in_pcbfree(struct inpcb *inp) KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); - INP_INFO_WLOCK_ASSERT(pcbinfo); +#ifdef INVARIANTS + if (pcbinfo == &V_tcbinfo) { + INP_INFO_LOCK_ASSERT(pcbinfo); + } else { + INP_INFO_WLOCK_ASSERT(pcbinfo); + } +#endif INP_WLOCK_ASSERT(inp); /* XXXRW: Do as much as possible here. */ @@ -1202,8 +1219,10 @@ in_pcbfree(struct inpcb *inp) if (inp->inp_sp != NULL) ipsec_delete_pcbpolicy(inp); #endif + INP_LIST_WLOCK(pcbinfo); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; in_pcbremlists(inp); + INP_LIST_WUNLOCK(pcbinfo); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { ip6_freepcbopts(inp->in6p_outputopts); @@ -1360,7 +1379,7 @@ in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) struct ip_moptions *imo; int i, gap; - INP_INFO_RLOCK(pcbinfo); + INP_INFO_WLOCK(pcbinfo); LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { INP_WLOCK(inp); imo = inp->inp_moptions; @@ -1390,7 +1409,7 @@ in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) } INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK(pcbinfo); + INP_INFO_WUNLOCK(pcbinfo); } /* @@ -2023,8 +2042,16 @@ in_pcbremlists(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; - INP_INFO_WLOCK_ASSERT(pcbinfo); +#ifdef INVARIANTS + if (pcbinfo == &V_tcbinfo) { + INP_INFO_RLOCK_ASSERT(pcbinfo); + } else { + INP_INFO_WLOCK_ASSERT(pcbinfo); + } +#endif + INP_WLOCK_ASSERT(inp); + INP_LIST_WLOCK_ASSERT(pcbinfo); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; if (inp->inp_flags & INP_INHASHLIST) { @@ -2169,13 +2196,13 @@ inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) { struct inpcb *inp; - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { INP_WLOCK(inp); func(inp, arg); INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); } struct socket * diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index 3292c6a..f23af58 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -128,23 +128,35 @@ struct in_conninfo { struct icmp6_filter; /*- - * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 - * and IPv6 sockets. In the case of TCP, further per-connection state is + * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 and + * IPv6 sockets. In the case of TCP and UDP, further per-connection state is * hung off of inp_ppcb most of the time. Almost all fields of struct inpcb * are static after creation or protected by a per-inpcb rwlock, inp_lock. A - * few fields also require the global pcbinfo lock for the inpcb to be held, - * when modified, such as the global connection lists and hashes, as well as - * binding information (which affects which hash a connection is on). This - * model means that connections can be looked up without holding the - * per-connection lock, which is important for performance when attempting to - * find the connection for a packet given its IP and port tuple. Writing to - * these fields that write locks be held on both the inpcb and global locks. + * few fields are protected by multiple locks as indicated in the locking notes + * below. For these fields, all of the listed locks must be write-locked for + * any modifications. However, these fields can be safely read while any one of + * the listed locks are read-locked. This model can permit greater concurrency + * for read operations. For example, connections can be looked up while only + * holding a read lock on the global pcblist lock. This is important for + * performance when attempting to find the connection for a packet given its IP + * and port tuple. + * + * One noteworthy exception is that the global pcbinfo lock follows a different + * set of rules in relation to the inp_list field. Rather than being + * write-locked for modifications and read-locked for list iterations, it must + * be read-locked during modifications and write-locked during list iterations. + * This ensures that the relatively rare global list iterations safely walk a + * stable snapshot of connections while allowing more common list modifications + * to safely grab the pcblist lock just while adding or removing a connection + * from the global list. * * Key: * (c) - Constant after initialization * (g) - Protected by the pcbgroup lock * (i) - Protected by the inpcb lock * (p) - Protected by the pcbinfo lock for the inpcb + * (l) - Protected by the pcblist lock for the inpcb + * (h) - Protected by the pcbhash lock for the inpcb * (s) - Protected by another subsystem's locks * (x) - Undefined locking * @@ -159,15 +171,21 @@ struct icmp6_filter; * socket has been freed), or there may be close(2)-related races. * * The inp_vflag field is overloaded, and would otherwise ideally be (c). + * + * TODO: Currently only the TCP stack is leveraging the global pcbinfo lock + * read-lock usage during modification, this model can be applied to other + * protocols (especially SCTP). */ struct inpcb { - LIST_ENTRY(inpcb) inp_hash; /* (i/p) hash list */ + LIST_ENTRY(inpcb) inp_hash; /* (h/i) hash list */ LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */ - LIST_ENTRY(inpcb) inp_list; /* (i/p) list for all PCBs for proto */ + LIST_ENTRY(inpcb) inp_list; /* (p/l) list for all PCBs for proto */ + /* (p[w]) for list iteration */ + /* (p[r]/l) for addition/removal */ void *inp_ppcb; /* (i) pointer to per-protocol pcb */ struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */ struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */ - LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/p) group wildcard entry */ + LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */ struct socket *inp_socket; /* (i) back pointer to socket */ struct ucred *inp_cred; /* (c) cache of socket cred */ u_int32_t inp_flow; /* (i) IPv6 flow information */ @@ -185,7 +203,7 @@ struct inpcb { * general use */ /* Local and foreign ports, local and foreign addr. */ - struct in_conninfo inp_inc; /* (i/p) list for PCB's local port */ + struct in_conninfo inp_inc; /* (i) list for PCB's local port */ /* MAC and IPSEC policy information. */ struct label *inp_label; /* (i) MAC label */ @@ -210,8 +228,8 @@ struct inpcb { int inp6_cksum; short inp6_hops; } inp_depend6; - LIST_ENTRY(inpcb) inp_portlist; /* (i/p) */ - struct inpcbport *inp_phd; /* (i/p) head of this list */ + LIST_ENTRY(inpcb) inp_portlist; /* (i/h) */ + struct inpcbport *inp_phd; /* (i/h) head of this list */ #define inp_zero_size offsetof(struct inpcb, inp_gencnt) inp_gen_t inp_gencnt; /* (c) generation count */ struct llentry *inp_lle; /* cached L2 information */ @@ -275,37 +293,46 @@ struct inpcbport { * Global data structure for each high-level protocol (UDP, TCP, ...) in both * IPv4 and IPv6. Holds inpcb lists and information for managing them. * - * Each pcbinfo is protected by two locks: ipi_lock and ipi_hash_lock, - * the former covering mutable global fields (such as the global pcb list), - * and the latter covering the hashed lookup tables. The lock order is: + * Each pcbinfo is protected by three locks: ipi_lock, ipi_hash_lock and + * ipi_list_lock: + * - ipi_lock covering the global pcb list stability during loop iteration, + * - ipi_hash_lock covering the hashed lookup tables, + * - ipi_list_lock covering mutable global fields (such as the global + * pcb list) + * + * The lock order is: * - * ipi_lock (before) inpcb locks (before) {ipi_hash_lock, pcbgroup locks} + * ipi_lock (before) + * inpcb locks (before) + * ipi_list locks (before) + * {ipi_hash_lock, pcbgroup locks} * * Locking key: * * (c) Constant or nearly constant after initialisation * (g) Locked by ipi_lock + * (l) Locked by ipi_list_lock * (h) Read using either ipi_hash_lock or inpcb lock; write requires both * (p) Protected by one or more pcbgroup locks * (x) Synchronisation properties poorly defined */ struct inpcbinfo { /* - * Global lock protecting global inpcb list, inpcb count, etc. + * Global lock protecting full inpcb list traversal */ struct rwlock ipi_lock; /* * Global list of inpcbs on the protocol. */ - struct inpcbhead *ipi_listhead; /* (g) */ - u_int ipi_count; /* (g) */ + struct inpcbhead *ipi_listhead; /* (g/l) */ + u_int ipi_count; /* (l) */ /* * Generation count -- incremented each time a connection is allocated * or freed. */ - u_quad_t ipi_gencnt; /* (g) */ + u_quad_t ipi_gencnt; /* (l) */ /* * Fields associated with port lookup and allocation. @@ -363,6 +390,11 @@ struct inpcbinfo { * general use 2 */ void *ipi_pspare[2]; + + /* + * Global lock protecting global inpcb list, inpcb count, etc. + */ + struct rwlock ipi_list_lock; }; #ifdef _KERNEL @@ -455,6 +487,7 @@ short inp_so_options(const struct inpcb *inp); #define INP_INFO_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_lock) #define INP_INFO_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_lock) #define INP_INFO_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_lock) +#define INP_INFO_WLOCKED(ipi) rw_wowned(&(ipi)->ipi_lock) #define INP_INFO_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_lock) #define INP_INFO_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_lock) #define INP_INFO_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_LOCKED) @@ -462,6 +495,25 @@ short inp_so_options(const struct inpcb *inp); #define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED) #define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED) +#define INP_LIST_LOCK_INIT(ipi, d) \ + rw_init_flags(&(ipi)->ipi_list_lock, (d), 0) +#define INP_LIST_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_list_lock) +#define INP_LIST_RLOCK(ipi) rw_rlock(&(ipi)->ipi_list_lock) +#define INP_LIST_WLOCK(ipi) rw_wlock(&(ipi)->ipi_list_lock) +#define INP_LIST_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_list_lock) +#define INP_LIST_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_list_lock) +#define INP_LIST_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_list_lock) +#define INP_LIST_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_list_lock) +#define INP_LIST_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_list_lock) +#define INP_LIST_LOCK_ASSERT(ipi) \ + rw_assert(&(ipi)->ipi_list_lock, RA_LOCKED) +#define INP_LIST_RLOCK_ASSERT(ipi) \ + rw_assert(&(ipi)->ipi_list_lock, RA_RLOCKED) +#define INP_LIST_WLOCK_ASSERT(ipi) \ + rw_assert(&(ipi)->ipi_list_lock, RA_WLOCKED) +#define INP_LIST_UNLOCK_ASSERT(ipi) \ + rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED) + #define INP_HASH_LOCK_INIT(ipi, d) \ rw_init_flags(&(ipi)->ipi_hash_lock, (d), 0) #define INP_HASH_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_hash_lock) diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index 174670a..254454f 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -571,7 +571,7 @@ tcp_input(struct mbuf *m, int off0) char *s = NULL; /* address and port logging */ int ti_locked; #define TI_UNLOCKED 1 -#define TI_WLOCKED 2 +#define TI_RLOCKED 2 #ifdef TCPDEBUG /* @@ -757,8 +757,8 @@ tcp_input(struct mbuf *m, int off0) * connection in TIMEWAIT and SYNs not targeting a listening socket. */ if ((thflags & (TH_FIN | TH_RST)) != 0) { - INP_INFO_WLOCK(&V_tcbinfo); - ti_locked = TI_WLOCKED; + INP_INFO_RLOCK(&V_tcbinfo); + ti_locked = TI_RLOCKED; } else ti_locked = TI_UNLOCKED; @@ -780,8 +780,8 @@ tcp_input(struct mbuf *m, int off0) findpcb: #ifdef INVARIANTS - if (ti_locked == TI_WLOCKED) { - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) { + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } else { INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); } @@ -941,11 +941,11 @@ findpcb: relocked: if (inp->inp_flags & INP_TIMEWAIT) { if (ti_locked == TI_UNLOCKED) { - if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { + if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) { in_pcbref(inp); INP_WUNLOCK(inp); - INP_INFO_WLOCK(&V_tcbinfo); - ti_locked = TI_WLOCKED; + INP_INFO_RLOCK(&V_tcbinfo); + ti_locked = TI_RLOCKED; INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { inp = NULL; @@ -956,9 +956,9 @@ relocked: goto findpcb; } } else - ti_locked = TI_WLOCKED; + ti_locked = TI_RLOCKED; } - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); if (thflags & TH_SYN) tcp_dooptions(&to, optp, optlen, TO_SYN); @@ -967,7 +967,7 @@ relocked: */ if (tcp_twcheck(inp, &to, th, m, tlen)) goto findpcb; - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return; } /* @@ -998,17 +998,17 @@ relocked: */ #ifdef INVARIANTS if ((thflags & (TH_FIN | TH_RST)) != 0) - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); #endif if (!((tp->t_state == TCPS_ESTABLISHED && (thflags & TH_SYN) == 0) || (tp->t_state == TCPS_LISTEN && (thflags & TH_SYN) && !(tp->t_flags & TF_FASTOPEN)))) { if (ti_locked == TI_UNLOCKED) { - if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { + if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) { in_pcbref(inp); INP_WUNLOCK(inp); - INP_INFO_WLOCK(&V_tcbinfo); - ti_locked = TI_WLOCKED; + INP_INFO_RLOCK(&V_tcbinfo); + ti_locked = TI_RLOCKED; INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { inp = NULL; @@ -1020,9 +1020,9 @@ relocked: } goto relocked; } else - ti_locked = TI_WLOCKED; + ti_locked = TI_RLOCKED; } - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } #ifdef MAC @@ -1077,7 +1077,7 @@ relocked: */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* * Parse the TCP options here because * syncookies need access to the reflected @@ -1132,7 +1132,11 @@ new_tfo_socket: */ INP_WUNLOCK(inp); /* listen socket */ inp = sotoinpcb(so); - INP_WLOCK(inp); /* new connection */ + /* + * New connection inpcb is already locked by + * syncache_expand(). + */ + INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); KASSERT(tp->t_state == TCPS_SYN_RECEIVED, ("%s: ", __func__)); @@ -1368,8 +1372,8 @@ new_tfo_socket: * Entry added to syncache and mbuf consumed. * Only the listen socket is unlocked by syncache_add(). */ - if (ti_locked == TI_WLOCKED) { - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; } INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); @@ -1418,8 +1422,8 @@ new_tfo_socket: dropwithreset: TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th); - if (ti_locked == TI_WLOCKED) { - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS @@ -1442,8 +1446,8 @@ dropunlock: if (m != NULL) TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th); - if (ti_locked == TI_WLOCKED) { - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS @@ -1501,13 +1505,13 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || tp->t_state != TCPS_ESTABLISHED) { - KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for " + KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " "SYN/FIN/RST/!EST", __func__, ti_locked)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } else { #ifdef INVARIANTS - if (ti_locked == TI_WLOCKED) - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); else { KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " "ti_locked: %d", __func__, ti_locked)); @@ -1676,8 +1680,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, /* * This is a pure ack for outstanding data. */ - if (ti_locked == TI_WLOCKED) - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; TCPSTAT_INC(tcps_predack); @@ -1780,8 +1784,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * nothing on the reassembly queue and we have enough * buffer space to take it. */ - if (ti_locked == TI_WLOCKED) - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; /* Clean receiver SACK report if present */ @@ -2039,9 +2043,9 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tcp_state_change(tp, TCPS_SYN_RECEIVED); } - KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: " + KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: " "ti_locked %d", __func__, ti_locked)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); /* @@ -2169,10 +2173,10 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, case TCPS_CLOSE_WAIT: so->so_error = ECONNRESET; close: - KASSERT(ti_locked == TI_WLOCKED, + KASSERT(ti_locked == TI_RLOCKED, ("tcp_do_segment: TH_RST 1 ti_locked %d", ti_locked)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tcp_state_change(tp, TCPS_CLOSED); TCPSTAT_INC(tcps_drops); @@ -2181,10 +2185,10 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, case TCPS_CLOSING: case TCPS_LAST_ACK: - KASSERT(ti_locked == TI_WLOCKED, + KASSERT(ti_locked == TI_RLOCKED, ("tcp_do_segment: TH_RST 2 ti_locked %d", ti_locked)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tp = tcp_close(tp); break; @@ -2288,9 +2292,9 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { - KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && " + KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && " "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " @@ -2364,9 +2368,9 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * error and we send an RST and drop the connection. */ if (thflags & TH_SYN) { - KASSERT(ti_locked == TI_WLOCKED, + KASSERT(ti_locked == TI_RLOCKED, ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tp = tcp_drop(tp, ECONNRESET); rstreason = BANDLIM_UNLIMITED; @@ -2902,9 +2906,9 @@ process_ACK: */ case TCPS_CLOSING: if (ourfinisacked) { - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tcp_twstart(tp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return; } @@ -2918,7 +2922,7 @@ process_ACK: */ case TCPS_LAST_ACK: if (ourfinisacked) { - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tp = tcp_close(tp); goto drop; } @@ -3137,18 +3141,18 @@ dodata: /* XXX */ * standard timers. */ case TCPS_FIN_WAIT_2: - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - KASSERT(ti_locked == TI_WLOCKED, ("%s: dodata " + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata " "TCP_FIN_WAIT_2 ti_locked: %d", __func__, ti_locked)); tcp_twstart(tp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return; } } - if (ti_locked == TI_WLOCKED) - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; #ifdef TCPDEBUG @@ -3203,8 +3207,8 @@ dropafterack: tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif - if (ti_locked == TI_WLOCKED) - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; tp->t_flags |= TF_ACKNOW; @@ -3214,8 +3218,8 @@ dropafterack: return; dropwithreset: - if (ti_locked == TI_WLOCKED) - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; if (tp != NULL) { @@ -3226,8 +3230,8 @@ dropwithreset: return; drop: - if (ti_locked == TI_WLOCKED) { - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index b3c7309..28af834 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -956,7 +956,7 @@ tcp_ccalgounload(struct cc_algo *unload_algo) VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); /* * New connections already part way through being initialised * with the CC algo we're removing will not race with this code @@ -986,7 +986,7 @@ tcp_ccalgounload(struct cc_algo *unload_algo) } INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); @@ -1004,7 +1004,7 @@ tcp_drop(struct tcpcb *tp, int errno) { struct socket *so = tp->t_inpcb->inp_socket; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (TCPS_HAVERCVDSYN(tp->t_state)) { @@ -1171,7 +1171,7 @@ tcp_timer_discard(struct tcpcb *tp, uint32_t timer_type) struct inpcb *inp; CURVNET_SET(tp->t_vnet); - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); @@ -1186,13 +1186,13 @@ tcp_timer_discard(struct tcpcb *tp, uint32_t timer_type) tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); if (in_pcbrele_wlocked(inp)) { - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } } INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } @@ -1206,7 +1206,7 @@ tcp_close(struct tcpcb *tp) struct inpcb *inp = tp->t_inpcb; struct socket *so; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD @@ -1265,7 +1265,7 @@ tcp_drain(void) * where we're really low on mbufs, this is potentially * useful. */ - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) { if (inpb->inp_flags & INP_TIMEWAIT) continue; @@ -1276,7 +1276,7 @@ tcp_drain(void) } INP_WUNLOCK(inpb); } - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); @@ -1295,7 +1295,7 @@ tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || @@ -1359,10 +1359,10 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) /* * OK, now we're committed to doing something. */ - INP_INFO_RLOCK(&V_tcbinfo); + INP_LIST_RLOCK(&V_tcbinfo); gencnt = V_tcbinfo.ipi_gencnt; n = V_tcbinfo.ipi_count; - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_LIST_RUNLOCK(&V_tcbinfo); m = syncache_pcbcount(); @@ -1385,7 +1385,7 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0; inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); @@ -1410,7 +1410,7 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) } INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); n = i; error = 0; @@ -1448,14 +1448,14 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) } else INP_RUNLOCK(inp); } - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (!in_pcbrele_rlocked(inp)) INP_RUNLOCK(inp); } - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); if (!error) { /* @@ -1465,11 +1465,11 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) * while we were processing this request, and it * might be necessary to retry. */ - INP_INFO_RLOCK(&V_tcbinfo); + INP_LIST_RLOCK(&V_tcbinfo); xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_tcbinfo.ipi_count + pcb_count; - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_LIST_RUNLOCK(&V_tcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); @@ -1630,7 +1630,7 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) - offsetof(struct icmp, icmp_ip)); th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL) { @@ -1690,7 +1690,7 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) inc.inc_laddr = ip->ip_src; syncache_unreach(&inc, th); } - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); } else in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify); } @@ -1763,9 +1763,9 @@ tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; inc.inc_flags |= INC_ISIPV6; - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); syncache_unreach(&inc, &th); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); } else in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, NULL, notify); @@ -1898,7 +1898,7 @@ tcp_drop_syn_sent(struct inpcb *inp, int errno) { struct tcpcb *tp; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || @@ -2422,7 +2422,7 @@ sysctl_drop(SYSCTL_HANDLER_ARGS) default: return (EINVAL); } - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: @@ -2461,7 +2461,7 @@ sysctl_drop(SYSCTL_HANDLER_ARGS) INP_WUNLOCK(inp); } else error = ESRCH; - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (error); } diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index 10cb03b..2ce36d9 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -665,6 +665,8 @@ done: /* * Build a new TCP socket structure from a syncache entry. + * + * On success return the newly created socket with its underlying inp locked. */ static struct socket * syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) @@ -675,7 +677,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) int error; char *s; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* * Ok, create the full blown connection, and set things up @@ -706,6 +708,15 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) inp = sotoinpcb(so); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WLOCK(inp); + /* + * Exclusive pcbinfo lock is not required in syncache socket case even + * if two inpcb locks can be acquired simultaneously: + * - the inpcb in LISTEN state, + * - the newly created inp. + * + * In this case, an inp cannot be at same time in LISTEN state and + * just created by an accept() call. + */ INP_HASH_WLOCK(&V_tcbinfo); /* Insert new socket into PCB hash list. */ @@ -920,8 +931,6 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) tp->t_keepcnt = sototcpcb(lso)->t_keepcnt; tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); - INP_WUNLOCK(inp); - TCPSTAT_INC(tcps_accepts); return (so); @@ -939,6 +948,9 @@ abort2: * in the syncache, and if its there, we pull it out of * the cache and turn it into a full-blown connection in * the SYN-RECEIVED state. + * + * On syncache_socket() success the newly created socket + * has its underlying inp locked. */ int syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, @@ -953,7 +965,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, * Global TCP locks are held because we manipulate the PCB lists * and create a new socket. */ - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK, ("%s: can handle only ACK", __func__)); @@ -1105,7 +1117,7 @@ syncache_tfo_expand(struct syncache *sc, struct socket **lsop, struct mbuf *m, * Global TCP locks are held because we manipulate the PCB lists * and create a new socket. */ - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); pending_counter = intotcpcb(sotoinpcb(*lsop))->t_tfo_pending; *lsop = syncache_socket(sc, *lsop, m); diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c index 776ff4e..78f8571 100644 --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -266,7 +266,7 @@ tcp_timer_2msl(void *xtp) ostate = tp->t_state; #endif - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); @@ -274,14 +274,14 @@ tcp_timer_2msl(void *xtp) if (callout_pending(&tp->t_timers->tt_2msl) || !callout_active(&tp->t_timers->tt_2msl)) { INP_WUNLOCK(tp->t_inpcb); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_2msl); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } @@ -304,7 +304,7 @@ tcp_timer_2msl(void *xtp) */ if ((inp->inp_flags & INP_TIMEWAIT) != 0) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } @@ -330,7 +330,7 @@ tcp_timer_2msl(void *xtp) #endif if (tp != NULL) INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } @@ -346,21 +346,21 @@ tcp_timer_keep(void *xtp) ostate = tp->t_state; #endif - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_keep) || !callout_active(&tp->t_timers->tt_keep)) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_keep); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } @@ -414,7 +414,7 @@ tcp_timer_keep(void *xtp) PRU_SLOWTIMO); #endif INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; @@ -429,7 +429,7 @@ dropit: #endif if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } @@ -444,21 +444,21 @@ tcp_timer_persist(void *xtp) ostate = tp->t_state; #endif - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_persist) || !callout_active(&tp->t_timers->tt_persist)) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_persist); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } @@ -507,7 +507,7 @@ out: #endif if (tp != NULL) INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } @@ -556,22 +556,6 @@ tcp_timer_rexmt(void * xtp) if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; TCPSTAT_INC(tcps_timeoutdrop); - in_pcbref(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); - INP_WUNLOCK(inp); - INP_INFO_WLOCK(&V_tcbinfo); - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) { - INP_INFO_WUNLOCK(&V_tcbinfo); - CURVNET_RESTORE(); - return; - } - if (inp->inp_flags & INP_DROPPED) { - INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); - CURVNET_RESTORE(); - return; - } tp = tcp_drop(tp, tp->t_softerror ? tp->t_softerror : ETIMEDOUT); @@ -779,7 +763,7 @@ out: if (tp != NULL) INP_WUNLOCK(inp); if (headlocked) - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c index 8139c6d..73d23d8 100644 --- a/sys/netinet/tcp_timewait.c +++ b/sys/netinet/tcp_timewait.c @@ -201,10 +201,10 @@ tcp_tw_destroy(void) { struct tcptw *tw; - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); while ((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL) tcp_twclose(tw, 0); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); TW_LOCK_DESTROY(V_tw_lock); uma_zdestroy(V_tcptw_zone); @@ -227,7 +227,7 @@ tcp_twstart(struct tcpcb *tp) int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6; #endif - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); /* A dropped inp should never transition to TIME_WAIT state. */ @@ -268,8 +268,8 @@ tcp_twstart(struct tcpcb *tp) * allowed. Remove a connection from TIMEWAIT queue in LRU * fashion to make room for this connection. * - * pcbinfo lock is needed here to prevent deadlock as - * two inpcb locks can be acquired simultaneously. + * XXX: Check if it possible to always have enough room + * in advance based on guarantees provided by uma_zalloc(). */ tw = tcp_tw_2msl_scan(1); if (tw == NULL) { @@ -380,7 +380,7 @@ tcp_twrecycleable(struct tcptw *tw) tcp_seq new_iss = tw->iss; tcp_seq new_irs = tw->irs; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); new_iss += (ticks - tw->t_starttime) * (ISN_BYTES_PER_SECOND / hz); new_irs += (ticks - tw->t_starttime) * (MS_ISN_BYTES_PER_SECOND / hz); @@ -403,7 +403,7 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to __unused, struct tcphdr *th, int thflags; tcp_seq seq; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); /* @@ -504,7 +504,7 @@ tcp_twclose(struct tcptw *tw, int reuse) inp = tw->tw_inpcb; KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait")); KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw")); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */ + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */ INP_WLOCK_ASSERT(inp); tcp_tw_2msl_stop(tw, reuse); @@ -659,7 +659,7 @@ static void tcp_tw_2msl_reset(struct tcptw *tw, int rearm) { - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tw->tw_inpcb); TW_WLOCK(V_tw_lock); @@ -677,7 +677,7 @@ tcp_tw_2msl_stop(struct tcptw *tw, int reuse) struct inpcb *inp; int released; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); TW_WLOCK(V_tw_lock); inp = tw->tw_inpcb; @@ -707,13 +707,18 @@ tcp_tw_2msl_scan(int reuse) #ifdef INVARIANTS if (reuse) { /* - * pcbinfo lock is needed in reuse case to prevent deadlock - * as two inpcb locks can be acquired simultaneously: + * Exclusive pcbinfo lock is not required in reuse case even if + * two inpcb locks can be acquired simultaneously: * - the inpcb transitioning to TIME_WAIT state in * tcp_tw_start(), * - the inpcb closed by tcp_twclose(). + * + * It is because only inpcbs in FIN_WAIT2 or CLOSING states can + * transition in TIME_WAIT state. Then a pcbcb cannot be in + * TIME_WAIT list and transitioning to TIME_WAIT state at same + * time. */ - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } #endif @@ -731,26 +736,26 @@ tcp_tw_2msl_scan(int reuse) in_pcbref(inp); TW_RUNLOCK(V_tw_lock); - if (INP_INFO_TRY_WLOCK(&V_tcbinfo)) { + if (INP_INFO_TRY_RLOCK(&V_tcbinfo)) { INP_WLOCK(inp); tw = intotw(inp); if (in_pcbrele_wlocked(inp)) { KASSERT(tw == NULL, ("%s: held last inp " "reference but tw not NULL", __func__)); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); continue; } if (tw == NULL) { /* tcp_twclose() has already been called */ INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); continue; } tcp_twclose(tw, reuse); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); if (reuse) return tw; } else { diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index 131eb28..5a875ae 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -166,7 +166,7 @@ tcp_detach(struct socket *so, struct inpcb *inp) { struct tcpcb *tp; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp")); @@ -195,10 +195,10 @@ tcp_detach(struct socket *so, struct inpcb *inp) * and leave inpcb present until timewait ends * #2 tcp_detach is called at timewait end (or reuse) by * tcp_twclose, then the tcptw has already been discarded - * and inpcb is freed here + * (or reused) and inpcb is freed here * #3 tcp_detach is called() after timewait ends (or reuse) * (e.g. by soclose), then tcptw has already been discarded - * and inpcb is freed here + * (or reused) and inpcb is freed here * * In all three cases the tcptw should not be freed here. */ @@ -260,15 +260,20 @@ static void tcp_usr_detach(struct socket *so) { struct inpcb *inp; + int rlock = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL")); - INP_INFO_WLOCK(&V_tcbinfo); + if (!INP_INFO_WLOCKED(&V_tcbinfo)) { + INP_INFO_RLOCK(&V_tcbinfo); + rlock = 1; + } INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_detach: inp_socket == NULL")); tcp_detach(so, inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + if (rlock) + INP_INFO_RUNLOCK(&V_tcbinfo); } #ifdef INET @@ -638,7 +643,7 @@ tcp_usr_disconnect(struct socket *so) int error = 0; TCPDEBUG0; - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL")); INP_WLOCK(inp); @@ -654,7 +659,7 @@ tcp_usr_disconnect(struct socket *so) out: TCPDEBUG2(PRU_DISCONNECT); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (error); } @@ -769,7 +774,7 @@ tcp_usr_shutdown(struct socket *so) struct tcpcb *tp = NULL; TCPDEBUG0; - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("inp == NULL")); INP_WLOCK(inp); @@ -787,7 +792,7 @@ tcp_usr_shutdown(struct socket *so) out: TCPDEBUG2(PRU_SHUTDOWN); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (error); } @@ -861,7 +866,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, * this call. */ if (flags & PRUS_EOF) - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); INP_WLOCK(inp); @@ -918,7 +923,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, * Close the send side of the connection after * the data is sent. */ - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); socantsendmore(so); tcp_usrclosed(tp); } @@ -982,7 +987,7 @@ out: ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); INP_WUNLOCK(inp); if (flags & PRUS_EOF) - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (error); } @@ -999,7 +1004,7 @@ tcp_usr_abort(struct socket *so) inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL")); - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_abort: inp_socket == NULL")); @@ -1021,7 +1026,7 @@ tcp_usr_abort(struct socket *so) inp->inp_flags |= INP_SOCKREF; } INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); } /* @@ -1037,7 +1042,7 @@ tcp_usr_close(struct socket *so) inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL")); - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_close: inp_socket == NULL")); @@ -1060,7 +1065,7 @@ tcp_usr_close(struct socket *so) inp->inp_flags |= INP_SOCKREF; } INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); } /* @@ -1721,10 +1726,10 @@ tcp_attach(struct socket *so) } so->so_rcv.sb_flags |= SB_AUTOSIZE; so->so_snd.sb_flags |= SB_AUTOSIZE; - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); error = in_pcballoc(so, &V_tcbinfo); if (error) { - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (error); } inp = sotoinpcb(so); @@ -1740,12 +1745,12 @@ tcp_attach(struct socket *so) if (tp == NULL) { in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (ENOBUFS); } tp->t_state = TCPS_CLOSED; INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (0); } @@ -1763,7 +1768,7 @@ tcp_disconnect(struct tcpcb *tp) struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); /* @@ -1801,7 +1806,7 @@ static void tcp_usrclosed(struct tcpcb *tp) { - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); switch (tp->t_state) { diff --git a/sys/netinet/toecore.c b/sys/netinet/toecore.c index 1adaf4d..60e5fd4 100644 --- a/sys/netinet/toecore.c +++ b/sys/netinet/toecore.c @@ -338,7 +338,7 @@ toe_syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct socket **lsop) { - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); return (syncache_expand(inc, to, th, lsop, NULL)); } @@ -369,7 +369,7 @@ toe_4tuple_check(struct in_conninfo *inc, struct tcphdr *th, struct ifnet *ifp) if ((inp->inp_flags & INP_TIMEWAIT) && th != NULL) { - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* for twcheck */ + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* for twcheck */ if (!tcp_twcheck(inp, NULL, th, NULL, 0)) return (EADDRINUSE); } else { @@ -573,7 +573,7 @@ toe_connect_failed(struct toedev *tod, struct inpcb *inp, int err) (void) tcp_output(tp); } else { - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tp = tcp_drop(tp, err); if (tp == NULL) INP_WLOCK(inp); /* re-acquire */ diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c index 907496b..139a83d 100644 --- a/sys/netinet6/in6_pcb.c +++ b/sys/netinet6/in6_pcb.c @@ -768,7 +768,7 @@ in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) struct ip6_moptions *im6o; int i, gap; - INP_INFO_RLOCK(pcbinfo); + INP_INFO_WLOCK(pcbinfo); LIST_FOREACH(in6p, pcbinfo->ipi_listhead, inp_list) { INP_WLOCK(in6p); im6o = in6p->in6p_moptions; @@ -799,7 +799,7 @@ in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) } INP_WUNLOCK(in6p); } - INP_INFO_RUNLOCK(pcbinfo); + INP_INFO_WUNLOCK(pcbinfo); } /* |