summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjch <jch@FreeBSD.org>2016-11-24 14:48:46 +0000
committerjch <jch@FreeBSD.org>2016-11-24 14:48:46 +0000
commit7f3fe3a98ec981ac0d1481bb5e4cb31ef9ac8a0a (patch)
tree3cfb7702bb0aeaf23d9b6562d3f81192fd2bc65b
parent16165b4c35ba2f47480383759cd08703a8515756 (diff)
downloadFreeBSD-src-7f3fe3a98ec981ac0d1481bb5e4cb31ef9ac8a0a.zip
FreeBSD-src-7f3fe3a98ec981ac0d1481bb5e4cb31ef9ac8a0a.tar.gz
MFC r286227, r286443:
r286227: Decompose TCP INP_INFO lock to increase short-lived TCP connections scalability: - The existing TCP INP_INFO lock continues to protect the global inpcb list stability during full list traversal (e.g. tcp_pcblist()). - A new INP_LIST lock protects inpcb list actual modifications (inp allocation and free) and inpcb global counters. It allows to use TCP INP_INFO_RLOCK lock in critical paths (e.g. tcp_input()) and INP_INFO_WLOCK only in occasional operations that walk all connections. PR: 183659 Differential Revision: https://reviews.freebsd.org/D2599 Reviewed by: jhb, adrian Tested by: adrian, nitroboost-gmail.com Sponsored by: Verisign, Inc. r286443: Fix a kernel assertion issue introduced with r286227: Avoid too strict INP_INFO_RLOCK_ASSERT checks due to tcp_notify() being called from in6_pcbnotify(). Reported by: Larry Rosenman <ler@lerctr.org> Submitted by: markj, jch
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c30
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_listen.c28
-rw-r--r--sys/dev/cxgbe/tom/t4_connect.c4
-rw-r--r--sys/dev/cxgbe/tom/t4_cpl_io.c20
-rw-r--r--sys/dev/cxgbe/tom/t4_listen.c30
-rw-r--r--sys/netinet/in_pcb.c43
-rw-r--r--sys/netinet/in_pcb.h98
-rw-r--r--sys/netinet/tcp_input.c118
-rw-r--r--sys/netinet/tcp_subr.c50
-rw-r--r--sys/netinet/tcp_syncache.c22
-rw-r--r--sys/netinet/tcp_timer.c46
-rw-r--r--sys/netinet/tcp_timewait.c39
-rw-r--r--sys/netinet/tcp_usrreq.c49
-rw-r--r--sys/netinet/toecore.c6
-rw-r--r--sys/netinet6/in6_pcb.c4
15 files changed, 340 insertions, 247 deletions
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
index e04bf20..48474da 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
@@ -638,7 +638,7 @@ t3_send_fin(struct toedev *tod, struct tcpcb *tp)
unsigned int tid = toep->tp_tid;
#endif
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
CTR4(KTR_CXGB, "%s: tid %d, toep %p, flags %x", __func__, tid, toep,
@@ -924,12 +924,12 @@ do_act_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
rc = act_open_rpl_status_to_errno(s);
if (rc != EAGAIN)
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
toe_connect_failed(tod, inp, rc);
toepcb_release(toep); /* unlocks inp */
if (rc != EAGAIN)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
m_freem(m);
return (0);
@@ -1060,7 +1060,7 @@ send_reset(struct toepcb *toep)
struct adapter *sc = tod->tod_softc;
struct mbuf *m;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
CTR4(KTR_CXGB, "%s: tid %d, toep %p (%x)", __func__, tid, toep,
@@ -1171,12 +1171,12 @@ do_rx_data(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
SOCKBUF_UNLOCK(so_rcv);
INP_WUNLOCK(inp);
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
tp = tcp_drop(tp, ECONNRESET);
if (tp)
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
m_freem(m);
return (0);
@@ -1221,7 +1221,7 @@ do_peer_close(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
struct tcpcb *tp;
struct socket *so;
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
tp = intotcpcb(inp);
@@ -1249,7 +1249,7 @@ do_peer_close(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
case TCPS_FIN_WAIT_2:
tcp_twstart(tp);
INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
INP_WLOCK(inp);
toepcb_release(toep); /* no more CPLs expected */
@@ -1263,7 +1263,7 @@ do_peer_close(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
done:
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
m_freem(m);
return (0);
@@ -1284,7 +1284,7 @@ do_close_con_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
struct tcpcb *tp;
struct socket *so;
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
tp = intotcpcb(inp);
@@ -1302,7 +1302,7 @@ do_close_con_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
tcp_twstart(tp);
release:
INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
INP_WLOCK(inp);
toepcb_release(toep); /* no more CPLs expected */
@@ -1327,7 +1327,7 @@ release:
done:
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
m_freem(m);
return (0);
@@ -1488,7 +1488,7 @@ do_abort_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
return (do_abort_req_synqe(qs, r, m));
inp = toep->tp_inp;
- INP_INFO_WLOCK(&V_tcbinfo); /* for tcp_close */
+ INP_INFO_RLOCK(&V_tcbinfo); /* for tcp_close */
INP_WLOCK(inp);
tp = intotcpcb(inp);
@@ -1502,7 +1502,7 @@ do_abort_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
toep->tp_flags |= TP_ABORT_REQ_RCVD;
toep->tp_flags |= TP_ABORT_SHUTDOWN;
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
m_freem(m);
return (0);
}
@@ -1522,7 +1522,7 @@ do_abort_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
INP_WLOCK(inp); /* re-acquire */
toepcb_release(toep); /* no more CPLs expected */
}
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
send_abort_rpl(tod, tid, qset);
m_freem(m);
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_listen.c b/sys/dev/cxgb/ulp/tom/cxgb_listen.c
index 8148ae4..c992be4 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_listen.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_listen.c
@@ -553,11 +553,11 @@ do_pass_accept_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
REJECT_PASS_ACCEPT(); /* no l2te, or ifp mismatch */
}
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
/* Don't offload if the 4-tuple is already in use */
if (toe_4tuple_check(&inc, &th, ifp) != 0) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
REJECT_PASS_ACCEPT();
}
@@ -570,7 +570,7 @@ do_pass_accept_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
* resources tied to this listen context.
*/
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
REJECT_PASS_ACCEPT();
}
so = inp->inp_socket;
@@ -698,7 +698,7 @@ do_pass_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
struct toepcb *toep;
struct socket *so;
struct listen_ctx *lctx = synqe->lctx;
- struct inpcb *inp = lctx->inp;
+ struct inpcb *inp = lctx->inp, *new_inp;
struct tcpopt to;
struct tcphdr th;
struct in_conninfo inc;
@@ -712,7 +712,7 @@ do_pass_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
KASSERT(qs->idx == synqe->qset,
("%s qset mismatch %d %d", __func__, qs->idx, synqe->qset));
- INP_INFO_WLOCK(&V_tcbinfo); /* for syncache_expand */
+ INP_INFO_RLOCK(&V_tcbinfo); /* for syncache_expand */
INP_WLOCK(inp);
if (__predict_false(inp->inp_flags & INP_DROPPED)) {
@@ -726,7 +726,7 @@ do_pass_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
("%s: listen socket dropped but tid %u not aborted.",
__func__, tid));
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
m_freem(m);
return (0);
}
@@ -742,7 +742,7 @@ do_pass_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m)
reset:
t3_send_reset_synqe(tod, synqe);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
m_freem(m);
return (0);
}
@@ -760,21 +760,23 @@ reset:
goto reset;
}
- if (__predict_false(!(synqe->flags & TP_SYNQE_EXPANDED))) {
- struct inpcb *new_inp = sotoinpcb(so);
+ /* New connection inpcb is already locked by syncache_expand(). */
+ new_inp = sotoinpcb(so);
+ INP_WLOCK_ASSERT(new_inp);
- INP_WLOCK(new_inp);
+ if (__predict_false(!(synqe->flags & TP_SYNQE_EXPANDED))) {
tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
t3_offload_socket(tod, synqe, so);
- INP_WUNLOCK(new_inp);
}
+ INP_WUNLOCK(new_inp);
+
/* Remove the synq entry and release its reference on the lctx */
TAILQ_REMOVE(&lctx->synq, synqe, link);
inp = release_lctx(td, lctx);
if (inp)
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
release_synqe(synqe);
m_freem(m);
@@ -1140,7 +1142,7 @@ t3_offload_socket(struct toedev *tod, void *arg, struct socket *so)
struct cpl_pass_establish *cpl = synqe->cpl;
struct toepcb *toep = synqe->toep;
- INP_INFO_LOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */
INP_WLOCK_ASSERT(inp);
offload_socket(so, toep);
diff --git a/sys/dev/cxgbe/tom/t4_connect.c b/sys/dev/cxgbe/tom/t4_connect.c
index 80d2a20..85b9d97 100644
--- a/sys/dev/cxgbe/tom/t4_connect.c
+++ b/sys/dev/cxgbe/tom/t4_connect.c
@@ -179,12 +179,12 @@ act_open_failure_cleanup(struct adapter *sc, u_int atid, u_int status)
toep->tid = -1;
if (status != EAGAIN)
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
toe_connect_failed(tod, inp, status);
final_cpl_received(toep); /* unlocks inp */
if (status != EAGAIN)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
}
static int
diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c
index 328210c..8756880 100644
--- a/sys/dev/cxgbe/tom/t4_cpl_io.c
+++ b/sys/dev/cxgbe/tom/t4_cpl_io.c
@@ -1086,7 +1086,7 @@ do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
tp = intotcpcb(inp);
@@ -1128,7 +1128,7 @@ do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
case TCPS_FIN_WAIT_2:
tcp_twstart(tp);
INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
INP_WLOCK(inp);
final_cpl_received(toep);
@@ -1140,7 +1140,7 @@ do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
}
done:
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (0);
}
@@ -1167,7 +1167,7 @@ do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss,
KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
tp = intotcpcb(inp);
@@ -1185,7 +1185,7 @@ do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss,
tcp_twstart(tp);
release:
INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
INP_WLOCK(inp);
final_cpl_received(toep); /* no more CPLs expected */
@@ -1209,7 +1209,7 @@ release:
}
done:
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (0);
}
@@ -1368,7 +1368,7 @@ do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
}
inp = toep->inp;
- INP_INFO_WLOCK(&V_tcbinfo); /* for tcp_close */
+ INP_INFO_RLOCK(&V_tcbinfo); /* for tcp_close */
INP_WLOCK(inp);
tp = intotcpcb(inp);
@@ -1402,7 +1402,7 @@ do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
final_cpl_received(toep);
done:
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
return (0);
}
@@ -1520,12 +1520,12 @@ do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
SOCKBUF_UNLOCK(sb);
INP_WUNLOCK(inp);
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
tp = tcp_drop(tp, ECONNRESET);
if (tp)
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (0);
}
diff --git a/sys/dev/cxgbe/tom/t4_listen.c b/sys/dev/cxgbe/tom/t4_listen.c
index 317bf8a..11ad4e2 100644
--- a/sys/dev/cxgbe/tom/t4_listen.c
+++ b/sys/dev/cxgbe/tom/t4_listen.c
@@ -931,7 +931,7 @@ t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
struct cpl_pass_establish *cpl = mtod(synqe->syn, void *);
struct toepcb *toep = *(struct toepcb **)(cpl + 1);
- INP_INFO_LOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */
INP_WLOCK_ASSERT(inp);
KASSERT(synqe->flags & TPF_SYNQE,
("%s: %p not a synq_entry?", __func__, arg));
@@ -1332,15 +1332,15 @@ found:
REJECT_PASS_ACCEPT();
rpl = wrtod(wr);
- INP_INFO_WLOCK(&V_tcbinfo); /* for 4-tuple check */
+ INP_INFO_RLOCK(&V_tcbinfo); /* for 4-tuple check */
/* Don't offload if the 4-tuple is already in use */
if (toe_4tuple_check(&inc, &th, ifp) != 0) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
free(wr, M_CXGBE);
REJECT_PASS_ACCEPT();
}
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
inp = lctx->inp; /* listening socket, not owned by TOE */
INP_WLOCK(inp);
@@ -1398,7 +1398,7 @@ found:
hold_synqe(synqe); /* hold for the duration it's in the synq */
hold_lctx(lctx); /* A synqe on the list has a ref on its lctx */
- /*
+ /*
* If all goes well t4_syncache_respond will get called during
* syncache_add. Note that syncache_add releases the pcb lock.
*/
@@ -1516,7 +1516,7 @@ do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
unsigned int tid = GET_TID(cpl);
struct synq_entry *synqe = lookup_tid(sc, tid);
struct listen_ctx *lctx = synqe->lctx;
- struct inpcb *inp = lctx->inp;
+ struct inpcb *inp = lctx->inp, *new_inp;
struct socket *so;
struct tcphdr th;
struct tcpopt to;
@@ -1534,7 +1534,7 @@ do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
KASSERT(synqe->flags & TPF_SYNQE,
("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
- INP_INFO_WLOCK(&V_tcbinfo); /* for syncache_expand */
+ INP_INFO_RLOCK(&V_tcbinfo); /* for syncache_expand */
INP_WLOCK(inp);
CTR6(KTR_CXGBE,
@@ -1550,7 +1550,7 @@ do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
}
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (0);
}
@@ -1575,7 +1575,7 @@ reset:
*/
send_reset_synqe(TOEDEV(ifp), synqe);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (0);
}
toep->tid = tid;
@@ -1609,6 +1609,10 @@ reset:
goto reset;
}
+ /* New connection inpcb is already locked by syncache_expand(). */
+ new_inp = sotoinpcb(so);
+ INP_WLOCK_ASSERT(new_inp);
+
/*
* This is for the unlikely case where the syncache entry that we added
* has been evicted from the syncache, but the syncache_expand above
@@ -1619,20 +1623,18 @@ reset:
* this somewhat defeats the purpose of having a tod_offload_socket :-(
*/
if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) {
- struct inpcb *new_inp = sotoinpcb(so);
-
- INP_WLOCK(new_inp);
tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
t4_offload_socket(TOEDEV(ifp), synqe, so);
- INP_WUNLOCK(new_inp);
}
+ INP_WUNLOCK(new_inp);
+
/* Done with the synqe */
TAILQ_REMOVE(&lctx->synq, synqe, link);
inp = release_lctx(sc, lctx);
if (inp != NULL)
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
release_synqe(synqe);
return (0);
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index 0b296e0..387e866 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -215,6 +215,7 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
INP_INFO_LOCK_INIT(pcbinfo, name);
INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */
+ INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist");
#ifdef VIMAGE
pcbinfo->ipi_vnet = curvnet;
#endif
@@ -253,6 +254,7 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
in_pcbgroup_destroy(pcbinfo);
#endif
uma_zdestroy(pcbinfo->ipi_zone);
+ INP_LIST_LOCK_DESTROY(pcbinfo);
INP_HASH_LOCK_DESTROY(pcbinfo);
INP_INFO_LOCK_DESTROY(pcbinfo);
}
@@ -267,7 +269,14 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
struct inpcb *inp;
int error;
- INP_INFO_WLOCK_ASSERT(pcbinfo);
+#ifdef INVARIANTS
+ if (pcbinfo == &V_tcbinfo) {
+ INP_INFO_RLOCK_ASSERT(pcbinfo);
+ } else {
+ INP_INFO_WLOCK_ASSERT(pcbinfo);
+ }
+#endif
+
error = 0;
inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
if (inp == NULL)
@@ -299,6 +308,8 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
inp->inp_flags |= IN6P_IPV6_V6ONLY;
}
#endif
+ INP_WLOCK(inp);
+ INP_LIST_WLOCK(pcbinfo);
LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
pcbinfo->ipi_count++;
so->so_pcb = (caddr_t)inp;
@@ -306,9 +317,9 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
if (V_ip6_auto_flowlabel)
inp->inp_flags |= IN6P_AUTOFLOWLABEL;
#endif
- INP_WLOCK(inp);
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */
+ INP_LIST_WUNLOCK(pcbinfo);
#if defined(IPSEC) || defined(MAC)
out:
if (error != 0) {
@@ -1194,7 +1205,13 @@ in_pcbfree(struct inpcb *inp)
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
- INP_INFO_WLOCK_ASSERT(pcbinfo);
+#ifdef INVARIANTS
+ if (pcbinfo == &V_tcbinfo) {
+ INP_INFO_LOCK_ASSERT(pcbinfo);
+ } else {
+ INP_INFO_WLOCK_ASSERT(pcbinfo);
+ }
+#endif
INP_WLOCK_ASSERT(inp);
/* XXXRW: Do as much as possible here. */
@@ -1202,8 +1219,10 @@ in_pcbfree(struct inpcb *inp)
if (inp->inp_sp != NULL)
ipsec_delete_pcbpolicy(inp);
#endif
+ INP_LIST_WLOCK(pcbinfo);
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
in_pcbremlists(inp);
+ INP_LIST_WUNLOCK(pcbinfo);
#ifdef INET6
if (inp->inp_vflag & INP_IPV6PROTO) {
ip6_freepcbopts(inp->in6p_outputopts);
@@ -1360,7 +1379,7 @@ in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
struct ip_moptions *imo;
int i, gap;
- INP_INFO_RLOCK(pcbinfo);
+ INP_INFO_WLOCK(pcbinfo);
LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
INP_WLOCK(inp);
imo = inp->inp_moptions;
@@ -1390,7 +1409,7 @@ in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
}
INP_WUNLOCK(inp);
}
- INP_INFO_RUNLOCK(pcbinfo);
+ INP_INFO_WUNLOCK(pcbinfo);
}
/*
@@ -2023,8 +2042,16 @@ in_pcbremlists(struct inpcb *inp)
{
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
- INP_INFO_WLOCK_ASSERT(pcbinfo);
+#ifdef INVARIANTS
+ if (pcbinfo == &V_tcbinfo) {
+ INP_INFO_RLOCK_ASSERT(pcbinfo);
+ } else {
+ INP_INFO_WLOCK_ASSERT(pcbinfo);
+ }
+#endif
+
INP_WLOCK_ASSERT(inp);
+ INP_LIST_WLOCK_ASSERT(pcbinfo);
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
if (inp->inp_flags & INP_INHASHLIST) {
@@ -2169,13 +2196,13 @@ inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
{
struct inpcb *inp;
- INP_INFO_RLOCK(&V_tcbinfo);
+ INP_INFO_WLOCK(&V_tcbinfo);
LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
INP_WLOCK(inp);
func(inp, arg);
INP_WUNLOCK(inp);
}
- INP_INFO_RUNLOCK(&V_tcbinfo);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
}
struct socket *
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index 3292c6a..f23af58 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -128,23 +128,35 @@ struct in_conninfo {
struct icmp6_filter;
/*-
- * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4
- * and IPv6 sockets. In the case of TCP, further per-connection state is
+ * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 and
+ * IPv6 sockets. In the case of TCP and UDP, further per-connection state is
* hung off of inp_ppcb most of the time. Almost all fields of struct inpcb
* are static after creation or protected by a per-inpcb rwlock, inp_lock. A
- * few fields also require the global pcbinfo lock for the inpcb to be held,
- * when modified, such as the global connection lists and hashes, as well as
- * binding information (which affects which hash a connection is on). This
- * model means that connections can be looked up without holding the
- * per-connection lock, which is important for performance when attempting to
- * find the connection for a packet given its IP and port tuple. Writing to
- * these fields that write locks be held on both the inpcb and global locks.
+ * few fields are protected by multiple locks as indicated in the locking notes
+ * below. For these fields, all of the listed locks must be write-locked for
+ * any modifications. However, these fields can be safely read while any one of
+ * the listed locks are read-locked. This model can permit greater concurrency
+ * for read operations. For example, connections can be looked up while only
+ * holding a read lock on the global pcblist lock. This is important for
+ * performance when attempting to find the connection for a packet given its IP
+ * and port tuple.
+ *
+ * One noteworthy exception is that the global pcbinfo lock follows a different
+ * set of rules in relation to the inp_list field. Rather than being
+ * write-locked for modifications and read-locked for list iterations, it must
+ * be read-locked during modifications and write-locked during list iterations.
+ * This ensures that the relatively rare global list iterations safely walk a
+ * stable snapshot of connections while allowing more common list modifications
+ * to safely grab the pcblist lock just while adding or removing a connection
+ * from the global list.
*
* Key:
* (c) - Constant after initialization
* (g) - Protected by the pcbgroup lock
* (i) - Protected by the inpcb lock
* (p) - Protected by the pcbinfo lock for the inpcb
+ * (l) - Protected by the pcblist lock for the inpcb
+ * (h) - Protected by the pcbhash lock for the inpcb
* (s) - Protected by another subsystem's locks
* (x) - Undefined locking
*
@@ -159,15 +171,21 @@ struct icmp6_filter;
* socket has been freed), or there may be close(2)-related races.
*
* The inp_vflag field is overloaded, and would otherwise ideally be (c).
+ *
+ * TODO: Currently only the TCP stack is leveraging the global pcbinfo lock
+ * read-lock usage during modification, this model can be applied to other
+ * protocols (especially SCTP).
*/
struct inpcb {
- LIST_ENTRY(inpcb) inp_hash; /* (i/p) hash list */
+ LIST_ENTRY(inpcb) inp_hash; /* (h/i) hash list */
LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */
- LIST_ENTRY(inpcb) inp_list; /* (i/p) list for all PCBs for proto */
+ LIST_ENTRY(inpcb) inp_list; /* (p/l) list for all PCBs for proto */
+ /* (p[w]) for list iteration */
+ /* (p[r]/l) for addition/removal */
void *inp_ppcb; /* (i) pointer to per-protocol pcb */
struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */
struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */
- LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/p) group wildcard entry */
+ LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */
struct socket *inp_socket; /* (i) back pointer to socket */
struct ucred *inp_cred; /* (c) cache of socket cred */
u_int32_t inp_flow; /* (i) IPv6 flow information */
@@ -185,7 +203,7 @@ struct inpcb {
* general use */
/* Local and foreign ports, local and foreign addr. */
- struct in_conninfo inp_inc; /* (i/p) list for PCB's local port */
+ struct in_conninfo inp_inc; /* (i) list for PCB's local port */
/* MAC and IPSEC policy information. */
struct label *inp_label; /* (i) MAC label */
@@ -210,8 +228,8 @@ struct inpcb {
int inp6_cksum;
short inp6_hops;
} inp_depend6;
- LIST_ENTRY(inpcb) inp_portlist; /* (i/p) */
- struct inpcbport *inp_phd; /* (i/p) head of this list */
+ LIST_ENTRY(inpcb) inp_portlist; /* (i/h) */
+ struct inpcbport *inp_phd; /* (i/h) head of this list */
#define inp_zero_size offsetof(struct inpcb, inp_gencnt)
inp_gen_t inp_gencnt; /* (c) generation count */
struct llentry *inp_lle; /* cached L2 information */
@@ -275,37 +293,46 @@ struct inpcbport {
* Global data structure for each high-level protocol (UDP, TCP, ...) in both
* IPv4 and IPv6. Holds inpcb lists and information for managing them.
*
- * Each pcbinfo is protected by two locks: ipi_lock and ipi_hash_lock,
- * the former covering mutable global fields (such as the global pcb list),
- * and the latter covering the hashed lookup tables. The lock order is:
+ * Each pcbinfo is protected by three locks: ipi_lock, ipi_hash_lock and
+ * ipi_list_lock:
+ * - ipi_lock covering the global pcb list stability during loop iteration,
+ * - ipi_hash_lock covering the hashed lookup tables,
+ * - ipi_list_lock covering mutable global fields (such as the global
+ * pcb list)
+ *
+ * The lock order is:
*
- * ipi_lock (before) inpcb locks (before) {ipi_hash_lock, pcbgroup locks}
+ * ipi_lock (before)
+ * inpcb locks (before)
+ * ipi_list locks (before)
+ * {ipi_hash_lock, pcbgroup locks}
*
* Locking key:
*
* (c) Constant or nearly constant after initialisation
* (g) Locked by ipi_lock
+ * (l) Locked by ipi_list_lock
* (h) Read using either ipi_hash_lock or inpcb lock; write requires both
* (p) Protected by one or more pcbgroup locks
* (x) Synchronisation properties poorly defined
*/
struct inpcbinfo {
/*
- * Global lock protecting global inpcb list, inpcb count, etc.
+ * Global lock protecting full inpcb list traversal
*/
struct rwlock ipi_lock;
/*
* Global list of inpcbs on the protocol.
*/
- struct inpcbhead *ipi_listhead; /* (g) */
- u_int ipi_count; /* (g) */
+ struct inpcbhead *ipi_listhead; /* (g/l) */
+ u_int ipi_count; /* (l) */
/*
* Generation count -- incremented each time a connection is allocated
* or freed.
*/
- u_quad_t ipi_gencnt; /* (g) */
+ u_quad_t ipi_gencnt; /* (l) */
/*
* Fields associated with port lookup and allocation.
@@ -363,6 +390,11 @@ struct inpcbinfo {
* general use 2
*/
void *ipi_pspare[2];
+
+ /*
+ * Global lock protecting global inpcb list, inpcb count, etc.
+ */
+ struct rwlock ipi_list_lock;
};
#ifdef _KERNEL
@@ -455,6 +487,7 @@ short inp_so_options(const struct inpcb *inp);
#define INP_INFO_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_lock)
#define INP_INFO_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_lock)
#define INP_INFO_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_lock)
+#define INP_INFO_WLOCKED(ipi) rw_wowned(&(ipi)->ipi_lock)
#define INP_INFO_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_lock)
#define INP_INFO_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_lock)
#define INP_INFO_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_LOCKED)
@@ -462,6 +495,25 @@ short inp_so_options(const struct inpcb *inp);
#define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED)
#define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED)
+#define INP_LIST_LOCK_INIT(ipi, d) \
+ rw_init_flags(&(ipi)->ipi_list_lock, (d), 0)
+#define INP_LIST_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_list_lock)
+#define INP_LIST_RLOCK(ipi) rw_rlock(&(ipi)->ipi_list_lock)
+#define INP_LIST_WLOCK(ipi) rw_wlock(&(ipi)->ipi_list_lock)
+#define INP_LIST_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_list_lock)
+#define INP_LIST_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_list_lock)
+#define INP_LIST_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_list_lock)
+#define INP_LIST_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_list_lock)
+#define INP_LIST_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_list_lock)
+#define INP_LIST_LOCK_ASSERT(ipi) \
+ rw_assert(&(ipi)->ipi_list_lock, RA_LOCKED)
+#define INP_LIST_RLOCK_ASSERT(ipi) \
+ rw_assert(&(ipi)->ipi_list_lock, RA_RLOCKED)
+#define INP_LIST_WLOCK_ASSERT(ipi) \
+ rw_assert(&(ipi)->ipi_list_lock, RA_WLOCKED)
+#define INP_LIST_UNLOCK_ASSERT(ipi) \
+ rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED)
+
#define INP_HASH_LOCK_INIT(ipi, d) \
rw_init_flags(&(ipi)->ipi_hash_lock, (d), 0)
#define INP_HASH_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_hash_lock)
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 174670a..254454f 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -571,7 +571,7 @@ tcp_input(struct mbuf *m, int off0)
char *s = NULL; /* address and port logging */
int ti_locked;
#define TI_UNLOCKED 1
-#define TI_WLOCKED 2
+#define TI_RLOCKED 2
#ifdef TCPDEBUG
/*
@@ -757,8 +757,8 @@ tcp_input(struct mbuf *m, int off0)
* connection in TIMEWAIT and SYNs not targeting a listening socket.
*/
if ((thflags & (TH_FIN | TH_RST)) != 0) {
- INP_INFO_WLOCK(&V_tcbinfo);
- ti_locked = TI_WLOCKED;
+ INP_INFO_RLOCK(&V_tcbinfo);
+ ti_locked = TI_RLOCKED;
} else
ti_locked = TI_UNLOCKED;
@@ -780,8 +780,8 @@ tcp_input(struct mbuf *m, int off0)
findpcb:
#ifdef INVARIANTS
- if (ti_locked == TI_WLOCKED) {
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
} else {
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
}
@@ -941,11 +941,11 @@ findpcb:
relocked:
if (inp->inp_flags & INP_TIMEWAIT) {
if (ti_locked == TI_UNLOCKED) {
- if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) {
+ if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) {
in_pcbref(inp);
INP_WUNLOCK(inp);
- INP_INFO_WLOCK(&V_tcbinfo);
- ti_locked = TI_WLOCKED;
+ INP_INFO_RLOCK(&V_tcbinfo);
+ ti_locked = TI_RLOCKED;
INP_WLOCK(inp);
if (in_pcbrele_wlocked(inp)) {
inp = NULL;
@@ -956,9 +956,9 @@ relocked:
goto findpcb;
}
} else
- ti_locked = TI_WLOCKED;
+ ti_locked = TI_RLOCKED;
}
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
if (thflags & TH_SYN)
tcp_dooptions(&to, optp, optlen, TO_SYN);
@@ -967,7 +967,7 @@ relocked:
*/
if (tcp_twcheck(inp, &to, th, m, tlen))
goto findpcb;
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return;
}
/*
@@ -998,17 +998,17 @@ relocked:
*/
#ifdef INVARIANTS
if ((thflags & (TH_FIN | TH_RST)) != 0)
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
#endif
if (!((tp->t_state == TCPS_ESTABLISHED && (thflags & TH_SYN) == 0) ||
(tp->t_state == TCPS_LISTEN && (thflags & TH_SYN) &&
!(tp->t_flags & TF_FASTOPEN)))) {
if (ti_locked == TI_UNLOCKED) {
- if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) {
+ if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) {
in_pcbref(inp);
INP_WUNLOCK(inp);
- INP_INFO_WLOCK(&V_tcbinfo);
- ti_locked = TI_WLOCKED;
+ INP_INFO_RLOCK(&V_tcbinfo);
+ ti_locked = TI_RLOCKED;
INP_WLOCK(inp);
if (in_pcbrele_wlocked(inp)) {
inp = NULL;
@@ -1020,9 +1020,9 @@ relocked:
}
goto relocked;
} else
- ti_locked = TI_WLOCKED;
+ ti_locked = TI_RLOCKED;
}
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
}
#ifdef MAC
@@ -1077,7 +1077,7 @@ relocked:
*/
if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
/*
* Parse the TCP options here because
* syncookies need access to the reflected
@@ -1132,7 +1132,11 @@ new_tfo_socket:
*/
INP_WUNLOCK(inp); /* listen socket */
inp = sotoinpcb(so);
- INP_WLOCK(inp); /* new connection */
+ /*
+ * New connection inpcb is already locked by
+ * syncache_expand().
+ */
+ INP_WLOCK_ASSERT(inp);
tp = intotcpcb(inp);
KASSERT(tp->t_state == TCPS_SYN_RECEIVED,
("%s: ", __func__));
@@ -1368,8 +1372,8 @@ new_tfo_socket:
* Entry added to syncache and mbuf consumed.
* Only the listen socket is unlocked by syncache_add().
*/
- if (ti_locked == TI_WLOCKED) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
}
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
@@ -1418,8 +1422,8 @@ new_tfo_socket:
dropwithreset:
TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th);
- if (ti_locked == TI_WLOCKED) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
}
#ifdef INVARIANTS
@@ -1442,8 +1446,8 @@ dropunlock:
if (m != NULL)
TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th);
- if (ti_locked == TI_WLOCKED) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
}
#ifdef INVARIANTS
@@ -1501,13 +1505,13 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
tp->t_state != TCPS_ESTABLISHED) {
- KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for "
+ KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for "
"SYN/FIN/RST/!EST", __func__, ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
} else {
#ifdef INVARIANTS
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
else {
KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
"ti_locked: %d", __func__, ti_locked));
@@ -1676,8 +1680,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
/*
* This is a pure ack for outstanding data.
*/
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
TCPSTAT_INC(tcps_predack);
@@ -1780,8 +1784,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* nothing on the reassembly queue and we have enough
* buffer space to take it.
*/
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
/* Clean receiver SACK report if present */
@@ -2039,9 +2043,9 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tcp_state_change(tp, TCPS_SYN_RECEIVED);
}
- KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: "
+ KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: "
"ti_locked %d", __func__, ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(tp->t_inpcb);
/*
@@ -2169,10 +2173,10 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
case TCPS_CLOSE_WAIT:
so->so_error = ECONNRESET;
close:
- KASSERT(ti_locked == TI_WLOCKED,
+ KASSERT(ti_locked == TI_RLOCKED,
("tcp_do_segment: TH_RST 1 ti_locked %d",
ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
tcp_state_change(tp, TCPS_CLOSED);
TCPSTAT_INC(tcps_drops);
@@ -2181,10 +2185,10 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
case TCPS_CLOSING:
case TCPS_LAST_ACK:
- KASSERT(ti_locked == TI_WLOCKED,
+ KASSERT(ti_locked == TI_RLOCKED,
("tcp_do_segment: TH_RST 2 ti_locked %d",
ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
tp = tcp_close(tp);
break;
@@ -2288,9 +2292,9 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if ((so->so_state & SS_NOFDREF) &&
tp->t_state > TCPS_CLOSE_WAIT && tlen) {
- KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && "
+ KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && "
"CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data "
@@ -2364,9 +2368,9 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* error and we send an RST and drop the connection.
*/
if (thflags & TH_SYN) {
- KASSERT(ti_locked == TI_WLOCKED,
+ KASSERT(ti_locked == TI_RLOCKED,
("tcp_do_segment: TH_SYN ti_locked %d", ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
tp = tcp_drop(tp, ECONNRESET);
rstreason = BANDLIM_UNLIMITED;
@@ -2902,9 +2906,9 @@ process_ACK:
*/
case TCPS_CLOSING:
if (ourfinisacked) {
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
tcp_twstart(tp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
m_freem(m);
return;
}
@@ -2918,7 +2922,7 @@ process_ACK:
*/
case TCPS_LAST_ACK:
if (ourfinisacked) {
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
tp = tcp_close(tp);
goto drop;
}
@@ -3137,18 +3141,18 @@ dodata: /* XXX */
* standard timers.
*/
case TCPS_FIN_WAIT_2:
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
- KASSERT(ti_locked == TI_WLOCKED, ("%s: dodata "
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata "
"TCP_FIN_WAIT_2 ti_locked: %d", __func__,
ti_locked));
tcp_twstart(tp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return;
}
}
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
#ifdef TCPDEBUG
@@ -3203,8 +3207,8 @@ dropafterack:
tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
&tcp_savetcp, 0);
#endif
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
tp->t_flags |= TF_ACKNOW;
@@ -3214,8 +3218,8 @@ dropafterack:
return;
dropwithreset:
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
if (tp != NULL) {
@@ -3226,8 +3230,8 @@ dropwithreset:
return;
drop:
- if (ti_locked == TI_WLOCKED) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
}
#ifdef INVARIANTS
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index b3c7309..28af834 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -956,7 +956,7 @@ tcp_ccalgounload(struct cc_algo *unload_algo)
VNET_LIST_RLOCK();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
- INP_INFO_RLOCK(&V_tcbinfo);
+ INP_INFO_WLOCK(&V_tcbinfo);
/*
* New connections already part way through being initialised
* with the CC algo we're removing will not race with this code
@@ -986,7 +986,7 @@ tcp_ccalgounload(struct cc_algo *unload_algo)
}
INP_WUNLOCK(inp);
}
- INP_INFO_RUNLOCK(&V_tcbinfo);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK();
@@ -1004,7 +1004,7 @@ tcp_drop(struct tcpcb *tp, int errno)
{
struct socket *so = tp->t_inpcb->inp_socket;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_LOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(tp->t_inpcb);
if (TCPS_HAVERCVDSYN(tp->t_state)) {
@@ -1171,7 +1171,7 @@ tcp_timer_discard(struct tcpcb *tp, uint32_t timer_type)
struct inpcb *inp;
CURVNET_SET(tp->t_vnet);
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL",
__func__, tp));
@@ -1186,13 +1186,13 @@ tcp_timer_discard(struct tcpcb *tp, uint32_t timer_type)
tp->t_inpcb = NULL;
uma_zfree(V_tcpcb_zone, tp);
if (in_pcbrele_wlocked(inp)) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
}
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
}
@@ -1206,7 +1206,7 @@ tcp_close(struct tcpcb *tp)
struct inpcb *inp = tp->t_inpcb;
struct socket *so;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_LOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
#ifdef TCP_OFFLOAD
@@ -1265,7 +1265,7 @@ tcp_drain(void)
* where we're really low on mbufs, this is potentially
* useful.
*/
- INP_INFO_RLOCK(&V_tcbinfo);
+ INP_INFO_WLOCK(&V_tcbinfo);
LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) {
if (inpb->inp_flags & INP_TIMEWAIT)
continue;
@@ -1276,7 +1276,7 @@ tcp_drain(void)
}
INP_WUNLOCK(inpb);
}
- INP_INFO_RUNLOCK(&V_tcbinfo);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK_NOSLEEP();
@@ -1295,7 +1295,7 @@ tcp_notify(struct inpcb *inp, int error)
{
struct tcpcb *tp;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_LOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
if ((inp->inp_flags & INP_TIMEWAIT) ||
@@ -1359,10 +1359,10 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
/*
* OK, now we're committed to doing something.
*/
- INP_INFO_RLOCK(&V_tcbinfo);
+ INP_LIST_RLOCK(&V_tcbinfo);
gencnt = V_tcbinfo.ipi_gencnt;
n = V_tcbinfo.ipi_count;
- INP_INFO_RUNLOCK(&V_tcbinfo);
+ INP_LIST_RUNLOCK(&V_tcbinfo);
m = syncache_pcbcount();
@@ -1385,7 +1385,7 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
- INP_INFO_RLOCK(&V_tcbinfo);
+ INP_INFO_WLOCK(&V_tcbinfo);
for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0;
inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) {
INP_WLOCK(inp);
@@ -1410,7 +1410,7 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
}
INP_WUNLOCK(inp);
}
- INP_INFO_RUNLOCK(&V_tcbinfo);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
n = i;
error = 0;
@@ -1448,14 +1448,14 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
} else
INP_RUNLOCK(inp);
}
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
for (i = 0; i < n; i++) {
inp = inp_list[i];
INP_RLOCK(inp);
if (!in_pcbrele_rlocked(inp))
INP_RUNLOCK(inp);
}
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
if (!error) {
/*
@@ -1465,11 +1465,11 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
* while we were processing this request, and it
* might be necessary to retry.
*/
- INP_INFO_RLOCK(&V_tcbinfo);
+ INP_LIST_RLOCK(&V_tcbinfo);
xig.xig_gen = V_tcbinfo.ipi_gencnt;
xig.xig_sogen = so_gencnt;
xig.xig_count = V_tcbinfo.ipi_count + pcb_count;
- INP_INFO_RUNLOCK(&V_tcbinfo);
+ INP_LIST_RUNLOCK(&V_tcbinfo);
error = SYSCTL_OUT(req, &xig, sizeof xig);
}
free(inp_list, M_TEMP);
@@ -1630,7 +1630,7 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
- offsetof(struct icmp, icmp_ip));
th = (struct tcphdr *)((caddr_t)ip
+ (ip->ip_hl << 2));
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport,
ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL);
if (inp != NULL) {
@@ -1690,7 +1690,7 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
inc.inc_laddr = ip->ip_src;
syncache_unreach(&inc, th);
}
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
} else
in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify);
}
@@ -1763,9 +1763,9 @@ tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
inc.inc_flags |= INC_ISIPV6;
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
syncache_unreach(&inc, &th);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
} else
in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src,
0, cmd, NULL, notify);
@@ -1898,7 +1898,7 @@ tcp_drop_syn_sent(struct inpcb *inp, int errno)
{
struct tcpcb *tp;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
if ((inp->inp_flags & INP_TIMEWAIT) ||
@@ -2422,7 +2422,7 @@ sysctl_drop(SYSCTL_HANDLER_ARGS)
default:
return (EINVAL);
}
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
switch (addrs[0].ss_family) {
#ifdef INET6
case AF_INET6:
@@ -2461,7 +2461,7 @@ sysctl_drop(SYSCTL_HANDLER_ARGS)
INP_WUNLOCK(inp);
} else
error = ESRCH;
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (error);
}
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
index 10cb03b..2ce36d9 100644
--- a/sys/netinet/tcp_syncache.c
+++ b/sys/netinet/tcp_syncache.c
@@ -665,6 +665,8 @@ done:
/*
* Build a new TCP socket structure from a syncache entry.
+ *
+ * On success return the newly created socket with its underlying inp locked.
*/
static struct socket *
syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
@@ -675,7 +677,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
int error;
char *s;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
/*
* Ok, create the full blown connection, and set things up
@@ -706,6 +708,15 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
inp = sotoinpcb(so);
inp->inp_inc.inc_fibnum = so->so_fibnum;
INP_WLOCK(inp);
+ /*
+ * Exclusive pcbinfo lock is not required in syncache socket case even
+ * if two inpcb locks can be acquired simultaneously:
+ * - the inpcb in LISTEN state,
+ * - the newly created inp.
+ *
+ * In this case, an inp cannot be at same time in LISTEN state and
+ * just created by an accept() call.
+ */
INP_HASH_WLOCK(&V_tcbinfo);
/* Insert new socket into PCB hash list. */
@@ -920,8 +931,6 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
tp->t_keepcnt = sototcpcb(lso)->t_keepcnt;
tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
- INP_WUNLOCK(inp);
-
TCPSTAT_INC(tcps_accepts);
return (so);
@@ -939,6 +948,9 @@ abort2:
* in the syncache, and if its there, we pull it out of
* the cache and turn it into a full-blown connection in
* the SYN-RECEIVED state.
+ *
+ * On syncache_socket() success the newly created socket
+ * has its underlying inp locked.
*/
int
syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
@@ -953,7 +965,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
* Global TCP locks are held because we manipulate the PCB lists
* and create a new socket.
*/
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK,
("%s: can handle only ACK", __func__));
@@ -1105,7 +1117,7 @@ syncache_tfo_expand(struct syncache *sc, struct socket **lsop, struct mbuf *m,
* Global TCP locks are held because we manipulate the PCB lists
* and create a new socket.
*/
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
pending_counter = intotcpcb(sotoinpcb(*lsop))->t_tfo_pending;
*lsop = syncache_socket(sc, *lsop, m);
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index 776ff4e..78f8571 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -266,7 +266,7 @@ tcp_timer_2msl(void *xtp)
ostate = tp->t_state;
#endif
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
INP_WLOCK(inp);
@@ -274,14 +274,14 @@ tcp_timer_2msl(void *xtp)
if (callout_pending(&tp->t_timers->tt_2msl) ||
!callout_active(&tp->t_timers->tt_2msl)) {
INP_WUNLOCK(tp->t_inpcb);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
callout_deactivate(&tp->t_timers->tt_2msl);
if ((inp->inp_flags & INP_DROPPED) != 0) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
@@ -304,7 +304,7 @@ tcp_timer_2msl(void *xtp)
*/
if ((inp->inp_flags & INP_TIMEWAIT) != 0) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
@@ -330,7 +330,7 @@ tcp_timer_2msl(void *xtp)
#endif
if (tp != NULL)
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
}
@@ -346,21 +346,21 @@ tcp_timer_keep(void *xtp)
ostate = tp->t_state;
#endif
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
INP_WLOCK(inp);
if (callout_pending(&tp->t_timers->tt_keep) ||
!callout_active(&tp->t_timers->tt_keep)) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
callout_deactivate(&tp->t_timers->tt_keep);
if ((inp->inp_flags & INP_DROPPED) != 0) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
@@ -414,7 +414,7 @@ tcp_timer_keep(void *xtp)
PRU_SLOWTIMO);
#endif
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
@@ -429,7 +429,7 @@ dropit:
#endif
if (tp != NULL)
INP_WUNLOCK(tp->t_inpcb);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
}
@@ -444,21 +444,21 @@ tcp_timer_persist(void *xtp)
ostate = tp->t_state;
#endif
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
INP_WLOCK(inp);
if (callout_pending(&tp->t_timers->tt_persist) ||
!callout_active(&tp->t_timers->tt_persist)) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
callout_deactivate(&tp->t_timers->tt_persist);
if ((inp->inp_flags & INP_DROPPED) != 0) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
@@ -507,7 +507,7 @@ out:
#endif
if (tp != NULL)
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
}
@@ -556,22 +556,6 @@ tcp_timer_rexmt(void * xtp)
if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
tp->t_rxtshift = TCP_MAXRXTSHIFT;
TCPSTAT_INC(tcps_timeoutdrop);
- in_pcbref(inp);
- INP_INFO_RUNLOCK(&V_tcbinfo);
- INP_WUNLOCK(inp);
- INP_INFO_WLOCK(&V_tcbinfo);
- INP_WLOCK(inp);
- if (in_pcbrele_wlocked(inp)) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
- if (inp->inp_flags & INP_DROPPED) {
- INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
tp = tcp_drop(tp, tp->t_softerror ?
tp->t_softerror : ETIMEDOUT);
@@ -779,7 +763,7 @@ out:
if (tp != NULL)
INP_WUNLOCK(inp);
if (headlocked)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
}
diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c
index 8139c6d..73d23d8 100644
--- a/sys/netinet/tcp_timewait.c
+++ b/sys/netinet/tcp_timewait.c
@@ -201,10 +201,10 @@ tcp_tw_destroy(void)
{
struct tcptw *tw;
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
while ((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL)
tcp_twclose(tw, 0);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
TW_LOCK_DESTROY(V_tw_lock);
uma_zdestroy(V_tcptw_zone);
@@ -227,7 +227,7 @@ tcp_twstart(struct tcpcb *tp)
int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
#endif
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
/* A dropped inp should never transition to TIME_WAIT state. */
@@ -268,8 +268,8 @@ tcp_twstart(struct tcpcb *tp)
* allowed. Remove a connection from TIMEWAIT queue in LRU
* fashion to make room for this connection.
*
- * pcbinfo lock is needed here to prevent deadlock as
- * two inpcb locks can be acquired simultaneously.
+ * XXX: Check if it possible to always have enough room
+ * in advance based on guarantees provided by uma_zalloc().
*/
tw = tcp_tw_2msl_scan(1);
if (tw == NULL) {
@@ -380,7 +380,7 @@ tcp_twrecycleable(struct tcptw *tw)
tcp_seq new_iss = tw->iss;
tcp_seq new_irs = tw->irs;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
new_iss += (ticks - tw->t_starttime) * (ISN_BYTES_PER_SECOND / hz);
new_irs += (ticks - tw->t_starttime) * (MS_ISN_BYTES_PER_SECOND / hz);
@@ -403,7 +403,7 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to __unused, struct tcphdr *th,
int thflags;
tcp_seq seq;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
/*
@@ -504,7 +504,7 @@ tcp_twclose(struct tcptw *tw, int reuse)
inp = tw->tw_inpcb;
KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait"));
KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw"));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */
INP_WLOCK_ASSERT(inp);
tcp_tw_2msl_stop(tw, reuse);
@@ -659,7 +659,7 @@ static void
tcp_tw_2msl_reset(struct tcptw *tw, int rearm)
{
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(tw->tw_inpcb);
TW_WLOCK(V_tw_lock);
@@ -677,7 +677,7 @@ tcp_tw_2msl_stop(struct tcptw *tw, int reuse)
struct inpcb *inp;
int released;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
TW_WLOCK(V_tw_lock);
inp = tw->tw_inpcb;
@@ -707,13 +707,18 @@ tcp_tw_2msl_scan(int reuse)
#ifdef INVARIANTS
if (reuse) {
/*
- * pcbinfo lock is needed in reuse case to prevent deadlock
- * as two inpcb locks can be acquired simultaneously:
+ * Exclusive pcbinfo lock is not required in reuse case even if
+ * two inpcb locks can be acquired simultaneously:
* - the inpcb transitioning to TIME_WAIT state in
* tcp_tw_start(),
* - the inpcb closed by tcp_twclose().
+ *
+ * It is because only inpcbs in FIN_WAIT2 or CLOSING states can
+ * transition in TIME_WAIT state. Then a pcbcb cannot be in
+ * TIME_WAIT list and transitioning to TIME_WAIT state at same
+ * time.
*/
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
}
#endif
@@ -731,26 +736,26 @@ tcp_tw_2msl_scan(int reuse)
in_pcbref(inp);
TW_RUNLOCK(V_tw_lock);
- if (INP_INFO_TRY_WLOCK(&V_tcbinfo)) {
+ if (INP_INFO_TRY_RLOCK(&V_tcbinfo)) {
INP_WLOCK(inp);
tw = intotw(inp);
if (in_pcbrele_wlocked(inp)) {
KASSERT(tw == NULL, ("%s: held last inp "
"reference but tw not NULL", __func__));
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
continue;
}
if (tw == NULL) {
/* tcp_twclose() has already been called */
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
continue;
}
tcp_twclose(tw, reuse);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
if (reuse)
return tw;
} else {
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 131eb28..5a875ae 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -166,7 +166,7 @@ tcp_detach(struct socket *so, struct inpcb *inp)
{
struct tcpcb *tp;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_LOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp"));
@@ -195,10 +195,10 @@ tcp_detach(struct socket *so, struct inpcb *inp)
* and leave inpcb present until timewait ends
* #2 tcp_detach is called at timewait end (or reuse) by
* tcp_twclose, then the tcptw has already been discarded
- * and inpcb is freed here
+ * (or reused) and inpcb is freed here
* #3 tcp_detach is called() after timewait ends (or reuse)
* (e.g. by soclose), then tcptw has already been discarded
- * and inpcb is freed here
+ * (or reused) and inpcb is freed here
*
* In all three cases the tcptw should not be freed here.
*/
@@ -260,15 +260,20 @@ static void
tcp_usr_detach(struct socket *so)
{
struct inpcb *inp;
+ int rlock = 0;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL"));
- INP_INFO_WLOCK(&V_tcbinfo);
+ if (!INP_INFO_WLOCKED(&V_tcbinfo)) {
+ INP_INFO_RLOCK(&V_tcbinfo);
+ rlock = 1;
+ }
INP_WLOCK(inp);
KASSERT(inp->inp_socket != NULL,
("tcp_usr_detach: inp_socket == NULL"));
tcp_detach(so, inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (rlock)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
}
#ifdef INET
@@ -638,7 +643,7 @@ tcp_usr_disconnect(struct socket *so)
int error = 0;
TCPDEBUG0;
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL"));
INP_WLOCK(inp);
@@ -654,7 +659,7 @@ tcp_usr_disconnect(struct socket *so)
out:
TCPDEBUG2(PRU_DISCONNECT);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (error);
}
@@ -769,7 +774,7 @@ tcp_usr_shutdown(struct socket *so)
struct tcpcb *tp = NULL;
TCPDEBUG0;
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("inp == NULL"));
INP_WLOCK(inp);
@@ -787,7 +792,7 @@ tcp_usr_shutdown(struct socket *so)
out:
TCPDEBUG2(PRU_SHUTDOWN);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (error);
}
@@ -861,7 +866,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
* this call.
*/
if (flags & PRUS_EOF)
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
INP_WLOCK(inp);
@@ -918,7 +923,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
* Close the send side of the connection after
* the data is sent.
*/
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
socantsendmore(so);
tcp_usrclosed(tp);
}
@@ -982,7 +987,7 @@ out:
((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
INP_WUNLOCK(inp);
if (flags & PRUS_EOF)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (error);
}
@@ -999,7 +1004,7 @@ tcp_usr_abort(struct socket *so)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL"));
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
KASSERT(inp->inp_socket != NULL,
("tcp_usr_abort: inp_socket == NULL"));
@@ -1021,7 +1026,7 @@ tcp_usr_abort(struct socket *so)
inp->inp_flags |= INP_SOCKREF;
}
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
}
/*
@@ -1037,7 +1042,7 @@ tcp_usr_close(struct socket *so)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL"));
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
KASSERT(inp->inp_socket != NULL,
("tcp_usr_close: inp_socket == NULL"));
@@ -1060,7 +1065,7 @@ tcp_usr_close(struct socket *so)
inp->inp_flags |= INP_SOCKREF;
}
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
}
/*
@@ -1721,10 +1726,10 @@ tcp_attach(struct socket *so)
}
so->so_rcv.sb_flags |= SB_AUTOSIZE;
so->so_snd.sb_flags |= SB_AUTOSIZE;
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
error = in_pcballoc(so, &V_tcbinfo);
if (error) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (error);
}
inp = sotoinpcb(so);
@@ -1740,12 +1745,12 @@ tcp_attach(struct socket *so)
if (tp == NULL) {
in_pcbdetach(inp);
in_pcbfree(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (ENOBUFS);
}
tp->t_state = TCPS_CLOSED;
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (0);
}
@@ -1763,7 +1768,7 @@ tcp_disconnect(struct tcpcb *tp)
struct inpcb *inp = tp->t_inpcb;
struct socket *so = inp->inp_socket;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
/*
@@ -1801,7 +1806,7 @@ static void
tcp_usrclosed(struct tcpcb *tp)
{
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(tp->t_inpcb);
switch (tp->t_state) {
diff --git a/sys/netinet/toecore.c b/sys/netinet/toecore.c
index 1adaf4d..60e5fd4 100644
--- a/sys/netinet/toecore.c
+++ b/sys/netinet/toecore.c
@@ -338,7 +338,7 @@ toe_syncache_expand(struct in_conninfo *inc, struct tcpopt *to,
struct tcphdr *th, struct socket **lsop)
{
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
return (syncache_expand(inc, to, th, lsop, NULL));
}
@@ -369,7 +369,7 @@ toe_4tuple_check(struct in_conninfo *inc, struct tcphdr *th, struct ifnet *ifp)
if ((inp->inp_flags & INP_TIMEWAIT) && th != NULL) {
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* for twcheck */
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* for twcheck */
if (!tcp_twcheck(inp, NULL, th, NULL, 0))
return (EADDRINUSE);
} else {
@@ -573,7 +573,7 @@ toe_connect_failed(struct toedev *tod, struct inpcb *inp, int err)
(void) tcp_output(tp);
} else {
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
tp = tcp_drop(tp, err);
if (tp == NULL)
INP_WLOCK(inp); /* re-acquire */
diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c
index 907496b..139a83d 100644
--- a/sys/netinet6/in6_pcb.c
+++ b/sys/netinet6/in6_pcb.c
@@ -768,7 +768,7 @@ in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
struct ip6_moptions *im6o;
int i, gap;
- INP_INFO_RLOCK(pcbinfo);
+ INP_INFO_WLOCK(pcbinfo);
LIST_FOREACH(in6p, pcbinfo->ipi_listhead, inp_list) {
INP_WLOCK(in6p);
im6o = in6p->in6p_moptions;
@@ -799,7 +799,7 @@ in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
}
INP_WUNLOCK(in6p);
}
- INP_INFO_RUNLOCK(pcbinfo);
+ INP_INFO_WUNLOCK(pcbinfo);
}
/*
OpenPOWER on IntegriCloud