summaryrefslogtreecommitdiffstats
path: root/sys/netinet/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/netinet/tcp_input.c')
-rw-r--r--sys/netinet/tcp_input.c238
1 files changed, 99 insertions, 139 deletions
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 1a94d0a..080b4da 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -5,6 +5,7 @@
* Swinburne University of Technology, Melbourne, Australia.
* Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
* Copyright (c) 2010 The FreeBSD Foundation
+ * Copyright (c) 2010-2011 Juniper Networks, Inc.
* All rights reserved.
*
* Portions of this software were developed at the Centre for Advanced Internet
@@ -16,6 +17,9 @@
* Internet Architectures, Swinburne University of Technology, Melbourne,
* Australia by David Hayes under sponsorship from the FreeBSD Foundation.
*
+ * Portions of this software were developed by Robert N. M. Watson under
+ * contract to Juniper Networks, Inc.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -197,10 +201,6 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
&VNET_NAME(tcp_autorcvbuf_max), 0,
"Max size of automatic receive buffer");
-int tcp_read_locking = 1;
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, read_locking, CTLFLAG_RW,
- &tcp_read_locking, 0, "Enable read locking strategy");
-
VNET_DEFINE(struct inpcbhead, tcb);
#define tcb6 tcb /* for KAME src sync over BSD*'s */
VNET_DEFINE(struct inpcbinfo, tcbinfo);
@@ -591,8 +591,7 @@ tcp_input(struct mbuf *m, int off0)
char *s = NULL; /* address and port logging */
int ti_locked;
#define TI_UNLOCKED 1
-#define TI_RLOCKED 2
-#define TI_WLOCKED 3
+#define TI_WLOCKED 2
#ifdef TCPDEBUG
/*
@@ -756,30 +755,25 @@ tcp_input(struct mbuf *m, int off0)
drop_hdrlen = off0 + off;
/*
- * Locate pcb for segment, which requires a lock on tcbinfo.
- * Optimisticaly acquire a global read lock rather than a write lock
- * unless header flags necessarily imply a state change. There are
- * two cases where we might discover later we need a write lock
- * despite the flags: ACKs moving a connection out of the syncache,
- * and ACKs for a connection in TIMEWAIT.
+ * Locate pcb for segment; if we're likely to add or remove a
+ * connection then first acquire pcbinfo lock. There are two cases
+ * where we might discover later we need a write lock despite the
+ * flags: ACKs moving a connection out of the syncache, and ACKs for
+ * a connection in TIMEWAIT.
*/
- if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
- tcp_read_locking == 0) {
+ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0) {
INP_INFO_WLOCK(&V_tcbinfo);
ti_locked = TI_WLOCKED;
- } else {
- INP_INFO_RLOCK(&V_tcbinfo);
- ti_locked = TI_RLOCKED;
- }
+ } else
+ ti_locked = TI_UNLOCKED;
findpcb:
#ifdef INVARIANTS
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
- else if (ti_locked == TI_WLOCKED)
+ if (ti_locked == TI_WLOCKED) {
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
- else
- panic("%s: findpcb ti_locked %d\n", __func__, ti_locked);
+ } else {
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ }
#endif
#ifdef INET
@@ -797,20 +791,18 @@ findpcb:
* Transparently forwarded. Pretend to be the destination.
* already got one like this?
*/
- inp = in_pcblookup_hash(&V_tcbinfo,
- ip->ip_src, th->th_sport,
- ip->ip_dst, th->th_dport,
- 0, m->m_pkthdr.rcvif);
+ inp = in_pcblookup(&V_tcbinfo, ip->ip_src, th->th_sport,
+ ip->ip_dst, th->th_dport, INPLOOKUP_WLOCKPCB,
+ m->m_pkthdr.rcvif);
if (!inp) {
- /* It's new. Try to find the ambushing socket. */
- inp = in_pcblookup_hash(&V_tcbinfo,
- ip->ip_src, th->th_sport,
- next_hop->sin_addr,
- next_hop->sin_port ?
- ntohs(next_hop->sin_port) :
- th->th_dport,
- INPLOOKUP_WILDCARD,
- m->m_pkthdr.rcvif);
+ /*
+ * It's new. Try to find the ambushing socket.
+ */
+ inp = in_pcblookup(&V_tcbinfo, ip->ip_src,
+ th->th_sport, next_hop->sin_addr,
+ next_hop->sin_port ? ntohs(next_hop->sin_port) :
+ th->th_dport, INPLOOKUP_WILDCARD |
+ INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif);
}
/* Remove the tag from the packet. We don't need it anymore. */
m_tag_delete(m, fwd_tag);
@@ -820,21 +812,19 @@ findpcb:
{
#ifdef INET6
if (isipv6)
- inp = in6_pcblookup_hash(&V_tcbinfo,
- &ip6->ip6_src, th->th_sport,
- &ip6->ip6_dst, th->th_dport,
- INPLOOKUP_WILDCARD,
- m->m_pkthdr.rcvif);
+ inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src,
+ th->th_sport, &ip6->ip6_dst, th->th_dport,
+ INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB,
+ m->m_pkthdr.rcvif);
#endif
#if defined(INET) && defined(INET6)
else
#endif
#ifdef INET
- inp = in_pcblookup_hash(&V_tcbinfo,
- ip->ip_src, th->th_sport,
- ip->ip_dst, th->th_dport,
- INPLOOKUP_WILDCARD,
- m->m_pkthdr.rcvif);
+ inp = in_pcblookup(&V_tcbinfo, ip->ip_src,
+ th->th_sport, ip->ip_dst, th->th_dport,
+ INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB,
+ m->m_pkthdr.rcvif);
#endif
}
@@ -865,7 +855,7 @@ findpcb:
rstreason = BANDLIM_RST_CLOSEDPORT;
goto dropwithreset;
}
- INP_WLOCK(inp);
+ INP_WLOCK_ASSERT(inp);
if (!(inp->inp_flags & INP_HW_FLOWID)
&& (m->m_flags & M_FLOWID)
&& ((inp->inp_socket == NULL)
@@ -906,28 +896,26 @@ findpcb:
* legitimate new connection attempt the old INPCB gets removed and
* we can try again to find a listening socket.
*
- * At this point, due to earlier optimism, we may hold a read lock on
- * the inpcbinfo, rather than a write lock. If so, we need to
- * upgrade, or if that fails, acquire a reference on the inpcb, drop
- * all locks, acquire a global write lock, and then re-acquire the
- * inpcb lock. We may at that point discover that another thread has
- * tried to free the inpcb, in which case we need to loop back and
- * try to find a new inpcb to deliver to.
+ * At this point, due to earlier optimism, we may hold only an inpcb
+ * lock, and not the inpcbinfo write lock. If so, we need to try to
+ * acquire it, or if that fails, acquire a reference on the inpcb,
+ * drop all locks, acquire a global write lock, and then re-acquire
+ * the inpcb lock. We may at that point discover that another thread
+ * has tried to free the inpcb, in which case we need to loop back
+ * and try to find a new inpcb to deliver to.
+ *
+ * XXXRW: It may be time to rethink timewait locking.
*/
relocked:
if (inp->inp_flags & INP_TIMEWAIT) {
- KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
- ("%s: INP_TIMEWAIT ti_locked %d", __func__, ti_locked));
-
- if (ti_locked == TI_RLOCKED) {
- if (INP_INFO_TRY_UPGRADE(&V_tcbinfo) == 0) {
+ if (ti_locked == TI_UNLOCKED) {
+ if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) {
in_pcbref(inp);
INP_WUNLOCK(inp);
- INP_INFO_RUNLOCK(&V_tcbinfo);
INP_INFO_WLOCK(&V_tcbinfo);
ti_locked = TI_WLOCKED;
INP_WLOCK(inp);
- if (in_pcbrele(inp)) {
+ if (in_pcbrele_wlocked(inp)) {
inp = NULL;
goto findpcb;
}
@@ -975,26 +963,24 @@ relocked:
/*
* We've identified a valid inpcb, but it could be that we need an
- * inpcbinfo write lock and have only a read lock. In this case,
- * attempt to upgrade/relock using the same strategy as the TIMEWAIT
- * case above. If we relock, we have to jump back to 'relocked' as
- * the connection might now be in TIMEWAIT.
+ * inpcbinfo write lock but don't hold it. In this case, attempt to
+ * acquire using the same strategy as the TIMEWAIT case above. If we
+ * relock, we have to jump back to 'relocked' as the connection might
+ * now be in TIMEWAIT.
*/
- if (tp->t_state != TCPS_ESTABLISHED ||
- (thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
- tcp_read_locking == 0) {
- KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
- ("%s: upgrade check ti_locked %d", __func__, ti_locked));
-
- if (ti_locked == TI_RLOCKED) {
- if (INP_INFO_TRY_UPGRADE(&V_tcbinfo) == 0) {
+#ifdef INVARIANTS
+ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0)
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+#endif
+ if (tp->t_state != TCPS_ESTABLISHED) {
+ if (ti_locked == TI_UNLOCKED) {
+ if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) {
in_pcbref(inp);
INP_WUNLOCK(inp);
- INP_INFO_RUNLOCK(&V_tcbinfo);
INP_INFO_WLOCK(&V_tcbinfo);
ti_locked = TI_WLOCKED;
INP_WLOCK(inp);
- if (in_pcbrele(inp)) {
+ if (in_pcbrele_wlocked(inp)) {
inp = NULL;
goto findpcb;
}
@@ -1027,13 +1013,16 @@ relocked:
/*
* When the socket is accepting connections (the INPCB is in LISTEN
* state) we look into the SYN cache if this is a new connection
- * attempt or the completion of a previous one.
+ * attempt or the completion of a previous one. Because listen
+ * sockets are never in TCPS_ESTABLISHED, the V_tcbinfo lock will be
+ * held in this case.
*/
if (so->so_options & SO_ACCEPTCONN) {
struct in_conninfo inc;
KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but "
"tp not listening", __func__));
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
bzero(&inc, sizeof(inc));
#ifdef INET6
@@ -1371,13 +1360,17 @@ relocked:
return;
dropwithreset:
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RUNLOCK(&V_tcbinfo);
- else if (ti_locked == TI_WLOCKED)
+ if (ti_locked == TI_WLOCKED) {
INP_INFO_WUNLOCK(&V_tcbinfo);
- else
- panic("%s: dropwithreset ti_locked %d", __func__, ti_locked);
- ti_locked = TI_UNLOCKED;
+ ti_locked = TI_UNLOCKED;
+ }
+#ifdef INVARIANTS
+ else {
+ KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropwithreset "
+ "ti_locked: %d", __func__, ti_locked));
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ }
+#endif
if (inp != NULL) {
tcp_dropwithreset(m, th, tp, tlen, rstreason);
@@ -1388,13 +1381,17 @@ dropwithreset:
goto drop;
dropunlock:
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RUNLOCK(&V_tcbinfo);
- else if (ti_locked == TI_WLOCKED)
+ if (ti_locked == TI_WLOCKED) {
INP_INFO_WUNLOCK(&V_tcbinfo);
- else
- panic("%s: dropunlock ti_locked %d", __func__, ti_locked);
- ti_locked = TI_UNLOCKED;
+ ti_locked = TI_UNLOCKED;
+ }
+#ifdef INVARIANTS
+ else {
+ KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropunlock "
+ "ti_locked: %d", __func__, ti_locked));
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ }
+#endif
if (inp != NULL)
INP_WUNLOCK(inp);
@@ -1449,13 +1446,13 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
} else {
#ifdef INVARIANTS
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
- else if (ti_locked == TI_WLOCKED)
+ if (ti_locked == TI_WLOCKED)
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
- else
- panic("%s: ti_locked %d for EST", __func__,
- ti_locked);
+ else {
+ KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
+ "ti_locked: %d", __func__, ti_locked));
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ }
#endif
}
INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -1601,13 +1598,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
/*
* This is a pure ack for outstanding data.
*/
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RUNLOCK(&V_tcbinfo);
- else if (ti_locked == TI_WLOCKED)
+ if (ti_locked == TI_WLOCKED)
INP_INFO_WUNLOCK(&V_tcbinfo);
- else
- panic("%s: ti_locked %d on pure ACK",
- __func__, ti_locked);
ti_locked = TI_UNLOCKED;
TCPSTAT_INC(tcps_predack);
@@ -1708,13 +1700,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* nothing on the reassembly queue and we have enough
* buffer space to take it.
*/
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RUNLOCK(&V_tcbinfo);
- else if (ti_locked == TI_WLOCKED)
+ if (ti_locked == TI_WLOCKED)
INP_INFO_WUNLOCK(&V_tcbinfo);
- else
- panic("%s: ti_locked %d on pure data "
- "segment", __func__, ti_locked);
ti_locked = TI_UNLOCKED;
/* Clean receiver SACK report if present */
@@ -2550,9 +2537,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
}
process_ACK:
- INP_INFO_LOCK_ASSERT(&V_tcbinfo);
- KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
- ("tcp_input: process_ACK ti_locked %d", ti_locked));
INP_WLOCK_ASSERT(tp->t_inpcb);
acked = BYTES_THIS_ACK(tp, th);
@@ -2716,9 +2700,6 @@ process_ACK:
}
step6:
- INP_INFO_LOCK_ASSERT(&V_tcbinfo);
- KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
- ("tcp_do_segment: step6 ti_locked %d", ti_locked));
INP_WLOCK_ASSERT(tp->t_inpcb);
/*
@@ -2804,9 +2785,6 @@ step6:
tp->rcv_up = tp->rcv_nxt;
}
dodata: /* XXX */
- INP_INFO_LOCK_ASSERT(&V_tcbinfo);
- KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
- ("tcp_do_segment: dodata ti_locked %d", ti_locked));
INP_WLOCK_ASSERT(tp->t_inpcb);
/*
@@ -2938,13 +2916,8 @@ dodata: /* XXX */
return;
}
}
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RUNLOCK(&V_tcbinfo);
- else if (ti_locked == TI_WLOCKED)
+ if (ti_locked == TI_WLOCKED)
INP_INFO_WUNLOCK(&V_tcbinfo);
- else
- panic("%s: dodata epilogue ti_locked %d", __func__,
- ti_locked);
ti_locked = TI_UNLOCKED;
#ifdef TCPDEBUG
@@ -2973,9 +2946,6 @@ check_delack:
return;
dropafterack:
- KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
- ("tcp_do_segment: dropafterack ti_locked %d", ti_locked));
-
/*
* Generate an ACK dropping incoming segment if it occupies
* sequence space, where the ACK reflects our state.
@@ -3002,13 +2972,8 @@ dropafterack:
tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
&tcp_savetcp, 0);
#endif
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RUNLOCK(&V_tcbinfo);
- else if (ti_locked == TI_WLOCKED)
+ if (ti_locked == TI_WLOCKED)
INP_INFO_WUNLOCK(&V_tcbinfo);
- else
- panic("%s: dropafterack epilogue ti_locked %d", __func__,
- ti_locked);
ti_locked = TI_UNLOCKED;
tp->t_flags |= TF_ACKNOW;
@@ -3018,12 +2983,8 @@ dropafterack:
return;
dropwithreset:
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RUNLOCK(&V_tcbinfo);
- else if (ti_locked == TI_WLOCKED)
+ if (ti_locked == TI_WLOCKED)
INP_INFO_WUNLOCK(&V_tcbinfo);
- else
- panic("%s: dropwithreset ti_locked %d", __func__, ti_locked);
ti_locked = TI_UNLOCKED;
if (tp != NULL) {
@@ -3034,15 +2995,14 @@ dropwithreset:
return;
drop:
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RUNLOCK(&V_tcbinfo);
- else if (ti_locked == TI_WLOCKED)
+ if (ti_locked == TI_WLOCKED) {
INP_INFO_WUNLOCK(&V_tcbinfo);
+ ti_locked = TI_UNLOCKED;
+ }
#ifdef INVARIANTS
else
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
#endif
- ti_locked = TI_UNLOCKED;
/*
* Drop space held by incoming segment and return.
OpenPOWER on IntegriCloud