summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorrwatson <rwatson@FreeBSD.org>2006-04-01 16:36:36 +0000
committerrwatson <rwatson@FreeBSD.org>2006-04-01 16:36:36 +0000
commit5078a28ae8712cfa4dfde732bd6ce4b0c4844bb0 (patch)
treefbe5163bbedb12aefb046285a48eec04db08aa3d
parent7f0301fd6c390eefc92e5a70b2a44bc8285ccb62 (diff)
downloadFreeBSD-src-5078a28ae8712cfa4dfde732bd6ce4b0c4844bb0.zip
FreeBSD-src-5078a28ae8712cfa4dfde732bd6ce4b0c4844bb0.tar.gz
Update TCP for infrastructural changes to the socket/pcb refcount model,
pru_abort(), pru_detach(), and in_pcbdetach(): - Universally support and enforce the invariant that so_pcb is never NULL, converting dozens of unnecessary NULL checks into assertions, and eliminating dozens of unnecessary error handling cases in protocol code. - In some cases, eliminate unnecessary pcbinfo locking, as it is no longer required to ensure so_pcb != NULL. For example, the receive code no longer requires the pcbinfo lock, and the send code only requires it if building a new connection on an otherwise unconnected socket triggered via sendto() with an address. This should significnatly reduce tcbinfo lock contention in the receive and send cases. - In order to support the invariant that so_pcb != NULL, it is now necessary for the TCP code to not discard the tcpcb any time a connection is dropped, but instead leave the tcpcb until the socket is shutdown. This case is handled by setting INP_DROPPED, to substitute for using a NULL so_pcb to indicate that the connection has been dropped. This requires the inpcb lock, but not the pcbinfo lock. - Unlike all other protocols in the tree, TCP may need to retain access to the socket after the file descriptor has been closed. Set SS_PROTOREF in tcp_detach() in order to prevent the socket from being freed, and add a flag, INP_SOCKREF, so that the TCP code knows whether or not it needs to free the socket when the connection finally does close. The typical case where this occurs is if close() is called on a TCP socket before all sent data in the send socket buffer has been transmitted or acknowledged. If INP_SOCKREF is found when the connection is dropped, we release the inpcb, tcpcb, and socket instead of flagging INP_DROPPED. - Abort and detach protocol switch methods no longer return failures, nor attempt to free sockets, as the socket layer does this. - Annotate the existence of a long-standing race in the TCP timer code, in which timers are stopped but not drained when the socket is freed, as waiting for drain may lead to deadlocks, or have to occur in a context where waiting is not permitted. This race has been handled by testing to see if the tcpcb pointer in the inpcb is NULL (and vice versa), which is not normally permitted, but may be true of a inpcb and tcpcb have been freed. Add a counter to test how often this race has actually occurred, and a large comment for each instance where we compare potentially freed memory with NULL. This will have to be fixed in the near future, but requires is to further address how to handle the timer shutdown shutdown issue. - Several TCP calls no longer potentially free the passed inpcb/tcpcb, so no longer need to return a pointer to indicate whether the argument passed in is still valid. - Un-macroize debugging and locking setup for various protocol switch methods for TCP, as it lead to more obscurity, and as locking becomes more customized to the methods, offers less benefit. - Assert copyright on tcp_usrreq.c due to significant modifications that have been made as part of this work. These changes significantly modify the memory management and connection logic of our TCP implementation, and are (as such) High Risk Changes, and likely to contain serious bugs. Please report problems to the current@ mailing list ASAP, ideally with simple test cases, and optionally, packet traces. MFC after: 3 months
-rw-r--r--sys/netinet/tcp_input.c2
-rw-r--r--sys/netinet/tcp_reass.c2
-rw-r--r--sys/netinet/tcp_subr.c156
-rw-r--r--sys/netinet/tcp_timer.c77
-rw-r--r--sys/netinet/tcp_timewait.c156
-rw-r--r--sys/netinet/tcp_usrreq.c569
-rw-r--r--sys/netinet/tcp_var.h4
7 files changed, 693 insertions, 273 deletions
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 18ccc34..c1ab6fc 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -3200,7 +3200,7 @@ tcp_timewait(tw, to, th, m, tlen)
* are above the previous ones.
*/
if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) {
- (void) tcp_twclose(tw, 0);
+ tcp_twclose(tw, 0);
return (1);
}
diff --git a/sys/netinet/tcp_reass.c b/sys/netinet/tcp_reass.c
index 18ccc34..c1ab6fc 100644
--- a/sys/netinet/tcp_reass.c
+++ b/sys/netinet/tcp_reass.c
@@ -3200,7 +3200,7 @@ tcp_timewait(tw, to, th, m, tlen)
* are above the previous ones.
*/
if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) {
- (void) tcp_twclose(tw, 0);
+ tcp_twclose(tw, 0);
return (1);
}
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index e61d541..e3d5f15 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -213,7 +213,6 @@ SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW,
uma_zone_t sack_hole_zone;
static struct inpcb *tcp_notify(struct inpcb *, int);
-static void tcp_discardcb(struct tcpcb *);
static void tcp_isn_tick(void *);
/*
@@ -665,7 +664,7 @@ tcp_drop(tp, errno)
return (tcp_close(tp));
}
-static void
+void
tcp_discardcb(tp)
struct tcpcb *tp;
{
@@ -676,6 +675,12 @@ tcp_discardcb(tp)
int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
+ /*
+ * XXXRW: This is all very well and good, but actually, we might be
+ * discarding the tcpcb after the socket is gone, so we can't do
+ * this:
+ KASSERT(so != NULL, ("tcp_discardcb: so == NULL"));
+ */
INP_LOCK_ASSERT(inp);
/*
@@ -755,7 +760,12 @@ tcp_discardcb(tp)
inp->inp_ppcb = NULL;
tp->t_inpcb = NULL;
uma_zfree(tcpcb_zone, tp);
- soisdisconnected(so);
+
+ /*
+ * XXXRW: This seems a bit unclean.
+ */
+ if (so != NULL)
+ soisdisconnected(so);
}
/*
@@ -769,22 +779,40 @@ tcp_close(tp)
struct tcpcb *tp;
{
struct inpcb *inp = tp->t_inpcb;
-#ifdef INET6
- struct socket *so = inp->inp_socket;
-#endif
+ struct socket *so;
INP_INFO_WLOCK_ASSERT(&tcbinfo);
INP_LOCK_ASSERT(inp);
- tcp_discardcb(tp);
+ inp->inp_vflag |= INP_DROPPED;
+
+ tcpstat.tcps_closed++;
+ KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
+ so = inp->inp_socket;
+ soisdisconnected(so);
+ if (inp->inp_vflag & INP_SOCKREF) {
+ KASSERT(so->so_state & SS_PROTOREF,
+ ("tcp_close: !SS_PROTOREF"));
+ inp->inp_vflag &= ~INP_SOCKREF;
+ tcp_discardcb(tp);
#ifdef INET6
- if (INP_CHECK_SOCKAF(so, AF_INET6))
- in6_pcbdetach(inp);
- else
+ if (inp->inp_vflag & INP_IPV6PROTO) {
+ in6_pcbdetach(inp);
+ in6_pcbfree(inp);
+ } else {
#endif
- in_pcbdetach(inp);
- tcpstat.tcps_closed++;
- return (NULL);
+ in_pcbdetach(inp);
+ in_pcbfree(inp);
+#ifdef INET6
+ }
+#endif
+ ACCEPT_LOCK();
+ SOCK_LOCK(so);
+ so->so_state &= ~SS_PROTOREF;
+ sofree(so);
+ return (NULL);
+ }
+ return (tp);
}
void
@@ -857,8 +885,11 @@ tcp_notify(inp, error)
return (inp);
} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
tp->t_softerror) {
- tcp_drop(tp, error);
- return (struct inpcb *)0;
+ tp = tcp_drop(tp, error);
+ if (tp != NULL)
+ return (inp);
+ else
+ return (NULL);
} else {
tp->t_softerror = error;
return (inp);
@@ -1433,8 +1464,11 @@ tcp_drop_syn_sent(inp, errno)
INP_LOCK_ASSERT(inp);
if (tp != NULL && tp->t_state == TCPS_SYN_SENT) {
- tcp_drop(tp, errno);
- return (NULL);
+ tp = tcp_drop(tp, errno);
+ if (tp != NULL)
+ return (inp);
+ else
+ return (NULL);
}
return (inp);
}
@@ -1670,7 +1704,9 @@ tcp_twstart(tp)
if (tw == NULL) {
tw = tcp_timer_2msl_tw(1);
if (tw == NULL) {
- tcp_close(tp);
+ tp = tcp_close(tp);
+ if (tp != NULL)
+ INP_UNLOCK(tp->t_inpcb);
return;
}
}
@@ -1705,21 +1741,45 @@ tcp_twstart(tp)
*/
tw_time = 2 * tcp_msl;
acknow = tp->t_flags & TF_ACKNOW;
+
+ /*
+ * First, discard tcpcb state, which includes stopping its timers and
+ * freeing it. tcp_discardcb() used to also release the inpcb, but
+ * that work is now done in the caller.
+ */
tcp_discardcb(tp);
so = inp->inp_socket;
- ACCEPT_LOCK();
SOCK_LOCK(so);
- so->so_pcb = NULL;
tw->tw_cred = crhold(so->so_cred);
tw->tw_so_options = so->so_options;
- sotryfree(so);
- inp->inp_socket = NULL;
+ SOCK_UNLOCK(so);
if (acknow)
tcp_twrespond(tw, TH_ACK);
inp->inp_ppcb = (caddr_t)tw;
inp->inp_vflag |= INP_TIMEWAIT;
tcp_timer_2msl_reset(tw, tw_time);
- INP_UNLOCK(inp);
+
+ /*
+ * If the inpcb owns the sole reference to the socket, then we can
+ * detach and free the socket as it is not needed in time wait.
+ */
+ if (inp->inp_vflag & INP_SOCKREF) {
+ KASSERT(so->so_state & SS_PROTOREF,
+ ("tcp_twstart: !SS_PROTOREF"));
+ inp->inp_vflag &= ~INP_SOCKREF;
+#ifdef INET6
+ if (inp->inp_vflag & INP_IPV6PROTO)
+ in6_pcbdetach(inp);
+ else
+#endif
+ in_pcbdetach(inp);
+ INP_UNLOCK(inp);
+ ACCEPT_LOCK();
+ SOCK_LOCK(so);
+ so->so_state &= ~SS_PROTOREF;
+ sofree(so);
+ } else
+ INP_UNLOCK(inp);
}
/*
@@ -1756,31 +1816,62 @@ tcp_twrecycleable(struct tcptw *tw)
return (0);
}
-struct tcptw *
+void
tcp_twclose(struct tcptw *tw, int reuse)
{
+ struct socket *so;
struct inpcb *inp;
+ /*
+ * At this point, we should have an inpcb<->twtcp pair, with no
+ * associated socket. Validate that this is the case.
+ *
+ * XXXRW: This comment stale -- could still have socket ...?
+ */
inp = tw->tw_inpcb;
+ KASSERT((inp->inp_vflag & INP_TIMEWAIT), ("tcp_twclose: !timewait"));
+ KASSERT(inp->inp_ppcb == (void *)tw, ("tcp_twclose: inp_ppcb != tw"));
INP_INFO_WLOCK_ASSERT(&tcbinfo); /* tcp_timer_2msl_stop(). */
INP_LOCK_ASSERT(inp);
tw->tw_inpcb = NULL;
tcp_timer_2msl_stop(tw);
inp->inp_ppcb = NULL;
+ inp->inp_vflag |= INP_DROPPED;
+
+ so = inp->inp_socket;
+ if (so != NULL && inp->inp_vflag & INP_SOCKREF) {
+ KASSERT(so->so_state & SS_PROTOREF,
+ ("tcp_twclose: !SS_PROTOREF"));
+ inp->inp_vflag &= ~INP_SOCKREF;
#ifdef INET6
- if (inp->inp_vflag & INP_IPV6PROTO)
- in6_pcbdetach(inp);
- else
+ if (inp->inp_vflag & INP_IPV6PROTO) {
+ in6_pcbdetach(inp);
+ in6_pcbfree(inp);
+ } else {
+ in_pcbdetach(inp);
+ in_pcbfree(inp);
+ }
+#endif
+ ACCEPT_LOCK();
+ SOCK_LOCK(so);
+ so->so_state &= ~SS_PROTOREF;
+ sofree(so);
+ } else if (so == NULL) {
+#ifdef INET6
+ if (inp->inp_vflag & INP_IPV6PROTO)
+ in6_pcbfree(inp);
+ else
#endif
- in_pcbdetach(inp);
+ in_pcbfree(inp);
+ } else
+ printf("tcp_twclose: so != NULL but !INP_SOCKREF");
tcpstat.tcps_closed++;
crfree(tw->tw_cred);
tw->tw_cred = NULL;
if (reuse)
- return (tw);
+ return;
uma_zfree(tcptw_zone, tw);
- return (NULL);
}
int
@@ -2233,11 +2324,10 @@ sysctl_drop(SYSCTL_HANDLER_ARGS)
INP_LOCK(inp);
if ((tw = intotw(inp)) &&
(inp->inp_vflag & INP_TIMEWAIT) != 0) {
- (void) tcp_twclose(tw, 0);
+ tcp_twclose(tw, 0);
} else if ((tp = intotcpcb(inp)) &&
((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) {
- tp = tcp_drop(tp, ECONNABORTED);
- if (tp != NULL)
+ if (tcp_drop(tp, ECONNABORTED) != NULL)
INP_UNLOCK(inp);
} else
INP_UNLOCK(inp);
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index 45808af..db2e618 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -125,6 +125,10 @@ int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */
+static int tcp_timer_race;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_race, CTLFLAG_RD, &tcp_timer_race,
+ 0, "Count of t_inpcb races on tcp_discardcb");
+
/*
* TCP timer processing.
*/
@@ -138,7 +142,15 @@ tcp_timer_delack(xtp)
INP_INFO_RLOCK(&tcbinfo);
inp = tp->t_inpcb;
+ /*
+ * XXXRW: While this assert is in fact correct, bugs in the tcpcb
+ * tear-down mean we need it as a work-around for races between
+ * timers and tcp_discardcb().
+ *
+ * KASSERT(inp != NULL, ("tcp_timer_delack: inp == NULL"));
+ */
if (inp == NULL) {
+ tcp_timer_race++;
INP_INFO_RUNLOCK(&tcbinfo);
return;
}
@@ -167,10 +179,21 @@ tcp_timer_2msl(xtp)
ostate = tp->t_state;
#endif
+ /*
+ * XXXRW: Does this actually happen?
+ */
INP_INFO_WLOCK(&tcbinfo);
inp = tp->t_inpcb;
+ /*
+ * XXXRW: While this assert is in fact correct, bugs in the tcpcb
+ * tear-down mean we need it as a work-around for races between
+ * timers and tcp_discardcb().
+ *
+ * KASSERT(inp != NULL, ("tcp_timer_2msl: inp == NULL"));
+ */
if (inp == NULL) {
- INP_INFO_WUNLOCK(&tcbinfo);
+ tcp_timer_race++;
+ INP_INFO_RUNLOCK(&tcbinfo);
return;
}
INP_LOCK(inp);
@@ -267,14 +290,12 @@ tcp_timer_2msl_tw(int reuse)
for (i = 0; i < 2; i++) {
twl = tw_2msl_list[i];
tw_tail = &twl->tw_tail;
- for (;;) {
- tw = LIST_FIRST(&twl->tw_list);
- if (tw == tw_tail || (!reuse && tw->tw_time > ticks))
- break;
- INP_LOCK(tw->tw_inpcb);
- if (tcp_twclose(tw, reuse) != NULL)
- return (tw);
- }
+ tw = LIST_FIRST(&twl->tw_list);
+ if (tw == tw_tail || (!reuse && tw->tw_time > ticks))
+ continue;
+ INP_LOCK(tw->tw_inpcb);
+ tcp_twclose(tw, reuse);
+ return (tw);
}
return (NULL);
}
@@ -293,8 +314,16 @@ tcp_timer_keep(xtp)
#endif
INP_INFO_WLOCK(&tcbinfo);
inp = tp->t_inpcb;
- if (!inp) {
- INP_INFO_WUNLOCK(&tcbinfo);
+ /*
+ * XXXRW: While this assert is in fact correct, bugs in the tcpcb
+ * tear-down mean we need it as a work-around for races between
+ * timers and tcp_discardcb().
+ *
+ * KASSERT(inp != NULL, ("tcp_timer_keep: inp == NULL"));
+ */
+ if (inp == NULL) {
+ tcp_timer_race++;
+ INP_INFO_RUNLOCK(&tcbinfo);
return;
}
INP_LOCK(inp);
@@ -375,8 +404,16 @@ tcp_timer_persist(xtp)
#endif
INP_INFO_WLOCK(&tcbinfo);
inp = tp->t_inpcb;
- if (!inp) {
- INP_INFO_WUNLOCK(&tcbinfo);
+ /*
+ * XXXRW: While this assert is in fact correct, bugs in the tcpcb
+ * tear-down mean we need it as a work-around for races between
+ * timers and tcp_discardcb().
+ *
+ * KASSERT(inp != NULL, ("tcp_timer_persist: inp == NULL"));
+ */
+ if (inp == NULL) {
+ tcp_timer_race++;
+ INP_INFO_RUNLOCK(&tcbinfo);
return;
}
INP_LOCK(inp);
@@ -412,7 +449,7 @@ tcp_timer_persist(xtp)
out:
#ifdef TCPDEBUG
- if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
+ if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
PRU_SLOWTIMO);
#endif
@@ -437,8 +474,16 @@ tcp_timer_rexmt(xtp)
INP_INFO_WLOCK(&tcbinfo);
headlocked = 1;
inp = tp->t_inpcb;
- if (!inp) {
- INP_INFO_WUNLOCK(&tcbinfo);
+ /*
+ * XXXRW: While this assert is in fact correct, bugs in the tcpcb
+ * tear-down mean we need it as a work-around for races between
+ * timers and tcp_discardcb().
+ *
+ * KASSERT(inp != NULL, ("tcp_timer_rexmt: inp == NULL"));
+ */
+ if (inp == NULL) {
+ tcp_timer_race++;
+ INP_INFO_RUNLOCK(&tcbinfo);
return;
}
INP_LOCK(inp);
diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c
index e61d541..e3d5f15 100644
--- a/sys/netinet/tcp_timewait.c
+++ b/sys/netinet/tcp_timewait.c
@@ -213,7 +213,6 @@ SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW,
uma_zone_t sack_hole_zone;
static struct inpcb *tcp_notify(struct inpcb *, int);
-static void tcp_discardcb(struct tcpcb *);
static void tcp_isn_tick(void *);
/*
@@ -665,7 +664,7 @@ tcp_drop(tp, errno)
return (tcp_close(tp));
}
-static void
+void
tcp_discardcb(tp)
struct tcpcb *tp;
{
@@ -676,6 +675,12 @@ tcp_discardcb(tp)
int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
+ /*
+ * XXXRW: This is all very well and good, but actually, we might be
+ * discarding the tcpcb after the socket is gone, so we can't do
+ * this:
+ KASSERT(so != NULL, ("tcp_discardcb: so == NULL"));
+ */
INP_LOCK_ASSERT(inp);
/*
@@ -755,7 +760,12 @@ tcp_discardcb(tp)
inp->inp_ppcb = NULL;
tp->t_inpcb = NULL;
uma_zfree(tcpcb_zone, tp);
- soisdisconnected(so);
+
+ /*
+ * XXXRW: This seems a bit unclean.
+ */
+ if (so != NULL)
+ soisdisconnected(so);
}
/*
@@ -769,22 +779,40 @@ tcp_close(tp)
struct tcpcb *tp;
{
struct inpcb *inp = tp->t_inpcb;
-#ifdef INET6
- struct socket *so = inp->inp_socket;
-#endif
+ struct socket *so;
INP_INFO_WLOCK_ASSERT(&tcbinfo);
INP_LOCK_ASSERT(inp);
- tcp_discardcb(tp);
+ inp->inp_vflag |= INP_DROPPED;
+
+ tcpstat.tcps_closed++;
+ KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
+ so = inp->inp_socket;
+ soisdisconnected(so);
+ if (inp->inp_vflag & INP_SOCKREF) {
+ KASSERT(so->so_state & SS_PROTOREF,
+ ("tcp_close: !SS_PROTOREF"));
+ inp->inp_vflag &= ~INP_SOCKREF;
+ tcp_discardcb(tp);
#ifdef INET6
- if (INP_CHECK_SOCKAF(so, AF_INET6))
- in6_pcbdetach(inp);
- else
+ if (inp->inp_vflag & INP_IPV6PROTO) {
+ in6_pcbdetach(inp);
+ in6_pcbfree(inp);
+ } else {
#endif
- in_pcbdetach(inp);
- tcpstat.tcps_closed++;
- return (NULL);
+ in_pcbdetach(inp);
+ in_pcbfree(inp);
+#ifdef INET6
+ }
+#endif
+ ACCEPT_LOCK();
+ SOCK_LOCK(so);
+ so->so_state &= ~SS_PROTOREF;
+ sofree(so);
+ return (NULL);
+ }
+ return (tp);
}
void
@@ -857,8 +885,11 @@ tcp_notify(inp, error)
return (inp);
} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
tp->t_softerror) {
- tcp_drop(tp, error);
- return (struct inpcb *)0;
+ tp = tcp_drop(tp, error);
+ if (tp != NULL)
+ return (inp);
+ else
+ return (NULL);
} else {
tp->t_softerror = error;
return (inp);
@@ -1433,8 +1464,11 @@ tcp_drop_syn_sent(inp, errno)
INP_LOCK_ASSERT(inp);
if (tp != NULL && tp->t_state == TCPS_SYN_SENT) {
- tcp_drop(tp, errno);
- return (NULL);
+ tp = tcp_drop(tp, errno);
+ if (tp != NULL)
+ return (inp);
+ else
+ return (NULL);
}
return (inp);
}
@@ -1670,7 +1704,9 @@ tcp_twstart(tp)
if (tw == NULL) {
tw = tcp_timer_2msl_tw(1);
if (tw == NULL) {
- tcp_close(tp);
+ tp = tcp_close(tp);
+ if (tp != NULL)
+ INP_UNLOCK(tp->t_inpcb);
return;
}
}
@@ -1705,21 +1741,45 @@ tcp_twstart(tp)
*/
tw_time = 2 * tcp_msl;
acknow = tp->t_flags & TF_ACKNOW;
+
+ /*
+ * First, discard tcpcb state, which includes stopping its timers and
+ * freeing it. tcp_discardcb() used to also release the inpcb, but
+ * that work is now done in the caller.
+ */
tcp_discardcb(tp);
so = inp->inp_socket;
- ACCEPT_LOCK();
SOCK_LOCK(so);
- so->so_pcb = NULL;
tw->tw_cred = crhold(so->so_cred);
tw->tw_so_options = so->so_options;
- sotryfree(so);
- inp->inp_socket = NULL;
+ SOCK_UNLOCK(so);
if (acknow)
tcp_twrespond(tw, TH_ACK);
inp->inp_ppcb = (caddr_t)tw;
inp->inp_vflag |= INP_TIMEWAIT;
tcp_timer_2msl_reset(tw, tw_time);
- INP_UNLOCK(inp);
+
+ /*
+ * If the inpcb owns the sole reference to the socket, then we can
+ * detach and free the socket as it is not needed in time wait.
+ */
+ if (inp->inp_vflag & INP_SOCKREF) {
+ KASSERT(so->so_state & SS_PROTOREF,
+ ("tcp_twstart: !SS_PROTOREF"));
+ inp->inp_vflag &= ~INP_SOCKREF;
+#ifdef INET6
+ if (inp->inp_vflag & INP_IPV6PROTO)
+ in6_pcbdetach(inp);
+ else
+#endif
+ in_pcbdetach(inp);
+ INP_UNLOCK(inp);
+ ACCEPT_LOCK();
+ SOCK_LOCK(so);
+ so->so_state &= ~SS_PROTOREF;
+ sofree(so);
+ } else
+ INP_UNLOCK(inp);
}
/*
@@ -1756,31 +1816,62 @@ tcp_twrecycleable(struct tcptw *tw)
return (0);
}
-struct tcptw *
+void
tcp_twclose(struct tcptw *tw, int reuse)
{
+ struct socket *so;
struct inpcb *inp;
+ /*
+ * At this point, we should have an inpcb<->twtcp pair, with no
+ * associated socket. Validate that this is the case.
+ *
+ * XXXRW: This comment stale -- could still have socket ...?
+ */
inp = tw->tw_inpcb;
+ KASSERT((inp->inp_vflag & INP_TIMEWAIT), ("tcp_twclose: !timewait"));
+ KASSERT(inp->inp_ppcb == (void *)tw, ("tcp_twclose: inp_ppcb != tw"));
INP_INFO_WLOCK_ASSERT(&tcbinfo); /* tcp_timer_2msl_stop(). */
INP_LOCK_ASSERT(inp);
tw->tw_inpcb = NULL;
tcp_timer_2msl_stop(tw);
inp->inp_ppcb = NULL;
+ inp->inp_vflag |= INP_DROPPED;
+
+ so = inp->inp_socket;
+ if (so != NULL && inp->inp_vflag & INP_SOCKREF) {
+ KASSERT(so->so_state & SS_PROTOREF,
+ ("tcp_twclose: !SS_PROTOREF"));
+ inp->inp_vflag &= ~INP_SOCKREF;
#ifdef INET6
- if (inp->inp_vflag & INP_IPV6PROTO)
- in6_pcbdetach(inp);
- else
+ if (inp->inp_vflag & INP_IPV6PROTO) {
+ in6_pcbdetach(inp);
+ in6_pcbfree(inp);
+ } else {
+ in_pcbdetach(inp);
+ in_pcbfree(inp);
+ }
+#endif
+ ACCEPT_LOCK();
+ SOCK_LOCK(so);
+ so->so_state &= ~SS_PROTOREF;
+ sofree(so);
+ } else if (so == NULL) {
+#ifdef INET6
+ if (inp->inp_vflag & INP_IPV6PROTO)
+ in6_pcbfree(inp);
+ else
#endif
- in_pcbdetach(inp);
+ in_pcbfree(inp);
+ } else
+ printf("tcp_twclose: so != NULL but !INP_SOCKREF");
tcpstat.tcps_closed++;
crfree(tw->tw_cred);
tw->tw_cred = NULL;
if (reuse)
- return (tw);
+ return;
uma_zfree(tcptw_zone, tw);
- return (NULL);
}
int
@@ -2233,11 +2324,10 @@ sysctl_drop(SYSCTL_HANDLER_ARGS)
INP_LOCK(inp);
if ((tw = intotw(inp)) &&
(inp->inp_vflag & INP_TIMEWAIT) != 0) {
- (void) tcp_twclose(tw, 0);
+ tcp_twclose(tw, 0);
} else if ((tp = intotcpcb(inp)) &&
((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) {
- tp = tcp_drop(tp, ECONNABORTED);
- if (tp != NULL)
+ if (tcp_drop(tp, ECONNABORTED) != NULL)
INP_UNLOCK(inp);
} else
INP_UNLOCK(inp);
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 902cab6..99b5e17 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -1,6 +1,8 @@
/*-
* Copyright (c) 1982, 1986, 1988, 1993
- * The Regents of the University of California. All rights reserved.
+ * The Regents of the University of California.
+ * Copyright (c) 2006 Robert N. M. Watson
+ * All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -89,10 +91,8 @@ static int tcp_connect(struct tcpcb *, struct sockaddr *,
static int tcp6_connect(struct tcpcb *, struct sockaddr *,
struct thread *td);
#endif /* INET6 */
-static struct tcpcb *
- tcp_disconnect(struct tcpcb *);
-static struct tcpcb *
- tcp_usrclosed(struct tcpcb *);
+static void tcp_disconnect(struct tcpcb *);
+static void tcp_usrclosed(struct tcpcb *);
static void tcp_fill_info(struct tcpcb *, struct tcp_info *);
#ifdef TCPDEBUG
@@ -113,18 +113,15 @@ static void tcp_fill_info(struct tcpcb *, struct tcp_info *);
static int
tcp_usr_attach(struct socket *so, int proto, struct thread *td)
{
- int error;
struct inpcb *inp;
- struct tcpcb *tp = 0;
+ struct tcpcb *tp = NULL;
+ int error;
TCPDEBUG0;
+ inp = sotoinpcb(so);
+ KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL"));
INP_INFO_WLOCK(&tcbinfo);
TCPDEBUG1();
- inp = sotoinpcb(so);
- if (inp) {
- error = EISCONN;
- goto out;
- }
error = tcp_attach(so);
if (error)
@@ -153,62 +150,94 @@ tcp_usr_detach(struct socket *so)
{
struct inpcb *inp;
struct tcpcb *tp;
+#ifdef INET6
+ int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0;
+#endif
TCPDEBUG0;
- INP_INFO_WLOCK(&tcbinfo);
inp = sotoinpcb(so);
- if (inp == NULL) {
- INP_INFO_WUNLOCK(&tcbinfo);
- return;
- }
+ KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL"));
+ INP_INFO_WLOCK(&tcbinfo);
INP_LOCK(inp);
- tp = intotcpcb(inp);
+ KASSERT(inp->inp_socket != NULL,
+ ("tcp_usr_detach: inp_socket == NULL"));
+
TCPDEBUG1();
- tp = tcp_disconnect(tp);
+ tp = intotcpcb(inp);
+ if (inp->inp_vflag & INP_TIMEWAIT) {
+ if (inp->inp_vflag & INP_DROPPED) {
+ /*
+ * Connection was in time wait and has been dropped;
+ * the calling path is via tcp_twclose(), which will
+ * free the tcptw, so we can discard the remainder.
+ *
+ * XXXRW: Would it be cleaner to free the tcptw
+ * here?
+ */
+#ifdef INET6
+ if (isipv6) {
+ in6_pcbdetach(inp);
+ in6_pcbfree(inp);
+ } else {
+#endif
+ in_pcbdetach(inp);
+ in_pcbfree(inp);
+#ifdef INET6
+ }
+#endif
+ } else {
+ /*
+ * Connection is in time wait and has not yet been
+ * dropped; allow the socket to be discarded, but
+ * need to keep inpcb until end of time wait.
+ */
+#ifdef INET6
+ if (isipv6)
+ in6_pcbdetach(inp);
+ else
+#endif
+ in_pcbdetach(inp);
+ INP_UNLOCK(inp);
+ }
+ } else {
+ tp = intotcpcb(inp);
+ if (inp->inp_vflag & INP_DROPPED ||
+ tp->t_state < TCPS_SYN_SENT) {
+ /*
+ * Connection has been dropped or is a listen socket,
+ * tear down all pcb state and allow socket to be
+ * freed.
+ */
+ tcp_discardcb(tp);
+#ifdef INET6
+ if (isipv6) {
+ in_pcbdetach(inp);
+ in_pcbfree(inp);
+ } else {
+#endif
+ in_pcbdetach(inp);
+ in_pcbfree(inp);
+#ifdef INET6
+ }
+#endif
+ } else {
+ /*
+ * Connection state still required, as is socket, so
+ * mark socket for TCP to free later.
+ */
+ SOCK_LOCK(so);
+ so->so_state |= SS_PROTOREF;
+ SOCK_UNLOCK(so);
+ inp->inp_vflag |= INP_SOCKREF;
+ INP_UNLOCK(inp);
+ }
+ }
+ tp = NULL;
TCPDEBUG2(PRU_DETACH);
- if (tp)
- INP_UNLOCK(inp);
INP_INFO_WUNLOCK(&tcbinfo);
}
-#define INI_NOLOCK 0
-#define INI_READ 1
-#define INI_WRITE 2
-
-#define COMMON_START() \
- TCPDEBUG0; \
- do { \
- if (inirw == INI_READ) \
- INP_INFO_RLOCK(&tcbinfo); \
- else if (inirw == INI_WRITE) \
- INP_INFO_WLOCK(&tcbinfo); \
- inp = sotoinpcb(so); \
- if (inp == 0) { \
- if (inirw == INI_READ) \
- INP_INFO_RUNLOCK(&tcbinfo); \
- else if (inirw == INI_WRITE) \
- INP_INFO_WUNLOCK(&tcbinfo); \
- return EINVAL; \
- } \
- INP_LOCK(inp); \
- if (inirw == INI_READ) \
- INP_INFO_RUNLOCK(&tcbinfo); \
- tp = intotcpcb(inp); \
- TCPDEBUG1(); \
-} while(0)
-
-#define COMMON_END(req) \
-out: TCPDEBUG2(req); \
- do { \
- if (tp) \
- INP_UNLOCK(inp); \
- if (inirw == INI_WRITE) \
- INP_INFO_WUNLOCK(&tcbinfo); \
- return error; \
- goto out; \
-} while(0)
-
/*
* Give the socket an address.
*/
@@ -217,9 +246,8 @@ tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
{
int error = 0;
struct inpcb *inp;
- struct tcpcb *tp;
+ struct tcpcb *tp = NULL;
struct sockaddr_in *sinp;
- const int inirw = INI_WRITE;
sinp = (struct sockaddr_in *)nam;
if (nam->sa_len != sizeof (*sinp))
@@ -232,11 +260,24 @@ tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
return (EAFNOSUPPORT);
- COMMON_START();
- error = in_pcbbind(inp, nam, td->td_ucred);
- if (error)
+ TCPDEBUG0;
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
goto out;
- COMMON_END(PRU_BIND);
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ error = in_pcbbind(inp, nam, td->td_ucred);
+out:
+ TCPDEBUG2(PRU_BIND);
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+
+ return (error);
}
#ifdef INET6
@@ -245,9 +286,8 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
{
int error = 0;
struct inpcb *inp;
- struct tcpcb *tp;
+ struct tcpcb *tp = NULL;
struct sockaddr_in6 *sin6p;
- const int inirw = INI_WRITE;
sin6p = (struct sockaddr_in6 *)nam;
if (nam->sa_len != sizeof (*sin6p))
@@ -260,7 +300,17 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
return (EAFNOSUPPORT);
- COMMON_START();
+ TCPDEBUG0;
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
inp->inp_vflag &= ~INP_IPV4;
inp->inp_vflag |= INP_IPV6;
if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
@@ -278,9 +328,11 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
}
}
error = in6_pcbbind(inp, nam, td->td_ucred);
- if (error)
- goto out;
- COMMON_END(PRU_BIND);
+out:
+ TCPDEBUG2(PRU_BIND);
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return (error);
}
#endif /* INET6 */
@@ -292,10 +344,19 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
{
int error = 0;
struct inpcb *inp;
- struct tcpcb *tp;
- const int inirw = INI_WRITE;
+ struct tcpcb *tp = NULL;
- COMMON_START();
+ TCPDEBUG0;
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
SOCK_LOCK(so);
error = solisten_proto_check(so);
if (error == 0 && inp->inp_lport == 0)
@@ -305,7 +366,12 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
solisten_proto(so, backlog);
}
SOCK_UNLOCK(so);
- COMMON_END(PRU_LISTEN);
+
+out:
+ TCPDEBUG2(PRU_LISTEN);
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return (error);
}
#ifdef INET6
@@ -314,10 +380,19 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
{
int error = 0;
struct inpcb *inp;
- struct tcpcb *tp;
- const int inirw = INI_WRITE;
+ struct tcpcb *tp = NULL;
- COMMON_START();
+ TCPDEBUG0;
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
SOCK_LOCK(so);
error = solisten_proto_check(so);
if (error == 0 && inp->inp_lport == 0) {
@@ -331,7 +406,12 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
solisten_proto(so, backlog);
}
SOCK_UNLOCK(so);
- COMMON_END(PRU_LISTEN);
+
+out:
+ TCPDEBUG2(PRU_LISTEN);
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return (error);
}
#endif /* INET6 */
@@ -347,9 +427,8 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
{
int error = 0;
struct inpcb *inp;
- struct tcpcb *tp;
+ struct tcpcb *tp = NULL;
struct sockaddr_in *sinp;
- const int inirw = INI_WRITE;
sinp = (struct sockaddr_in *)nam;
if (nam->sa_len != sizeof (*sinp))
@@ -363,11 +442,25 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
if (jailed(td->td_ucred))
prison_remote_ip(td->td_ucred, 0, &sinp->sin_addr.s_addr);
- COMMON_START();
+ TCPDEBUG0;
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
if ((error = tcp_connect(tp, nam, td)) != 0)
goto out;
error = tcp_output(tp);
- COMMON_END(PRU_CONNECT);
+out:
+ TCPDEBUG2(PRU_CONNECT);
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return (error);
}
#ifdef INET6
@@ -376,9 +469,10 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
{
int error = 0;
struct inpcb *inp;
- struct tcpcb *tp;
+ struct tcpcb *tp = NULL;
struct sockaddr_in6 *sin6p;
- const int inirw = INI_WRITE;
+
+ TCPDEBUG0;
sin6p = (struct sockaddr_in6 *)nam;
if (nam->sa_len != sizeof (*sin6p))
@@ -390,7 +484,16 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
&& IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
return (EAFNOSUPPORT);
- COMMON_START();
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
struct sockaddr_in sin;
@@ -413,7 +516,12 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
if ((error = tcp6_connect(tp, nam, td)) != 0)
goto out;
error = tcp_output(tp);
- COMMON_END(PRU_CONNECT);
+
+out:
+ TCPDEBUG2(PRU_CONNECT);
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return (error);
}
#endif /* INET6 */
@@ -431,14 +539,27 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
static int
tcp_usr_disconnect(struct socket *so)
{
- int error = 0;
struct inpcb *inp;
- struct tcpcb *tp;
- const int inirw = INI_WRITE;
+ struct tcpcb *tp = NULL;
+ int error = 0;
- COMMON_START();
- tp = tcp_disconnect(tp);
- COMMON_END(PRU_DISCONNECT);
+ TCPDEBUG0;
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ tcp_disconnect(tp);
+out:
+ TCPDEBUG2(PRU_DISCONNECT);
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return (error);
}
/*
@@ -461,14 +582,13 @@ tcp_usr_accept(struct socket *so, struct sockaddr **nam)
goto out;
}
- INP_INFO_RLOCK(&tcbinfo);
inp = sotoinpcb(so);
- if (!inp) {
- INP_INFO_RUNLOCK(&tcbinfo);
- return (EINVAL);
- }
+ KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL"));
INP_LOCK(inp);
- INP_INFO_RUNLOCK(&tcbinfo);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
tp = intotcpcb(inp);
TCPDEBUG1();
@@ -480,9 +600,9 @@ tcp_usr_accept(struct socket *so, struct sockaddr **nam)
port = inp->inp_fport;
addr = inp->inp_faddr;
-out: TCPDEBUG2(PRU_ACCEPT);
- if (tp)
- INP_UNLOCK(inp);
+out:
+ TCPDEBUG2(PRU_ACCEPT);
+ INP_UNLOCK(inp);
if (error == 0)
*nam = in_sockaddr(port, &addr);
return error;
@@ -506,16 +626,16 @@ tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
goto out;
}
- INP_INFO_RLOCK(&tcbinfo);
inp = sotoinpcb(so);
- if (inp == 0) {
- INP_INFO_RUNLOCK(&tcbinfo);
- return (EINVAL);
- }
+ KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL"));
INP_LOCK(inp);
- INP_INFO_RUNLOCK(&tcbinfo);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
tp = intotcpcb(inp);
TCPDEBUG1();
+
/*
* We inline in6_mapped_peeraddr and COMMON_END here, so that we can
* copy the data of interest and defer the malloc until after we
@@ -530,9 +650,9 @@ tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
addr6 = inp->in6p_faddr;
}
-out: TCPDEBUG2(PRU_ACCEPT);
- if (tp)
- INP_UNLOCK(inp);
+out:
+ TCPDEBUG2(PRU_ACCEPT);
+ INP_UNLOCK(inp);
if (error == 0) {
if (v4)
*nam = in6_v4mapsin6_sockaddr(port, &addr);
@@ -572,15 +692,29 @@ tcp_usr_shutdown(struct socket *so)
{
int error = 0;
struct inpcb *inp;
- struct tcpcb *tp;
- const int inirw = INI_WRITE;
+ struct tcpcb *tp = NULL;
- COMMON_START();
+ TCPDEBUG0;
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
socantsendmore(so);
- tp = tcp_usrclosed(tp);
- if (tp)
- error = tcp_output(tp);
- COMMON_END(PRU_SHUTDOWN);
+ tcp_usrclosed(tp);
+ error = tcp_output(tp);
+
+out:
+ TCPDEBUG2(PRU_SHUTDOWN);
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+
+ return (error);
}
/*
@@ -589,14 +723,26 @@ tcp_usr_shutdown(struct socket *so)
static int
tcp_usr_rcvd(struct socket *so, int flags)
{
- int error = 0;
struct inpcb *inp;
- struct tcpcb *tp;
- const int inirw = INI_READ;
+ struct tcpcb *tp = NULL;
+ int error = 0;
- COMMON_START();
+ TCPDEBUG0;
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
tcp_output(tp);
- COMMON_END(PRU_RCVD);
+
+out:
+ TCPDEBUG2(PRU_RCVD);
+ INP_UNLOCK(inp);
+ return (error);
}
/*
@@ -612,37 +758,33 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
{
int error = 0;
struct inpcb *inp;
- struct tcpcb *tp;
- int unlocked = 0;
+ struct tcpcb *tp = NULL;
+ int headlocked = 0;
#ifdef INET6
int isipv6;
#endif
TCPDEBUG0;
/*
- * Need write lock here because this function might call
- * tcp_connect or tcp_usrclosed.
- * We really want to have to this function upgrade from read lock
- * to write lock. XXX
+ * We require the pcbinfo lock in two cases:
+ *
+ * (1) An implied connect is taking place, which can result in
+ * binding IPs and ports and hence modification of the pcb hash
+ * chains.
+ *
+ * (2) PRUS_EOF is set, resulting in explicit close on the send.
*/
- INP_INFO_WLOCK(&tcbinfo);
+ if ((nam != NULL) || (flags & PRUS_EOF)) {
+ INP_INFO_WLOCK(&tcbinfo);
+ headlocked = 1;
+ }
inp = sotoinpcb(so);
- if (inp == NULL) {
- /*
- * OOPS! we lost a race, the TCP session got reset after
- * we checked SBS_CANTSENDMORE, eg: while doing uiomove or a
- * network interrupt in the non-splnet() section of sosend().
- */
- if (m)
- m_freem(m);
- if (control)
- m_freem(control);
- error = ECONNRESET; /* XXX EPIPE? */
- tp = NULL;
- TCPDEBUG1();
+ KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
goto out;
}
- INP_LOCK(inp);
#ifdef INET6
isipv6 = nam && nam->sa_family == AF_INET6;
#endif /* INET6 */
@@ -668,6 +810,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
* initialize maxseg/maxopd using peer's cached
* MSS.
*/
+ INP_INFO_WLOCK_ASSERT(&tcbinfo);
#ifdef INET6
if (isipv6)
error = tcp6_connect(tp, nam, td);
@@ -679,17 +822,19 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
tp->snd_wnd = TTCP_CLIENT_SND_WND;
tcp_mss(tp, -1);
}
-
if (flags & PRUS_EOF) {
/*
* Close the send side of the connection after
* the data is sent.
*/
+ INP_INFO_WLOCK_ASSERT(&tcbinfo);
socantsendmore(so);
- tp = tcp_usrclosed(tp);
+ tcp_usrclosed(tp);
+ }
+ if (headlocked) {
+ INP_INFO_WUNLOCK(&tcbinfo);
+ headlocked = 0;
}
- INP_INFO_WUNLOCK(&tcbinfo);
- unlocked = 1;
if (tp != NULL) {
if (flags & PRUS_MORETOCOME)
tp->t_flags |= TF_MORETOCOME;
@@ -698,6 +843,9 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
tp->t_flags &= ~TF_MORETOCOME;
}
} else {
+ /*
+ * XXXRW: PRUS_EOF not implemented with PRUS_OOB?
+ */
SOCKBUF_LOCK(&so->so_snd);
if (sbspace(&so->so_snd) < -512) {
SOCKBUF_UNLOCK(&so->so_snd);
@@ -722,6 +870,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
* initialize maxseg/maxopd using peer's cached
* MSS.
*/
+ INP_INFO_WLOCK_ASSERT(&tcbinfo);
#ifdef INET6
if (isipv6)
error = tcp6_connect(tp, nam, td);
@@ -732,9 +881,12 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
goto out;
tp->snd_wnd = TTCP_CLIENT_SND_WND;
tcp_mss(tp, -1);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ headlocked = 0;
+ } else if (nam) {
+ INP_INFO_WUNLOCK(&tcbinfo);
+ headlocked = 0;
}
- INP_INFO_WUNLOCK(&tcbinfo);
- unlocked = 1;
tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
tp->t_flags |= TF_FORCEDATA;
error = tcp_output(tp);
@@ -743,9 +895,8 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
out:
TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB :
((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
- if (tp)
- INP_UNLOCK(inp);
- if (!unlocked)
+ INP_UNLOCK(inp);
+ if (headlocked)
INP_INFO_WUNLOCK(&tcbinfo);
return (error);
}
@@ -756,22 +907,37 @@ out:
static void
tcp_usr_abort(struct socket *so)
{
+#if 0
struct inpcb *inp;
struct tcpcb *tp;
- TCPDEBUG0;
+#endif
+
+ /*
+ * XXXRW: This is not really quite the same, as we want to tcp_drop()
+ * rather than tcp_disconnect(), I think, but for now I'll avoid
+ * replicating all the tear-down logic here.
+ */
+ tcp_usr_detach(so);
+#if 0
+ TCPDEBUG0;
INP_INFO_WLOCK(&tcbinfo);
inp = sotoinpcb(so);
- if (inp == NULL)
- return;
INP_LOCK(inp);
- tp = intotcpcb(inp);
+ /*
+ * Do we need to handle timewait here? Aborted connections should
+ * never generate a FIN?
+ */
+ KASSERT((inp->inp_vflag & INP_TIMEWAIT) == 0,
+ ("tcp_usr_abort: timewait"));
+ tp = intotcpcb(inp);
TCPDEBUG1();
tp = tcp_drop(tp, ECONNABORTED);
- TCPDEBUG2(PRU_ABORT);
- if (tp)
+ TCPDEBUG2(PRU_ABORT);
+ if (tp != NULL)
INP_UNLOCK(inp);
INP_INFO_WUNLOCK(&tcbinfo);
+#endif
}
/*
@@ -782,10 +948,18 @@ tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
{
int error = 0;
struct inpcb *inp;
- struct tcpcb *tp;
- const int inirw = INI_READ;
+ struct tcpcb *tp = NULL;
- COMMON_START();
+ TCPDEBUG0;
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
if ((so->so_oobmark == 0 &&
(so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
so->so_options & SO_OOBINLINE ||
@@ -801,7 +975,11 @@ tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
*mtod(m, caddr_t) = tp->t_iobc;
if ((flags & MSG_PEEK) == 0)
tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
- COMMON_END(PRU_RCVOOB);
+
+out:
+ TCPDEBUG2(PRU_RCVOOB);
+ INP_UNLOCK(inp);
+ return (error);
}
struct pr_usrreqs tcp_usrreqs = {
@@ -866,6 +1044,9 @@ tcp_connect(tp, nam, td)
u_short lport;
int error;
+ INP_INFO_WLOCK_ASSERT(&tcbinfo);
+ INP_LOCK_ASSERT(inp);
+
if (inp->inp_lport == 0) {
error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
if (error)
@@ -917,6 +1098,9 @@ tcp6_connect(tp, nam, td)
struct in6_addr *addr6;
int error;
+ INP_INFO_WLOCK_ASSERT(&tcbinfo);
+ INP_LOCK_ASSERT(inp);
+
if (inp->inp_lport == 0) {
error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
if (error)
@@ -1026,14 +1210,9 @@ tcp_ctloutput(so, sopt)
struct tcp_info ti;
error = 0;
- INP_INFO_RLOCK(&tcbinfo);
inp = sotoinpcb(so);
- if (inp == NULL) {
- INP_INFO_RUNLOCK(&tcbinfo);
- return (ECONNRESET);
- }
+ KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL"));
INP_LOCK(inp);
- INP_INFO_RUNLOCK(&tcbinfo);
if (sopt->sopt_level != IPPROTO_TCP) {
INP_UNLOCK(inp);
#ifdef INET6
@@ -1044,6 +1223,10 @@ tcp_ctloutput(so, sopt)
error = ip_ctloutput(so, sopt);
return (error);
}
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = ECONNRESET;
+ goto out;
+ }
tp = intotcpcb(inp);
switch (sopt->sopt_dir) {
@@ -1158,6 +1341,7 @@ tcp_ctloutput(so, sopt)
}
break;
}
+out:
INP_UNLOCK(inp);
return (error);
}
@@ -1210,19 +1394,19 @@ tcp_attach(so)
#endif
inp->inp_vflag |= INP_IPV4;
tp = tcp_newtcpcb(inp);
- if (tp == 0) {
- int nofd = so->so_state & SS_NOFDREF; /* XXX */
-
- so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */
-
+ if (tp == NULL) {
INP_LOCK(inp);
#ifdef INET6
- if (isipv6)
+ if (isipv6) {
in6_pcbdetach(inp);
- else
+ in6_pcbfree(inp);
+ } else {
+#endif
+ in_pcbdetach(inp);
+ in_pcbfree(inp);
+#ifdef INET6
+ }
#endif
- in_pcbdetach(inp);
- so->so_state |= nofd;
return (ENOBUFS);
}
tp->t_state = TCPS_CLOSED;
@@ -1237,7 +1421,7 @@ tcp_attach(so)
* current input data; switch states based on user close, and
* send segment to peer (with FIN).
*/
-static struct tcpcb *
+static void
tcp_disconnect(tp)
register struct tcpcb *tp;
{
@@ -1247,18 +1431,24 @@ tcp_disconnect(tp)
INP_INFO_WLOCK_ASSERT(&tcbinfo);
INP_LOCK_ASSERT(inp);
- if (tp->t_state < TCPS_ESTABLISHED)
+ /*
+ * Neither tcp_close() nor tcp_drop() should return NULL, as the
+ * socket is still open.
+ */
+ if (tp->t_state < TCPS_ESTABLISHED) {
tp = tcp_close(tp);
- else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
+ KASSERT(tp != NULL,
+ ("tcp_disconnect: tcp_close() returned NULL"));
+ } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
tp = tcp_drop(tp, 0);
- else {
+ KASSERT(tp != NULL,
+ ("tcp_disconnect: tcp_drop() returned NULL"));
+ } else {
soisdisconnecting(so);
sbflush(&so->so_rcv);
- tp = tcp_usrclosed(tp);
- if (tp)
- (void) tcp_output(tp);
+ tcp_usrclosed(tp);
+ tcp_output(tp);
}
- return (tp);
}
/*
@@ -1271,7 +1461,7 @@ tcp_disconnect(tp)
* for peer to send FIN or not respond to keep-alives, etc.
* We can let the user exit from the close as soon as the FIN is acked.
*/
-static struct tcpcb *
+static void
tcp_usrclosed(tp)
register struct tcpcb *tp;
{
@@ -1285,6 +1475,12 @@ tcp_usrclosed(tp)
case TCPS_LISTEN:
tp->t_state = TCPS_CLOSED;
tp = tcp_close(tp);
+ /*
+ * tcp_close() should never return NULL here as the socket is
+ * still open.
+ */
+ KASSERT(tp != NULL,
+ ("tcp_usrclosed: tcp_close() returned NULL"));
break;
case TCPS_SYN_SENT:
@@ -1307,5 +1503,4 @@ tcp_usrclosed(tp)
callout_reset(tp->tt_2msl, tcp_maxidle,
tcp_timer_2msl, tp);
}
- return (tp);
}
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 0ef12aa..e3e4819 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -518,10 +518,10 @@ extern int tcp_do_sack; /* SACK enabled/disabled */
struct tcpcb *
tcp_close(struct tcpcb *);
+void tcp_discardcb(struct tcpcb *);
void tcp_twstart(struct tcpcb *);
int tcp_twrecycleable(struct tcptw *tw);
-struct tcptw *
- tcp_twclose(struct tcptw *_tw, int _reuse);
+void tcp_twclose(struct tcptw *_tw, int _reuse);
void tcp_ctlinput(int, struct sockaddr *, void *);
int tcp_ctloutput(struct socket *, struct sockopt *);
struct tcpcb *
OpenPOWER on IntegriCloud