summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--sys/netinet/tcp_subr.c109
-rw-r--r--sys/netinet/tcp_timer.c174
-rw-r--r--sys/netinet/tcp_timer.h24
-rw-r--r--sys/netinet/tcp_var.h5
4 files changed, 209 insertions, 103 deletions
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index c492de7..cbc6239 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -230,6 +230,7 @@ static struct inpcb *tcp_notify(struct inpcb *, int);
static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int);
static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th,
void *ip4hdr, const void *ip6hdr);
+static void tcp_timer_discard(struct tcpcb *, uint32_t);
/*
* Target size of TCP PCB hash tables. Must be a power of two.
@@ -790,7 +791,13 @@ tcp_newtcpcb(struct inpcb *inp)
if (V_tcp_do_sack)
tp->t_flags |= TF_SACK_PERMIT;
TAILQ_INIT(&tp->snd_holes);
- tp->t_inpcb = inp; /* XXX */
+ /*
+ * The tcpcb will hold a reference on its inpcb until tcp_discardcb()
+ * is called.
+ */
+ in_pcbref(inp); /* Reference for tcpcb */
+ tp->t_inpcb = inp;
+
/*
* Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
* rtt estimate. Set rttvar so that srtt + 4 * rttvar gives
@@ -909,6 +916,7 @@ tcp_discardcb(struct tcpcb *tp)
#ifdef INET6
int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
+ int released;
INP_WLOCK_ASSERT(inp);
@@ -916,22 +924,15 @@ tcp_discardcb(struct tcpcb *tp)
* Make sure that all of our timers are stopped before we delete the
* PCB.
*
- * XXXRW: Really, we would like to use callout_drain() here in order
- * to avoid races experienced in tcp_timer.c where a timer is already
- * executing at this point. However, we can't, both because we're
- * running in a context where we can't sleep, and also because we
- * hold locks required by the timers. What we instead need to do is
- * test to see if callout_drain() is required, and if so, defer some
- * portion of the remainder of tcp_discardcb() to an asynchronous
- * context that can callout_drain() and then continue. Some care
- * will be required to ensure that no further processing takes place
- * on the tcpcb, even though it hasn't been freed (a flag?).
+ * If stopping a timer fails, we schedule a discard function in same
+ * callout, and the last discard function called will take care of
+ * deleting the tcpcb.
*/
- callout_stop(&tp->t_timers->tt_rexmt);
- callout_stop(&tp->t_timers->tt_persist);
- callout_stop(&tp->t_timers->tt_keep);
- callout_stop(&tp->t_timers->tt_2msl);
- callout_stop(&tp->t_timers->tt_delack);
+ tcp_timer_stop(tp, TT_REXMT);
+ tcp_timer_stop(tp, TT_PERSIST);
+ tcp_timer_stop(tp, TT_KEEP);
+ tcp_timer_stop(tp, TT_2MSL);
+ tcp_timer_stop(tp, TT_DELACK);
/*
* If we got enough samples through the srtt filter,
@@ -1008,8 +1009,80 @@ tcp_discardcb(struct tcpcb *tp)
CC_ALGO(tp) = NULL;
inp->inp_ppcb = NULL;
- tp->t_inpcb = NULL;
- uma_zfree(V_tcpcb_zone, tp);
+ if ((tp->t_timers->tt_flags & TT_MASK) == 0) {
+ /* We own the last reference on tcpcb, let's free it. */
+ tp->t_inpcb = NULL;
+ uma_zfree(V_tcpcb_zone, tp);
+ released = in_pcbrele_wlocked(inp);
+ KASSERT(!released, ("%s: inp %p should not have been released "
+ "here", __func__, inp));
+ }
+}
+
+void
+tcp_timer_2msl_discard(void *xtp)
+{
+
+ tcp_timer_discard((struct tcpcb *)xtp, TT_2MSL);
+}
+
+void
+tcp_timer_keep_discard(void *xtp)
+{
+
+ tcp_timer_discard((struct tcpcb *)xtp, TT_KEEP);
+}
+
+void
+tcp_timer_persist_discard(void *xtp)
+{
+
+ tcp_timer_discard((struct tcpcb *)xtp, TT_PERSIST);
+}
+
+void
+tcp_timer_rexmt_discard(void *xtp)
+{
+
+ tcp_timer_discard((struct tcpcb *)xtp, TT_REXMT);
+}
+
+void
+tcp_timer_delack_discard(void *xtp)
+{
+
+ tcp_timer_discard((struct tcpcb *)xtp, TT_DELACK);
+}
+
+void
+tcp_timer_discard(struct tcpcb *tp, uint32_t timer_type)
+{
+ struct inpcb *inp;
+
+ CURVNET_SET(tp->t_vnet);
+ INP_INFO_WLOCK(&V_tcbinfo);
+ inp = tp->t_inpcb;
+ KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL",
+ __func__, tp));
+ INP_WLOCK(inp);
+ KASSERT((tp->t_timers->tt_flags & TT_STOPPED) != 0,
+ ("%s: tcpcb has to be stopped here", __func__));
+ KASSERT((tp->t_timers->tt_flags & timer_type) != 0,
+ ("%s: discard callout should be running", __func__));
+ tp->t_timers->tt_flags &= ~timer_type;
+ if ((tp->t_timers->tt_flags & TT_MASK) == 0) {
+ /* We own the last reference on this tcpcb, let's free it. */
+ tp->t_inpcb = NULL;
+ uma_zfree(V_tcpcb_zone, tp);
+ if (in_pcbrele_wlocked(inp)) {
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+ return;
+ }
+ }
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
}
/*
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index 80ec61c..d5295b4 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -209,10 +209,6 @@ int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */
-static int tcp_timer_race;
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_race, CTLFLAG_RD, &tcp_timer_race,
- 0, "Count of t_inpcb races on tcp_discardcb");
-
/*
* TCP timer processing.
*/
@@ -225,18 +221,7 @@ tcp_timer_delack(void *xtp)
CURVNET_SET(tp->t_vnet);
inp = tp->t_inpcb;
- /*
- * XXXRW: While this assert is in fact correct, bugs in the tcpcb
- * tear-down mean we need it as a work-around for races between
- * timers and tcp_discardcb().
- *
- * KASSERT(inp != NULL, ("tcp_timer_delack: inp == NULL"));
- */
- if (inp == NULL) {
- tcp_timer_race++;
- CURVNET_RESTORE();
- return;
- }
+ KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
INP_WLOCK(inp);
if (callout_pending(&tp->t_timers->tt_delack) ||
!callout_active(&tp->t_timers->tt_delack)) {
@@ -250,6 +235,10 @@ tcp_timer_delack(void *xtp)
CURVNET_RESTORE();
return;
}
+ KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
+ ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
+ KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0,
+ ("%s: tp %p delack callout should be running", __func__, tp));
tp->t_flags |= TF_ACKNOW;
TCPSTAT_INC(tcps_delack);
@@ -269,24 +258,9 @@ tcp_timer_2msl(void *xtp)
ostate = tp->t_state;
#endif
- /*
- * XXXRW: Does this actually happen?
- */
INP_INFO_WLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
- /*
- * XXXRW: While this assert is in fact correct, bugs in the tcpcb
- * tear-down mean we need it as a work-around for races between
- * timers and tcp_discardcb().
- *
- * KASSERT(inp != NULL, ("tcp_timer_2msl: inp == NULL"));
- */
- if (inp == NULL) {
- tcp_timer_race++;
- INP_INFO_WUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
+ KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
INP_WLOCK(inp);
tcp_free_sackholes(tp);
if (callout_pending(&tp->t_timers->tt_2msl) ||
@@ -303,6 +277,10 @@ tcp_timer_2msl(void *xtp)
CURVNET_RESTORE();
return;
}
+ KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
+ ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
+ KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0,
+ ("%s: tp %p 2msl callout should be running", __func__, tp));
/*
* 2 MSL timeout in shutdown went off. If we're closed but
* still waiting for peer to close and connection has been idle
@@ -352,19 +330,7 @@ tcp_timer_keep(void *xtp)
#endif
INP_INFO_WLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
- /*
- * XXXRW: While this assert is in fact correct, bugs in the tcpcb
- * tear-down mean we need it as a work-around for races between
- * timers and tcp_discardcb().
- *
- * KASSERT(inp != NULL, ("tcp_timer_keep: inp == NULL"));
- */
- if (inp == NULL) {
- tcp_timer_race++;
- INP_INFO_WUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
+ KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
INP_WLOCK(inp);
if (callout_pending(&tp->t_timers->tt_keep) ||
!callout_active(&tp->t_timers->tt_keep)) {
@@ -380,6 +346,10 @@ tcp_timer_keep(void *xtp)
CURVNET_RESTORE();
return;
}
+ KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
+ ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
+ KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0,
+ ("%s: tp %p keep callout should be running", __func__, tp));
/*
* Keep-alive timer went off; send something
* or drop connection if idle for too long.
@@ -455,19 +425,7 @@ tcp_timer_persist(void *xtp)
#endif
INP_INFO_WLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
- /*
- * XXXRW: While this assert is in fact correct, bugs in the tcpcb
- * tear-down mean we need it as a work-around for races between
- * timers and tcp_discardcb().
- *
- * KASSERT(inp != NULL, ("tcp_timer_persist: inp == NULL"));
- */
- if (inp == NULL) {
- tcp_timer_race++;
- INP_INFO_WUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
+ KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
INP_WLOCK(inp);
if (callout_pending(&tp->t_timers->tt_persist) ||
!callout_active(&tp->t_timers->tt_persist)) {
@@ -483,6 +441,10 @@ tcp_timer_persist(void *xtp)
CURVNET_RESTORE();
return;
}
+ KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
+ ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
+ KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0,
+ ("%s: tp %p persist callout should be running", __func__, tp));
/*
* Persistance timer into zero window.
* Force a byte to be output, if possible.
@@ -544,19 +506,7 @@ tcp_timer_rexmt(void * xtp)
INP_INFO_RLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
- /*
- * XXXRW: While this assert is in fact correct, bugs in the tcpcb
- * tear-down mean we need it as a work-around for races between
- * timers and tcp_discardcb().
- *
- * KASSERT(inp != NULL, ("tcp_timer_rexmt: inp == NULL"));
- */
- if (inp == NULL) {
- tcp_timer_race++;
- INP_INFO_RUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
+ KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
INP_WLOCK(inp);
if (callout_pending(&tp->t_timers->tt_rexmt) ||
!callout_active(&tp->t_timers->tt_rexmt)) {
@@ -572,6 +522,10 @@ tcp_timer_rexmt(void * xtp)
CURVNET_RESTORE();
return;
}
+ KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
+ ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
+ KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0,
+ ("%s: tp %p rexmt callout should be running", __func__, tp));
tcp_free_sackholes(tp);
/*
* Retransmission timer went off. Message has not
@@ -800,10 +754,10 @@ out:
}
void
-tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta)
+tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
{
struct callout *t_callout;
- void *f_callout;
+ timeout_t *f_callout;
struct inpcb *inp = tp->t_inpcb;
int cpu = INP_CPU(inp);
@@ -812,6 +766,9 @@ tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta)
return;
#endif
+ if (tp->t_timers->tt_flags & TT_STOPPED)
+ return;
+
switch (timer_type) {
case TT_DELACK:
t_callout = &tp->t_timers->tt_delack;
@@ -834,17 +791,26 @@ tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta)
f_callout = tcp_timer_2msl;
break;
default:
- panic("bad timer_type");
+ panic("tp %p bad timer_type %#x", tp, timer_type);
}
if (delta == 0) {
- callout_stop(t_callout);
+ if ((tp->t_timers->tt_flags & timer_type) &&
+ callout_stop(t_callout)) {
+ tp->t_timers->tt_flags &= ~timer_type;
+ }
} else {
- callout_reset_on(t_callout, delta, f_callout, tp, cpu);
+ if ((tp->t_timers->tt_flags & timer_type) == 0) {
+ tp->t_timers->tt_flags |= timer_type;
+ callout_reset_on(t_callout, delta, f_callout, tp, cpu);
+ } else {
+ /* Reset already running callout on the same CPU. */
+ callout_reset(t_callout, delta, f_callout, tp);
+ }
}
}
int
-tcp_timer_active(struct tcpcb *tp, int timer_type)
+tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
{
struct callout *t_callout;
@@ -865,11 +831,63 @@ tcp_timer_active(struct tcpcb *tp, int timer_type)
t_callout = &tp->t_timers->tt_2msl;
break;
default:
- panic("bad timer_type");
+ panic("tp %p bad timer_type %#x", tp, timer_type);
}
return callout_active(t_callout);
}
+void
+tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
+{
+ struct callout *t_callout;
+ timeout_t *f_callout;
+
+ tp->t_timers->tt_flags |= TT_STOPPED;
+
+ switch (timer_type) {
+ case TT_DELACK:
+ t_callout = &tp->t_timers->tt_delack;
+ f_callout = tcp_timer_delack_discard;
+ break;
+ case TT_REXMT:
+ t_callout = &tp->t_timers->tt_rexmt;
+ f_callout = tcp_timer_rexmt_discard;
+ break;
+ case TT_PERSIST:
+ t_callout = &tp->t_timers->tt_persist;
+ f_callout = tcp_timer_persist_discard;
+ break;
+ case TT_KEEP:
+ t_callout = &tp->t_timers->tt_keep;
+ f_callout = tcp_timer_keep_discard;
+ break;
+ case TT_2MSL:
+ t_callout = &tp->t_timers->tt_2msl;
+ f_callout = tcp_timer_2msl_discard;
+ break;
+ default:
+ panic("tp %p bad timer_type %#x", tp, timer_type);
+ }
+
+ if (tp->t_timers->tt_flags & timer_type) {
+ if (callout_stop(t_callout)) {
+ tp->t_timers->tt_flags &= ~timer_type;
+ } else {
+ /*
+ * Can't stop the callout, defer tcpcb actual deletion
+ * to the last tcp timer discard callout.
+ * The TT_STOPPED flag will ensure that no tcp timer
+ * callouts can be restarted on our behalf, and
+ * past this point currently running callouts waiting
+ * on inp lock will return right away after the
+ * classical check for callout reset/stop events:
+ * callout_pending() || !callout_active()
+ */
+ callout_reset(t_callout, 1, f_callout, tp);
+ }
+ }
+}
+
#define ticks_to_msecs(t) (1000*(t) / hz)
void
diff --git a/sys/netinet/tcp_timer.h b/sys/netinet/tcp_timer.h
index f80e744..02a7782 100644
--- a/sys/netinet/tcp_timer.h
+++ b/sys/netinet/tcp_timer.h
@@ -146,12 +146,21 @@ struct tcp_timer {
struct callout tt_keep; /* keepalive */
struct callout tt_2msl; /* 2*msl TIME_WAIT timer */
struct callout tt_delack; /* delayed ACK timer */
+ uint32_t tt_flags; /* Timers flags */
+ uint32_t tt_spare; /* TDB */
};
-#define TT_DELACK 0x01
-#define TT_REXMT 0x02
-#define TT_PERSIST 0x04
-#define TT_KEEP 0x08
-#define TT_2MSL 0x10
+
+/*
+ * Flags for the tt_flags field.
+ */
+#define TT_DELACK 0x0001
+#define TT_REXMT 0x0002
+#define TT_PERSIST 0x0004
+#define TT_KEEP 0x0008
+#define TT_2MSL 0x0010
+#define TT_MASK (TT_DELACK|TT_REXMT|TT_PERSIST|TT_KEEP|TT_2MSL)
+
+#define TT_STOPPED 0x00010000
#define TP_KEEPINIT(tp) ((tp)->t_keepinit ? (tp)->t_keepinit : tcp_keepinit)
#define TP_KEEPIDLE(tp) ((tp)->t_keepidle ? (tp)->t_keepidle : tcp_keepidle)
@@ -183,6 +192,11 @@ void tcp_timer_keep(void *xtp);
void tcp_timer_persist(void *xtp);
void tcp_timer_rexmt(void *xtp);
void tcp_timer_delack(void *xtp);
+void tcp_timer_2msl_discard(void *xtp);
+void tcp_timer_keep_discard(void *xtp);
+void tcp_timer_persist_discard(void *xtp);
+void tcp_timer_rexmt_discard(void *xtp);
+void tcp_timer_delack_discard(void *xtp);
void tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
struct xtcp_timer *xtimer);
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 1919146..61f01c6 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -718,8 +718,9 @@ void tcp_slowtimo(void);
struct tcptemp *
tcpip_maketemplate(struct inpcb *);
void tcpip_fillheaders(struct inpcb *, void *, void *);
-void tcp_timer_activate(struct tcpcb *, int, u_int);
-int tcp_timer_active(struct tcpcb *, int);
+void tcp_timer_activate(struct tcpcb *, uint32_t, u_int);
+int tcp_timer_active(struct tcpcb *, uint32_t);
+void tcp_timer_stop(struct tcpcb *, uint32_t);
void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int);
/*
* All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo)
OpenPOWER on IntegriCloud