summaryrefslogtreecommitdiffstats
path: root/sys/netinet
diff options
context:
space:
mode:
authorjch <jch@FreeBSD.org>2014-10-30 08:53:56 +0000
committerjch <jch@FreeBSD.org>2014-10-30 08:53:56 +0000
commit5630210a7f1dbbd903b77b2aef939cd47c63da58 (patch)
tree2136568ca3c9519f2148245ea43ee938f0b3bfea /sys/netinet
parentd140ecd680057af241267a4e27144a9c980fc8cf (diff)
downloadFreeBSD-src-5630210a7f1dbbd903b77b2aef939cd47c63da58.zip
FreeBSD-src-5630210a7f1dbbd903b77b2aef939cd47c63da58.tar.gz
Fix a race condition in TCP timewait between tcp_tw_2msl_reuse() and
tcp_tw_2msl_scan(). This race condition drives unplanned timewait timeout cancellation. Also simplify implementation by holding inpcb reference and removing tcptw reference counting. Differential Revision: https://reviews.freebsd.org/D826 Submitted by: Marc De la Gueronniere <mdelagueronniere@verisign.com> Submitted by: jch Reviewed By: jhb (mentor), adrian, rwatson Sponsored by: Verisign, Inc. MFC after: 2 weeks X-MFC-With: r264321
Diffstat (limited to 'sys/netinet')
-rw-r--r--sys/netinet/tcp_timer.c2
-rw-r--r--sys/netinet/tcp_timer.h3
-rw-r--r--sys/netinet/tcp_timewait.c138
-rw-r--r--sys/netinet/tcp_usrreq.c15
-rw-r--r--sys/netinet/tcp_var.h3
5 files changed, 93 insertions, 68 deletions
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index 634c676..5d379ea 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -243,7 +243,7 @@ tcp_slowtimo(void)
VNET_LIST_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
- tcp_tw_2msl_scan();
+ (void) tcp_tw_2msl_scan(0);
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK_NOSLEEP();
diff --git a/sys/netinet/tcp_timer.h b/sys/netinet/tcp_timer.h
index c04723a..f80e744 100644
--- a/sys/netinet/tcp_timer.h
+++ b/sys/netinet/tcp_timer.h
@@ -178,8 +178,7 @@ extern int tcp_fast_finwait2_recycle;
void tcp_timer_init(void);
void tcp_timer_2msl(void *xtp);
struct tcptw *
- tcp_tw_2msl_reuse(void); /* XXX temporary? */
-void tcp_tw_2msl_scan(void);
+ tcp_tw_2msl_scan(int reuse); /* XXX temporary? */
void tcp_timer_keep(void *xtp);
void tcp_timer_persist(void *xtp);
void tcp_timer_rexmt(void *xtp);
diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c
index 33555d9..2bdc107 100644
--- a/sys/netinet/tcp_timewait.c
+++ b/sys/netinet/tcp_timewait.c
@@ -49,7 +49,6 @@ __FBSDID("$FreeBSD$");
#include <sys/socketvar.h>
#include <sys/protosw.h>
#include <sys/random.h>
-#include <sys/refcount.h>
#include <vm/uma.h>
@@ -101,6 +100,11 @@ static int maxtcptw;
* currently in the TIME_WAIT state. The queue pointers, including the
* queue pointers in each tcptw structure, are protected using the global
* timewait lock, which must be held over queue iteration and modification.
+ *
+ * Rules on tcptw usage:
+ * - a inpcb is always freed _after_ its tcptw
+ * - a tcptw relies on its inpcb reference counting for memory stability
+ * - a tcptw is dereferenceable only while its inpcb is locked
*/
static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl);
#define V_twq_2msl VNET(twq_2msl)
@@ -124,32 +128,6 @@ static void tcp_tw_2msl_reset(struct tcptw *, int);
static void tcp_tw_2msl_stop(struct tcptw *, int);
static int tcp_twrespond(struct tcptw *, int);
-/*
- * tw_pcbref() bumps the reference count on an tw in order to maintain
- * stability of an tw pointer despite the tw lock being released.
- */
-static void
-tw_pcbref(struct tcptw *tw)
-{
-
- KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
- refcount_acquire(&tw->tw_refcount);
-}
-
-/*
- * Drop a refcount on an tw elevated using tw_pcbref().
- */
-static int
-tw_pcbrele(struct tcptw *tw)
-{
-
- KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
- if (!refcount_release(&tw->tw_refcount))
- return (0);
- uma_zfree(V_tcptw_zone, tw);
- return (1);
-}
-
static int
tcptw_auto_size(void)
{
@@ -279,8 +257,11 @@ tcp_twstart(struct tcpcb *tp)
* Reached limit on total number of TIMEWAIT connections
* allowed. Remove a connection from TIMEWAIT queue in LRU
* fashion to make room for this connection.
+ *
+ * pcbinfo lock is needed here to prevent deadlock as
+ * two inpcb locks can be acquired simultaneously.
*/
- tw = tcp_tw_2msl_reuse();
+ tw = tcp_tw_2msl_scan(1);
if (tw == NULL) {
tp = tcp_close(tp);
if (tp != NULL)
@@ -288,8 +269,12 @@ tcp_twstart(struct tcpcb *tp)
return;
}
}
+ /*
+ * The tcptw will hold a reference on its inpcb until tcp_twclose
+ * is called
+ */
tw->tw_inpcb = inp;
- refcount_init(&tw->tw_refcount, 1);
+ in_pcbref(inp); /* Reference from tw */
/*
* Recover last window size sent.
@@ -479,7 +464,6 @@ tcp_twclose(struct tcptw *tw, int reuse)
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */
INP_WLOCK_ASSERT(inp);
- tw->tw_inpcb = NULL;
tcp_tw_2msl_stop(tw, reuse);
inp->inp_ppcb = NULL;
in_pcbdrop(inp);
@@ -509,8 +493,13 @@ tcp_twclose(struct tcptw *tw, int reuse)
*/
INP_WUNLOCK(inp);
}
- } else
+ } else {
+ /*
+ * The socket has been already cleaned-up for us, only free the
+ * inpcb.
+ */
in_pcbfree(inp);
+ }
TCPSTAT_INC(tcps_closed);
}
@@ -641,71 +630,94 @@ tcp_tw_2msl_reset(struct tcptw *tw, int rearm)
static void
tcp_tw_2msl_stop(struct tcptw *tw, int reuse)
{
+ struct ucred *cred;
+ struct inpcb *inp;
+ int released;
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
TW_WLOCK(V_tw_lock);
+ inp = tw->tw_inpcb;
+ tw->tw_inpcb = NULL;
+
TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
- crfree(tw->tw_cred);
+ cred = tw->tw_cred;
tw->tw_cred = NULL;
TW_WUNLOCK(V_tw_lock);
+ if (cred != NULL)
+ crfree(cred);
+
+ released = in_pcbrele_wlocked(inp);
+ KASSERT(!released, ("%s: inp should not be released here", __func__));
+
if (!reuse)
- tw_pcbrele(tw);
+ uma_zfree(V_tcptw_zone, tw);
}
struct tcptw *
-tcp_tw_2msl_reuse(void)
+tcp_tw_2msl_scan(int reuse)
{
struct tcptw *tw;
+ struct inpcb *inp;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
-
- TW_WLOCK(V_tw_lock);
- tw = TAILQ_FIRST(&V_twq_2msl);
- if (tw == NULL) {
- TW_WUNLOCK(V_tw_lock);
- return NULL;
+#ifdef INVARIANTS
+ if (reuse) {
+ /*
+ * pcbinfo lock is needed in reuse case to prevent deadlock
+ * as two inpcb locks can be acquired simultaneously:
+ * - the inpcb transitioning to TIME_WAIT state in
+ * tcp_tw_start(),
+ * - the inpcb closed by tcp_twclose().
+ */
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
}
- TW_WUNLOCK(V_tw_lock);
-
- INP_WLOCK(tw->tw_inpcb);
- tcp_twclose(tw, 1);
-
- return (tw);
-}
-
-void
-tcp_tw_2msl_scan(void)
-{
- struct tcptw *tw;
+#endif
for (;;) {
TW_RLOCK(V_tw_lock);
tw = TAILQ_FIRST(&V_twq_2msl);
- if (tw == NULL || tw->tw_time - ticks > 0) {
+ if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) {
TW_RUNLOCK(V_tw_lock);
break;
}
- tw_pcbref(tw);
+ KASSERT(tw->tw_inpcb != NULL, ("%s: tw->tw_inpcb == NULL",
+ __func__));
+
+ inp = tw->tw_inpcb;
+ in_pcbref(inp);
TW_RUNLOCK(V_tw_lock);
- /* Close timewait state */
if (INP_INFO_TRY_WLOCK(&V_tcbinfo)) {
- if (tw_pcbrele(tw)) {
+
+ INP_WLOCK(inp);
+ tw = intotw(inp);
+ if (in_pcbrele_wlocked(inp)) {
+ KASSERT(tw == NULL, ("%s: held last inp "
+ "reference but tw not NULL", __func__));
INP_INFO_WUNLOCK(&V_tcbinfo);
continue;
}
- KASSERT(tw->tw_inpcb != NULL,
- ("%s: tw->tw_inpcb == NULL", __func__));
- INP_WLOCK(tw->tw_inpcb);
- tcp_twclose(tw, 0);
+ if (tw == NULL) {
+ /* tcp_twclose() has already been called */
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ continue;
+ }
+
+ tcp_twclose(tw, reuse);
INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (reuse)
+ return tw;
} else {
- /* INP_INFO lock is busy; continue later. */
- tw_pcbrele(tw);
+ /* INP_INFO lock is busy, continue later. */
+ INP_WLOCK(inp);
+ if (!in_pcbrele_wlocked(inp))
+ INP_WUNLOCK(inp);
break;
}
}
+
+ return NULL;
}
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 17efea4..2820ffb 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -183,6 +183,21 @@ tcp_detach(struct socket *so, struct inpcb *inp)
* present until timewait ends.
*
* XXXRW: Would it be cleaner to free the tcptw here?
+ *
+ * Astute question indeed, from twtcp perspective there are
+ * three cases to consider:
+ *
+ * #1 tcp_detach is called at tcptw creation time by
+ * tcp_twstart, then do not discard the newly created tcptw
+ * and leave inpcb present until timewait ends
+ * #2 tcp_detach is called at timewait end (or reuse) by
+ * tcp_twclose, then the tcptw has already been discarded
+ * and inpcb is freed here
+ * #3 tcp_detach is called() after timewait ends (or reuse)
+ * (e.g. by soclose), then tcptw has already been discarded
+ * and inpcb is freed here
+ *
+ * In all three cases the tcptw should not be freed here.
*/
if (inp->inp_flags & INP_DROPPED) {
KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && "
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 9fe6447..84ac8f0 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -358,7 +358,6 @@ struct tcptw {
u_int t_starttime;
int tw_time;
TAILQ_ENTRY(tcptw) tw_2msl;
- u_int tw_refcount; /* refcount */
};
#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb)
@@ -651,7 +650,7 @@ struct tcpcb *
tcp_close(struct tcpcb *);
void tcp_discardcb(struct tcpcb *);
void tcp_twstart(struct tcpcb *);
-void tcp_twclose(struct tcptw *_tw, int _reuse);
+void tcp_twclose(struct tcptw *, int);
void tcp_ctlinput(int, struct sockaddr *, void *);
int tcp_ctloutput(struct socket *, struct sockopt *);
struct tcpcb *
OpenPOWER on IntegriCloud