summaryrefslogtreecommitdiffstats
path: root/sys/netinet
diff options
context:
space:
mode:
authorjch <jch@FreeBSD.org>2014-12-02 11:47:26 +0000
committerjch <jch@FreeBSD.org>2014-12-02 11:47:26 +0000
commit0e426a7bb8fede30cf641e097dd3bd96e9f78970 (patch)
tree3fca138675f4baf228082838ff54a37bcaf712e7 /sys/netinet
parentf66f59b79490706a63f09914c0ef4ea4d649c419 (diff)
downloadFreeBSD-src-0e426a7bb8fede30cf641e097dd3bd96e9f78970.zip
FreeBSD-src-0e426a7bb8fede30cf641e097dd3bd96e9f78970.tar.gz
MFC r264321, r264342, r264351, r264356, r273850, r274629:
Currently, the TCP slow timer can starve TCP input processing while it walks the list of connections in TIME_WAIT closing expired connections due to contention on the global TCP pcbinfo lock. To remediate, introduce a new global lock to protect the list of connections in TIME_WAIT. Only acquire the TCP pcbinfo lock when closing an expired connection. This limits the window of time when TCP input processing is stopped to the amount of time needed to close a single connection. Approved by: jhb (mentor)
Diffstat (limited to 'sys/netinet')
-rw-r--r--sys/netinet/tcp_timer.c2
-rw-r--r--sys/netinet/tcp_timer.h2
-rw-r--r--sys/netinet/tcp_timewait.c155
-rw-r--r--sys/netinet/tcp_usrreq.c15
-rw-r--r--sys/netinet/tcp_var.h2
5 files changed, 147 insertions, 29 deletions
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index 00fcb22..80ec61c 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -195,9 +195,7 @@ tcp_slowtimo(void)
VNET_LIST_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
- INP_INFO_WLOCK(&V_tcbinfo);
(void) tcp_tw_2msl_scan(0);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK_NOSLEEP();
diff --git a/sys/netinet/tcp_timer.h b/sys/netinet/tcp_timer.h
index 3115fb3..f80e744 100644
--- a/sys/netinet/tcp_timer.h
+++ b/sys/netinet/tcp_timer.h
@@ -178,7 +178,7 @@ extern int tcp_fast_finwait2_recycle;
void tcp_timer_init(void);
void tcp_timer_2msl(void *xtp);
struct tcptw *
- tcp_tw_2msl_scan(int _reuse); /* XXX temporary */
+ tcp_tw_2msl_scan(int reuse); /* XXX temporary? */
void tcp_timer_keep(void *xtp);
void tcp_timer_persist(void *xtp);
void tcp_timer_rexmt(void *xtp);
diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c
index b3d7ae6..e82f6f7 100644
--- a/sys/netinet/tcp_timewait.c
+++ b/sys/netinet/tcp_timewait.c
@@ -91,20 +91,40 @@ __FBSDID("$FreeBSD$");
#include <security/mac/mac_framework.h>
static VNET_DEFINE(uma_zone_t, tcptw_zone);
-#define V_tcptw_zone VNET(tcptw_zone)
+#define V_tcptw_zone VNET(tcptw_zone)
static int maxtcptw;
/*
* The timed wait queue contains references to each of the TCP sessions
* currently in the TIME_WAIT state. The queue pointers, including the
* queue pointers in each tcptw structure, are protected using the global
- * tcbinfo lock, which must be held over queue iteration and modification.
+ * timewait lock, which must be held over queue iteration and modification.
+ *
+ * Rules on tcptw usage:
+ * - a inpcb is always freed _after_ its tcptw
+ * - a tcptw relies on its inpcb reference counting for memory stability
+ * - a tcptw is dereferenceable only while its inpcb is locked
*/
static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl);
-#define V_twq_2msl VNET(twq_2msl)
+#define V_twq_2msl VNET(twq_2msl)
+
+/* Global timewait lock */
+static VNET_DEFINE(struct rwlock, tw_lock);
+#define V_tw_lock VNET(tw_lock)
+
+#define TW_LOCK_INIT(tw, d) rw_init_flags(&(tw), (d), 0)
+#define TW_LOCK_DESTROY(tw) rw_destroy(&(tw))
+#define TW_RLOCK(tw) rw_rlock(&(tw))
+#define TW_WLOCK(tw) rw_wlock(&(tw))
+#define TW_RUNLOCK(tw) rw_runlock(&(tw))
+#define TW_WUNLOCK(tw) rw_wunlock(&(tw))
+#define TW_LOCK_ASSERT(tw) rw_assert(&(tw), RA_LOCKED)
+#define TW_RLOCK_ASSERT(tw) rw_assert(&(tw), RA_RLOCKED)
+#define TW_WLOCK_ASSERT(tw) rw_assert(&(tw), RA_WLOCKED)
+#define TW_UNLOCK_ASSERT(tw) rw_assert(&(tw), RA_UNLOCKED)
static void tcp_tw_2msl_reset(struct tcptw *, int);
-static void tcp_tw_2msl_stop(struct tcptw *);
+static void tcp_tw_2msl_stop(struct tcptw *, int);
static int tcp_twrespond(struct tcptw *, int);
static int
@@ -172,6 +192,7 @@ tcp_tw_init(void)
else
uma_zone_set_max(V_tcptw_zone, maxtcptw);
TAILQ_INIT(&V_twq_2msl);
+ TW_LOCK_INIT(V_tw_lock, "tcptw");
}
#ifdef VIMAGE
@@ -181,10 +202,11 @@ tcp_tw_destroy(void)
struct tcptw *tw;
INP_INFO_WLOCK(&V_tcbinfo);
- while((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL)
+ while ((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL)
tcp_twclose(tw, 0);
INP_INFO_WUNLOCK(&V_tcbinfo);
+ TW_LOCK_DESTROY(V_tw_lock);
uma_zdestroy(V_tcptw_zone);
}
#endif
@@ -205,7 +227,7 @@ tcp_twstart(struct tcpcb *tp)
int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
#endif
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_reset(). */
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
if (V_nolocaltimewait) {
@@ -230,6 +252,14 @@ tcp_twstart(struct tcpcb *tp)
tw = uma_zalloc(V_tcptw_zone, M_NOWAIT);
if (tw == NULL) {
+ /*
+ * Reached limit on total number of TIMEWAIT connections
+ * allowed. Remove a connection from TIMEWAIT queue in LRU
+ * fashion to make room for this connection.
+ *
+ * pcbinfo lock is needed here to prevent deadlock as
+ * two inpcb locks can be acquired simultaneously.
+ */
tw = tcp_tw_2msl_scan(1);
if (tw == NULL) {
tp = tcp_close(tp);
@@ -238,7 +268,12 @@ tcp_twstart(struct tcpcb *tp)
return;
}
}
+ /*
+ * The tcptw will hold a reference on its inpcb until tcp_twclose
+ * is called
+ */
tw->tw_inpcb = inp;
+ in_pcbref(inp); /* Reference from tw */
/*
* Recover last window size sent.
@@ -321,7 +356,7 @@ tcp_twstart(struct tcpcb *tp)
* Most other new OSes use semi-randomized ISN values, so we
* do not need to worry about them.
*/
-#define MS_ISN_BYTES_PER_SECOND 250000
+#define MS_ISN_BYTES_PER_SECOND 250000
/*
* Determine if the ISN we will generate has advanced beyond the last
@@ -357,7 +392,6 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to __unused, struct tcphdr *th,
int thflags;
tcp_seq seq;
- /* tcbinfo lock required for tcp_twclose(), tcp_tw_2msl_reset(). */
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
@@ -459,11 +493,10 @@ tcp_twclose(struct tcptw *tw, int reuse)
inp = tw->tw_inpcb;
KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait"));
KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw"));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_stop(). */
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */
INP_WLOCK_ASSERT(inp);
- tw->tw_inpcb = NULL;
- tcp_tw_2msl_stop(tw);
+ tcp_tw_2msl_stop(tw, reuse);
inp->inp_ppcb = NULL;
in_pcbdrop(inp);
@@ -492,14 +525,14 @@ tcp_twclose(struct tcptw *tw, int reuse)
*/
INP_WUNLOCK(inp);
}
- } else
+ } else {
+ /*
+ * The socket has been already cleaned-up for us, only free the
+ * inpcb.
+ */
in_pcbfree(inp);
+ }
TCPSTAT_INC(tcps_closed);
- crfree(tw->tw_cred);
- tw->tw_cred = NULL;
- if (reuse)
- return;
- uma_zfree(V_tcptw_zone, tw);
}
static int
@@ -617,34 +650,106 @@ tcp_tw_2msl_reset(struct tcptw *tw, int rearm)
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(tw->tw_inpcb);
+
+ TW_WLOCK(V_tw_lock);
if (rearm)
TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
tw->tw_time = ticks + 2 * tcp_msl;
TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl);
+ TW_WUNLOCK(V_tw_lock);
}
static void
-tcp_tw_2msl_stop(struct tcptw *tw)
+tcp_tw_2msl_stop(struct tcptw *tw, int reuse)
{
+ struct ucred *cred;
+ struct inpcb *inp;
+ int released;
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+
+ TW_WLOCK(V_tw_lock);
+ inp = tw->tw_inpcb;
+ tw->tw_inpcb = NULL;
+
TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
+ cred = tw->tw_cred;
+ tw->tw_cred = NULL;
+ TW_WUNLOCK(V_tw_lock);
+
+ if (cred != NULL)
+ crfree(cred);
+
+ released = in_pcbrele_wlocked(inp);
+ KASSERT(!released, ("%s: inp should not be released here", __func__));
+
+ if (!reuse)
+ uma_zfree(V_tcptw_zone, tw);
}
struct tcptw *
tcp_tw_2msl_scan(int reuse)
{
struct tcptw *tw;
+ struct inpcb *inp;
+
+#ifdef INVARIANTS
+ if (reuse) {
+ /*
+ * pcbinfo lock is needed in reuse case to prevent deadlock
+ * as two inpcb locks can be acquired simultaneously:
+ * - the inpcb transitioning to TIME_WAIT state in
+ * tcp_tw_start(),
+ * - the inpcb closed by tcp_twclose().
+ */
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ }
+#endif
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
for (;;) {
+ TW_RLOCK(V_tw_lock);
tw = TAILQ_FIRST(&V_twq_2msl);
- if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0))
+ if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) {
+ TW_RUNLOCK(V_tw_lock);
break;
- INP_WLOCK(tw->tw_inpcb);
- tcp_twclose(tw, reuse);
- if (reuse)
- return (tw);
+ }
+ KASSERT(tw->tw_inpcb != NULL, ("%s: tw->tw_inpcb == NULL",
+ __func__));
+
+ inp = tw->tw_inpcb;
+ in_pcbref(inp);
+ TW_RUNLOCK(V_tw_lock);
+
+ if (INP_INFO_TRY_WLOCK(&V_tcbinfo)) {
+
+ INP_WLOCK(inp);
+ tw = intotw(inp);
+ if (in_pcbrele_wlocked(inp)) {
+ KASSERT(tw == NULL, ("%s: held last inp "
+ "reference but tw not NULL", __func__));
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ continue;
+ }
+
+ if (tw == NULL) {
+ /* tcp_twclose() has already been called */
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ continue;
+ }
+
+ tcp_twclose(tw, reuse);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (reuse)
+ return tw;
+ } else {
+ /* INP_INFO lock is busy, continue later. */
+ INP_WLOCK(inp);
+ if (!in_pcbrele_wlocked(inp))
+ INP_WUNLOCK(inp);
+ break;
+ }
}
- return (NULL);
+
+ return NULL;
}
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 8435791..8d92803 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -182,6 +182,21 @@ tcp_detach(struct socket *so, struct inpcb *inp)
* present until timewait ends.
*
* XXXRW: Would it be cleaner to free the tcptw here?
+ *
+ * Astute question indeed, from twtcp perspective there are
+ * three cases to consider:
+ *
+ * #1 tcp_detach is called at tcptw creation time by
+ * tcp_twstart, then do not discard the newly created tcptw
+ * and leave inpcb present until timewait ends
+ * #2 tcp_detach is called at timewait end (or reuse) by
+ * tcp_twclose, then the tcptw has already been discarded
+ * and inpcb is freed here
+ * #3 tcp_detach is called() after timewait ends (or reuse)
+ * (e.g. by soclose), then tcptw has already been discarded
+ * and inpcb is freed here
+ *
+ * In all three cases the tcptw should not be freed here.
*/
if (inp->inp_flags & INP_DROPPED) {
KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && "
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 72bb7ae..1919146 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -663,7 +663,7 @@ void tcp_twstart(struct tcpcb *);
#if 0
int tcp_twrecycleable(struct tcptw *tw);
#endif
-void tcp_twclose(struct tcptw *_tw, int _reuse);
+void tcp_twclose(struct tcptw *, int);
void tcp_ctlinput(int, struct sockaddr *, void *);
int tcp_ctloutput(struct socket *, struct sockopt *);
struct tcpcb *
OpenPOWER on IntegriCloud