summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--sys/netinet/tcp_input.c15
-rw-r--r--sys/netinet/tcp_reass.c15
-rw-r--r--sys/netinet/tcp_syncache.c539
-rw-r--r--sys/netinet/tcp_var.h15
4 files changed, 312 insertions, 272 deletions
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 90957ce..70b524c 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -973,18 +973,18 @@ findpcb:
(void *)tcp_saveipgen, &tcp_savetcp, 0);
#endif
tcp_dooptions(&to, optp, optlen, 1);
- if (!syncache_add(&inc, &to, th, &so, m))
- goto drop;
+ if (!syncache_add(&inc, &to, th, inp, &so, m))
+ goto drop; /* XXX: does not happen */
if (so == NULL) {
/*
* Entry added to syncache, mbuf used to
- * send SYN,ACK packet.
+ * send SYN,ACK packet. Everything unlocked
+ * already.
*/
- KASSERT(headlocked, ("headlocked"));
- INP_UNLOCK(inp);
- INP_INFO_WUNLOCK(&tcbinfo);
return;
}
+ panic("T/TCP not supported at the moment");
+#if 0 /* T/TCP */
/*
* Segment passed TAO tests.
* XXX: Can't happen at the moment.
@@ -1011,6 +1011,7 @@ findpcb:
tcpstat.tcps_connects++;
soisconnected(so);
goto trimthenstep6;
+#endif /* T/TCP */
}
goto drop;
}
@@ -1437,7 +1438,9 @@ after_listen:
tp->t_state = TCPS_SYN_RECEIVED;
}
+#if 0 /* T/TCP */
trimthenstep6:
+#endif
KASSERT(headlocked, ("tcp_input: trimthenstep6: head not "
"locked"));
INP_LOCK_ASSERT(inp);
diff --git a/sys/netinet/tcp_reass.c b/sys/netinet/tcp_reass.c
index 90957ce..70b524c 100644
--- a/sys/netinet/tcp_reass.c
+++ b/sys/netinet/tcp_reass.c
@@ -973,18 +973,18 @@ findpcb:
(void *)tcp_saveipgen, &tcp_savetcp, 0);
#endif
tcp_dooptions(&to, optp, optlen, 1);
- if (!syncache_add(&inc, &to, th, &so, m))
- goto drop;
+ if (!syncache_add(&inc, &to, th, inp, &so, m))
+ goto drop; /* XXX: does not happen */
if (so == NULL) {
/*
* Entry added to syncache, mbuf used to
- * send SYN,ACK packet.
+ * send SYN,ACK packet. Everything unlocked
+ * already.
*/
- KASSERT(headlocked, ("headlocked"));
- INP_UNLOCK(inp);
- INP_INFO_WUNLOCK(&tcbinfo);
return;
}
+ panic("T/TCP not supported at the moment");
+#if 0 /* T/TCP */
/*
* Segment passed TAO tests.
* XXX: Can't happen at the moment.
@@ -1011,6 +1011,7 @@ findpcb:
tcpstat.tcps_connects++;
soisconnected(so);
goto trimthenstep6;
+#endif /* T/TCP */
}
goto drop;
}
@@ -1437,7 +1438,9 @@ after_listen:
tp->t_state = TCPS_SYN_RECEIVED;
}
+#if 0 /* T/TCP */
trimthenstep6:
+#endif
KASSERT(headlocked, ("tcp_input: trimthenstep6: head not "
"locked"));
INP_LOCK_ASSERT(inp);
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
index 2ba6808..a15464b 100644
--- a/sys/netinet/tcp_syncache.c
+++ b/sys/netinet/tcp_syncache.c
@@ -1,5 +1,6 @@
/*-
* Copyright (c) 2001 McAfee, Inc.
+ * Copyright (c) 2006 Andre Oppermann, Internet Business Solutions AG
* All rights reserved.
*
* This software was developed for the FreeBSD Project by Jonathan Lemon
@@ -42,12 +43,15 @@
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
#include <sys/malloc.h>
#include <sys/mac.h>
#include <sys/mbuf.h>
#include <sys/md5.h>
#include <sys/proc.h> /* for proc0 declaration */
#include <sys/random.h>
+#include <sys/rwlock.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
@@ -110,14 +114,11 @@ static void syncache_drop(struct syncache *, struct syncache_head *);
static void syncache_free(struct syncache *);
static void syncache_insert(struct syncache *, struct syncache_head *);
struct syncache *syncache_lookup(struct in_conninfo *, struct syncache_head **);
-#ifdef TCPDEBUG
-static int syncache_respond(struct syncache *, struct mbuf *, struct socket *);
-#else
static int syncache_respond(struct syncache *, struct mbuf *);
-#endif
static struct socket *syncache_socket(struct syncache *, struct socket *,
struct mbuf *m);
static void syncache_timer(void *);
+static void syncookie_init(void);
static u_int32_t syncookie_generate(struct syncache *, u_int32_t *);
static struct syncache *syncookie_lookup(struct in_conninfo *,
struct tcphdr *, struct socket *);
@@ -139,12 +140,10 @@ struct tcp_syncache {
u_int hashsize;
u_int hashmask;
u_int bucket_limit;
- u_int cache_count;
+ u_int cache_count; /* XXX: unprotected */
u_int cache_limit;
u_int rexmt_limit;
u_int hash_secret;
- TAILQ_HEAD(, syncache) timerq[SYNCACHE_MAXREXMTS + 1];
- struct callout tt_timerq[SYNCACHE_MAXREXMTS + 1];
};
static struct tcp_syncache tcp_syncache;
@@ -188,16 +187,25 @@ static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache");
#define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0)
-#define SYNCACHE_TIMEOUT(sc, slot) do { \
- sc->sc_rxtslot = (slot); \
- sc->sc_rxttime = ticks + TCPTV_RTOBASE * tcp_backoff[(slot)]; \
- TAILQ_INSERT_TAIL(&tcp_syncache.timerq[(slot)], sc, sc_timerq); \
- if (!callout_active(&tcp_syncache.tt_timerq[(slot)])) \
- callout_reset(&tcp_syncache.tt_timerq[(slot)], \
- TCPTV_RTOBASE * tcp_backoff[(slot)], \
- syncache_timer, (void *)((intptr_t)(slot))); \
+#define SYNCACHE_TIMEOUT(sc, sch, co) do { \
+ (sc)->sc_rxmits++; \
+ (sc)->sc_rxttime = ticks + \
+ TCPTV_RTOBASE * tcp_backoff[(sc)->sc_rxmits - 1]; \
+ if ((sch)->sch_nextc > (sc)->sc_rxttime) \
+ (sch)->sch_nextc = (sc)->sc_rxttime; \
+ if (!TAILQ_EMPTY(&(sch)->sch_bucket) && !(co)) \
+ callout_reset(&(sch)->sch_timer, \
+ (sch)->sch_nextc - ticks, \
+ syncache_timer, (void *)(sch)); \
} while (0)
+#define SCH_LOCK(sch) mtx_lock(&(sch)->sch_mtx)
+#define SCH_UNLOCK(sch) mtx_unlock(&(sch)->sch_mtx)
+#define SCH_LOCK_ASSERT(sch) mtx_assert(&(sch)->sch_mtx, MA_OWNED)
+
+/*
+ * Requires the syncache entry to be already removed from the bucket list.
+ */
static void
syncache_free(struct syncache *sc)
{
@@ -215,15 +223,11 @@ syncache_init(void)
tcp_syncache.cache_count = 0;
tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT;
- tcp_syncache.cache_limit =
- tcp_syncache.hashsize * tcp_syncache.bucket_limit;
tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS;
tcp_syncache.hash_secret = arc4random();
TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize",
&tcp_syncache.hashsize);
- TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit",
- &tcp_syncache.cache_limit);
TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit",
&tcp_syncache.bucket_limit);
if (!powerof2(tcp_syncache.hashsize) || tcp_syncache.hashsize == 0) {
@@ -232,6 +236,12 @@ syncache_init(void)
}
tcp_syncache.hashmask = tcp_syncache.hashsize - 1;
+ /* Set limits. */
+ tcp_syncache.cache_limit =
+ tcp_syncache.hashsize * tcp_syncache.bucket_limit;
+ TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit",
+ &tcp_syncache.cache_limit);
+
/* Allocate the hash table. */
MALLOC(tcp_syncache.hashbase, struct syncache_head *,
tcp_syncache.hashsize * sizeof(struct syncache_head),
@@ -240,164 +250,127 @@ syncache_init(void)
/* Initialize the hash buckets. */
for (i = 0; i < tcp_syncache.hashsize; i++) {
TAILQ_INIT(&tcp_syncache.hashbase[i].sch_bucket);
+ mtx_init(&tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head",
+ NULL, MTX_DEF);
+ callout_init_mtx(&tcp_syncache.hashbase[i].sch_timer,
+ &tcp_syncache.hashbase[i].sch_mtx, 0);
tcp_syncache.hashbase[i].sch_length = 0;
}
- /* Initialize the timer queues. */
- for (i = 0; i <= SYNCACHE_MAXREXMTS; i++) {
- TAILQ_INIT(&tcp_syncache.timerq[i]);
- callout_init(&tcp_syncache.tt_timerq[i], NET_CALLOUT_MPSAFE);
- }
+ syncookie_init();
- /*
- * Allocate the syncache entries. Allow the zone to allocate one
- * more entry than cache limit, so a new entry can bump out an
- * older one.
- */
+ /* Create the syncache entry zone. */
tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache),
- NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
uma_zone_set_max(tcp_syncache.zone, tcp_syncache.cache_limit);
- tcp_syncache.cache_limit -= 1;
}
+/*
+ * Inserts a syncache entry into the specified bucket row.
+ * Locks and unlocks the syncache_head autonomously.
+ */
static void
syncache_insert(sc, sch)
struct syncache *sc;
struct syncache_head *sch;
{
struct syncache *sc2;
- int i;
- INP_INFO_WLOCK_ASSERT(&tcbinfo);
+ SCH_LOCK(sch);
/*
- * Make sure that we don't overflow the per-bucket
- * limit or the total cache size limit.
+ * Make sure that we don't overflow the per-bucket limit.
+ * If the bucket is full, toss the oldest element.
*/
if (sch->sch_length >= tcp_syncache.bucket_limit) {
- /*
- * The bucket is full, toss the oldest element.
- */
- sc2 = TAILQ_FIRST(&sch->sch_bucket);
- sc2->sc_tp->ts_recent = ticks;
+ KASSERT(!TAILQ_EMPTY(&sch->sch_bucket),
+ ("sch->sch_length incorrect"));
+ sc2 = TAILQ_LAST(&sch->sch_bucket, sch_head);
syncache_drop(sc2, sch);
tcpstat.tcps_sc_bucketoverflow++;
- } else if (tcp_syncache.cache_count >= tcp_syncache.cache_limit) {
- /*
- * The cache is full. Toss the oldest entry in the
- * entire cache. This is the front entry in the
- * first non-empty timer queue with the largest
- * timeout value.
- */
- for (i = SYNCACHE_MAXREXMTS; i >= 0; i--) {
- sc2 = TAILQ_FIRST(&tcp_syncache.timerq[i]);
- if (sc2 != NULL)
- break;
- }
- sc2->sc_tp->ts_recent = ticks;
- syncache_drop(sc2, NULL);
- tcpstat.tcps_sc_cacheoverflow++;
}
- /* Initialize the entry's timer. */
- SYNCACHE_TIMEOUT(sc, 0);
-
/* Put it into the bucket. */
- TAILQ_INSERT_TAIL(&sch->sch_bucket, sc, sc_hash);
+ TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash);
sch->sch_length++;
+
+ /* Reinitialize the bucket row's timer. */
+ SYNCACHE_TIMEOUT(sc, sch, 1);
+
+ SCH_UNLOCK(sch);
+
tcp_syncache.cache_count++;
tcpstat.tcps_sc_added++;
}
+/*
+ * Remove and free entry from syncache bucket row.
+ * Expects locked syncache head.
+ */
static void
syncache_drop(sc, sch)
struct syncache *sc;
struct syncache_head *sch;
{
- INP_INFO_WLOCK_ASSERT(&tcbinfo);
- if (sch == NULL) {
-#ifdef INET6
- if (sc->sc_inc.inc_isipv6) {
- sch = &tcp_syncache.hashbase[
- SYNCACHE_HASH6(&sc->sc_inc, tcp_syncache.hashmask)];
- } else
-#endif
- {
- sch = &tcp_syncache.hashbase[
- SYNCACHE_HASH(&sc->sc_inc, tcp_syncache.hashmask)];
- }
- }
+ SCH_LOCK_ASSERT(sch);
TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
sch->sch_length--;
- tcp_syncache.cache_count--;
-
- TAILQ_REMOVE(&tcp_syncache.timerq[sc->sc_rxtslot], sc, sc_timerq);
- if (TAILQ_EMPTY(&tcp_syncache.timerq[sc->sc_rxtslot]))
- callout_stop(&tcp_syncache.tt_timerq[sc->sc_rxtslot]);
syncache_free(sc);
+ tcp_syncache.cache_count--;
}
/*
* Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
* If we have retransmitted an entry the maximum number of times, expire it.
+ * One separate timer for each bucket row.
*/
static void
-syncache_timer(xslot)
- void *xslot;
+syncache_timer(xsch)
+ void *xsch;
{
- intptr_t slot = (intptr_t)xslot;
+ struct syncache_head *sch = (struct syncache_head *)xsch;
struct syncache *sc, *nsc;
- struct inpcb *inp;
+ int tick = ticks;
- INP_INFO_WLOCK(&tcbinfo);
- if (callout_pending(&tcp_syncache.tt_timerq[slot]) ||
- !callout_active(&tcp_syncache.tt_timerq[slot])) {
- /* XXX can this happen? */
- INP_INFO_WUNLOCK(&tcbinfo);
- return;
- }
- callout_deactivate(&tcp_syncache.tt_timerq[slot]);
+ /* NB: syncache_head has already been locked by the callout. */
+ SCH_LOCK_ASSERT(sch);
- nsc = TAILQ_FIRST(&tcp_syncache.timerq[slot]);
- while (nsc != NULL) {
- if (ticks < nsc->sc_rxttime)
- break;
- sc = nsc;
- inp = sc->sc_tp->t_inpcb;
- if (slot == SYNCACHE_MAXREXMTS ||
- slot >= tcp_syncache.rexmt_limit ||
- inp == NULL || inp->inp_gencnt != sc->sc_inp_gencnt) {
- nsc = TAILQ_NEXT(sc, sc_timerq);
- syncache_drop(sc, NULL);
+ TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) {
+ /*
+ * We do not check if the listen socket still exists
+ * and accept the case where the listen socket may be
+ * gone by the time we resend the SYN/ACK. We do
+ * not expect this to happens often. If it does,
+ * then the RST will be sent by the time the remote
+ * host does the SYN/ACK->ACK.
+ */
+ if (sc->sc_rxttime >= tick) {
+ if (sc->sc_rxttime < sch->sch_nextc)
+ sch->sch_nextc = sc->sc_rxttime;
+ continue;
+ }
+
+ if (sc->sc_rxmits > tcp_syncache.rexmt_limit) {
+ syncache_drop(sc, sch);
tcpstat.tcps_sc_stale++;
continue;
}
- /*
- * syncache_respond() may call back into the syncache to
- * to modify another entry, so do not obtain the next
- * entry on the timer chain until it has completed.
- */
-#ifdef TCPDEBUG
- (void) syncache_respond(sc, NULL, NULL);
-#else
+
(void) syncache_respond(sc, NULL);
-#endif
- nsc = TAILQ_NEXT(sc, sc_timerq);
tcpstat.tcps_sc_retransmitted++;
- TAILQ_REMOVE(&tcp_syncache.timerq[slot], sc, sc_timerq);
- SYNCACHE_TIMEOUT(sc, slot + 1);
+ SYNCACHE_TIMEOUT(sc, sch, 0);
}
- if (nsc != NULL)
- callout_reset(&tcp_syncache.tt_timerq[slot],
- nsc->sc_rxttime - ticks, syncache_timer, (void *)(slot));
- INP_INFO_WUNLOCK(&tcbinfo);
+ if (!TAILQ_EMPTY(&(sch)->sch_bucket))
+ callout_reset(&(sch)->sch_timer, (sch)->sch_nextc - tick,
+ syncache_timer, (void *)(sch));
}
/*
* Find an entry in the syncache.
+ * Returns always with locked syncache_head plus a matching entry or NULL.
*/
struct syncache *
syncache_lookup(inc, schp)
@@ -407,13 +380,15 @@ syncache_lookup(inc, schp)
struct syncache *sc;
struct syncache_head *sch;
- INP_INFO_WLOCK_ASSERT(&tcbinfo);
-
#ifdef INET6
if (inc->inc_isipv6) {
sch = &tcp_syncache.hashbase[
SYNCACHE_HASH6(inc, tcp_syncache.hashmask)];
*schp = sch;
+
+ SCH_LOCK(sch);
+
+ /* Circle through bucket row to find matching entry. */
TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie))
return (sc);
@@ -424,6 +399,10 @@ syncache_lookup(inc, schp)
sch = &tcp_syncache.hashbase[
SYNCACHE_HASH(inc, tcp_syncache.hashmask)];
*schp = sch;
+
+ SCH_LOCK(sch);
+
+ /* Circle through bucket row to find matching entry. */
TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
#ifdef INET6
if (sc->sc_inc.inc_isipv6)
@@ -433,7 +412,8 @@ syncache_lookup(inc, schp)
return (sc);
}
}
- return (NULL);
+ SCH_LOCK_ASSERT(*schp);
+ return (NULL); /* always returns with locked sch */
}
/*
@@ -449,11 +429,11 @@ syncache_chkrst(inc, th)
struct syncache *sc;
struct syncache_head *sch;
- INP_INFO_WLOCK_ASSERT(&tcbinfo);
-
- sc = syncache_lookup(inc, &sch);
+ sc = syncache_lookup(inc, &sch); /* returns locked sch */
+ SCH_LOCK_ASSERT(sch);
if (sc == NULL)
- return;
+ goto done;
+
/*
* If the RST bit is set, check the sequence number to see
* if this is a valid reset segment.
@@ -472,6 +452,8 @@ syncache_chkrst(inc, th)
syncache_drop(sc, sch);
tcpstat.tcps_sc_reset++;
}
+done:
+ SCH_UNLOCK(sch);
}
void
@@ -481,13 +463,13 @@ syncache_badack(inc)
struct syncache *sc;
struct syncache_head *sch;
- INP_INFO_WLOCK_ASSERT(&tcbinfo);
-
- sc = syncache_lookup(inc, &sch);
+ sc = syncache_lookup(inc, &sch); /* returns locked sch */
+ SCH_LOCK_ASSERT(sch);
if (sc != NULL) {
syncache_drop(sc, sch);
tcpstat.tcps_sc_badack++;
}
+ SCH_UNLOCK(sch);
}
void
@@ -498,15 +480,14 @@ syncache_unreach(inc, th)
struct syncache *sc;
struct syncache_head *sch;
- INP_INFO_WLOCK_ASSERT(&tcbinfo);
-
- sc = syncache_lookup(inc, &sch);
+ sc = syncache_lookup(inc, &sch); /* returns locked sch */
+ SCH_LOCK_ASSERT(sch);
if (sc == NULL)
- return;
+ goto done;
/* If the sequence number != sc_iss, then it's a bogus ICMP msg */
if (ntohl(th->th_seq) != sc->sc_iss)
- return;
+ goto done;
/*
* If we've rertransmitted 3 times and this is our second error,
@@ -516,12 +497,14 @@ syncache_unreach(inc, th)
*
* See tcp_notify().
*/
- if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtslot < 3) {
+ if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxmits < 3 + 1) {
sc->sc_flags |= SCF_UNREACH;
- return;
+ goto done;
}
syncache_drop(sc, sch);
tcpstat.tcps_sc_unreach++;
+done:
+ SCH_UNLOCK(sch);
}
/*
@@ -564,9 +547,7 @@ syncache_socket(sc, lso, m)
inp = sotoinpcb(so);
INP_LOCK(inp);
- /*
- * Insert new socket into hash list.
- */
+ /* Insert new socket into PCB hash list. */
inp->inp_inc.inc_isipv6 = sc->sc_inc.inc_isipv6;
#ifdef INET6
if (sc->sc_inc.inc_isipv6) {
@@ -665,7 +646,6 @@ syncache_socket(sc, lso, m)
goto abort;
}
}
-
tp = intotcpcb(inp);
tp->t_state = TCPS_SYN_RECEIVED;
tp->iss = sc->sc_iss;
@@ -698,6 +678,7 @@ syncache_socket(sc, lso, m)
tp->sack_enable = 1;
tp->t_flags |= TF_SACK_PERMIT;
}
+
/*
* Set up MSS and get cached values from tcp_hostcache.
* This might overwrite some of the defaults we just set.
@@ -707,7 +688,7 @@ syncache_socket(sc, lso, m)
/*
* If the SYN,ACK was retransmitted, reset cwnd to 1 segment.
*/
- if (sc->sc_rxtslot != 0)
+ if (sc->sc_rxmits > 1)
tp->snd_cwnd = tp->t_maxseg;
callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
@@ -732,19 +713,24 @@ abort2:
* the SYN-RECEIVED state.
*/
int
-syncache_expand(inc, th, sop, m)
+syncache_expand(inc, th, lsop, m)
struct in_conninfo *inc;
struct tcphdr *th;
- struct socket **sop;
+ struct socket **lsop;
struct mbuf *m;
{
struct syncache *sc;
struct syncache_head *sch;
struct socket *so;
+ /*
+ * Global TCP locks are held because we manipulate the PCB lists
+ * and create a new socket.
+ */
INP_INFO_WLOCK_ASSERT(&tcbinfo);
- sc = syncache_lookup(inc, &sch);
+ sc = syncache_lookup(inc, &sch); /* returns locked sch */
+ SCH_LOCK_ASSERT(sch);
if (sc == NULL) {
/*
* There is no syncache entry, so see if this ACK is
@@ -755,25 +741,30 @@ syncache_expand(inc, th, sop, m)
* B. check that the syncookie is valid. If it is, then
* cobble up a fake syncache entry, and return.
*/
+ SCH_UNLOCK(sch);
+ sch = NULL;
+
if (!tcp_syncookies)
- return (0);
- sc = syncookie_lookup(inc, th, *sop);
+ goto failed;
+ sc = syncookie_lookup(inc, th, *lsop);
if (sc == NULL)
- return (0);
- sch = NULL;
+ goto failed;
tcpstat.tcps_sc_recvcookie++;
+ } else {
+ /* Pull out the entry to unlock the bucket row. */
+ TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
+ sch->sch_length--;
+ SCH_UNLOCK(sch);
}
/*
* If seg contains an ACK, but not for our SYN/ACK, send a RST.
*/
- if (th->th_ack != sc->sc_iss + 1) {
- if (sch == NULL)
- syncache_free(sc);
- return (0);
- }
+ if (th->th_ack != sc->sc_iss + 1)
+ goto failed;
+
+ so = syncache_socket(sc, *lsop, m);
- so = syncache_socket(sc, *sop, m);
if (so == NULL) {
#if 0
resetandabort:
@@ -781,17 +772,23 @@ resetandabort:
(void) tcp_respond(NULL, m, m, th,
th->th_seq + tlen, (tcp_seq)0, TH_RST|TH_ACK);
#endif
- m_freem(m); /* XXX only needed for above */
+ m_freem(m); /* XXX: only needed for above */
tcpstat.tcps_sc_aborted++;
+ if (sch != NULL) {
+ syncache_insert(sc, sch); /* try again later */
+ sc = NULL;
+ }
+ goto failed;
} else
tcpstat.tcps_sc_completed++;
+ *lsop = so;
- if (sch == NULL)
- syncache_free(sc);
- else
- syncache_drop(sc, sch);
- *sop = so;
+ syncache_free(sc);
return (1);
+failed:
+ if (sc != NULL)
+ syncache_free(sc);
+ return (0);
}
/*
@@ -808,11 +805,12 @@ resetandabort:
* the data, we avoid this DoS scenario.
*/
int
-syncache_add(inc, to, th, sop, m)
+syncache_add(inc, to, th, inp, lsop, m)
struct in_conninfo *inc;
struct tcpopt *to;
struct tcphdr *th;
- struct socket **sop;
+ struct inpcb *inp;
+ struct socket **lsop;
struct mbuf *m;
{
struct tcpcb *tp;
@@ -821,13 +819,37 @@ syncache_add(inc, to, th, sop, m)
struct syncache_head *sch;
struct mbuf *ipopts = NULL;
u_int32_t flowtmp;
- int i, win;
+ int win, autoflowlabel = 0;
+ int sb_hiwat, ip_ttl, ip_tos;
INP_INFO_WLOCK_ASSERT(&tcbinfo);
+ INP_LOCK_ASSERT(inp); /* listen socket */
- so = *sop;
+ /*
+ * Combine all so/tp operations very early to drop the INP lock as
+ * soon as possible.
+ */
+ so = *lsop;
tp = sototcpcb(so);
+#ifdef INET6
+ if (inc->inc_isipv6 &&
+ (inp->in6p_flags & IN6P_AUTOFLOWLABEL))
+ autoflowlabel = 1;
+#endif
+ ip_ttl = inp->inp_ip_ttl;
+ ip_tos = inp->inp_ip_tos;
+ win = sbspace(&so->so_rcv);
+ sb_hiwat = so->so_rcv.sb_hiwat;
+ if (tp->t_flags & TF_NOOPT)
+ sc->sc_flags = SCF_NOOPT;
+
+ so = NULL;
+ tp = NULL;
+
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+
/*
* Remember the IP options, if any.
*/
@@ -844,7 +866,8 @@ syncache_add(inc, to, th, sop, m)
* should the syncache be re-initialized with the contents
* of the new SYN here (which may have different options?)
*/
- sc = syncache_lookup(inc, &sch);
+ sc = syncache_lookup(inc, &sch); /* returns locked entry */
+ SCH_LOCK_ASSERT(sch);
if (sc != NULL) {
tcpstat.tcps_sc_dupsyn++;
if (ipopts) {
@@ -861,25 +884,13 @@ syncache_add(inc, to, th, sop, m)
*/
if (sc->sc_flags & SCF_TIMESTAMP)
sc->sc_tsrecent = to->to_tsval;
- /*
- * PCB may have changed, pick up new values.
- */
- sc->sc_tp = tp;
- sc->sc_inp_gencnt = tp->t_inpcb->inp_gencnt;
-#ifdef TCPDEBUG
- if (syncache_respond(sc, m, so) == 0) {
-#else
if (syncache_respond(sc, m) == 0) {
-#endif
- /* NB: guarded by INP_INFO_WLOCK(&tcbinfo) */
- TAILQ_REMOVE(&tcp_syncache.timerq[sc->sc_rxtslot],
- sc, sc_timerq);
- SYNCACHE_TIMEOUT(sc, sc->sc_rxtslot);
+ SYNCACHE_TIMEOUT(sc, sch, 1);
tcpstat.tcps_sndacks++;
tcpstat.tcps_sndtotal++;
}
- *sop = NULL;
- return (1);
+ SCH_UNLOCK(sch);
+ goto done;
}
sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT | M_ZERO);
@@ -890,33 +901,21 @@ syncache_add(inc, to, th, sop, m)
* entry and insert the new one.
*/
tcpstat.tcps_sc_zonefail++;
- /* NB: guarded by INP_INFO_WLOCK(&tcbinfo) */
- for (i = SYNCACHE_MAXREXMTS; i >= 0; i--) {
- sc = TAILQ_FIRST(&tcp_syncache.timerq[i]);
- if (sc != NULL)
- break;
- }
- if (sc == NULL) {
- /* Generic memory failure. */
- if (ipopts)
- (void) m_free(ipopts);
- return (0);
- }
- sc->sc_tp->ts_recent = ticks;
- syncache_drop(sc, NULL);
+ sc = TAILQ_LAST(&sch->sch_bucket, sch_head);
+ syncache_drop(sc, sch);
+ SCH_UNLOCK(sch);
sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT | M_ZERO);
if (sc == NULL) {
if (ipopts)
(void) m_free(ipopts);
- return (0);
+ goto done;
}
- }
+ } else
+ SCH_UNLOCK(sch);
/*
* Fill in the syncache values.
*/
- sc->sc_tp = tp;
- sc->sc_inp_gencnt = tp->t_inpcb->inp_gencnt;
sc->sc_ipopts = ipopts;
sc->sc_inc.inc_fport = inc->inc_fport;
sc->sc_inc.inc_lport = inc->inc_lport;
@@ -930,6 +929,8 @@ syncache_add(inc, to, th, sop, m)
{
sc->sc_inc.inc_faddr = inc->inc_faddr;
sc->sc_inc.inc_laddr = inc->inc_laddr;
+ sc->sc_ip_tos = ip_tos;
+ sc->sc_ip_ttl = ip_ttl;
}
sc->sc_irs = th->th_seq;
sc->sc_flags = 0;
@@ -938,24 +939,22 @@ syncache_add(inc, to, th, sop, m)
if (tcp_syncookies) {
sc->sc_iss = syncookie_generate(sc, &flowtmp);
#ifdef INET6
- if (inc->inc_isipv6 &&
- (sc->sc_tp->t_inpcb->in6p_flags & IN6P_AUTOFLOWLABEL)) {
+ if (autoflowlabel)
sc->sc_flowlabel = flowtmp & IPV6_FLOWLABEL_MASK;
- }
#endif
} else {
sc->sc_iss = arc4random();
#ifdef INET6
- if (inc->inc_isipv6 &&
- (sc->sc_tp->t_inpcb->in6p_flags & IN6P_AUTOFLOWLABEL)) {
+ if (autoflowlabel)
sc->sc_flowlabel =
(htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
- }
#endif
}
- /* Initial receive window: clip sbspace to [0 .. TCP_MAXWIN] */
- win = sbspace(&so->so_rcv);
+ /*
+ * Initial receive window: clip sbspace to [0 .. TCP_MAXWIN].
+ * win was derived from socket earlier in the function.
+ */
win = imax(win, 0);
win = imin(win, TCP_MAXWIN);
sc->sc_wnd = win;
@@ -974,15 +973,13 @@ syncache_add(inc, to, th, sop, m)
/* Compute proper scaling value from buffer space */
while (wscale < TCP_MAX_WINSHIFT &&
- (TCP_MAXWIN << wscale) < so->so_rcv.sb_hiwat)
+ (TCP_MAXWIN << wscale) < sb_hiwat)
wscale++;
sc->sc_request_r_scale = wscale;
sc->sc_requested_s_scale = to->to_requested_s_scale;
sc->sc_flags |= SCF_WINSCALE;
}
}
- if (tp->t_flags & TF_NOOPT)
- sc->sc_flags = SCF_NOOPT;
#ifdef TCP_SIGNATURE
/*
* If listening socket requested TCP digests, and received SYN
@@ -1001,44 +998,36 @@ syncache_add(inc, to, th, sop, m)
/*
* Do a standard 3-way handshake.
*/
-#ifdef TCPDEBUG
- if (syncache_respond(sc, m, so) == 0) {
-#else
if (syncache_respond(sc, m) == 0) {
-#endif
- syncache_insert(sc, sch);
+ syncache_insert(sc, sch); /* locks and unlocks sch */
tcpstat.tcps_sndacks++;
tcpstat.tcps_sndtotal++;
} else {
syncache_free(sc);
tcpstat.tcps_sc_dropped++;
}
- *sop = NULL;
+
+done:
+ *lsop = NULL;
return (1);
}
-#ifdef TCPDEBUG
-static int
-syncache_respond(sc, m, so)
- struct syncache *sc;
- struct mbuf *m;
- struct socket *so;
-#else
static int
syncache_respond(sc, m)
struct syncache *sc;
struct mbuf *m;
-#endif
{
u_int8_t *optp;
int optlen, error;
u_int16_t tlen, hlen, mssopt;
struct ip *ip = NULL;
struct tcphdr *th;
- struct inpcb *inp;
#ifdef INET6
struct ip6_hdr *ip6 = NULL;
#endif
+#ifdef MAC
+ struct inpcb *inp = NULL;
+#endif
hlen =
#ifdef INET6
@@ -1074,10 +1063,7 @@ syncache_respond(sc, m)
*/
KASSERT(max_linkhdr + tlen <= MHLEN, ("syncache: mbuf too small"));
- /*
- * XXX shouldn't this reuse the mbuf if possible ?
- * Create the IP+TCP header from scratch.
- */
+ /* Create the IP+TCP header from scratch. */
if (m)
m_freem(m);
@@ -1088,11 +1074,45 @@ syncache_respond(sc, m)
m->m_len = tlen;
m->m_pkthdr.len = tlen;
m->m_pkthdr.rcvif = NULL;
- inp = sc->sc_tp->t_inpcb;
- INP_LOCK(inp);
+
#ifdef MAC
+ /*
+ * For MAC look up the inpcb to get access to the label information.
+ * We don't store the inpcb pointer in struct syncache to make locking
+ * less complicated and to save locking operations. However for MAC
+ * this gives a slight overhead as we have to do a full pcblookup here.
+ */
+ INP_INFO_RLOCK(&tcbinfo);
+ if (inp == NULL) {
+#ifdef INET6 /* && MAC */
+ if (sc->sc_inc.inc_isipv6)
+ inp = in6_pcblookup_hash(&tcbinfo,
+ &sc->sc_inc.inc6_laddr, sc->sc_inc.inc_lport,
+ &sc->sc_inc.inc6_faddr, sc->sc_inc.inc_fport,
+ 1, NULL);
+ else
+#endif /* INET6 */
+ inp = in_pcblookup_hash(&tcbinfo,
+ sc->sc_inc.inc_laddr, sc->sc_inc.inc_lport,
+ sc->sc_inc.inc_faddr, sc->sc_inc.inc_fport,
+ 1, NULL);
+ if (inp == NULL) {
+ m_freem(m);
+ INP_INFO_RUNLOCK(&tcbinfo);
+ return (ESHUTDOWN);
+ }
+ }
+ INP_LOCK(inp);
+ if (!inp->inp_socket->so_options & SO_ACCEPTCONN) {
+ m_freem(m);
+ INP_UNLOCK(inp);
+ INP_INFO_RUNLOCK(&tcbinfo);
+ return (ESHUTDOWN);
+ }
mac_create_mbuf_from_inpcb(inp, m);
-#endif
+ INP_UNLOCK(inp);
+ INP_INFO_RUNLOCK(&tcbinfo);
+#endif /* MAC */
#ifdef INET6
if (sc->sc_inc.inc_isipv6) {
@@ -1120,8 +1140,8 @@ syncache_respond(sc, m)
ip->ip_p = IPPROTO_TCP;
ip->ip_src = sc->sc_inc.inc_laddr;
ip->ip_dst = sc->sc_inc.inc_faddr;
- ip->ip_ttl = inp->inp_ip_ttl; /* XXX */
- ip->ip_tos = inp->inp_ip_tos; /* XXX */
+ ip->ip_ttl = sc->sc_ip_ttl;
+ ip->ip_tos = sc->sc_ip_tos;
/*
* See if we should do MTU discovery. Route lookups are
@@ -1207,7 +1227,7 @@ syncache_respond(sc, m)
th->th_sum = 0;
th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
ip6->ip6_hlim = in6_selecthlim(NULL, NULL);
- error = ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
+ error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
} else
#endif
{
@@ -1215,19 +1235,8 @@ syncache_respond(sc, m)
htons(tlen - hlen + IPPROTO_TCP));
m->m_pkthdr.csum_flags = CSUM_TCP;
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
-#ifdef TCPDEBUG
- /*
- * Trace.
- */
- if (so != NULL && so->so_options & SO_DEBUG) {
- struct tcpcb *tp = sototcpcb(so);
- tcp_trace(TA_OUTPUT, tp->t_state, tp,
- mtod(m, void *), th, 0);
- }
-#endif
- error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, inp);
+ error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL);
}
- INP_UNLOCK(inp);
return (error);
}
@@ -1255,9 +1264,15 @@ syncache_respond(sc, m)
(hz * (1 << SYNCOOKIE_WNDBITS) / (1 << SYNCOOKIE_TIMESHIFT))
#define SYNCOOKIE_DATAMASK ((3 << SYNCOOKIE_WNDBITS) | SYNCOOKIE_WNDMASK)
+#define SYNCOOKIE_RLOCK(ts) (rw_rlock(&(ts).ts_rwmtx))
+#define SYNCOOKIE_RUNLOCK(ts) (rw_runlock(&(ts).ts_rwmtx))
+#define SYNCOOKIE_TRY_UPGRADE(ts) (rw_try_upgrade(&(ts).ts_rwmtx))
+#define SYNCOOKIE_DOWNGRADE(ts) (rw_downgrade(&(ts).ts_rwmtx))
+
static struct {
+ struct rwlock ts_rwmtx;
+ u_int ts_expire; /* ticks */
u_int32_t ts_secbits[4];
- u_int ts_expire;
} tcp_secret[SYNCOOKIE_NSECRETS];
static int tcp_msstab[] = { 0, 536, 1460, 8960 };
@@ -1286,6 +1301,15 @@ CTASSERT(sizeof(struct md5_add) == 28);
* worrying about.
*/
+static void
+syncookie_init(void) {
+ int idx;
+
+ for (idx = 0; idx < SYNCOOKIE_NSECRETS; idx++) {
+ rw_init(&(tcp_secret[idx].ts_rwmtx), "tcp_secret");
+ }
+}
+
static u_int32_t
syncookie_generate(struct syncache *sc, u_int32_t *flowid)
{
@@ -1294,13 +1318,15 @@ syncookie_generate(struct syncache *sc, u_int32_t *flowid)
int idx, i;
struct md5_add add;
- /* NB: single threaded; could add INP_INFO_WLOCK_ASSERT(&tcbinfo) */
-
idx = ((ticks << SYNCOOKIE_TIMESHIFT) / hz) & SYNCOOKIE_WNDMASK;
- if (tcp_secret[idx].ts_expire < ticks) {
+ SYNCOOKIE_RLOCK(tcp_secret[idx]);
+ if (tcp_secret[idx].ts_expire < time_uptime &&
+ SYNCOOKIE_TRY_UPGRADE(tcp_secret[idx]) ) {
+ /* need write access */
for (i = 0; i < 4; i++)
tcp_secret[idx].ts_secbits[i] = arc4random();
tcp_secret[idx].ts_expire = ticks + SYNCOOKIE_TIMEOUT;
+ SYNCOOKIE_DOWNGRADE(tcp_secret[idx]);
}
for (data = sizeof(tcp_msstab) / sizeof(int) - 1; data > 0; data--)
if (tcp_msstab[data] <= sc->sc_peer_mss)
@@ -1326,6 +1352,7 @@ syncookie_generate(struct syncache *sc, u_int32_t *flowid)
add.secbits[1] = tcp_secret[idx].ts_secbits[1];
add.secbits[2] = tcp_secret[idx].ts_secbits[2];
add.secbits[3] = tcp_secret[idx].ts_secbits[3];
+ SYNCOOKIE_RUNLOCK(tcp_secret[idx]);
MD5Add(add);
MD5Final((u_char *)&md5_buffer, &syn_ctx);
data ^= (md5_buffer[0] & ~SYNCOOKIE_WNDMASK);
@@ -1345,13 +1372,15 @@ syncookie_lookup(inc, th, so)
int wnd, idx;
struct md5_add add;
- /* NB: single threaded; could add INP_INFO_WLOCK_ASSERT(&tcbinfo) */
-
data = (th->th_ack - 1) ^ (th->th_seq - 1); /* remove ISS */
idx = data & SYNCOOKIE_WNDMASK;
+ rw_rlock(&(tcp_secret[idx].ts_rwmtx));
+ SYNCOOKIE_RLOCK(tcp_secret[idx]);
if (tcp_secret[idx].ts_expire < ticks ||
- sototcpcb(so)->ts_recent + SYNCOOKIE_TIMEOUT < ticks)
+ sototcpcb(so)->ts_recent + SYNCOOKIE_TIMEOUT < ticks) {
+ SYNCOOKIE_RUNLOCK(tcp_secret[idx]);
return (NULL);
+ }
MD5Init(&syn_ctx);
#ifdef INET6
if (inc->inc_isipv6) {
@@ -1371,6 +1400,7 @@ syncookie_lookup(inc, th, so)
add.secbits[1] = tcp_secret[idx].ts_secbits[1];
add.secbits[2] = tcp_secret[idx].ts_secbits[2];
add.secbits[3] = tcp_secret[idx].ts_secbits[3];
+ SYNCOOKIE_RUNLOCK(tcp_secret[idx]);
MD5Add(add);
MD5Final((u_char *)&md5_buffer, &syn_ctx);
data ^= md5_buffer[0];
@@ -1383,24 +1413,25 @@ syncookie_lookup(inc, th, so)
return (NULL);
/*
* Fill in the syncache values.
- * XXX duplicate code from syncache_add
+ * XXX: duplicate code from syncache_add
*/
sc->sc_ipopts = NULL;
sc->sc_inc.inc_fport = inc->inc_fport;
sc->sc_inc.inc_lport = inc->inc_lport;
- sc->sc_tp = sototcpcb(so);
#ifdef INET6
sc->sc_inc.inc_isipv6 = inc->inc_isipv6;
if (inc->inc_isipv6) {
sc->sc_inc.inc6_faddr = inc->inc6_faddr;
sc->sc_inc.inc6_laddr = inc->inc6_laddr;
- if (sc->sc_tp->t_inpcb->in6p_flags & IN6P_AUTOFLOWLABEL)
+ if (sotoinpcb(so)->in6p_flags & IN6P_AUTOFLOWLABEL)
sc->sc_flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK;
} else
#endif
{
sc->sc_inc.inc_faddr = inc->inc_faddr;
sc->sc_inc.inc_laddr = inc->inc_laddr;
+ sc->sc_ip_ttl = sotoinpcb(so)->inp_ip_ttl;
+ sc->sc_ip_tos = sotoinpcb(so)->inp_ip_tos;
}
sc->sc_irs = th->th_seq - 1;
sc->sc_iss = th->th_ack - 1;
@@ -1409,7 +1440,7 @@ syncookie_lookup(inc, th, so)
wnd = imin(wnd, TCP_MAXWIN);
sc->sc_wnd = wnd;
sc->sc_flags = 0;
- sc->sc_rxtslot = 0;
+ sc->sc_rxmits = 0;
sc->sc_peer_mss = tcp_msstab[data];
return (sc);
}
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index e3e4819..b48dbdf 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -245,8 +245,6 @@ struct tcpopt {
#ifdef _NETINET_IN_PCB_H_
struct syncache {
- inp_gen_t sc_inp_gencnt; /* pointer check */
- struct tcpcb *sc_tp; /* tcb for listening socket */
struct mbuf *sc_ipopts; /* source route */
struct in_conninfo sc_inc; /* addresses */
u_int32_t sc_tsrecent;
@@ -254,9 +252,12 @@ struct syncache {
tcp_seq sc_irs; /* seq from peer */
tcp_seq sc_iss; /* our ISS */
u_long sc_rxttime; /* retransmit time */
- u_int16_t sc_rxtslot; /* retransmit counter */
+ u_int16_t sc_rxmits; /* retransmit counter */
+
u_int16_t sc_peer_mss; /* peer's MSS */
u_int16_t sc_wnd; /* advertised window */
+ u_int8_t sc_ip_ttl; /* IPv4 TTL */
+ u_int8_t sc_ip_tos; /* IPv4 TOS */
u_int8_t sc_requested_s_scale:4,
sc_request_r_scale:4;
u_int8_t sc_flags;
@@ -267,11 +268,13 @@ struct syncache {
#define SCF_SIGNATURE 0x20 /* send MD5 digests */
#define SCF_SACK 0x80 /* send SACK option */
TAILQ_ENTRY(syncache) sc_hash;
- TAILQ_ENTRY(syncache) sc_timerq;
};
struct syncache_head {
- TAILQ_HEAD(, syncache) sch_bucket;
+ TAILQ_HEAD(sch_head, syncache) sch_bucket;
+ struct mtx sch_mtx;
+ struct callout sch_timer;
+ int sch_nextc;
u_int sch_length;
};
#else
@@ -563,7 +566,7 @@ void syncache_unreach(struct in_conninfo *, struct tcphdr *);
int syncache_expand(struct in_conninfo *, struct tcphdr *,
struct socket **, struct mbuf *);
int syncache_add(struct in_conninfo *, struct tcpopt *,
- struct tcphdr *, struct socket **, struct mbuf *);
+ struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *);
void syncache_chkrst(struct in_conninfo *, struct tcphdr *);
void syncache_badack(struct in_conninfo *);
/*
OpenPOWER on IntegriCloud