summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorglebius <glebius@FreeBSD.org>2014-05-04 23:25:32 +0000
committerglebius <glebius@FreeBSD.org>2014-05-04 23:25:32 +0000
commitd2bbb6646bad4efa2c8bac48ec5a0c33f7482d11 (patch)
treed7e7b758425a201d0984001d2f9d3f019d60b8d7
parent9b1896e64f52fa639d8bea0b9ba42b8064af94a4 (diff)
downloadFreeBSD-src-d2bbb6646bad4efa2c8bac48ec5a0c33f7482d11.zip
FreeBSD-src-d2bbb6646bad4efa2c8bac48ec5a0c33f7482d11.tar.gz
The FreeBSD-SA-14:08.tcp was a lesson on not doing acrobatics with
mixing on stack memory and UMA memory in one linked list. Thus, rewrite TCP reassembly code in terms of memory usage. The algorithm remains unchanged. We actually do not need extra memory to build a reassembly queue. Arriving mbufs are always packet header mbufs. So we got the length of data as pkthdr.len. We got m_nextpkt for linkage. And we need only one pointer to point at the tcphdr, use PH_loc for that. In tcpcb the t_segq fields becomes mbuf pointer. The t_segqlen field now counts not packets, but bytes in the queue. This gives us more precision when comparing to socket buffer limits. Sponsored by: Netflix Sponsored by: Nginx, Inc.
-rw-r--r--sys/netinet/tcp_input.c6
-rw-r--r--sys/netinet/tcp_reass.c224
-rw-r--r--sys/netinet/tcp_subr.c2
-rw-r--r--sys/netinet/tcp_usrreq.c2
-rw-r--r--sys/netinet/tcp_var.h15
-rw-r--r--sys/sys/mbuf.h1
6 files changed, 77 insertions, 173 deletions
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index f01da00..aac430d 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -1639,8 +1639,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->snd_nxt == tp->snd_max &&
tiwin && tiwin == tp->snd_wnd &&
((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
- LIST_EMPTY(&tp->t_segq) &&
- ((to.to_flags & TOF_TS) == 0 ||
+ tp->t_segq == NULL && ((to.to_flags & TOF_TS) == 0 ||
TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
/*
@@ -2908,8 +2907,7 @@ dodata: /* XXX */
* immediately when segments are out of order (so
* fast retransmit can work).
*/
- if (th->th_seq == tp->rcv_nxt &&
- LIST_EMPTY(&tp->t_segq) &&
+ if (th->th_seq == tp->rcv_nxt && tp->t_segq == NULL &&
TCPS_HAVEESTABLISHED(tp->t_state)) {
if (DELAY_ACK(tp, tlen))
tp->t_flags |= TF_DELACK;
diff --git a/sys/netinet/tcp_reass.c b/sys/netinet/tcp_reass.c
index a8ec39c..b0366b2 100644
--- a/sys/netinet/tcp_reass.c
+++ b/sys/netinet/tcp_reass.c
@@ -71,19 +71,10 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp_var.h>
#include <netinet6/tcp6_var.h>
#include <netinet/tcpip.h>
-#ifdef TCPDEBUG
-#include <netinet/tcp_debug.h>
-#endif /* TCPDEBUG */
static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
"TCP Segment Reassembly Queue");
-static VNET_DEFINE(int, tcp_reass_maxseg) = 0;
-#define V_tcp_reass_maxseg VNET(tcp_reass_maxseg)
-SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN,
- &VNET_NAME(tcp_reass_maxseg), 0,
- "Global maximum number of TCP Segments in Reassembly Queue");
-
static VNET_DEFINE(int, tcp_reass_overflows) = 0;
#define V_tcp_reass_overflows VNET(tcp_reass_overflows)
SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, overflows,
@@ -91,78 +82,32 @@ SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, overflows,
&VNET_NAME(tcp_reass_overflows), 0,
"Global number of TCP Segment Reassembly Queue Overflows");
-static VNET_DEFINE(uma_zone_t, tcp_reass_zone);
-#define V_tcp_reass_zone VNET(tcp_reass_zone)
-SYSCTL_UMA_CUR(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_VNET,
- &VNET_NAME(tcp_reass_zone),
- "Global number of TCP Segments currently in Reassembly Queue");
-
-/* Initialize TCP reassembly queue */
-static void
-tcp_reass_zone_change(void *tag)
-{
-
- /* Set the zone limit and read back the effective value. */
- V_tcp_reass_maxseg = nmbclusters / 16;
- V_tcp_reass_maxseg = uma_zone_set_max(V_tcp_reass_zone,
- V_tcp_reass_maxseg);
-}
-
-void
-tcp_reass_init(void)
-{
-
- V_tcp_reass_maxseg = nmbclusters / 16;
- TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments",
- &V_tcp_reass_maxseg);
- V_tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent),
- NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
- /* Set the zone limit and read back the effective value. */
- V_tcp_reass_maxseg = uma_zone_set_max(V_tcp_reass_zone,
- V_tcp_reass_maxseg);
- EVENTHANDLER_REGISTER(nmbclusters_change,
- tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY);
-}
-
-#ifdef VIMAGE
-void
-tcp_reass_destroy(void)
-{
-
- uma_zdestroy(V_tcp_reass_zone);
-}
-#endif
-
void
tcp_reass_flush(struct tcpcb *tp)
{
- struct tseg_qent *qe;
+ struct mbuf *m;
INP_WLOCK_ASSERT(tp->t_inpcb);
- while ((qe = LIST_FIRST(&tp->t_segq)) != NULL) {
- LIST_REMOVE(qe, tqe_q);
- m_freem(qe->tqe_m);
- uma_zfree(V_tcp_reass_zone, qe);
- tp->t_segqlen--;
+ while ((m = tp->t_segq) != NULL) {
+ tp->t_segq = m->m_nextpkt;
+ tp->t_segqlen -= m->m_pkthdr.len;
+ m_freem(m);
}
KASSERT((tp->t_segqlen == 0),
- ("TCP reass queue %p segment count is %d instead of 0 after flush.",
+ ("TCP reass queue %p length is %d instead of 0 after flush.",
tp, tp->t_segqlen));
}
+#define M_TCPHDR(m) ((struct tcphdr *)((m)->m_pkthdr.pkt_tcphdr))
+
int
tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
{
- struct tseg_qent *q;
- struct tseg_qent *p = NULL;
- struct tseg_qent *nq;
- struct tseg_qent *te = NULL;
struct socket *so = tp->t_inpcb->inp_socket;
- char *s = NULL;
- int flags;
- struct tseg_qent tqs;
+ struct mbuf *mq, *mp;
+ int flags, wakeup;
INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -178,6 +123,10 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
if (th == NULL)
goto present;
+ M_ASSERTPKTHDR(m);
+ KASSERT(*tlenp == m->m_pkthdr.len, ("%s: tlenp %u len %u", __func__,
+ *tlenp, m->m_pkthdr.len));
+
/*
* Limit the number of segments that can be queued to reduce the
* potential for mbuf exhaustion. For best performance, we want to be
@@ -188,19 +137,17 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
* Always let the missing segment through which caused this queue.
* NB: Access to the socket buffer is left intentionally unlocked as we
* can tolerate stale information here.
- *
- * XXXLAS: Using sbspace(so->so_rcv) instead of so->so_rcv.sb_hiwat
- * should work but causes packets to be dropped when they shouldn't.
- * Investigate why and re-evaluate the below limit after the behaviour
- * is understood.
*/
if ((th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) &&
- tp->t_segqlen >= (so->so_rcv.sb_hiwat / tp->t_maxseg) + 1) {
+ tp->t_segqlen + m->m_pkthdr.len >= sbspace(&so->so_rcv)) {
+ char *s;
+
V_tcp_reass_overflows++;
TCPSTAT_INC(tcps_rcvmemdrop);
m_freem(m);
*tlenp = 0;
- if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) {
+ if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL,
+ NULL))) {
log(LOG_DEBUG, "%s; %s: queue limit reached, "
"segment dropped\n", s, __func__);
free(s, M_TCPLOG);
@@ -209,46 +156,13 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
}
/*
- * Allocate a new queue entry. If we can't, or hit the zone limit
- * just drop the pkt.
- *
- * Use a temporary structure on the stack for the missing segment
- * when the zone is exhausted. Otherwise we may get stuck.
- */
- te = uma_zalloc(V_tcp_reass_zone, M_NOWAIT);
- if (te == NULL) {
- if (th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) {
- TCPSTAT_INC(tcps_rcvmemdrop);
- m_freem(m);
- *tlenp = 0;
- if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL,
- NULL))) {
- log(LOG_DEBUG, "%s; %s: global zone limit "
- "reached, segment dropped\n", s, __func__);
- free(s, M_TCPLOG);
- }
- return (0);
- } else {
- bzero(&tqs, sizeof(struct tseg_qent));
- te = &tqs;
- if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL,
- NULL))) {
- log(LOG_DEBUG,
- "%s; %s: global zone limit reached, using "
- "stack for missing segment\n", s, __func__);
- free(s, M_TCPLOG);
- }
- }
- }
- tp->t_segqlen++;
-
- /*
* Find a segment which begins after this one does.
*/
- LIST_FOREACH(q, &tp->t_segq, tqe_q) {
- if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
+ mp = NULL;
+ for (mq = tp->t_segq; mq != NULL; mq = mq->m_nextpkt) {
+ if (SEQ_GT(M_TCPHDR(mq)->th_seq, th->th_seq))
break;
- p = q;
+ mp = mq;
}
/*
@@ -256,18 +170,16 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
* our data already. If so, drop the data from the incoming
* segment. If it provides all of our data, drop us.
*/
- if (p != NULL) {
+ if (mp != NULL) {
int i;
+
/* conversion to int (in i) handles seq wraparound */
- i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
+ i = M_TCPHDR(mp)->th_seq + mp->m_pkthdr.len - th->th_seq;
if (i > 0) {
if (i >= *tlenp) {
TCPSTAT_INC(tcps_rcvduppack);
TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp);
m_freem(m);
- if (te != &tqs)
- uma_zfree(V_tcp_reass_zone, te);
- tp->t_segqlen--;
/*
* Try to present any queued data
* at the left window edge to the user.
@@ -289,37 +201,40 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
* While we overlap succeeding segments trim them or,
* if they are completely covered, dequeue them.
*/
- while (q) {
- int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
+ while (mq) {
+ struct mbuf *nq;
+ int i;
+
+ i = (th->th_seq + *tlenp) - M_TCPHDR(mq)->th_seq;
if (i <= 0)
break;
- if (i < q->tqe_len) {
- q->tqe_th->th_seq += i;
- q->tqe_len -= i;
- m_adj(q->tqe_m, i);
+ if (i < mq->m_pkthdr.len) {
+ M_TCPHDR(mq)->th_seq += i;
+ m_adj(mq, i);
+ tp->t_segqlen -= i;
break;
}
- nq = LIST_NEXT(q, tqe_q);
- LIST_REMOVE(q, tqe_q);
- m_freem(q->tqe_m);
- uma_zfree(V_tcp_reass_zone, q);
- tp->t_segqlen--;
- q = nq;
+ nq = mq->m_nextpkt;
+ tp->t_segqlen -= mq->m_pkthdr.len;
+ m_freem(mq);
+ if (mp)
+ mp->m_nextpkt = nq;
+ else
+ tp->t_segq = nq;
+ mq = nq;
}
/* Insert the new segment queue entry into place. */
- te->tqe_m = m;
- te->tqe_th = th;
- te->tqe_len = *tlenp;
-
- if (p == NULL) {
- LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
+ if (mp) {
+ m->m_nextpkt = mp->m_nextpkt;
+ mp->m_nextpkt = m;
} else {
- KASSERT(te != &tqs, ("%s: temporary stack based entry not "
- "first element in queue", __func__));
- LIST_INSERT_AFTER(p, te, tqe_q);
+ m->m_nextpkt = tp->t_segq;
+ tp->t_segq = m ;
}
+ m->m_pkthdr.pkt_tcphdr = th;
+ tp->t_segqlen += m->m_pkthdr.len;
present:
/*
@@ -328,25 +243,30 @@ present:
*/
if (!TCPS_HAVEESTABLISHED(tp->t_state))
return (0);
- q = LIST_FIRST(&tp->t_segq);
- if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
- return (0);
+
+ flags = 0;
+ wakeup = 0;
SOCKBUF_LOCK(&so->so_rcv);
- do {
- tp->rcv_nxt += q->tqe_len;
- flags = q->tqe_th->th_flags & TH_FIN;
- nq = LIST_NEXT(q, tqe_q);
- LIST_REMOVE(q, tqe_q);
+ while ((mq = tp->t_segq) != NULL &&
+ M_TCPHDR(mq)->th_seq == tp->rcv_nxt) {
+ tp->t_segq = mq->m_nextpkt;
+
+ tp->rcv_nxt += mq->m_pkthdr.len;
+ tp->t_segqlen -= mq->m_pkthdr.len;
+ flags = M_TCPHDR(mq)->th_flags & TH_FIN;
+
if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
- m_freem(q->tqe_m);
- else
- sbappendstream_locked(&so->so_rcv, q->tqe_m);
- if (q != &tqs)
- uma_zfree(V_tcp_reass_zone, q);
- tp->t_segqlen--;
- q = nq;
- } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
+ m_freem(mq);
+ else {
+ mq->m_nextpkt = NULL;
+ sbappendstream_locked(&so->so_rcv, mq);
+ wakeup = 1;
+ }
+ }
ND6_HINT(tp);
- sorwakeup_locked(so);
+ if (wakeup)
+ sorwakeup_locked(so);
+ else
+ SOCKBUF_UNLOCK(&so->so_rcv);
return (flags);
}
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index c73cb97..fb2c415 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -375,7 +375,6 @@ tcp_init(void)
tcp_tw_init();
syncache_init();
tcp_hc_init();
- tcp_reass_init();
TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack);
V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
@@ -433,7 +432,6 @@ tcp_destroy(void)
{
int error;
- tcp_reass_destroy();
tcp_hc_destroy();
syncache_destroy();
tcp_tw_destroy();
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index a8dcb32..42c1e1d 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -1949,7 +1949,7 @@ db_print_tcpcb(struct tcpcb *tp, const char *name, int indent)
db_print_indent(indent);
db_printf("t_segq first: %p t_segqlen: %d t_dupacks: %d\n",
- LIST_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks);
+ tp->t_segq, tp->t_segqlen, tp->t_dupacks);
db_print_indent(indent);
db_printf("tt_rexmt: %p tt_persist: %p tt_keep: %p\n",
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 25426f5..3577bed 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -46,15 +46,6 @@ VNET_DECLARE(int, tcp_do_rfc1323);
#endif /* _KERNEL */
-/* TCP segment queue entry */
-struct tseg_qent {
- LIST_ENTRY(tseg_qent) tqe_q;
- int tqe_len; /* TCP segment data length */
- struct tcphdr *tqe_th; /* a pointer to tcp header */
- struct mbuf *tqe_m; /* mbuf contains packet */
-};
-LIST_HEAD(tsegqe_head, tseg_qent);
-
struct sackblk {
tcp_seq start; /* start seq no. of sack block */
tcp_seq end; /* end seq no. */
@@ -100,7 +91,7 @@ do { \
* Organized for 16 byte cacheline efficiency.
*/
struct tcpcb {
- struct tsegqe_head t_segq; /* segment reassembly queue */
+ struct mbuf *t_segq; /* segment reassembly queue */
void *t_pspare[2]; /* new reassembly queue */
int t_segqlen; /* segment reassembly queue length */
int t_dupacks; /* consecutive dup acks recd */
@@ -663,11 +654,7 @@ char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *,
char *tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *,
const void *);
int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *);
-void tcp_reass_init(void);
void tcp_reass_flush(struct tcpcb *);
-#ifdef VIMAGE
-void tcp_reass_destroy(void);
-#endif
void tcp_input(struct mbuf *, int);
u_long tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);
u_long tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *);
diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h
index 815732e..e056bc7 100644
--- a/sys/sys/mbuf.h
+++ b/sys/sys/mbuf.h
@@ -159,6 +159,7 @@ struct pkthdr {
#define tso_segsz PH_per.sixteen[1]
#define csum_phsum PH_per.sixteen[2]
#define csum_data PH_per.thirtytwo[1]
+#define pkt_tcphdr PH_loc.ptr
/*
* Description of external storage mapped into mbuf; valid only if M_EXT is
OpenPOWER on IntegriCloud