summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorps <ps@FreeBSD.org>2004-06-23 21:04:37 +0000
committerps <ps@FreeBSD.org>2004-06-23 21:04:37 +0000
commitf5f3e8600b5cd41c8645b3a5d45e20092a8b9ee1 (patch)
tree37eceb1297375660ec2d161a79ee2ec7364248b3
parent933faf5c3e0325440e1ef2edac115dd64ece174c (diff)
downloadFreeBSD-src-f5f3e8600b5cd41c8645b3a5d45e20092a8b9ee1.zip
FreeBSD-src-f5f3e8600b5cd41c8645b3a5d45e20092a8b9ee1.tar.gz
Add support for TCP Selective Acknowledgements. The work for this
originated on RELENG_4 and was ported to -CURRENT. The scoreboarding code was obtained from OpenBSD, and many of the remaining changes were inspired by OpenBSD, but not taken directly from there. You can enable/disable sack using net.inet.tcp.do_sack. You can also limit the number of sack holes that all senders can have in the scoreboard with net.inet.tcp.sackhole_limit. Reviewed by: gnn Obtained from: Yahoo! (Mohan Srinivasan, Jayanth Vijayaraghavan)
-rw-r--r--sys/conf/files1
-rw-r--r--sys/conf/options1
-rw-r--r--sys/netinet/tcp.h12
-rw-r--r--sys/netinet/tcp_input.c93
-rw-r--r--sys/netinet/tcp_output.c118
-rw-r--r--sys/netinet/tcp_reass.c93
-rw-r--r--sys/netinet/tcp_sack.c592
-rw-r--r--sys/netinet/tcp_seq.h3
-rw-r--r--sys/netinet/tcp_subr.c16
-rw-r--r--sys/netinet/tcp_syncache.c15
-rw-r--r--sys/netinet/tcp_timer.c3
-rw-r--r--sys/netinet/tcp_timewait.c16
-rw-r--r--sys/netinet/tcp_var.h49
13 files changed, 975 insertions, 37 deletions
diff --git a/sys/conf/files b/sys/conf/files
index 0febe87..8c1a136 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -1465,6 +1465,7 @@ netinet/tcp_debug.c optional tcpdebug
netinet/tcp_hostcache.c optional inet
netinet/tcp_input.c optional inet
netinet/tcp_output.c optional inet
+netinet/tcp_sack.c optional inet
netinet/tcp_subr.c optional inet
netinet/tcp_syncache.c optional inet
netinet/tcp_timer.c optional inet
diff --git a/sys/conf/options b/sys/conf/options
index 62913a1..fdb385a 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -354,6 +354,7 @@ RANDOM_IP_ID
SLIP_IFF_OPTS opt_slip.h
TCPDEBUG
TCP_SIGNATURE opt_inet.h
+TCP_SACK_DEBUG opt_tcp_sack.h
TCP_DROP_SYNFIN opt_tcp_input.h
XBONEHACK
diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
index 1eee95c..92460d9 100644
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@@ -85,12 +85,15 @@ struct tcphdr {
#define TCPOPT_SACK_PERMITTED 4 /* Experimental */
#define TCPOLEN_SACK_PERMITTED 2
#define TCPOPT_SACK 5 /* Experimental */
+#define TCPOLEN_SACK 8 /* 2*sizeof(tcp_seq) */
#define TCPOPT_TIMESTAMP 8
#define TCPOLEN_TIMESTAMP 10
#define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */
#define TCPOPT_TSTAMP_HDR \
(TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)
+#define MAX_TCPOPTLEN 40 /* Absolute maximum TCP options len */
+
#define TCPOPT_CC 11 /* CC options: RFC-1644 */
#define TCPOPT_CCNEW 12
#define TCPOPT_CCECHO 13
@@ -101,6 +104,15 @@ struct tcphdr {
#define TCPOPT_SIGNATURE 19 /* Keyed MD5: RFC 2385 */
#define TCPOLEN_SIGNATURE 18
+/* Option definitions */
+#define TCPOPT_SACK_PERMIT_HDR \
+(TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK_PERMITTED<<8|TCPOLEN_SACK_PERMITTED)
+#define TCPOPT_SACK_HDR (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK<<8)
+/* Miscellaneous constants */
+#define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at sender side */
+#define TCP_MAX_SACK 3 /* MAX # SACKs sent in any segment */
+
+
/*
* Default maximum segment size for TCP.
* With an IP MTU of 576, this is 536,
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index b1b2284..581fe9a 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -37,6 +37,7 @@
#include "opt_mac.h"
#include "opt_tcpdebug.h"
#include "opt_tcp_input.h"
+#include "opt_tcp_sack.h"
#include <sys/param.h>
#include <sys/kernel.h>
@@ -159,7 +160,9 @@ struct inpcbhead tcb;
struct inpcbinfo tcbinfo;
struct mtx *tcbinfo_mtx;
-static void tcp_dooptions(struct tcpopt *, u_char *, int, int);
+static void tcp_dooptions(struct tcpcb *, struct tcpopt *, u_char *,
+ int, int, struct tcphdr *);
+
static void tcp_pulloutofband(struct socket *,
struct tcphdr *, struct mbuf *, int);
static int tcp_reass(struct tcpcb *, struct tcphdr *, int *,
@@ -724,7 +727,7 @@ findpcb:
* present in a SYN segment. See tcp_timewait().
*/
if (thflags & TH_SYN)
- tcp_dooptions(&to, optp, optlen, 1);
+ tcp_dooptions((struct tcpcb *)NULL, &to, optp, optlen, 1, th);
if (tcp_timewait((struct tcptw *)inp->inp_ppcb,
&to, th, m, tlen))
goto findpcb;
@@ -938,7 +941,7 @@ findpcb:
tcp_trace(TA_INPUT, ostate, tp,
(void *)tcp_saveipgen, &tcp_savetcp, 0);
#endif
- tcp_dooptions(&to, optp, optlen, 1);
+ tcp_dooptions(tp, &to, optp, optlen, 1, th);
if (!syncache_add(&inc, &to, th, &so, m))
goto drop;
if (so == NULL) {
@@ -1054,7 +1057,7 @@ after_listen:
* for incoming connections is handled in tcp_syncache.
* XXX this is traditional behavior, may need to be cleaned up.
*/
- tcp_dooptions(&to, optp, optlen, thflags & TH_SYN);
+ tcp_dooptions(tp,&to, optp, optlen, thflags & TH_SYN,th);
if (thflags & TH_SYN) {
if (to.to_flags & TOF_SCALE) {
tp->t_flags |= TF_RCVD_SCALE;
@@ -1069,6 +1072,20 @@ after_listen:
tp->t_flags |= TF_RCVD_CC;
if (to.to_flags & TOF_MSS)
tcp_mss(tp, to.to_mss);
+ if (tp->sack_enable) {
+ if (!(to.to_flags & TOF_SACK))
+ tp->sack_enable = 0;
+ else
+ tp->t_flags |= TF_SACK_PERMIT;
+ }
+
+ }
+
+ if (tp->sack_enable) {
+ /* Delete stale (cumulatively acked) SACK holes */
+ tcp_del_sackholes(tp, th);
+ tp->rcv_laststart = th->th_seq; /* last rec'vd segment*/
+ tp->rcv_lastend = th->th_seq + tlen;
}
/*
@@ -1120,9 +1137,10 @@ after_listen:
if (SEQ_GT(th->th_ack, tp->snd_una) &&
SEQ_LEQ(th->th_ack, tp->snd_max) &&
tp->snd_cwnd >= tp->snd_wnd &&
- ((!tcp_do_newreno &&
+ ((!tcp_do_newreno && !tp->sack_enable &&
tp->t_dupacks < tcprexmtthresh) ||
- (tcp_do_newreno && !IN_FASTRECOVERY(tp)))) {
+ ((tcp_do_newreno || tp->sack_enable) &&
+ !IN_FASTRECOVERY(tp)))) {
KASSERT(headlocked, ("headlocked"));
INP_INFO_WUNLOCK(&tcbinfo);
/*
@@ -1218,6 +1236,9 @@ after_listen:
* with nothing on the reassembly queue and
* we have enough buffer space to take it.
*/
+ /* Clean receiver SACK report if present */
+ if (tp->sack_enable && tp->rcv_numsacks)
+ tcp_clean_sackreport(tp);
++tcpstat.tcps_preddat;
tp->rcv_nxt += tlen;
/*
@@ -1898,7 +1919,7 @@ trimthenstep6:
th->th_ack != tp->snd_una)
tp->t_dupacks = 0;
else if (++tp->t_dupacks > tcprexmtthresh ||
- (tcp_do_newreno &&
+ ((tcp_do_newreno || tp->sack_enable) &&
IN_FASTRECOVERY(tp))) {
tp->snd_cwnd += tp->t_maxseg;
(void) tcp_output(tp);
@@ -1906,7 +1927,8 @@ trimthenstep6:
} else if (tp->t_dupacks == tcprexmtthresh) {
tcp_seq onxt = tp->snd_nxt;
u_int win;
- if (tcp_do_newreno &&
+ if ((tcp_do_newreno ||
+ tp->sack_enable) &&
SEQ_LEQ(th->th_ack,
tp->snd_recover)) {
tp->t_dupacks = 0;
@@ -1921,6 +1943,17 @@ trimthenstep6:
tp->snd_recover = tp->snd_max;
callout_stop(tp->tt_rexmt);
tp->t_rtttime = 0;
+ if (tp->sack_enable) {
+ tcpstat.tcps_sack_recovery_episode++;
+ tp->snd_cwnd =
+ tp->t_maxseg *
+ tp->t_dupacks;
+ (void) tcp_output(tp);
+ tp->snd_cwnd =
+ tp->snd_ssthresh;
+ goto drop;
+ }
+
tp->snd_nxt = th->th_ack;
tp->snd_cwnd = tp->t_maxseg;
(void) tcp_output(tp);
@@ -1971,12 +2004,16 @@ trimthenstep6:
* If the congestion window was inflated to account
* for the other side's cached packets, retract it.
*/
- if (tcp_do_newreno) {
+ if (tcp_do_newreno || tp->sack_enable) {
if (IN_FASTRECOVERY(tp)) {
if (SEQ_LT(th->th_ack, tp->snd_recover)) {
- tcp_newreno_partial_ack(tp, th);
+ if (tp->sack_enable)
+ tcp_sack_partialack(tp, th);
+ else
+ tcp_newreno_partial_ack(tp, th);
} else {
/*
+ * Out of fast recovery.
* Window inflation should have left us
* with approximately snd_ssthresh
* outstanding data.
@@ -2098,7 +2135,8 @@ process_ACK:
* Otherwise open linearly: maxseg per window
* (maxseg^2 / cwnd per packet).
*/
- if (!tcp_do_newreno || !IN_FASTRECOVERY(tp)) {
+ if ((!tcp_do_newreno && !tp->sack_enable) ||
+ !IN_FASTRECOVERY(tp)) {
register u_int cw = tp->snd_cwnd;
register u_int incr = tp->t_maxseg;
if (cw > tp->snd_ssthresh)
@@ -2116,14 +2154,20 @@ process_ACK:
}
sowwakeup(so);
/* detect una wraparound */
- if (tcp_do_newreno && !IN_FASTRECOVERY(tp) &&
+ if ((tcp_do_newreno || tp->sack_enable) &&
+ !IN_FASTRECOVERY(tp) &&
SEQ_GT(tp->snd_una, tp->snd_recover) &&
SEQ_LEQ(th->th_ack, tp->snd_recover))
tp->snd_recover = th->th_ack - 1;
- if (tcp_do_newreno && IN_FASTRECOVERY(tp) &&
+ if ((tcp_do_newreno || tp->sack_enable) &&
+ IN_FASTRECOVERY(tp) &&
SEQ_GEQ(th->th_ack, tp->snd_recover))
EXIT_FASTRECOVERY(tp);
tp->snd_una = th->th_ack;
+ if (tp->sack_enable) {
+ if (SEQ_GT(tp->snd_una, tp->snd_recover))
+ tp->snd_recover = tp->snd_una;
+ }
if (SEQ_LT(tp->snd_nxt, tp->snd_una))
tp->snd_nxt = tp->snd_una;
@@ -2327,7 +2371,8 @@ dodata: /* XXX */
thflags = tcp_reass(tp, th, &tlen, m);
tp->t_flags |= TF_ACKNOW;
}
-
+ if (tp->sack_enable)
+ tcp_update_sack_list(tp);
/*
* Note the amount of data that peer has sent into
* our window, in order to estimate the sender's
@@ -2530,11 +2575,13 @@ drop:
* Parse TCP options and place in tcpopt.
*/
static void
-tcp_dooptions(to, cp, cnt, is_syn)
+tcp_dooptions(tp, to, cp, cnt, is_syn, th)
+ struct tcpcb *tp;
struct tcpopt *to;
- u_char *cp;
+ u_char *cp;
int cnt;
int is_syn;
+ struct tcphdr *th;
{
int opt, optlen;
@@ -2623,6 +2670,20 @@ tcp_dooptions(to, cp, cnt, is_syn)
to->to_flags |= (TOF_SIGNATURE | TOF_SIGLEN);
break;
#endif
+ case TCPOPT_SACK_PERMITTED:
+ if (!tcp_do_sack ||
+ optlen != TCPOLEN_SACK_PERMITTED)
+ continue;
+ if (is_syn) {
+ /* MUST only be set on SYN */
+ to->to_flags |= TOF_SACK;
+ }
+ break;
+
+ case TCPOPT_SACK:
+ if (!tp || tcp_sack_option(tp, th, cp, optlen))
+ continue;
+ break;
default:
continue;
}
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index aa7f58f..a662d0f 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -35,6 +35,7 @@
#include "opt_ipsec.h"
#include "opt_mac.h"
#include "opt_tcpdebug.h"
+#include "opt_tcp_sack.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -122,6 +123,8 @@ tcp_output(struct tcpcb *tp)
u_char opt[TCP_MAXOLEN];
unsigned ipoptlen, optlen, hdrlen;
int idle, sendalot;
+ int i, sack_rxmit;
+ struct sackhole *p;
#if 0
int maxburst = TCP_MAXBURST;
#endif
@@ -171,6 +174,13 @@ tcp_output(struct tcpcb *tp)
}
}
again:
+ /*
+ * If we've recently taken a timeout, snd_max will be greater than
+ * snd_nxt. There may be SACK information that allows us to avoid
+ * resending already delivered data. Adjust snd_nxt accordingly.
+ */
+ if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max))
+ tcp_sack_adjust(tp);
sendalot = 0;
off = tp->snd_nxt - tp->snd_una;
sendwin = min(tp->snd_wnd, tp->snd_cwnd);
@@ -178,6 +188,36 @@ again:
flags = tcp_outflags[tp->t_state];
/*
+ * Send any SACK-generated retransmissions. If we're explicitly trying
+ * to send out new data (when sendalot is 1), bypass this function.
+ * If we retransmit in fast recovery mode, decrement snd_cwnd, since
+ * we're replacing a (future) new transmission with a retransmission
+ * now, and we previously incremented snd_cwnd in tcp_input().
+ */
+ /*
+ * Still in sack recovery , reset rxmit flag to zero.
+ */
+ sack_rxmit = 0;
+ len = 0;
+ p = NULL;
+ if (tp->sack_enable && IN_FASTRECOVERY(tp) &&
+ (p = tcp_sack_output(tp))) {
+ sack_rxmit = 1;
+ sendalot = 1;
+ off = p->rxmit - tp->snd_una;
+ KASSERT(tp->snd_cwnd >= 0,("%s: CWIN is negative: %ld", __func__, tp->snd_cwnd));
+ /* Do not retransmit SACK segments beyond snd_recover */
+ if (SEQ_GT(p->end, tp->snd_recover))
+ len = min(tp->snd_cwnd, tp->snd_recover - p->rxmit);
+ else
+ len = min(tp->snd_cwnd, p->end - p->rxmit);
+ if (len > 0) {
+ tcpstat.tcps_sack_rexmits++;
+ tcpstat.tcps_sack_rexmit_bytes +=
+ min(len, tp->t_maxseg);
+ }
+ }
+ /*
* Get standard flags, and add SYN or FIN if requested by 'hidden'
* state flags.
*/
@@ -230,9 +270,12 @@ again:
* In the normal retransmit-FIN-only case, however, snd_nxt will
* be set to snd_una, the offset will be 0, and the length may
* wind up 0.
+ *
+ * If sack_rxmit is true we are retransmitting from the scoreboard
+ * in which case len is already set.
*/
- len = (long)ulmin(so->so_snd.sb_cc, sendwin) - off;
-
+ if (!sack_rxmit)
+ len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off);
/*
* Lop off SYN bit if it has already been sent. However, if this
@@ -331,6 +374,8 @@ again:
goto send;
if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */
goto send;
+ if (sack_rxmit)
+ goto send;
}
/*
@@ -374,7 +419,18 @@ again:
if (flags & TH_FIN &&
((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
goto send;
-
+ /*
+ * In SACK, it is possible for tcp_output to fail to send a segment
+ * after the retransmission timer has been turned off. Make sure
+ * that the retransmission timer is set.
+ */
+ if (tp->sack_enable && SEQ_GT(tp->snd_max, tp->snd_una) &&
+ !callout_active(tp->tt_rexmt) &&
+ !callout_active(tp->tt_persist)) {
+ callout_reset(tp->tt_rexmt, tp->t_rxtcur,
+ tcp_timer_rexmt, tp);
+ return (0);
+ }
/*
* TCP window updates are not reliable, rather a polling protocol
* using ``persist'' packets is used to insure receipt of window
@@ -435,6 +491,19 @@ send:
(void)memcpy(opt + 2, &mss, sizeof(mss));
optlen = TCPOLEN_MAXSEG;
+ /*
+ * If this is the first SYN of connection (not a SYN
+ * ACK), include SACK_PERMIT_HDR option. If this is a
+ * SYN ACK, include SACK_PERMIT_HDR option if peer has
+ * already done so. This is only for active connect,
+ * since the syncache takes care of the passive connect.
+ */
+ if (tp->sack_enable && ((flags & TH_ACK) == 0 ||
+ (tp->t_flags & TF_SACK_PERMIT))) {
+ *((u_int32_t *) (opt + optlen)) =
+ htonl(TCPOPT_SACK_PERMIT_HDR);
+ optlen += 4;
+ }
if ((tp->t_flags & TF_REQ_SCALE) &&
((flags & TH_ACK) == 0 ||
(tp->t_flags & TF_RCVD_SCALE))) {
@@ -466,6 +535,32 @@ send:
optlen += TCPOLEN_TSTAMP_APPA;
}
+ /*
+ * Send SACKs if necessary. This should be the last option processed.
+ * Only as many SACKs are sent as are permitted by the maximum options
+ * size. No more than three SACKs are sent.
+ */
+ if (tp->sack_enable && tp->t_state == TCPS_ESTABLISHED &&
+ (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT &&
+ tp->rcv_numsacks) {
+ u_int32_t *lp = (u_int32_t *)(opt + optlen);
+ u_int32_t *olp = lp++;
+ int count = 0; /* actual number of SACKs inserted */
+ int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK;
+
+ tcpstat.tcps_sack_send_blocks++;
+ maxsack = min(maxsack, TCP_MAX_SACK);
+ for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) {
+ struct sackblk sack = tp->sackblks[i];
+ if (sack.start == 0 && sack.end == 0)
+ continue;
+ *lp++ = htonl(sack.start);
+ *lp++ = htonl(sack.end);
+ count++;
+ }
+ *olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2));
+ optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */
+ }
/*
* Send `CC-family' options if our side wants to use them (TF_REQ_CC),
* options are allowed (!TF_NOOPT) and it's not a RST.
@@ -734,6 +829,10 @@ send:
th->th_seq = htonl(tp->snd_nxt);
else
th->th_seq = htonl(tp->snd_max);
+ if (sack_rxmit) {
+ th->th_seq = htonl(p->rxmit);
+ p->rxmit += len;
+ }
th->th_ack = htonl(tp->rcv_nxt);
if (optlen) {
bcopy(opt, th + 1, optlen);
@@ -831,6 +930,8 @@ send:
tp->t_flags |= TF_SENTFIN;
}
}
+ if (tp->sack_enable && sack_rxmit && (p->rxmit != tp->snd_nxt))
+ goto timer;
tp->snd_nxt += len;
if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
tp->snd_max = tp->snd_nxt;
@@ -853,6 +954,17 @@ send:
* Initialize shift counter which is used for backoff
* of retransmit time.
*/
+timer:
+ if (tp->sack_enable && sack_rxmit &&
+ !callout_active(tp->tt_rexmt) &&
+ tp->snd_nxt != tp->snd_max) {
+ callout_reset(tp->tt_rexmt, tp->t_rxtcur,
+ tcp_timer_rexmt, tp);
+ if (callout_active(tp->tt_persist)) {
+ callout_stop(tp->tt_persist);
+ tp->t_rxtshift = 0;
+ }
+ }
if (!callout_active(tp->tt_rexmt) &&
tp->snd_nxt != tp->snd_una) {
if (callout_active(tp->tt_persist)) {
diff --git a/sys/netinet/tcp_reass.c b/sys/netinet/tcp_reass.c
index b1b2284..581fe9a 100644
--- a/sys/netinet/tcp_reass.c
+++ b/sys/netinet/tcp_reass.c
@@ -37,6 +37,7 @@
#include "opt_mac.h"
#include "opt_tcpdebug.h"
#include "opt_tcp_input.h"
+#include "opt_tcp_sack.h"
#include <sys/param.h>
#include <sys/kernel.h>
@@ -159,7 +160,9 @@ struct inpcbhead tcb;
struct inpcbinfo tcbinfo;
struct mtx *tcbinfo_mtx;
-static void tcp_dooptions(struct tcpopt *, u_char *, int, int);
+static void tcp_dooptions(struct tcpcb *, struct tcpopt *, u_char *,
+ int, int, struct tcphdr *);
+
static void tcp_pulloutofband(struct socket *,
struct tcphdr *, struct mbuf *, int);
static int tcp_reass(struct tcpcb *, struct tcphdr *, int *,
@@ -724,7 +727,7 @@ findpcb:
* present in a SYN segment. See tcp_timewait().
*/
if (thflags & TH_SYN)
- tcp_dooptions(&to, optp, optlen, 1);
+ tcp_dooptions((struct tcpcb *)NULL, &to, optp, optlen, 1, th);
if (tcp_timewait((struct tcptw *)inp->inp_ppcb,
&to, th, m, tlen))
goto findpcb;
@@ -938,7 +941,7 @@ findpcb:
tcp_trace(TA_INPUT, ostate, tp,
(void *)tcp_saveipgen, &tcp_savetcp, 0);
#endif
- tcp_dooptions(&to, optp, optlen, 1);
+ tcp_dooptions(tp, &to, optp, optlen, 1, th);
if (!syncache_add(&inc, &to, th, &so, m))
goto drop;
if (so == NULL) {
@@ -1054,7 +1057,7 @@ after_listen:
* for incoming connections is handled in tcp_syncache.
* XXX this is traditional behavior, may need to be cleaned up.
*/
- tcp_dooptions(&to, optp, optlen, thflags & TH_SYN);
+ tcp_dooptions(tp,&to, optp, optlen, thflags & TH_SYN,th);
if (thflags & TH_SYN) {
if (to.to_flags & TOF_SCALE) {
tp->t_flags |= TF_RCVD_SCALE;
@@ -1069,6 +1072,20 @@ after_listen:
tp->t_flags |= TF_RCVD_CC;
if (to.to_flags & TOF_MSS)
tcp_mss(tp, to.to_mss);
+ if (tp->sack_enable) {
+ if (!(to.to_flags & TOF_SACK))
+ tp->sack_enable = 0;
+ else
+ tp->t_flags |= TF_SACK_PERMIT;
+ }
+
+ }
+
+ if (tp->sack_enable) {
+ /* Delete stale (cumulatively acked) SACK holes */
+ tcp_del_sackholes(tp, th);
+ tp->rcv_laststart = th->th_seq; /* last rec'vd segment*/
+ tp->rcv_lastend = th->th_seq + tlen;
}
/*
@@ -1120,9 +1137,10 @@ after_listen:
if (SEQ_GT(th->th_ack, tp->snd_una) &&
SEQ_LEQ(th->th_ack, tp->snd_max) &&
tp->snd_cwnd >= tp->snd_wnd &&
- ((!tcp_do_newreno &&
+ ((!tcp_do_newreno && !tp->sack_enable &&
tp->t_dupacks < tcprexmtthresh) ||
- (tcp_do_newreno && !IN_FASTRECOVERY(tp)))) {
+ ((tcp_do_newreno || tp->sack_enable) &&
+ !IN_FASTRECOVERY(tp)))) {
KASSERT(headlocked, ("headlocked"));
INP_INFO_WUNLOCK(&tcbinfo);
/*
@@ -1218,6 +1236,9 @@ after_listen:
* with nothing on the reassembly queue and
* we have enough buffer space to take it.
*/
+ /* Clean receiver SACK report if present */
+ if (tp->sack_enable && tp->rcv_numsacks)
+ tcp_clean_sackreport(tp);
++tcpstat.tcps_preddat;
tp->rcv_nxt += tlen;
/*
@@ -1898,7 +1919,7 @@ trimthenstep6:
th->th_ack != tp->snd_una)
tp->t_dupacks = 0;
else if (++tp->t_dupacks > tcprexmtthresh ||
- (tcp_do_newreno &&
+ ((tcp_do_newreno || tp->sack_enable) &&
IN_FASTRECOVERY(tp))) {
tp->snd_cwnd += tp->t_maxseg;
(void) tcp_output(tp);
@@ -1906,7 +1927,8 @@ trimthenstep6:
} else if (tp->t_dupacks == tcprexmtthresh) {
tcp_seq onxt = tp->snd_nxt;
u_int win;
- if (tcp_do_newreno &&
+ if ((tcp_do_newreno ||
+ tp->sack_enable) &&
SEQ_LEQ(th->th_ack,
tp->snd_recover)) {
tp->t_dupacks = 0;
@@ -1921,6 +1943,17 @@ trimthenstep6:
tp->snd_recover = tp->snd_max;
callout_stop(tp->tt_rexmt);
tp->t_rtttime = 0;
+ if (tp->sack_enable) {
+ tcpstat.tcps_sack_recovery_episode++;
+ tp->snd_cwnd =
+ tp->t_maxseg *
+ tp->t_dupacks;
+ (void) tcp_output(tp);
+ tp->snd_cwnd =
+ tp->snd_ssthresh;
+ goto drop;
+ }
+
tp->snd_nxt = th->th_ack;
tp->snd_cwnd = tp->t_maxseg;
(void) tcp_output(tp);
@@ -1971,12 +2004,16 @@ trimthenstep6:
* If the congestion window was inflated to account
* for the other side's cached packets, retract it.
*/
- if (tcp_do_newreno) {
+ if (tcp_do_newreno || tp->sack_enable) {
if (IN_FASTRECOVERY(tp)) {
if (SEQ_LT(th->th_ack, tp->snd_recover)) {
- tcp_newreno_partial_ack(tp, th);
+ if (tp->sack_enable)
+ tcp_sack_partialack(tp, th);
+ else
+ tcp_newreno_partial_ack(tp, th);
} else {
/*
+ * Out of fast recovery.
* Window inflation should have left us
* with approximately snd_ssthresh
* outstanding data.
@@ -2098,7 +2135,8 @@ process_ACK:
* Otherwise open linearly: maxseg per window
* (maxseg^2 / cwnd per packet).
*/
- if (!tcp_do_newreno || !IN_FASTRECOVERY(tp)) {
+ if ((!tcp_do_newreno && !tp->sack_enable) ||
+ !IN_FASTRECOVERY(tp)) {
register u_int cw = tp->snd_cwnd;
register u_int incr = tp->t_maxseg;
if (cw > tp->snd_ssthresh)
@@ -2116,14 +2154,20 @@ process_ACK:
}
sowwakeup(so);
/* detect una wraparound */
- if (tcp_do_newreno && !IN_FASTRECOVERY(tp) &&
+ if ((tcp_do_newreno || tp->sack_enable) &&
+ !IN_FASTRECOVERY(tp) &&
SEQ_GT(tp->snd_una, tp->snd_recover) &&
SEQ_LEQ(th->th_ack, tp->snd_recover))
tp->snd_recover = th->th_ack - 1;
- if (tcp_do_newreno && IN_FASTRECOVERY(tp) &&
+ if ((tcp_do_newreno || tp->sack_enable) &&
+ IN_FASTRECOVERY(tp) &&
SEQ_GEQ(th->th_ack, tp->snd_recover))
EXIT_FASTRECOVERY(tp);
tp->snd_una = th->th_ack;
+ if (tp->sack_enable) {
+ if (SEQ_GT(tp->snd_una, tp->snd_recover))
+ tp->snd_recover = tp->snd_una;
+ }
if (SEQ_LT(tp->snd_nxt, tp->snd_una))
tp->snd_nxt = tp->snd_una;
@@ -2327,7 +2371,8 @@ dodata: /* XXX */
thflags = tcp_reass(tp, th, &tlen, m);
tp->t_flags |= TF_ACKNOW;
}
-
+ if (tp->sack_enable)
+ tcp_update_sack_list(tp);
/*
* Note the amount of data that peer has sent into
* our window, in order to estimate the sender's
@@ -2530,11 +2575,13 @@ drop:
* Parse TCP options and place in tcpopt.
*/
static void
-tcp_dooptions(to, cp, cnt, is_syn)
+tcp_dooptions(tp, to, cp, cnt, is_syn, th)
+ struct tcpcb *tp;
struct tcpopt *to;
- u_char *cp;
+ u_char *cp;
int cnt;
int is_syn;
+ struct tcphdr *th;
{
int opt, optlen;
@@ -2623,6 +2670,20 @@ tcp_dooptions(to, cp, cnt, is_syn)
to->to_flags |= (TOF_SIGNATURE | TOF_SIGLEN);
break;
#endif
+ case TCPOPT_SACK_PERMITTED:
+ if (!tcp_do_sack ||
+ optlen != TCPOLEN_SACK_PERMITTED)
+ continue;
+ if (is_syn) {
+ /* MUST only be set on SYN */
+ to->to_flags |= TOF_SACK;
+ }
+ break;
+
+ case TCPOPT_SACK:
+ if (!tp || tcp_sack_option(tp, th, cp, optlen))
+ continue;
+ break;
default:
continue;
}
diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c
new file mode 100644
index 0000000..8dfa682
--- /dev/null
+++ b/sys/netinet/tcp_sack.c
@@ -0,0 +1,592 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95
+ * $FreeBSD$
+ */
+
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995
+ *
+ * NRL grants permission for redistribution and use in source and binary
+ * forms, with or without modification, of the software and documentation
+ * created at NRL provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgements:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * This product includes software developed at the Information
+ * Technology Division, US Naval Research Laboratory.
+ * 4. Neither the name of the NRL nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
+ * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * The views and conclusions contained in the software and documentation
+ * are those of the authors and should not be interpreted as representing
+ * official policies, either expressed or implied, of the US Naval
+ * Research Laboratory (NRL).
+ */
+#include "opt_ipfw.h" /* for ipfw_fwd */
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_tcpdebug.h"
+#include "opt_tcp_input.h"
+#include "opt_tcp_sack.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h> /* for proc0 declaration */
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+
+#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */
+
+#include <vm/uma.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */
+#include <netinet/in_var.h>
+#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
+#include <netinet/in_pcb.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet6/nd6.h>
+#include <netinet6/ip6_var.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet6/tcp6_var.h>
+#include <netinet/tcpip.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+
+u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */
+struct tcphdr tcp_savetcp;
+#endif /* TCPDEBUG */
+
+#ifdef FAST_IPSEC
+#include <netipsec/ipsec.h>
+#include <netipsec/ipsec6.h>
+#endif
+
+#ifdef IPSEC
+#include <netinet6/ipsec.h>
+#include <netinet6/ipsec6.h>
+#include <netkey/key.h>
+#endif /*IPSEC*/
+#include <machine/in_cksum.h>
+
+extern struct uma_zone *sack_hole_zone;
+
+/*
+ * This function is called upon receipt of new valid data (while not in header
+ * prediction mode), and it updates the ordered list of sacks.
+ */
+void
+tcp_update_sack_list(tp)
+ struct tcpcb *tp;
+{
+ /*
+ * First reported block MUST be the most recent one. Subsequent
+ * blocks SHOULD be in the order in which they arrived at the
+ * receiver. These two conditions make the implementation fully
+ * compliant with RFC 2018.
+ */
+ int i, j = 0, count = 0, lastpos = -1;
+ struct sackblk sack, firstsack, temp[MAX_SACK_BLKS];
+
+ INP_LOCK_ASSERT(tp->t_inpcb);
+ /* First clean up current list of sacks */
+ for (i = 0; i < tp->rcv_numsacks; i++) {
+ sack = tp->sackblks[i];
+ if (sack.start == 0 && sack.end == 0) {
+ count++; /* count = number of blocks to be discarded */
+ continue;
+ }
+ if (SEQ_LEQ(sack.end, tp->rcv_nxt)) {
+ tp->sackblks[i].start = tp->sackblks[i].end = 0;
+ count++;
+ } else {
+ temp[j].start = tp->sackblks[i].start;
+ temp[j++].end = tp->sackblks[i].end;
+ }
+ }
+ tp->rcv_numsacks -= count;
+ if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */
+ tcp_clean_sackreport(tp);
+ if (SEQ_LT(tp->rcv_nxt, tp->rcv_laststart)) {
+ /* ==> need first sack block */
+ tp->sackblks[0].start = tp->rcv_laststart;
+ tp->sackblks[0].end = tp->rcv_lastend;
+ tp->rcv_numsacks = 1;
+ }
+ return;
+ }
+ /* Otherwise, sack blocks are already present. */
+ for (i = 0; i < tp->rcv_numsacks; i++)
+ tp->sackblks[i] = temp[i]; /* first copy back sack list */
+ if (SEQ_GEQ(tp->rcv_nxt, tp->rcv_lastend))
+ return; /* sack list remains unchanged */
+ /*
+ * From here, segment just received should be (part of) the 1st sack.
+ * Go through list, possibly coalescing sack block entries.
+ */
+ firstsack.start = tp->rcv_laststart;
+ firstsack.end = tp->rcv_lastend;
+ for (i = 0; i < tp->rcv_numsacks; i++) {
+ sack = tp->sackblks[i];
+ if (SEQ_LT(sack.end, firstsack.start) ||
+ SEQ_GT(sack.start, firstsack.end))
+ continue; /* no overlap */
+ if (sack.start == firstsack.start && sack.end == firstsack.end){
+ /*
+ * identical block; delete it here since we will
+ * move it to the front of the list.
+ */
+ tp->sackblks[i].start = tp->sackblks[i].end = 0;
+ lastpos = i; /* last posn with a zero entry */
+ continue;
+ }
+ if (SEQ_LEQ(sack.start, firstsack.start))
+ firstsack.start = sack.start; /* merge blocks */
+ if (SEQ_GEQ(sack.end, firstsack.end))
+ firstsack.end = sack.end; /* merge blocks */
+ tp->sackblks[i].start = tp->sackblks[i].end = 0;
+ lastpos = i; /* last posn with a zero entry */
+ }
+ if (lastpos != -1) { /* at least one merge */
+ for (i = 0, j = 1; i < tp->rcv_numsacks; i++) {
+ sack = tp->sackblks[i];
+ if (sack.start == 0 && sack.end == 0)
+ continue;
+ temp[j++] = sack;
+ }
+ tp->rcv_numsacks = j; /* including first blk (added later) */
+ for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */
+ tp->sackblks[i] = temp[i];
+ } else { /* no merges -- shift sacks by 1 */
+ if (tp->rcv_numsacks < MAX_SACK_BLKS)
+ tp->rcv_numsacks++;
+ for (i = tp->rcv_numsacks-1; i > 0; i--)
+ tp->sackblks[i] = tp->sackblks[i-1];
+ }
+ tp->sackblks[0] = firstsack;
+ return;
+}
+
+/*
+ * Delete all receiver-side SACK information.
+ */
+void
+tcp_clean_sackreport(tp)
+ struct tcpcb *tp;
+{
+ int i;
+
+ INP_LOCK_ASSERT(tp->t_inpcb);
+ tp->rcv_numsacks = 0;
+ for (i = 0; i < MAX_SACK_BLKS; i++)
+ tp->sackblks[i].start = tp->sackblks[i].end=0;
+}
+
+/*
+ * Process the TCP SACK option. Returns 1 if tcp_dooptions() should continue,
+ * and 0 otherwise, if the option was fine. tp->snd_holes is an ordered list
+ * of holes (oldest to newest, in terms of the sequence space).
+ */
+int
+tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen)
+{
+ int tmp_olen;
+ u_char *tmp_cp;
+ struct sackhole *cur, *p, *temp;
+
+ INP_LOCK_ASSERT(tp->t_inpcb);
+ if (!tp->sack_enable)
+ return (1);
+
+ /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */
+ if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
+ return (1);
+ tmp_cp = cp + 2;
+ tmp_olen = optlen - 2;
+ tcpstat.tcps_sack_rcv_blocks++;
+ if (tp->snd_numholes < 0)
+ tp->snd_numholes = 0;
+ if (tp->t_maxseg == 0)
+ panic("tcp_sack_option"); /* Should never happen */
+ while (tmp_olen > 0) {
+ struct sackblk sack;
+
+ bcopy(tmp_cp, (char *) &(sack.start), sizeof(tcp_seq));
+ sack.start = ntohl(sack.start);
+ bcopy(tmp_cp + sizeof(tcp_seq),
+ (char *) &(sack.end), sizeof(tcp_seq));
+ sack.end = ntohl(sack.end);
+ tmp_olen -= TCPOLEN_SACK;
+ tmp_cp += TCPOLEN_SACK;
+ if (SEQ_LEQ(sack.end, sack.start))
+ continue; /* bad SACK fields */
+ if (SEQ_LEQ(sack.end, tp->snd_una))
+ continue; /* old block */
+ if (SEQ_GT(th->th_ack, tp->snd_una)) {
+ if (SEQ_LT(sack.start, th->th_ack))
+ continue;
+ }
+ if (SEQ_GT(sack.end, tp->snd_max))
+ continue;
+ if (tp->snd_holes == NULL) { /* first hole */
+ tp->snd_holes = (struct sackhole *)
+ uma_zalloc(sack_hole_zone,M_NOWAIT);
+ if (tp->snd_holes == NULL) {
+ /* ENOBUFS, so ignore SACKed block for now*/
+ continue;
+ }
+ cur = tp->snd_holes;
+ cur->start = th->th_ack;
+ cur->end = sack.start;
+ cur->rxmit = cur->start;
+ cur->next = NULL;
+ tp->snd_numholes = 1;
+ tp->rcv_lastsack = sack.end;
+ continue; /* with next sack block */
+ }
+ /* Go thru list of holes: p = previous, cur = current */
+ p = cur = tp->snd_holes;
+ while (cur) {
+ if (SEQ_LEQ(sack.end, cur->start))
+ /* SACKs data before the current hole */
+ break; /* no use going through more holes */
+ if (SEQ_GEQ(sack.start, cur->end)) {
+ /* SACKs data beyond the current hole */
+ p = cur;
+ cur = cur->next;
+ continue;
+ }
+ if (SEQ_LEQ(sack.start, cur->start)) {
+ /* Data acks at least the beginning of hole */
+ if (SEQ_GEQ(sack.end, cur->end)) {
+ /* Acks entire hole, so delete hole */
+ if (p != cur) {
+ p->next = cur->next;
+ uma_zfree(sack_hole_zone, cur);
+ cur = p->next;
+ } else {
+ cur = cur->next;
+ uma_zfree(sack_hole_zone, p);
+ p = cur;
+ tp->snd_holes = p;
+ }
+ tp->snd_numholes--;
+ continue;
+ }
+ /* otherwise, move start of hole forward */
+ cur->start = sack.end;
+ cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
+ p = cur;
+ cur = cur->next;
+ continue;
+ }
+ /* move end of hole backward */
+ if (SEQ_GEQ(sack.end, cur->end)) {
+ cur->end = sack.start;
+ cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
+ p = cur;
+ cur = cur->next;
+ continue;
+ }
+ if (SEQ_LT(cur->start, sack.start) &&
+ SEQ_GT(cur->end, sack.end)) {
+ /*
+ * ACKs some data in middle of a hole; need to
+ * split current hole
+ */
+ temp = (struct sackhole *)
+ uma_zalloc(sack_hole_zone,M_NOWAIT);
+ if (temp == NULL)
+ continue; /* ENOBUFS */
+ temp->next = cur->next;
+ temp->start = sack.end;
+ temp->end = cur->end;
+ temp->rxmit = SEQ_MAX(cur->rxmit, temp->start);
+ cur->end = sack.start;
+ cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
+ cur->next = temp;
+ p = temp;
+ cur = p->next;
+ tp->snd_numholes++;
+ }
+ }
+ /* At this point, p points to the last hole on the list */
+ if (SEQ_LT(tp->rcv_lastsack, sack.start)) {
+ /*
+ * Need to append new hole at end.
+ * Last hole is p (and it's not NULL).
+ */
+ temp = (struct sackhole *)
+ uma_zalloc(sack_hole_zone,M_NOWAIT);
+ if (temp == NULL)
+ continue; /* ENOBUFS */
+ temp->start = tp->rcv_lastsack;
+ temp->end = sack.start;
+ temp->rxmit = temp->start;
+ temp->next = 0;
+ p->next = temp;
+ tp->rcv_lastsack = sack.end;
+ tp->snd_numholes++;
+ }
+ }
+ return (0);
+}
+
+/*
+ * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if
+ * it is completely acked; otherwise, tcp_sack_option(), called from
+ * tcp_dooptions(), will fix up the hole.
+ */
+void
+tcp_del_sackholes(tp, th)
+ struct tcpcb *tp;
+ struct tcphdr *th;
+{
+ INP_LOCK_ASSERT(tp->t_inpcb);
+ if (tp->sack_enable && tp->t_state != TCPS_LISTEN) {
+ /* max because this could be an older ack just arrived */
+ tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
+ th->th_ack : tp->snd_una;
+ struct sackhole *cur = tp->snd_holes;
+ struct sackhole *prev;
+ while (cur)
+ if (SEQ_LEQ(cur->end, lastack)) {
+ prev = cur;
+ cur = cur->next;
+ uma_zfree(sack_hole_zone, prev);
+ tp->snd_numholes--;
+ } else if (SEQ_LT(cur->start, lastack)) {
+ cur->start = lastack;
+ if (SEQ_LT(cur->rxmit, cur->start))
+ cur->rxmit = cur->start;
+ break;
+ } else
+ break;
+ tp->snd_holes = cur;
+ }
+}
+
+void
+tcp_free_sackholes(struct tcpcb *tp)
+{
+ struct sackhole *p, *q;
+
+ INP_LOCK_ASSERT(tp->t_inpcb);
+ q = tp->snd_holes;
+ while (q != NULL) {
+ p = q;
+ q = q->next;
+ uma_zfree(sack_hole_zone, p);
+ }
+ tp->snd_holes = 0;
+}
+
+/*
+ * Checks for partial ack. If partial ack arrives, turn off retransmission
+ * timer, deflate the window, do not clear tp->t_dupacks, and return 1.
+ * If the ack advances at least to tp->snd_recover, return 0.
+ */
+void
+tcp_sack_partialack(tp, th)
+ struct tcpcb *tp;
+ struct tcphdr *th;
+{
+ INP_LOCK_ASSERT(tp->t_inpcb);
+ u_long ocwnd = tp->snd_cwnd;
+
+ callout_stop(tp->tt_rexmt);
+ tp->t_rtttime = 0;
+ /*
+ * Set snd_cwnd to one segment beyond acknowledged offset
+ * (tp->snd_una has not yet been updated when this function is called.)
+ */
+ /*
+ * Should really be
+ * min(tp->snd_cwnd, tp->t_maxseg + (th->th_ack - tp->snd_una))
+ */
+ tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
+ tp->t_flags |= TF_ACKNOW;
+ (void) tcp_output(tp);
+ tp->snd_cwnd = ocwnd;
+ /*
+ * Partial window deflation. Relies on fact that tp->snd_una
+ * not updated yet.
+ */
+ tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg);
+}
+
+#ifdef TCP_SACK_DEBUG
+void
+tcp_print_holes(struct tcpcb *tp)
+{
+ struct sackhole *p = tp->snd_holes;
+ if (p == 0)
+ return;
+ printf("Hole report: start--end dups rxmit\n");
+ while (p) {
+ printf("%x--%x r %x\n", p->start, p->end, p->rxmit);
+ p = p->next;
+ }
+ printf("\n");
+}
+#endif /* TCP_SACK_DEBUG */
+
+/*
+ * Returns pointer to a sackhole if there are any pending retransmissions;
+ * NULL otherwise.
+ */
+struct sackhole *
+tcp_sack_output(struct tcpcb *tp)
+{
+ struct sackhole *p;
+
+ INP_LOCK_ASSERT(tp->t_inpcb);
+ if (!tp->sack_enable)
+ return (NULL);
+ p = tp->snd_holes;
+ while (p) {
+ if (SEQ_LT(p->rxmit, p->end)) {
+ if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */
+ p = p->next;
+ continue;
+ }
+#ifdef TCP_SACK_DEBUG
+ if (p)
+ tcp_print_holes(tp);
+#endif
+ return (p);
+ }
+ p = p->next;
+ }
+ return (NULL);
+}
+
+/*
+ * After a timeout, the SACK list may be rebuilt. This SACK information
+ * should be used to avoid retransmitting SACKed data. This function
+ * traverses the SACK list to see if snd_nxt should be moved forward.
+ */
+void
+tcp_sack_adjust(struct tcpcb *tp)
+{
+ INP_LOCK_ASSERT(tp->t_inpcb);
+ struct sackhole *cur = tp->snd_holes;
+ if (cur == NULL)
+ return; /* No holes */
+ if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack))
+ return; /* We're already beyond any SACKed blocks */
+ /*
+ * Two cases for which we want to advance snd_nxt:
+ * i) snd_nxt lies between end of one hole and beginning of another
+ * ii) snd_nxt lies between end of last hole and rcv_lastsack
+ */
+ while (cur->next) {
+ if (SEQ_LT(tp->snd_nxt, cur->end))
+ return;
+ if (SEQ_GEQ(tp->snd_nxt, cur->next->start))
+ cur = cur->next;
+ else {
+ tp->snd_nxt = cur->next->start;
+ return;
+ }
+ }
+ if (SEQ_LT(tp->snd_nxt, cur->end))
+ return;
+ tp->snd_nxt = tp->rcv_lastsack;
+ return;
+}
+
diff --git a/sys/netinet/tcp_seq.h b/sys/netinet/tcp_seq.h
index d5b1f14..c029b12 100644
--- a/sys/netinet/tcp_seq.h
+++ b/sys/netinet/tcp_seq.h
@@ -42,6 +42,9 @@
#define SEQ_GT(a,b) ((int)((a)-(b)) > 0)
#define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0)
+#define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b))
+#define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b))
+
/* for modulo comparisons of timestamps */
#define TSTMP_LT(a,b) ((int)((a)-(b)) < 0)
#define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0)
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index b5cfd43..c1add3f 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -36,6 +36,7 @@
#include "opt_ipsec.h"
#include "opt_mac.h"
#include "opt_tcpdebug.h"
+#include "opt_tcp_sack.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -201,6 +202,17 @@ static int tcp_inflight_stab = 20;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
&tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
+
+int tcp_do_sack = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_sack, CTLFLAG_RW,
+ &tcp_do_sack, 0, "Enable/Disable TCP SACK support");
+
+int tcp_sackhole_limit = 10 * 1024; /* Arbitrarily set */
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sackhole_limit, CTLFLAG_RW,
+ &tcp_sackhole_limit, 0, "Limit on the total SACK scoreboard elements");
+
+uma_zone_t sack_hole_zone;
+
static struct inpcb *tcp_notify(struct inpcb *, int);
static void tcp_discardcb(struct tcpcb *);
static void tcp_isn_tick(void *);
@@ -292,6 +304,8 @@ tcp_init()
tcp_isn_tick(NULL);
EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
SHUTDOWN_PRI_DEFAULT);
+ sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
}
void
@@ -606,6 +620,7 @@ tcp_newtcpcb(inp)
tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
if (tcp_do_rfc1644)
tp->t_flags |= TF_REQ_CC;
+ tp->sack_enable = tcp_do_sack;
tp->t_inpcb = inp; /* XXX */
/*
* Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
@@ -739,6 +754,7 @@ tcp_discardcb(tp)
tp->t_segqlen--;
tcp_reass_qsize--;
}
+ tcp_free_sackholes(tp);
inp->inp_ppcb = NULL;
tp->t_inpcb = NULL;
uma_zfree(tcpcb_zone, tp);
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
index 57d6a93..dbbfbb1 100644
--- a/sys/netinet/tcp_syncache.c
+++ b/sys/netinet/tcp_syncache.c
@@ -39,6 +39,7 @@
#include "opt_ipsec.h"
#include "opt_mac.h"
#include "opt_tcpdebug.h"
+#include "opt_tcp_sack.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -704,7 +705,10 @@ syncache_socket(sc, lso, m)
if (sc->sc_flags & SCF_SIGNATURE)
tp->t_flags |= TF_SIGNATURE;
#endif
-
+ if (sc->sc_flags & SCF_SACK) {
+ tp->sack_enable = 1;
+ tp->t_flags |= TF_SACK_PERMIT;
+ }
/*
* Set up MSS and get cached values from tcp_hostcache.
* This might overwrite some of the defaults we just set.
@@ -991,6 +995,9 @@ syncache_add(inc, to, th, sop, m)
sc->sc_flags = SCF_SIGNATURE;
#endif
+ if (to->to_flags & TOF_SACK)
+ sc->sc_flags |= SCF_SACK;
+
/*
* XXX
* We have the option here of not doing TAO (even if the segment
@@ -1107,6 +1114,7 @@ syncache_respond(sc, m)
optlen += (sc->sc_flags & SCF_SIGNATURE) ?
TCPOLEN_SIGNATURE + 2 : 0;
#endif
+ optlen += ((sc->sc_flags & SCF_SACK) ? 4 : 0);
}
tlen = hlen + sizeof(struct tcphdr) + optlen;
@@ -1244,6 +1252,11 @@ syncache_respond(sc, m)
optp += TCPOLEN_SIGNATURE + 2;
}
#endif /* TCP_SIGNATURE */
+
+ if (sc->sc_flags & SCF_SACK) {
+ *(u_int32_t *)optp = htonl(TCPOPT_SACK_PERMIT_HDR);
+ optp += 4;
+ }
}
#ifdef INET6
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index a23531f..44664ad 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -32,6 +32,7 @@
#include "opt_inet6.h"
#include "opt_tcpdebug.h"
+#include "opt_tcp_sack.h"
#include <sys/param.h>
#include <sys/kernel.h>
@@ -217,6 +218,7 @@ tcp_timer_2msl(xtp)
return;
}
INP_LOCK(inp);
+ tcp_free_sackholes(tp);
if (callout_pending(tp->tt_2msl) || !callout_active(tp->tt_2msl)) {
INP_UNLOCK(tp->t_inpcb);
INP_INFO_WUNLOCK(&tcbinfo);
@@ -497,6 +499,7 @@ tcp_timer_rexmt(xtp)
return;
}
callout_deactivate(tp->tt_rexmt);
+ tcp_free_sackholes(tp);
/*
* Retransmission timer went off. Message has not
* been acked within retransmit interval. Back off
diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c
index b5cfd43..c1add3f 100644
--- a/sys/netinet/tcp_timewait.c
+++ b/sys/netinet/tcp_timewait.c
@@ -36,6 +36,7 @@
#include "opt_ipsec.h"
#include "opt_mac.h"
#include "opt_tcpdebug.h"
+#include "opt_tcp_sack.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -201,6 +202,17 @@ static int tcp_inflight_stab = 20;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
&tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
+
+int tcp_do_sack = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_sack, CTLFLAG_RW,
+ &tcp_do_sack, 0, "Enable/Disable TCP SACK support");
+
+int tcp_sackhole_limit = 10 * 1024; /* Arbitrarily set */
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sackhole_limit, CTLFLAG_RW,
+ &tcp_sackhole_limit, 0, "Limit on the total SACK scoreboard elements");
+
+uma_zone_t sack_hole_zone;
+
static struct inpcb *tcp_notify(struct inpcb *, int);
static void tcp_discardcb(struct tcpcb *);
static void tcp_isn_tick(void *);
@@ -292,6 +304,8 @@ tcp_init()
tcp_isn_tick(NULL);
EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
SHUTDOWN_PRI_DEFAULT);
+ sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
}
void
@@ -606,6 +620,7 @@ tcp_newtcpcb(inp)
tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
if (tcp_do_rfc1644)
tp->t_flags |= TF_REQ_CC;
+ tp->sack_enable = tcp_do_sack;
tp->t_inpcb = inp; /* XXX */
/*
* Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
@@ -739,6 +754,7 @@ tcp_discardcb(tp)
tp->t_segqlen--;
tcp_reass_qsize--;
}
+ tcp_free_sackholes(tp);
inp->inp_ppcb = NULL;
tp->t_inpcb = NULL;
uma_zfree(tcpcb_zone, tp);
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 8c42b4d..0090210 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -52,6 +52,17 @@ LIST_HEAD(tsegqe_head, tseg_qent);
extern int tcp_reass_qsize;
extern struct uma_zone *tcp_reass_zone;
+struct sackblk {
+ tcp_seq start; /* start seq no. of sack block */
+ tcp_seq end; /* end seq no. */
+};
+
+struct sackhole {
+ tcp_seq start; /* start seq no. of hole */
+ tcp_seq end; /* end seq no. */
+ tcp_seq rxmit; /* next seq. no in hole to be retransmitted */
+ struct sackhole *next; /* next in list */
+};
struct tcptemp {
u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */
struct tcphdr tt_t;
@@ -179,6 +190,16 @@ struct tcpcb {
u_long rcv_second; /* start of interval second */
u_long rcv_pps; /* received packets per second */
u_long rcv_byps; /* received bytes per second */
+ /* SACK related state */
+ int sack_enable; /* enable SACK for this connection */
+ int snd_numholes; /* number of holes seen by sender */
+ struct sackhole *snd_holes; /* linked list of holes (sorted) */
+
+ tcp_seq rcv_laststart; /* start of last segment recd. */
+ tcp_seq rcv_lastend; /* end of ... */
+ tcp_seq rcv_lastsack; /* last seq number(+1) sack'd by rcv'r*/
+ int rcv_numsacks; /* # distinct sack blks present */
+ struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */
};
#define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY)
@@ -216,6 +237,7 @@ struct tcpopt {
#define TOF_SCALE 0x0020
#define TOF_SIGNATURE 0x0040 /* signature option present */
#define TOF_SIGLEN 0x0080 /* signature length valid (RFC2385) */
+#define TOF_SACK 0x0100 /* Peer sent SACK option */
u_int32_t to_tsval;
u_int32_t to_tsecr;
tcp_cc to_cc; /* holds CC or CCnew */
@@ -249,6 +271,7 @@ struct syncache {
#define SCF_CC 0x08 /* negotiated CC */
#define SCF_UNREACH 0x10 /* icmp unreachable received */
#define SCF_SIGNATURE 0x20 /* send MD5 digests */
+#define SCF_SACK 0x80 /* send SACK option */
TAILQ_ENTRY(syncache) sc_hash;
TAILQ_ENTRY(syncache) sc_timerq;
};
@@ -434,6 +457,13 @@ struct tcpstat {
u_long tcps_hc_added; /* entry added to hostcache */
u_long tcps_hc_bucketoverflow; /* hostcache per bucket limit hit */
+
+ /* SACK related stats */
+ u_long tcps_sack_recovery_episode; /* SACK recovery episodes */
+ u_long tcps_sack_rexmits; /* SACK rexmit segments */
+ u_long tcps_sack_rexmit_bytes; /* SACK rexmit bytes */
+ u_long tcps_sack_rcv_blocks; /* SACK blocks (options) received */
+ u_long tcps_sack_send_blocks; /* SACK blocks (options) sent */
};
/*
@@ -467,7 +497,8 @@ struct xtcpcb {
#define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */
#define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */
#define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */
-#define TCPCTL_MAXID 14
+#define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */
+#define TCPCTL_MAXID 15
#define TCPCTL_NAMES { \
{ 0, 0 }, \
@@ -505,6 +536,8 @@ extern int path_mtu_discovery;
extern int ss_fltsz;
extern int ss_fltsz_local;
+extern int tcp_do_sack; /* SACK enabled/disabled */
+
void tcp_canceltimers(struct tcpcb *);
struct tcpcb *
tcp_close(struct tcpcb *);
@@ -578,6 +611,20 @@ extern u_long tcp_sendspace;
extern u_long tcp_recvspace;
tcp_seq tcp_new_isn(struct tcpcb *);
+int tcp_sack_option(struct tcpcb *,struct tcphdr *,u_char *,int);
+void tcp_update_sack_list(struct tcpcb *tp);
+void tcp_del_sackholes(struct tcpcb *, struct tcphdr *);
+void tcp_clean_sackreport(struct tcpcb *tp);
+void tcp_sack_adjust(struct tcpcb *tp);
+struct sackhole *tcp_sack_output(struct tcpcb *tp);
+void tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
+void tcp_free_sackholes(struct tcpcb *tp);
+int tcp_newreno(struct tcpcb *, struct tcphdr *);
+u_long tcp_seq_subtract(u_long, u_long );
+#ifdef TCP_SACK_DEBUG
+void tcp_print_holes(struct tcpcb *tp);
+#endif /* TCP_SACK_DEBUG */
+
#endif /* _KERNEL */
#endif /* _NETINET_TCP_VAR_H_ */
OpenPOWER on IntegriCloud