diff options
author | ps <ps@FreeBSD.org> | 2004-06-23 21:04:37 +0000 |
---|---|---|
committer | ps <ps@FreeBSD.org> | 2004-06-23 21:04:37 +0000 |
commit | f5f3e8600b5cd41c8645b3a5d45e20092a8b9ee1 (patch) | |
tree | 37eceb1297375660ec2d161a79ee2ec7364248b3 | |
parent | 933faf5c3e0325440e1ef2edac115dd64ece174c (diff) | |
download | FreeBSD-src-f5f3e8600b5cd41c8645b3a5d45e20092a8b9ee1.zip FreeBSD-src-f5f3e8600b5cd41c8645b3a5d45e20092a8b9ee1.tar.gz |
Add support for TCP Selective Acknowledgements. The work for this
originated on RELENG_4 and was ported to -CURRENT.
The scoreboarding code was obtained from OpenBSD, and many
of the remaining changes were inspired by OpenBSD, but not
taken directly from there.
You can enable/disable sack using net.inet.tcp.do_sack. You can
also limit the number of sack holes that all senders can have in
the scoreboard with net.inet.tcp.sackhole_limit.
Reviewed by: gnn
Obtained from: Yahoo! (Mohan Srinivasan, Jayanth Vijayaraghavan)
-rw-r--r-- | sys/conf/files | 1 | ||||
-rw-r--r-- | sys/conf/options | 1 | ||||
-rw-r--r-- | sys/netinet/tcp.h | 12 | ||||
-rw-r--r-- | sys/netinet/tcp_input.c | 93 | ||||
-rw-r--r-- | sys/netinet/tcp_output.c | 118 | ||||
-rw-r--r-- | sys/netinet/tcp_reass.c | 93 | ||||
-rw-r--r-- | sys/netinet/tcp_sack.c | 592 | ||||
-rw-r--r-- | sys/netinet/tcp_seq.h | 3 | ||||
-rw-r--r-- | sys/netinet/tcp_subr.c | 16 | ||||
-rw-r--r-- | sys/netinet/tcp_syncache.c | 15 | ||||
-rw-r--r-- | sys/netinet/tcp_timer.c | 3 | ||||
-rw-r--r-- | sys/netinet/tcp_timewait.c | 16 | ||||
-rw-r--r-- | sys/netinet/tcp_var.h | 49 |
13 files changed, 975 insertions, 37 deletions
diff --git a/sys/conf/files b/sys/conf/files index 0febe87..8c1a136 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1465,6 +1465,7 @@ netinet/tcp_debug.c optional tcpdebug netinet/tcp_hostcache.c optional inet netinet/tcp_input.c optional inet netinet/tcp_output.c optional inet +netinet/tcp_sack.c optional inet netinet/tcp_subr.c optional inet netinet/tcp_syncache.c optional inet netinet/tcp_timer.c optional inet diff --git a/sys/conf/options b/sys/conf/options index 62913a1..fdb385a 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -354,6 +354,7 @@ RANDOM_IP_ID SLIP_IFF_OPTS opt_slip.h TCPDEBUG TCP_SIGNATURE opt_inet.h +TCP_SACK_DEBUG opt_tcp_sack.h TCP_DROP_SYNFIN opt_tcp_input.h XBONEHACK diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h index 1eee95c..92460d9 100644 --- a/sys/netinet/tcp.h +++ b/sys/netinet/tcp.h @@ -85,12 +85,15 @@ struct tcphdr { #define TCPOPT_SACK_PERMITTED 4 /* Experimental */ #define TCPOLEN_SACK_PERMITTED 2 #define TCPOPT_SACK 5 /* Experimental */ +#define TCPOLEN_SACK 8 /* 2*sizeof(tcp_seq) */ #define TCPOPT_TIMESTAMP 8 #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */ #define TCPOPT_TSTAMP_HDR \ (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP) +#define MAX_TCPOPTLEN 40 /* Absolute maximum TCP options len */ + #define TCPOPT_CC 11 /* CC options: RFC-1644 */ #define TCPOPT_CCNEW 12 #define TCPOPT_CCECHO 13 @@ -101,6 +104,15 @@ struct tcphdr { #define TCPOPT_SIGNATURE 19 /* Keyed MD5: RFC 2385 */ #define TCPOLEN_SIGNATURE 18 +/* Option definitions */ +#define TCPOPT_SACK_PERMIT_HDR \ +(TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK_PERMITTED<<8|TCPOLEN_SACK_PERMITTED) +#define TCPOPT_SACK_HDR (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK<<8) +/* Miscellaneous constants */ +#define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at sender side */ +#define TCP_MAX_SACK 3 /* MAX # SACKs sent in any segment */ + + /* * Default maximum segment size for TCP. * With an IP MTU of 576, this is 536, diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index b1b2284..581fe9a 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -37,6 +37,7 @@ #include "opt_mac.h" #include "opt_tcpdebug.h" #include "opt_tcp_input.h" +#include "opt_tcp_sack.h" #include <sys/param.h> #include <sys/kernel.h> @@ -159,7 +160,9 @@ struct inpcbhead tcb; struct inpcbinfo tcbinfo; struct mtx *tcbinfo_mtx; -static void tcp_dooptions(struct tcpopt *, u_char *, int, int); +static void tcp_dooptions(struct tcpcb *, struct tcpopt *, u_char *, + int, int, struct tcphdr *); + static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, @@ -724,7 +727,7 @@ findpcb: * present in a SYN segment. See tcp_timewait(). */ if (thflags & TH_SYN) - tcp_dooptions(&to, optp, optlen, 1); + tcp_dooptions((struct tcpcb *)NULL, &to, optp, optlen, 1, th); if (tcp_timewait((struct tcptw *)inp->inp_ppcb, &to, th, m, tlen)) goto findpcb; @@ -938,7 +941,7 @@ findpcb: tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif - tcp_dooptions(&to, optp, optlen, 1); + tcp_dooptions(tp, &to, optp, optlen, 1, th); if (!syncache_add(&inc, &to, th, &so, m)) goto drop; if (so == NULL) { @@ -1054,7 +1057,7 @@ after_listen: * for incoming connections is handled in tcp_syncache. * XXX this is traditional behavior, may need to be cleaned up. */ - tcp_dooptions(&to, optp, optlen, thflags & TH_SYN); + tcp_dooptions(tp,&to, optp, optlen, thflags & TH_SYN,th); if (thflags & TH_SYN) { if (to.to_flags & TOF_SCALE) { tp->t_flags |= TF_RCVD_SCALE; @@ -1069,6 +1072,20 @@ after_listen: tp->t_flags |= TF_RCVD_CC; if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); + if (tp->sack_enable) { + if (!(to.to_flags & TOF_SACK)) + tp->sack_enable = 0; + else + tp->t_flags |= TF_SACK_PERMIT; + } + + } + + if (tp->sack_enable) { + /* Delete stale (cumulatively acked) SACK holes */ + tcp_del_sackholes(tp, th); + tp->rcv_laststart = th->th_seq; /* last rec'vd segment*/ + tp->rcv_lastend = th->th_seq + tlen; } /* @@ -1120,9 +1137,10 @@ after_listen: if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd && - ((!tcp_do_newreno && + ((!tcp_do_newreno && !tp->sack_enable && tp->t_dupacks < tcprexmtthresh) || - (tcp_do_newreno && !IN_FASTRECOVERY(tp)))) { + ((tcp_do_newreno || tp->sack_enable) && + !IN_FASTRECOVERY(tp)))) { KASSERT(headlocked, ("headlocked")); INP_INFO_WUNLOCK(&tcbinfo); /* @@ -1218,6 +1236,9 @@ after_listen: * with nothing on the reassembly queue and * we have enough buffer space to take it. */ + /* Clean receiver SACK report if present */ + if (tp->sack_enable && tp->rcv_numsacks) + tcp_clean_sackreport(tp); ++tcpstat.tcps_preddat; tp->rcv_nxt += tlen; /* @@ -1898,7 +1919,7 @@ trimthenstep6: th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || - (tcp_do_newreno && + ((tcp_do_newreno || tp->sack_enable) && IN_FASTRECOVERY(tp))) { tp->snd_cwnd += tp->t_maxseg; (void) tcp_output(tp); @@ -1906,7 +1927,8 @@ trimthenstep6: } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; u_int win; - if (tcp_do_newreno && + if ((tcp_do_newreno || + tp->sack_enable) && SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; @@ -1921,6 +1943,17 @@ trimthenstep6: tp->snd_recover = tp->snd_max; callout_stop(tp->tt_rexmt); tp->t_rtttime = 0; + if (tp->sack_enable) { + tcpstat.tcps_sack_recovery_episode++; + tp->snd_cwnd = + tp->t_maxseg * + tp->t_dupacks; + (void) tcp_output(tp); + tp->snd_cwnd = + tp->snd_ssthresh; + goto drop; + } + tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); @@ -1971,12 +2004,16 @@ trimthenstep6: * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ - if (tcp_do_newreno) { + if (tcp_do_newreno || tp->sack_enable) { if (IN_FASTRECOVERY(tp)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { - tcp_newreno_partial_ack(tp, th); + if (tp->sack_enable) + tcp_sack_partialack(tp, th); + else + tcp_newreno_partial_ack(tp, th); } else { /* + * Out of fast recovery. * Window inflation should have left us * with approximately snd_ssthresh * outstanding data. @@ -2098,7 +2135,8 @@ process_ACK: * Otherwise open linearly: maxseg per window * (maxseg^2 / cwnd per packet). */ - if (!tcp_do_newreno || !IN_FASTRECOVERY(tp)) { + if ((!tcp_do_newreno && !tp->sack_enable) || + !IN_FASTRECOVERY(tp)) { register u_int cw = tp->snd_cwnd; register u_int incr = tp->t_maxseg; if (cw > tp->snd_ssthresh) @@ -2116,14 +2154,20 @@ process_ACK: } sowwakeup(so); /* detect una wraparound */ - if (tcp_do_newreno && !IN_FASTRECOVERY(tp) && + if ((tcp_do_newreno || tp->sack_enable) && + !IN_FASTRECOVERY(tp) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; - if (tcp_do_newreno && IN_FASTRECOVERY(tp) && + if ((tcp_do_newreno || tp->sack_enable) && + IN_FASTRECOVERY(tp) && SEQ_GEQ(th->th_ack, tp->snd_recover)) EXIT_FASTRECOVERY(tp); tp->snd_una = th->th_ack; + if (tp->sack_enable) { + if (SEQ_GT(tp->snd_una, tp->snd_recover)) + tp->snd_recover = tp->snd_una; + } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; @@ -2327,7 +2371,8 @@ dodata: /* XXX */ thflags = tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; } - + if (tp->sack_enable) + tcp_update_sack_list(tp); /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's @@ -2530,11 +2575,13 @@ drop: * Parse TCP options and place in tcpopt. */ static void -tcp_dooptions(to, cp, cnt, is_syn) +tcp_dooptions(tp, to, cp, cnt, is_syn, th) + struct tcpcb *tp; struct tcpopt *to; - u_char *cp; + u_char *cp; int cnt; int is_syn; + struct tcphdr *th; { int opt, optlen; @@ -2623,6 +2670,20 @@ tcp_dooptions(to, cp, cnt, is_syn) to->to_flags |= (TOF_SIGNATURE | TOF_SIGLEN); break; #endif + case TCPOPT_SACK_PERMITTED: + if (!tcp_do_sack || + optlen != TCPOLEN_SACK_PERMITTED) + continue; + if (is_syn) { + /* MUST only be set on SYN */ + to->to_flags |= TOF_SACK; + } + break; + + case TCPOPT_SACK: + if (!tp || tcp_sack_option(tp, th, cp, optlen)) + continue; + break; default: continue; } diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index aa7f58f..a662d0f 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -35,6 +35,7 @@ #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" +#include "opt_tcp_sack.h" #include <sys/param.h> #include <sys/systm.h> @@ -122,6 +123,8 @@ tcp_output(struct tcpcb *tp) u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; int idle, sendalot; + int i, sack_rxmit; + struct sackhole *p; #if 0 int maxburst = TCP_MAXBURST; #endif @@ -171,6 +174,13 @@ tcp_output(struct tcpcb *tp) } } again: + /* + * If we've recently taken a timeout, snd_max will be greater than + * snd_nxt. There may be SACK information that allows us to avoid + * resending already delivered data. Adjust snd_nxt accordingly. + */ + if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max)) + tcp_sack_adjust(tp); sendalot = 0; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd); @@ -178,6 +188,36 @@ again: flags = tcp_outflags[tp->t_state]; /* + * Send any SACK-generated retransmissions. If we're explicitly trying + * to send out new data (when sendalot is 1), bypass this function. + * If we retransmit in fast recovery mode, decrement snd_cwnd, since + * we're replacing a (future) new transmission with a retransmission + * now, and we previously incremented snd_cwnd in tcp_input(). + */ + /* + * Still in sack recovery , reset rxmit flag to zero. + */ + sack_rxmit = 0; + len = 0; + p = NULL; + if (tp->sack_enable && IN_FASTRECOVERY(tp) && + (p = tcp_sack_output(tp))) { + sack_rxmit = 1; + sendalot = 1; + off = p->rxmit - tp->snd_una; + KASSERT(tp->snd_cwnd >= 0,("%s: CWIN is negative: %ld", __func__, tp->snd_cwnd)); + /* Do not retransmit SACK segments beyond snd_recover */ + if (SEQ_GT(p->end, tp->snd_recover)) + len = min(tp->snd_cwnd, tp->snd_recover - p->rxmit); + else + len = min(tp->snd_cwnd, p->end - p->rxmit); + if (len > 0) { + tcpstat.tcps_sack_rexmits++; + tcpstat.tcps_sack_rexmit_bytes += + min(len, tp->t_maxseg); + } + } + /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ @@ -230,9 +270,12 @@ again: * In the normal retransmit-FIN-only case, however, snd_nxt will * be set to snd_una, the offset will be 0, and the length may * wind up 0. + * + * If sack_rxmit is true we are retransmitting from the scoreboard + * in which case len is already set. */ - len = (long)ulmin(so->so_snd.sb_cc, sendwin) - off; - + if (!sack_rxmit) + len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off); /* * Lop off SYN bit if it has already been sent. However, if this @@ -331,6 +374,8 @@ again: goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; + if (sack_rxmit) + goto send; } /* @@ -374,7 +419,18 @@ again: if (flags & TH_FIN && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) goto send; - + /* + * In SACK, it is possible for tcp_output to fail to send a segment + * after the retransmission timer has been turned off. Make sure + * that the retransmission timer is set. + */ + if (tp->sack_enable && SEQ_GT(tp->snd_max, tp->snd_una) && + !callout_active(tp->tt_rexmt) && + !callout_active(tp->tt_persist)) { + callout_reset(tp->tt_rexmt, tp->t_rxtcur, + tcp_timer_rexmt, tp); + return (0); + } /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window @@ -435,6 +491,19 @@ send: (void)memcpy(opt + 2, &mss, sizeof(mss)); optlen = TCPOLEN_MAXSEG; + /* + * If this is the first SYN of connection (not a SYN + * ACK), include SACK_PERMIT_HDR option. If this is a + * SYN ACK, include SACK_PERMIT_HDR option if peer has + * already done so. This is only for active connect, + * since the syncache takes care of the passive connect. + */ + if (tp->sack_enable && ((flags & TH_ACK) == 0 || + (tp->t_flags & TF_SACK_PERMIT))) { + *((u_int32_t *) (opt + optlen)) = + htonl(TCPOPT_SACK_PERMIT_HDR); + optlen += 4; + } if ((tp->t_flags & TF_REQ_SCALE) && ((flags & TH_ACK) == 0 || (tp->t_flags & TF_RCVD_SCALE))) { @@ -466,6 +535,32 @@ send: optlen += TCPOLEN_TSTAMP_APPA; } + /* + * Send SACKs if necessary. This should be the last option processed. + * Only as many SACKs are sent as are permitted by the maximum options + * size. No more than three SACKs are sent. + */ + if (tp->sack_enable && tp->t_state == TCPS_ESTABLISHED && + (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT && + tp->rcv_numsacks) { + u_int32_t *lp = (u_int32_t *)(opt + optlen); + u_int32_t *olp = lp++; + int count = 0; /* actual number of SACKs inserted */ + int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK; + + tcpstat.tcps_sack_send_blocks++; + maxsack = min(maxsack, TCP_MAX_SACK); + for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) { + struct sackblk sack = tp->sackblks[i]; + if (sack.start == 0 && sack.end == 0) + continue; + *lp++ = htonl(sack.start); + *lp++ = htonl(sack.end); + count++; + } + *olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2)); + optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */ + } /* * Send `CC-family' options if our side wants to use them (TF_REQ_CC), * options are allowed (!TF_NOOPT) and it's not a RST. @@ -734,6 +829,10 @@ send: th->th_seq = htonl(tp->snd_nxt); else th->th_seq = htonl(tp->snd_max); + if (sack_rxmit) { + th->th_seq = htonl(p->rxmit); + p->rxmit += len; + } th->th_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, th + 1, optlen); @@ -831,6 +930,8 @@ send: tp->t_flags |= TF_SENTFIN; } } + if (tp->sack_enable && sack_rxmit && (p->rxmit != tp->snd_nxt)) + goto timer; tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { tp->snd_max = tp->snd_nxt; @@ -853,6 +954,17 @@ send: * Initialize shift counter which is used for backoff * of retransmit time. */ +timer: + if (tp->sack_enable && sack_rxmit && + !callout_active(tp->tt_rexmt) && + tp->snd_nxt != tp->snd_max) { + callout_reset(tp->tt_rexmt, tp->t_rxtcur, + tcp_timer_rexmt, tp); + if (callout_active(tp->tt_persist)) { + callout_stop(tp->tt_persist); + tp->t_rxtshift = 0; + } + } if (!callout_active(tp->tt_rexmt) && tp->snd_nxt != tp->snd_una) { if (callout_active(tp->tt_persist)) { diff --git a/sys/netinet/tcp_reass.c b/sys/netinet/tcp_reass.c index b1b2284..581fe9a 100644 --- a/sys/netinet/tcp_reass.c +++ b/sys/netinet/tcp_reass.c @@ -37,6 +37,7 @@ #include "opt_mac.h" #include "opt_tcpdebug.h" #include "opt_tcp_input.h" +#include "opt_tcp_sack.h" #include <sys/param.h> #include <sys/kernel.h> @@ -159,7 +160,9 @@ struct inpcbhead tcb; struct inpcbinfo tcbinfo; struct mtx *tcbinfo_mtx; -static void tcp_dooptions(struct tcpopt *, u_char *, int, int); +static void tcp_dooptions(struct tcpcb *, struct tcpopt *, u_char *, + int, int, struct tcphdr *); + static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, @@ -724,7 +727,7 @@ findpcb: * present in a SYN segment. See tcp_timewait(). */ if (thflags & TH_SYN) - tcp_dooptions(&to, optp, optlen, 1); + tcp_dooptions((struct tcpcb *)NULL, &to, optp, optlen, 1, th); if (tcp_timewait((struct tcptw *)inp->inp_ppcb, &to, th, m, tlen)) goto findpcb; @@ -938,7 +941,7 @@ findpcb: tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif - tcp_dooptions(&to, optp, optlen, 1); + tcp_dooptions(tp, &to, optp, optlen, 1, th); if (!syncache_add(&inc, &to, th, &so, m)) goto drop; if (so == NULL) { @@ -1054,7 +1057,7 @@ after_listen: * for incoming connections is handled in tcp_syncache. * XXX this is traditional behavior, may need to be cleaned up. */ - tcp_dooptions(&to, optp, optlen, thflags & TH_SYN); + tcp_dooptions(tp,&to, optp, optlen, thflags & TH_SYN,th); if (thflags & TH_SYN) { if (to.to_flags & TOF_SCALE) { tp->t_flags |= TF_RCVD_SCALE; @@ -1069,6 +1072,20 @@ after_listen: tp->t_flags |= TF_RCVD_CC; if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); + if (tp->sack_enable) { + if (!(to.to_flags & TOF_SACK)) + tp->sack_enable = 0; + else + tp->t_flags |= TF_SACK_PERMIT; + } + + } + + if (tp->sack_enable) { + /* Delete stale (cumulatively acked) SACK holes */ + tcp_del_sackholes(tp, th); + tp->rcv_laststart = th->th_seq; /* last rec'vd segment*/ + tp->rcv_lastend = th->th_seq + tlen; } /* @@ -1120,9 +1137,10 @@ after_listen: if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd && - ((!tcp_do_newreno && + ((!tcp_do_newreno && !tp->sack_enable && tp->t_dupacks < tcprexmtthresh) || - (tcp_do_newreno && !IN_FASTRECOVERY(tp)))) { + ((tcp_do_newreno || tp->sack_enable) && + !IN_FASTRECOVERY(tp)))) { KASSERT(headlocked, ("headlocked")); INP_INFO_WUNLOCK(&tcbinfo); /* @@ -1218,6 +1236,9 @@ after_listen: * with nothing on the reassembly queue and * we have enough buffer space to take it. */ + /* Clean receiver SACK report if present */ + if (tp->sack_enable && tp->rcv_numsacks) + tcp_clean_sackreport(tp); ++tcpstat.tcps_preddat; tp->rcv_nxt += tlen; /* @@ -1898,7 +1919,7 @@ trimthenstep6: th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || - (tcp_do_newreno && + ((tcp_do_newreno || tp->sack_enable) && IN_FASTRECOVERY(tp))) { tp->snd_cwnd += tp->t_maxseg; (void) tcp_output(tp); @@ -1906,7 +1927,8 @@ trimthenstep6: } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; u_int win; - if (tcp_do_newreno && + if ((tcp_do_newreno || + tp->sack_enable) && SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; @@ -1921,6 +1943,17 @@ trimthenstep6: tp->snd_recover = tp->snd_max; callout_stop(tp->tt_rexmt); tp->t_rtttime = 0; + if (tp->sack_enable) { + tcpstat.tcps_sack_recovery_episode++; + tp->snd_cwnd = + tp->t_maxseg * + tp->t_dupacks; + (void) tcp_output(tp); + tp->snd_cwnd = + tp->snd_ssthresh; + goto drop; + } + tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); @@ -1971,12 +2004,16 @@ trimthenstep6: * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ - if (tcp_do_newreno) { + if (tcp_do_newreno || tp->sack_enable) { if (IN_FASTRECOVERY(tp)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { - tcp_newreno_partial_ack(tp, th); + if (tp->sack_enable) + tcp_sack_partialack(tp, th); + else + tcp_newreno_partial_ack(tp, th); } else { /* + * Out of fast recovery. * Window inflation should have left us * with approximately snd_ssthresh * outstanding data. @@ -2098,7 +2135,8 @@ process_ACK: * Otherwise open linearly: maxseg per window * (maxseg^2 / cwnd per packet). */ - if (!tcp_do_newreno || !IN_FASTRECOVERY(tp)) { + if ((!tcp_do_newreno && !tp->sack_enable) || + !IN_FASTRECOVERY(tp)) { register u_int cw = tp->snd_cwnd; register u_int incr = tp->t_maxseg; if (cw > tp->snd_ssthresh) @@ -2116,14 +2154,20 @@ process_ACK: } sowwakeup(so); /* detect una wraparound */ - if (tcp_do_newreno && !IN_FASTRECOVERY(tp) && + if ((tcp_do_newreno || tp->sack_enable) && + !IN_FASTRECOVERY(tp) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; - if (tcp_do_newreno && IN_FASTRECOVERY(tp) && + if ((tcp_do_newreno || tp->sack_enable) && + IN_FASTRECOVERY(tp) && SEQ_GEQ(th->th_ack, tp->snd_recover)) EXIT_FASTRECOVERY(tp); tp->snd_una = th->th_ack; + if (tp->sack_enable) { + if (SEQ_GT(tp->snd_una, tp->snd_recover)) + tp->snd_recover = tp->snd_una; + } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; @@ -2327,7 +2371,8 @@ dodata: /* XXX */ thflags = tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; } - + if (tp->sack_enable) + tcp_update_sack_list(tp); /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's @@ -2530,11 +2575,13 @@ drop: * Parse TCP options and place in tcpopt. */ static void -tcp_dooptions(to, cp, cnt, is_syn) +tcp_dooptions(tp, to, cp, cnt, is_syn, th) + struct tcpcb *tp; struct tcpopt *to; - u_char *cp; + u_char *cp; int cnt; int is_syn; + struct tcphdr *th; { int opt, optlen; @@ -2623,6 +2670,20 @@ tcp_dooptions(to, cp, cnt, is_syn) to->to_flags |= (TOF_SIGNATURE | TOF_SIGLEN); break; #endif + case TCPOPT_SACK_PERMITTED: + if (!tcp_do_sack || + optlen != TCPOLEN_SACK_PERMITTED) + continue; + if (is_syn) { + /* MUST only be set on SYN */ + to->to_flags |= TOF_SACK; + } + break; + + case TCPOPT_SACK: + if (!tp || tcp_sack_option(tp, th, cp, optlen)) + continue; + break; default: continue; } diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c new file mode 100644 index 0000000..8dfa682 --- /dev/null +++ b/sys/netinet/tcp_sack.c @@ -0,0 +1,592 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995 + * + * NRL grants permission for redistribution and use in source and binary + * forms, with or without modification, of the software and documentation + * created at NRL provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgements: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * This product includes software developed at the Information + * Technology Division, US Naval Research Laboratory. + * 4. Neither the name of the NRL nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS + * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * The views and conclusions contained in the software and documentation + * are those of the authors and should not be interpreted as representing + * official policies, either expressed or implied, of the US Naval + * Research Laboratory (NRL). + */ +#include "opt_ipfw.h" /* for ipfw_fwd */ +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_tcpdebug.h" +#include "opt_tcp_input.h" +#include "opt_tcp_sack.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/proc.h> /* for proc0 declaration */ +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/syslog.h> +#include <sys/systm.h> + +#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ + +#include <vm/uma.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */ +#include <netinet/in_var.h> +#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ +#include <netinet/in_pcb.h> +#include <netinet/ip_var.h> +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#include <netinet6/nd6.h> +#include <netinet6/ip6_var.h> +#include <netinet6/in6_pcb.h> +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet6/tcp6_var.h> +#include <netinet/tcpip.h> +#ifdef TCPDEBUG +#include <netinet/tcp_debug.h> + +u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */ +struct tcphdr tcp_savetcp; +#endif /* TCPDEBUG */ + +#ifdef FAST_IPSEC +#include <netipsec/ipsec.h> +#include <netipsec/ipsec6.h> +#endif + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#include <netinet6/ipsec6.h> +#include <netkey/key.h> +#endif /*IPSEC*/ +#include <machine/in_cksum.h> + +extern struct uma_zone *sack_hole_zone; + +/* + * This function is called upon receipt of new valid data (while not in header + * prediction mode), and it updates the ordered list of sacks. + */ +void +tcp_update_sack_list(tp) + struct tcpcb *tp; +{ + /* + * First reported block MUST be the most recent one. Subsequent + * blocks SHOULD be in the order in which they arrived at the + * receiver. These two conditions make the implementation fully + * compliant with RFC 2018. + */ + int i, j = 0, count = 0, lastpos = -1; + struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; + + INP_LOCK_ASSERT(tp->t_inpcb); + /* First clean up current list of sacks */ + for (i = 0; i < tp->rcv_numsacks; i++) { + sack = tp->sackblks[i]; + if (sack.start == 0 && sack.end == 0) { + count++; /* count = number of blocks to be discarded */ + continue; + } + if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { + tp->sackblks[i].start = tp->sackblks[i].end = 0; + count++; + } else { + temp[j].start = tp->sackblks[i].start; + temp[j++].end = tp->sackblks[i].end; + } + } + tp->rcv_numsacks -= count; + if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ + tcp_clean_sackreport(tp); + if (SEQ_LT(tp->rcv_nxt, tp->rcv_laststart)) { + /* ==> need first sack block */ + tp->sackblks[0].start = tp->rcv_laststart; + tp->sackblks[0].end = tp->rcv_lastend; + tp->rcv_numsacks = 1; + } + return; + } + /* Otherwise, sack blocks are already present. */ + for (i = 0; i < tp->rcv_numsacks; i++) + tp->sackblks[i] = temp[i]; /* first copy back sack list */ + if (SEQ_GEQ(tp->rcv_nxt, tp->rcv_lastend)) + return; /* sack list remains unchanged */ + /* + * From here, segment just received should be (part of) the 1st sack. + * Go through list, possibly coalescing sack block entries. + */ + firstsack.start = tp->rcv_laststart; + firstsack.end = tp->rcv_lastend; + for (i = 0; i < tp->rcv_numsacks; i++) { + sack = tp->sackblks[i]; + if (SEQ_LT(sack.end, firstsack.start) || + SEQ_GT(sack.start, firstsack.end)) + continue; /* no overlap */ + if (sack.start == firstsack.start && sack.end == firstsack.end){ + /* + * identical block; delete it here since we will + * move it to the front of the list. + */ + tp->sackblks[i].start = tp->sackblks[i].end = 0; + lastpos = i; /* last posn with a zero entry */ + continue; + } + if (SEQ_LEQ(sack.start, firstsack.start)) + firstsack.start = sack.start; /* merge blocks */ + if (SEQ_GEQ(sack.end, firstsack.end)) + firstsack.end = sack.end; /* merge blocks */ + tp->sackblks[i].start = tp->sackblks[i].end = 0; + lastpos = i; /* last posn with a zero entry */ + } + if (lastpos != -1) { /* at least one merge */ + for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { + sack = tp->sackblks[i]; + if (sack.start == 0 && sack.end == 0) + continue; + temp[j++] = sack; + } + tp->rcv_numsacks = j; /* including first blk (added later) */ + for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ + tp->sackblks[i] = temp[i]; + } else { /* no merges -- shift sacks by 1 */ + if (tp->rcv_numsacks < MAX_SACK_BLKS) + tp->rcv_numsacks++; + for (i = tp->rcv_numsacks-1; i > 0; i--) + tp->sackblks[i] = tp->sackblks[i-1]; + } + tp->sackblks[0] = firstsack; + return; +} + +/* + * Delete all receiver-side SACK information. + */ +void +tcp_clean_sackreport(tp) + struct tcpcb *tp; +{ + int i; + + INP_LOCK_ASSERT(tp->t_inpcb); + tp->rcv_numsacks = 0; + for (i = 0; i < MAX_SACK_BLKS; i++) + tp->sackblks[i].start = tp->sackblks[i].end=0; +} + +/* + * Process the TCP SACK option. Returns 1 if tcp_dooptions() should continue, + * and 0 otherwise, if the option was fine. tp->snd_holes is an ordered list + * of holes (oldest to newest, in terms of the sequence space). + */ +int +tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) +{ + int tmp_olen; + u_char *tmp_cp; + struct sackhole *cur, *p, *temp; + + INP_LOCK_ASSERT(tp->t_inpcb); + if (!tp->sack_enable) + return (1); + + /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ + if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) + return (1); + tmp_cp = cp + 2; + tmp_olen = optlen - 2; + tcpstat.tcps_sack_rcv_blocks++; + if (tp->snd_numholes < 0) + tp->snd_numholes = 0; + if (tp->t_maxseg == 0) + panic("tcp_sack_option"); /* Should never happen */ + while (tmp_olen > 0) { + struct sackblk sack; + + bcopy(tmp_cp, (char *) &(sack.start), sizeof(tcp_seq)); + sack.start = ntohl(sack.start); + bcopy(tmp_cp + sizeof(tcp_seq), + (char *) &(sack.end), sizeof(tcp_seq)); + sack.end = ntohl(sack.end); + tmp_olen -= TCPOLEN_SACK; + tmp_cp += TCPOLEN_SACK; + if (SEQ_LEQ(sack.end, sack.start)) + continue; /* bad SACK fields */ + if (SEQ_LEQ(sack.end, tp->snd_una)) + continue; /* old block */ + if (SEQ_GT(th->th_ack, tp->snd_una)) { + if (SEQ_LT(sack.start, th->th_ack)) + continue; + } + if (SEQ_GT(sack.end, tp->snd_max)) + continue; + if (tp->snd_holes == NULL) { /* first hole */ + tp->snd_holes = (struct sackhole *) + uma_zalloc(sack_hole_zone,M_NOWAIT); + if (tp->snd_holes == NULL) { + /* ENOBUFS, so ignore SACKed block for now*/ + continue; + } + cur = tp->snd_holes; + cur->start = th->th_ack; + cur->end = sack.start; + cur->rxmit = cur->start; + cur->next = NULL; + tp->snd_numholes = 1; + tp->rcv_lastsack = sack.end; + continue; /* with next sack block */ + } + /* Go thru list of holes: p = previous, cur = current */ + p = cur = tp->snd_holes; + while (cur) { + if (SEQ_LEQ(sack.end, cur->start)) + /* SACKs data before the current hole */ + break; /* no use going through more holes */ + if (SEQ_GEQ(sack.start, cur->end)) { + /* SACKs data beyond the current hole */ + p = cur; + cur = cur->next; + continue; + } + if (SEQ_LEQ(sack.start, cur->start)) { + /* Data acks at least the beginning of hole */ + if (SEQ_GEQ(sack.end, cur->end)) { + /* Acks entire hole, so delete hole */ + if (p != cur) { + p->next = cur->next; + uma_zfree(sack_hole_zone, cur); + cur = p->next; + } else { + cur = cur->next; + uma_zfree(sack_hole_zone, p); + p = cur; + tp->snd_holes = p; + } + tp->snd_numholes--; + continue; + } + /* otherwise, move start of hole forward */ + cur->start = sack.end; + cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); + p = cur; + cur = cur->next; + continue; + } + /* move end of hole backward */ + if (SEQ_GEQ(sack.end, cur->end)) { + cur->end = sack.start; + cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); + p = cur; + cur = cur->next; + continue; + } + if (SEQ_LT(cur->start, sack.start) && + SEQ_GT(cur->end, sack.end)) { + /* + * ACKs some data in middle of a hole; need to + * split current hole + */ + temp = (struct sackhole *) + uma_zalloc(sack_hole_zone,M_NOWAIT); + if (temp == NULL) + continue; /* ENOBUFS */ + temp->next = cur->next; + temp->start = sack.end; + temp->end = cur->end; + temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); + cur->end = sack.start; + cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); + cur->next = temp; + p = temp; + cur = p->next; + tp->snd_numholes++; + } + } + /* At this point, p points to the last hole on the list */ + if (SEQ_LT(tp->rcv_lastsack, sack.start)) { + /* + * Need to append new hole at end. + * Last hole is p (and it's not NULL). + */ + temp = (struct sackhole *) + uma_zalloc(sack_hole_zone,M_NOWAIT); + if (temp == NULL) + continue; /* ENOBUFS */ + temp->start = tp->rcv_lastsack; + temp->end = sack.start; + temp->rxmit = temp->start; + temp->next = 0; + p->next = temp; + tp->rcv_lastsack = sack.end; + tp->snd_numholes++; + } + } + return (0); +} + +/* + * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if + * it is completely acked; otherwise, tcp_sack_option(), called from + * tcp_dooptions(), will fix up the hole. + */ +void +tcp_del_sackholes(tp, th) + struct tcpcb *tp; + struct tcphdr *th; +{ + INP_LOCK_ASSERT(tp->t_inpcb); + if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { + /* max because this could be an older ack just arrived */ + tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? + th->th_ack : tp->snd_una; + struct sackhole *cur = tp->snd_holes; + struct sackhole *prev; + while (cur) + if (SEQ_LEQ(cur->end, lastack)) { + prev = cur; + cur = cur->next; + uma_zfree(sack_hole_zone, prev); + tp->snd_numholes--; + } else if (SEQ_LT(cur->start, lastack)) { + cur->start = lastack; + if (SEQ_LT(cur->rxmit, cur->start)) + cur->rxmit = cur->start; + break; + } else + break; + tp->snd_holes = cur; + } +} + +void +tcp_free_sackholes(struct tcpcb *tp) +{ + struct sackhole *p, *q; + + INP_LOCK_ASSERT(tp->t_inpcb); + q = tp->snd_holes; + while (q != NULL) { + p = q; + q = q->next; + uma_zfree(sack_hole_zone, p); + } + tp->snd_holes = 0; +} + +/* + * Checks for partial ack. If partial ack arrives, turn off retransmission + * timer, deflate the window, do not clear tp->t_dupacks, and return 1. + * If the ack advances at least to tp->snd_recover, return 0. + */ +void +tcp_sack_partialack(tp, th) + struct tcpcb *tp; + struct tcphdr *th; +{ + INP_LOCK_ASSERT(tp->t_inpcb); + u_long ocwnd = tp->snd_cwnd; + + callout_stop(tp->tt_rexmt); + tp->t_rtttime = 0; + /* + * Set snd_cwnd to one segment beyond acknowledged offset + * (tp->snd_una has not yet been updated when this function is called.) + */ + /* + * Should really be + * min(tp->snd_cwnd, tp->t_maxseg + (th->th_ack - tp->snd_una)) + */ + tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); + tp->t_flags |= TF_ACKNOW; + (void) tcp_output(tp); + tp->snd_cwnd = ocwnd; + /* + * Partial window deflation. Relies on fact that tp->snd_una + * not updated yet. + */ + tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg); +} + +#ifdef TCP_SACK_DEBUG +void +tcp_print_holes(struct tcpcb *tp) +{ + struct sackhole *p = tp->snd_holes; + if (p == 0) + return; + printf("Hole report: start--end dups rxmit\n"); + while (p) { + printf("%x--%x r %x\n", p->start, p->end, p->rxmit); + p = p->next; + } + printf("\n"); +} +#endif /* TCP_SACK_DEBUG */ + +/* + * Returns pointer to a sackhole if there are any pending retransmissions; + * NULL otherwise. + */ +struct sackhole * +tcp_sack_output(struct tcpcb *tp) +{ + struct sackhole *p; + + INP_LOCK_ASSERT(tp->t_inpcb); + if (!tp->sack_enable) + return (NULL); + p = tp->snd_holes; + while (p) { + if (SEQ_LT(p->rxmit, p->end)) { + if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */ + p = p->next; + continue; + } +#ifdef TCP_SACK_DEBUG + if (p) + tcp_print_holes(tp); +#endif + return (p); + } + p = p->next; + } + return (NULL); +} + +/* + * After a timeout, the SACK list may be rebuilt. This SACK information + * should be used to avoid retransmitting SACKed data. This function + * traverses the SACK list to see if snd_nxt should be moved forward. + */ +void +tcp_sack_adjust(struct tcpcb *tp) +{ + INP_LOCK_ASSERT(tp->t_inpcb); + struct sackhole *cur = tp->snd_holes; + if (cur == NULL) + return; /* No holes */ + if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack)) + return; /* We're already beyond any SACKed blocks */ + /* + * Two cases for which we want to advance snd_nxt: + * i) snd_nxt lies between end of one hole and beginning of another + * ii) snd_nxt lies between end of last hole and rcv_lastsack + */ + while (cur->next) { + if (SEQ_LT(tp->snd_nxt, cur->end)) + return; + if (SEQ_GEQ(tp->snd_nxt, cur->next->start)) + cur = cur->next; + else { + tp->snd_nxt = cur->next->start; + return; + } + } + if (SEQ_LT(tp->snd_nxt, cur->end)) + return; + tp->snd_nxt = tp->rcv_lastsack; + return; +} + diff --git a/sys/netinet/tcp_seq.h b/sys/netinet/tcp_seq.h index d5b1f14..c029b12 100644 --- a/sys/netinet/tcp_seq.h +++ b/sys/netinet/tcp_seq.h @@ -42,6 +42,9 @@ #define SEQ_GT(a,b) ((int)((a)-(b)) > 0) #define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0) +#define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b)) +#define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b)) + /* for modulo comparisons of timestamps */ #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index b5cfd43..c1add3f 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -36,6 +36,7 @@ #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" +#include "opt_tcp_sack.h" #include <sys/param.h> #include <sys/systm.h> @@ -201,6 +202,17 @@ static int tcp_inflight_stab = 20; SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW, &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets"); + +int tcp_do_sack = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_sack, CTLFLAG_RW, + &tcp_do_sack, 0, "Enable/Disable TCP SACK support"); + +int tcp_sackhole_limit = 10 * 1024; /* Arbitrarily set */ +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sackhole_limit, CTLFLAG_RW, + &tcp_sackhole_limit, 0, "Limit on the total SACK scoreboard elements"); + +uma_zone_t sack_hole_zone; + static struct inpcb *tcp_notify(struct inpcb *, int); static void tcp_discardcb(struct tcpcb *); static void tcp_isn_tick(void *); @@ -292,6 +304,8 @@ tcp_init() tcp_isn_tick(NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); + sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); } void @@ -606,6 +620,7 @@ tcp_newtcpcb(inp) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); if (tcp_do_rfc1644) tp->t_flags |= TF_REQ_CC; + tp->sack_enable = tcp_do_sack; tp->t_inpcb = inp; /* XXX */ /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no @@ -739,6 +754,7 @@ tcp_discardcb(tp) tp->t_segqlen--; tcp_reass_qsize--; } + tcp_free_sackholes(tp); inp->inp_ppcb = NULL; tp->t_inpcb = NULL; uma_zfree(tcpcb_zone, tp); diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index 57d6a93..dbbfbb1 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -39,6 +39,7 @@ #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" +#include "opt_tcp_sack.h" #include <sys/param.h> #include <sys/systm.h> @@ -704,7 +705,10 @@ syncache_socket(sc, lso, m) if (sc->sc_flags & SCF_SIGNATURE) tp->t_flags |= TF_SIGNATURE; #endif - + if (sc->sc_flags & SCF_SACK) { + tp->sack_enable = 1; + tp->t_flags |= TF_SACK_PERMIT; + } /* * Set up MSS and get cached values from tcp_hostcache. * This might overwrite some of the defaults we just set. @@ -991,6 +995,9 @@ syncache_add(inc, to, th, sop, m) sc->sc_flags = SCF_SIGNATURE; #endif + if (to->to_flags & TOF_SACK) + sc->sc_flags |= SCF_SACK; + /* * XXX * We have the option here of not doing TAO (even if the segment @@ -1107,6 +1114,7 @@ syncache_respond(sc, m) optlen += (sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGNATURE + 2 : 0; #endif + optlen += ((sc->sc_flags & SCF_SACK) ? 4 : 0); } tlen = hlen + sizeof(struct tcphdr) + optlen; @@ -1244,6 +1252,11 @@ syncache_respond(sc, m) optp += TCPOLEN_SIGNATURE + 2; } #endif /* TCP_SIGNATURE */ + + if (sc->sc_flags & SCF_SACK) { + *(u_int32_t *)optp = htonl(TCPOPT_SACK_PERMIT_HDR); + optp += 4; + } } #ifdef INET6 diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c index a23531f..44664ad 100644 --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -32,6 +32,7 @@ #include "opt_inet6.h" #include "opt_tcpdebug.h" +#include "opt_tcp_sack.h" #include <sys/param.h> #include <sys/kernel.h> @@ -217,6 +218,7 @@ tcp_timer_2msl(xtp) return; } INP_LOCK(inp); + tcp_free_sackholes(tp); if (callout_pending(tp->tt_2msl) || !callout_active(tp->tt_2msl)) { INP_UNLOCK(tp->t_inpcb); INP_INFO_WUNLOCK(&tcbinfo); @@ -497,6 +499,7 @@ tcp_timer_rexmt(xtp) return; } callout_deactivate(tp->tt_rexmt); + tcp_free_sackholes(tp); /* * Retransmission timer went off. Message has not * been acked within retransmit interval. Back off diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c index b5cfd43..c1add3f 100644 --- a/sys/netinet/tcp_timewait.c +++ b/sys/netinet/tcp_timewait.c @@ -36,6 +36,7 @@ #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" +#include "opt_tcp_sack.h" #include <sys/param.h> #include <sys/systm.h> @@ -201,6 +202,17 @@ static int tcp_inflight_stab = 20; SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW, &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets"); + +int tcp_do_sack = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_sack, CTLFLAG_RW, + &tcp_do_sack, 0, "Enable/Disable TCP SACK support"); + +int tcp_sackhole_limit = 10 * 1024; /* Arbitrarily set */ +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sackhole_limit, CTLFLAG_RW, + &tcp_sackhole_limit, 0, "Limit on the total SACK scoreboard elements"); + +uma_zone_t sack_hole_zone; + static struct inpcb *tcp_notify(struct inpcb *, int); static void tcp_discardcb(struct tcpcb *); static void tcp_isn_tick(void *); @@ -292,6 +304,8 @@ tcp_init() tcp_isn_tick(NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); + sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); } void @@ -606,6 +620,7 @@ tcp_newtcpcb(inp) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); if (tcp_do_rfc1644) tp->t_flags |= TF_REQ_CC; + tp->sack_enable = tcp_do_sack; tp->t_inpcb = inp; /* XXX */ /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no @@ -739,6 +754,7 @@ tcp_discardcb(tp) tp->t_segqlen--; tcp_reass_qsize--; } + tcp_free_sackholes(tp); inp->inp_ppcb = NULL; tp->t_inpcb = NULL; uma_zfree(tcpcb_zone, tp); diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 8c42b4d..0090210 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -52,6 +52,17 @@ LIST_HEAD(tsegqe_head, tseg_qent); extern int tcp_reass_qsize; extern struct uma_zone *tcp_reass_zone; +struct sackblk { + tcp_seq start; /* start seq no. of sack block */ + tcp_seq end; /* end seq no. */ +}; + +struct sackhole { + tcp_seq start; /* start seq no. of hole */ + tcp_seq end; /* end seq no. */ + tcp_seq rxmit; /* next seq. no in hole to be retransmitted */ + struct sackhole *next; /* next in list */ +}; struct tcptemp { u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */ struct tcphdr tt_t; @@ -179,6 +190,16 @@ struct tcpcb { u_long rcv_second; /* start of interval second */ u_long rcv_pps; /* received packets per second */ u_long rcv_byps; /* received bytes per second */ + /* SACK related state */ + int sack_enable; /* enable SACK for this connection */ + int snd_numholes; /* number of holes seen by sender */ + struct sackhole *snd_holes; /* linked list of holes (sorted) */ + + tcp_seq rcv_laststart; /* start of last segment recd. */ + tcp_seq rcv_lastend; /* end of ... */ + tcp_seq rcv_lastsack; /* last seq number(+1) sack'd by rcv'r*/ + int rcv_numsacks; /* # distinct sack blks present */ + struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */ }; #define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY) @@ -216,6 +237,7 @@ struct tcpopt { #define TOF_SCALE 0x0020 #define TOF_SIGNATURE 0x0040 /* signature option present */ #define TOF_SIGLEN 0x0080 /* signature length valid (RFC2385) */ +#define TOF_SACK 0x0100 /* Peer sent SACK option */ u_int32_t to_tsval; u_int32_t to_tsecr; tcp_cc to_cc; /* holds CC or CCnew */ @@ -249,6 +271,7 @@ struct syncache { #define SCF_CC 0x08 /* negotiated CC */ #define SCF_UNREACH 0x10 /* icmp unreachable received */ #define SCF_SIGNATURE 0x20 /* send MD5 digests */ +#define SCF_SACK 0x80 /* send SACK option */ TAILQ_ENTRY(syncache) sc_hash; TAILQ_ENTRY(syncache) sc_timerq; }; @@ -434,6 +457,13 @@ struct tcpstat { u_long tcps_hc_added; /* entry added to hostcache */ u_long tcps_hc_bucketoverflow; /* hostcache per bucket limit hit */ + + /* SACK related stats */ + u_long tcps_sack_recovery_episode; /* SACK recovery episodes */ + u_long tcps_sack_rexmits; /* SACK rexmit segments */ + u_long tcps_sack_rexmit_bytes; /* SACK rexmit bytes */ + u_long tcps_sack_rcv_blocks; /* SACK blocks (options) received */ + u_long tcps_sack_send_blocks; /* SACK blocks (options) sent */ }; /* @@ -467,7 +497,8 @@ struct xtcpcb { #define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */ #define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */ #define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */ -#define TCPCTL_MAXID 14 +#define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */ +#define TCPCTL_MAXID 15 #define TCPCTL_NAMES { \ { 0, 0 }, \ @@ -505,6 +536,8 @@ extern int path_mtu_discovery; extern int ss_fltsz; extern int ss_fltsz_local; +extern int tcp_do_sack; /* SACK enabled/disabled */ + void tcp_canceltimers(struct tcpcb *); struct tcpcb * tcp_close(struct tcpcb *); @@ -578,6 +611,20 @@ extern u_long tcp_sendspace; extern u_long tcp_recvspace; tcp_seq tcp_new_isn(struct tcpcb *); +int tcp_sack_option(struct tcpcb *,struct tcphdr *,u_char *,int); +void tcp_update_sack_list(struct tcpcb *tp); +void tcp_del_sackholes(struct tcpcb *, struct tcphdr *); +void tcp_clean_sackreport(struct tcpcb *tp); +void tcp_sack_adjust(struct tcpcb *tp); +struct sackhole *tcp_sack_output(struct tcpcb *tp); +void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); +void tcp_free_sackholes(struct tcpcb *tp); +int tcp_newreno(struct tcpcb *, struct tcphdr *); +u_long tcp_seq_subtract(u_long, u_long ); +#ifdef TCP_SACK_DEBUG +void tcp_print_holes(struct tcpcb *tp); +#endif /* TCP_SACK_DEBUG */ + #endif /* _KERNEL */ #endif /* _NETINET_TCP_VAR_H_ */ |