diff options
-rw-r--r-- | sys/netinet/siftr.c | 4 | ||||
-rw-r--r-- | sys/netinet/tcp.h | 2 | ||||
-rw-r--r-- | sys/netinet/tcp_input.c | 4 | ||||
-rw-r--r-- | sys/netinet/tcp_output.c | 1 | ||||
-rw-r--r-- | sys/netinet/tcp_subr.c | 206 | ||||
-rw-r--r-- | sys/netinet/tcp_timer.h | 3 | ||||
-rw-r--r-- | sys/netinet/tcp_usrreq.c | 22 | ||||
-rw-r--r-- | sys/netinet/tcp_var.h | 9 |
8 files changed, 16 insertions, 235 deletions
diff --git a/sys/netinet/siftr.c b/sys/netinet/siftr.c index 34f3636..6097ad4 100644 --- a/sys/netinet/siftr.c +++ b/sys/netinet/siftr.c @@ -193,7 +193,7 @@ struct pkt_node { u_long snd_wnd; /* Receive Window (bytes). */ u_long rcv_wnd; - /* Bandwidth Controlled Window (bytes). */ + /* Unused (was: Bandwidth Controlled Window (bytes)). */ u_long snd_bwnd; /* Slow Start Threshold (bytes). */ u_long snd_ssthresh; @@ -775,7 +775,7 @@ siftr_siftdata(struct pkt_node *pn, struct inpcb *inp, struct tcpcb *tp, pn->snd_cwnd = tp->snd_cwnd; pn->snd_wnd = tp->snd_wnd; pn->rcv_wnd = tp->rcv_wnd; - pn->snd_bwnd = tp->snd_bwnd; + pn->snd_bwnd = 0; /* Unused, kept for compat. */ pn->snd_ssthresh = tp->snd_ssthresh; pn->snd_scale = tp->snd_scale; pn->rcv_scale = tp->rcv_scale; diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h index 27d45aa..62a89f7 100644 --- a/sys/netinet/tcp.h +++ b/sys/netinet/tcp.h @@ -221,7 +221,7 @@ struct tcp_info { /* FreeBSD extensions to tcp_info. */ u_int32_t tcpi_snd_wnd; /* Advertised send window. */ - u_int32_t tcpi_snd_bwnd; /* Bandwidth send window. */ + u_int32_t tcpi_snd_bwnd; /* No longer used. */ u_int32_t tcpi_snd_nxt; /* Next egress seqno */ u_int32_t tcpi_rcv_nxt; /* Next ingress seqno */ u_int32_t tcpi_toe_tid; /* HWTID for TOE endpoints */ diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index e4bddb9..22a2ea4 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -1321,7 +1321,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tcp_xmit_timer(tp, ticks - tp->t_rtttime); } - tcp_xmit_bandwidth_limit(tp, th->th_ack); acked = th->th_ack - tp->snd_una; TCPSTAT_INC(tcps_rcvackpack); TCPSTAT_ADD(tcps_rcvackbyte, acked); @@ -2278,7 +2277,6 @@ process_ACK: tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } - tcp_xmit_bandwidth_limit(tp, th->th_ack); /* * If all outstanding data is acked, stop retransmit @@ -3328,8 +3326,6 @@ tcp_mss(struct tcpcb *tp, int offer) tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh); TCPSTAT_INC(tcps_usedssthresh); } - if (metrics.rmx_bandwidth) - tp->snd_bandwidth = metrics.rmx_bandwidth; /* * Set the slow-start flight size depending on whether this diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index 50d0ee6..94b48fc 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -225,7 +225,6 @@ again: tso = 0; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd); - sendwin = min(sendwin, tp->snd_bwnd); flags = tcp_outflags[tp->t_state]; /* diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index b537fb9..d19a91a 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -160,14 +160,6 @@ SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, "Default TCP Maximum Segment Size for IPv6"); #endif -static int -vnet_sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS) -{ - - VNET_SYSCTL_ARG(req, arg1); - return (sysctl_msec_to_ticks(oidp, arg1, arg2, req)); -} - /* * Minimum MSS we accept and use. This prevents DoS attacks where * we are forced to a ridiculous low MSS like 20 and send hundreds @@ -213,50 +205,6 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, &VNET_NAME(tcp_isn_reseed_interval), 0, "Seconds between reseeding of ISN secret"); -/* - * TCP bandwidth limiting sysctls. Note that the default lower bound of - * 1024 exists only for debugging. A good production default would be - * something like 6100. - */ -SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0, - "TCP inflight data limiting"); - -static VNET_DEFINE(int, tcp_inflight_enable) = 0; -#define V_tcp_inflight_enable VNET(tcp_inflight_enable) -SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW, - &VNET_NAME(tcp_inflight_enable), 0, - "Enable automatic TCP inflight data limiting"); - -static int tcp_inflight_debug = 0; -SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW, - &tcp_inflight_debug, 0, - "Debug TCP inflight calculations"); - -static VNET_DEFINE(int, tcp_inflight_rttthresh); -#define V_tcp_inflight_rttthresh VNET(tcp_inflight_rttthresh) -SYSCTL_VNET_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_inflight_rttthresh), 0, - vnet_sysctl_msec_to_ticks, "I", - "RTT threshold below which inflight will deactivate itself"); - -static VNET_DEFINE(int, tcp_inflight_min) = 6144; -#define V_tcp_inflight_min VNET(tcp_inflight_min) -SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW, - &VNET_NAME(tcp_inflight_min), 0, - "Lower-bound for TCP inflight window"); - -static VNET_DEFINE(int, tcp_inflight_max) = TCP_MAXWIN << TCP_MAX_WINSHIFT; -#define V_tcp_inflight_max VNET(tcp_inflight_max) -SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW, - &VNET_NAME(tcp_inflight_max), 0, - "Upper-bound for TCP inflight window"); - -static VNET_DEFINE(int, tcp_inflight_stab) = 20; -#define V_tcp_inflight_stab VNET(tcp_inflight_stab) -SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW, - &VNET_NAME(tcp_inflight_stab), 0, - "Inflight Algorithm Stabilization 20 = 2 packets"); - #ifdef TCP_SORECEIVE_STREAM static int tcp_soreceive_stream = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN, @@ -338,8 +286,6 @@ tcp_init(void) in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize, "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE); - V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH; - /* * These have to be type stable for the benefit of the timers. */ @@ -728,10 +674,8 @@ tcp_newtcpcb(struct inpcb *inp) tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = TCPTV_RTOBASE; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; - tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; - tp->t_bw_rtttime = ticks; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, @@ -849,8 +793,6 @@ tcp_discardcb(struct tcpcb *tp) metrics.rmx_rtt = tp->t_srtt; metrics.rmx_rttvar = tp->t_rttvar; - /* XXX: This wraps if the pipe is more than 4 Gbit per second */ - metrics.rmx_bandwidth = tp->snd_bandwidth; metrics.rmx_cwnd = tp->snd_cwnd; metrics.rmx_sendpipe = 0; metrics.rmx_recvpipe = 0; @@ -1773,154 +1715,6 @@ ipsec_hdrsiz_tcp(struct tcpcb *tp) } #endif /* IPSEC */ -/* - * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING - * - * This code attempts to calculate the bandwidth-delay product as a - * means of determining the optimal window size to maximize bandwidth, - * minimize RTT, and avoid the over-allocation of buffers on interfaces and - * routers. This code also does a fairly good job keeping RTTs in check - * across slow links like modems. We implement an algorithm which is very - * similar (but not meant to be) TCP/Vegas. The code operates on the - * transmitter side of a TCP connection and so only effects the transmit - * side of the connection. - * - * BACKGROUND: TCP makes no provision for the management of buffer space - * at the end points or at the intermediate routers and switches. A TCP - * stream, whether using NewReno or not, will eventually buffer as - * many packets as it is able and the only reason this typically works is - * due to the fairly small default buffers made available for a connection - * (typicaly 16K or 32K). As machines use larger windows and/or window - * scaling it is now fairly easy for even a single TCP connection to blow-out - * all available buffer space not only on the local interface, but on - * intermediate routers and switches as well. NewReno makes a misguided - * attempt to 'solve' this problem by waiting for an actual failure to occur, - * then backing off, then steadily increasing the window again until another - * failure occurs, ad-infinitum. This results in terrible oscillation that - * is only made worse as network loads increase and the idea of intentionally - * blowing out network buffers is, frankly, a terrible way to manage network - * resources. - * - * It is far better to limit the transmit window prior to the failure - * condition being achieved. There are two general ways to do this: First - * you can 'scan' through different transmit window sizes and locate the - * point where the RTT stops increasing, indicating that you have filled the - * pipe, then scan backwards until you note that RTT stops decreasing, then - * repeat ad-infinitum. This method works in principle but has severe - * implementation issues due to RTT variances, timer granularity, and - * instability in the algorithm which can lead to many false positives and - * create oscillations as well as interact badly with other TCP streams - * implementing the same algorithm. - * - * The second method is to limit the window to the bandwidth delay product - * of the link. This is the method we implement. RTT variances and our - * own manipulation of the congestion window, bwnd, can potentially - * destabilize the algorithm. For this reason we have to stabilize the - * elements used to calculate the window. We do this by using the minimum - * observed RTT, the long term average of the observed bandwidth, and - * by adding two segments worth of slop. It isn't perfect but it is able - * to react to changing conditions and gives us a very stable basis on - * which to extend the algorithm. - */ -void -tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq) -{ - u_long bw; - u_long bwnd; - int save_ticks; - - INP_WLOCK_ASSERT(tp->t_inpcb); - - /* - * If inflight_enable is disabled in the middle of a tcp connection, - * make sure snd_bwnd is effectively disabled. - */ - if (V_tcp_inflight_enable == 0 || - tp->t_rttlow < V_tcp_inflight_rttthresh) { - tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; - tp->snd_bandwidth = 0; - return; - } - - /* - * Figure out the bandwidth. Due to the tick granularity this - * is a very rough number and it MUST be averaged over a fairly - * long period of time. XXX we need to take into account a link - * that is not using all available bandwidth, but for now our - * slop will ramp us up if this case occurs and the bandwidth later - * increases. - * - * Note: if ticks rollover 'bw' may wind up negative. We must - * effectively reset t_bw_rtttime for this case. - */ - save_ticks = ticks; - if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1) - return; - - bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / - (save_ticks - tp->t_bw_rtttime); - tp->t_bw_rtttime = save_ticks; - tp->t_bw_rtseq = ack_seq; - if (tp->t_bw_rtttime == 0 || (int)bw < 0) - return; - bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4; - - tp->snd_bandwidth = bw; - - /* - * Calculate the semi-static bandwidth delay product, plus two maximal - * segments. The additional slop puts us squarely in the sweet - * spot and also handles the bandwidth run-up case and stabilization. - * Without the slop we could be locking ourselves into a lower - * bandwidth. - * - * Situations Handled: - * (1) Prevents over-queueing of packets on LANs, especially on - * high speed LANs, allowing larger TCP buffers to be - * specified, and also does a good job preventing - * over-queueing of packets over choke points like modems - * (at least for the transmit side). - * - * (2) Is able to handle changing network loads (bandwidth - * drops so bwnd drops, bandwidth increases so bwnd - * increases). - * - * (3) Theoretically should stabilize in the face of multiple - * connections implementing the same algorithm (this may need - * a little work). - * - * (4) Stability value (defaults to 20 = 2 maximal packets) can - * be adjusted with a sysctl but typically only needs to be - * on very slow connections. A value no smaller then 5 - * should be used, but only reduce this default if you have - * no other choice. - */ -#define USERTT ((tp->t_srtt + tp->t_rttbest) / 2) - bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + V_tcp_inflight_stab * tp->t_maxseg / 10; -#undef USERTT - - if (tcp_inflight_debug > 0) { - static int ltime; - if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) { - ltime = ticks; - printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n", - tp, - bw, - tp->t_rttbest, - tp->t_srtt, - bwnd - ); - } - } - if ((long)bwnd < V_tcp_inflight_min) - bwnd = V_tcp_inflight_min; - if (bwnd > V_tcp_inflight_max) - bwnd = V_tcp_inflight_max; - if ((long)bwnd < tp->t_maxseg * 2) - bwnd = tp->t_maxseg * 2; - tp->snd_bwnd = bwnd; -} - #ifdef TCP_SIGNATURE /* * Callback function invoked by m_apply() to digest TCP segment data diff --git a/sys/netinet/tcp_timer.h b/sys/netinet/tcp_timer.h index 1ab0b7b..4bfcdf6 100644 --- a/sys/netinet/tcp_timer.h +++ b/sys/netinet/tcp_timer.h @@ -86,9 +86,6 @@ #define TCPTV_KEEPINTVL ( 75*hz) /* default probe interval */ #define TCPTV_KEEPCNT 8 /* max probes before drop */ -#define TCPTV_INFLIGHT_RTTTHRESH (10*hz/1000) /* below which inflight - disengages, in msec */ - #define TCPTV_FINWAIT2_TIMEOUT (60*hz) /* FIN_WAIT_2 timeout if no receiver */ /* diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index 2e61c31..f35890b 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -1105,7 +1105,6 @@ tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) tp->t_state = TCPS_SYN_SENT; tcp_timer_activate(tp, TT_KEEP, tcp_keepinit); tp->iss = tcp_new_isn(tp); - tp->t_bw_rtseq = tp->iss; tcp_sendseqinit(tp); return 0; @@ -1168,7 +1167,6 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) tp->t_state = TCPS_SYN_SENT; tcp_timer_activate(tp, TT_KEEP, tcp_keepinit); tp->iss = tcp_new_isn(tp); - tp->t_bw_rtseq = tp->iss; tcp_sendseqinit(tp); return 0; @@ -1214,7 +1212,7 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) ti->tcpi_rcv_space = tp->rcv_wnd; ti->tcpi_rcv_nxt = tp->rcv_nxt; ti->tcpi_snd_wnd = tp->snd_wnd; - ti->tcpi_snd_bwnd = tp->snd_bwnd; + ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */ ti->tcpi_snd_nxt = tp->snd_nxt; ti->tcpi_snd_mss = tp->t_maxseg; ti->tcpi_rcv_mss = tp->t_maxseg; @@ -1795,26 +1793,24 @@ db_print_tcpcb(struct tcpcb *tp, const char *name, int indent) tp->rcv_adv, tp->rcv_wnd, tp->rcv_up); db_print_indent(indent); - db_printf("snd_wnd: %lu snd_cwnd: %lu snd_bwnd: %lu\n", - tp->snd_wnd, tp->snd_cwnd, tp->snd_bwnd); + db_printf("snd_wnd: %lu snd_cwnd: %lu\n", + tp->snd_wnd, tp->snd_cwnd); db_print_indent(indent); - db_printf("snd_ssthresh: %lu snd_bandwidth: %lu snd_recover: " - "0x%08x\n", tp->snd_ssthresh, tp->snd_bandwidth, - tp->snd_recover); + db_printf("snd_ssthresh: %lu snd_recover: " + "0x%08x\n", tp->snd_ssthresh, tp->snd_recover); db_print_indent(indent); db_printf("t_maxopd: %u t_rcvtime: %u t_startime: %u\n", tp->t_maxopd, tp->t_rcvtime, tp->t_starttime); db_print_indent(indent); - db_printf("t_rttime: %u t_rtsq: 0x%08x t_bw_rtttime: %u\n", - tp->t_rtttime, tp->t_rtseq, tp->t_bw_rtttime); + db_printf("t_rttime: %u t_rtsq: 0x%08x\n", + tp->t_rtttime, tp->t_rtseq); db_print_indent(indent); - db_printf("t_bw_rtseq: 0x%08x t_rxtcur: %d t_maxseg: %u " - "t_srtt: %d\n", tp->t_bw_rtseq, tp->t_rxtcur, tp->t_maxseg, - tp->t_srtt); + db_printf("t_rxtcur: %d t_maxseg: %u t_srtt: %d\n", + tp->t_rxtcur, tp->t_maxseg, tp->t_srtt); db_print_indent(indent); db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u " diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index b753e10..2b7abca 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -135,12 +135,12 @@ struct tcpcb { u_long snd_wnd; /* send window */ u_long snd_cwnd; /* congestion-controlled window */ - u_long snd_bwnd; /* bandwidth-controlled window */ + u_long snd_spare1; /* unused */ u_long snd_ssthresh; /* snd_cwnd size threshold for * for slow start exponential to * linear switch */ - u_long snd_bandwidth; /* calculated bandwidth or 0 */ + u_long snd_spare2; /* unused */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ u_int t_maxopd; /* mss plus options */ @@ -150,8 +150,8 @@ struct tcpcb { u_int t_rtttime; /* RTT measurement start time */ tcp_seq t_rtseq; /* sequence number being timed */ - u_int t_bw_rtttime; /* used for bandwidth calculation */ - tcp_seq t_bw_rtseq; /* used for bandwidth calculation */ + u_int t_bw_spare1; /* unused */ + tcp_seq t_bw_spare2; /* unused */ int t_rxtcur; /* current retransmit value (ticks) */ u_int t_maxseg; /* maximum segment size */ @@ -654,7 +654,6 @@ void tcpip_fillheaders(struct inpcb *, void *, void *); void tcp_timer_activate(struct tcpcb *, int, u_int); int tcp_timer_active(struct tcpcb *, int); void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int); -void tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq); /* * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo) */ |