summaryrefslogtreecommitdiffstats
path: root/sys/netinet/tcp_subr.c
diff options
context:
space:
mode:
authordillon <dillon@FreeBSD.org>2002-08-17 18:26:02 +0000
committerdillon <dillon@FreeBSD.org>2002-08-17 18:26:02 +0000
commitbb806c49bfead601a2b3c7298f33acef7d26a4ef (patch)
tree81ede6b84903af3b1f395ea052e7ad5eeb1fce0c /sys/netinet/tcp_subr.c
parentf31168aa75e2bc569ba987f85d994af3c2662ccb (diff)
downloadFreeBSD-src-bb806c49bfead601a2b3c7298f33acef7d26a4ef.zip
FreeBSD-src-bb806c49bfead601a2b3c7298f33acef7d26a4ef.tar.gz
Implement TCP bandwidth delay product window limiting, similar to (but
not meant to duplicate) TCP/Vegas. Add four sysctls and default the implementation to 'off'. net.inet.tcp.inflight_enable enable algorithm (defaults to 0=off) net.inet.tcp.inflight_debug debugging (defaults to 1=on) net.inet.tcp.inflight_min minimum window limit net.inet.tcp.inflight_max maximum window limit MFC after: 1 week
Diffstat (limited to 'sys/netinet/tcp_subr.c')
-rw-r--r--sys/netinet/tcp_subr.c158
1 files changed, 158 insertions, 0 deletions
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index fb6bd8be..45e5026 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -146,6 +146,27 @@ static int tcp_isn_reseed_interval = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
&tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
+/*
+ * TCP bandwidth limiting sysctls. Note that the default lower bound of
+ * 1024 exists only for debugging. A good production default would be
+ * something like 6100.
+ */
+static int tcp_inflight_enable = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW,
+ &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
+
+static int tcp_inflight_debug = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW,
+ &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
+
+static int tcp_inflight_min = 1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW,
+ &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
+
+static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW,
+ &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
+
static void tcp_cleartaocache(void);
static struct inpcb *tcp_notify(struct inpcb *, int);
@@ -566,8 +587,10 @@ tcp_newtcpcb(inp)
tp->t_rttmin = tcp_rexmit_min;
tp->t_rxtcur = TCPTV_RTOBASE;
tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+ tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
tp->t_rcvtime = ticks;
+ tp->t_bw_rtttime = ticks;
/*
* IPv4 TTL initialization is necessary for an IPv6 socket as well,
* because the socket may be bound to an IPv6 wildcard address,
@@ -1531,3 +1554,138 @@ static void
tcp_cleartaocache()
{
}
+
+/*
+ * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
+ *
+ * This code attempts to calculate the bandwidth-delay product as a
+ * means of determining the optimal window size to maximize bandwidth,
+ * minimize RTT, and avoid the over-allocation of buffers on interfaces and
+ * routers. This code also does a fairly good job keeping RTTs in check
+ * across slow links like modems. We implement an algorithm which is very
+ * similar (but not meant to be) TCP/Vegas. The code operates on the
+ * transmitter side of a TCP connection and so only effects the transmit
+ * side of the connection.
+ *
+ * BACKGROUND: TCP makes no provision for the management of buffer space
+ * at the end points or at the intermediate routers and switches. A TCP
+ * stream, whether using NewReno or not, will eventually buffer as
+ * many packets as it is able and the only reason this typically works is
+ * due to the fairly small default buffers made available for a connection
+ * (typicaly 16K or 32K). As machines use larger windows and/or window
+ * scaling it is now fairly easy for even a single TCP connection to blow-out
+ * all available buffer space not only on the local interface, but on
+ * intermediate routers and switches as well. NewReno makes a misguided
+ * attempt to 'solve' this problem by waiting for an actual failure to occur,
+ * then backing off, then steadily increasing the window again until another
+ * failure occurs, ad-infinitum. This results in terrible oscillation that
+ * is only made worse as network loads increase and the idea of intentionally
+ * blowing out network buffers is, frankly, a terrible way to manage network
+ * resources.
+ *
+ * It is far better to limit the transmit window prior to the failure
+ * condition being achieved. There are two general ways to do this: First
+ * you can 'scan' through different transmit window sizes and locate the
+ * point where the RTT stops increasing, indicating that you have filled the
+ * pipe, then scan backwards until you note that RTT stops decreasing, then
+ * repeat ad-infinitum. This method works in principle but has severe
+ * implementation issues due to RTT variances, timer granularity, and
+ * instability in the algorithm which can lead to many false positives and
+ * create oscillations as well as interact badly with other TCP streams
+ * implementing the same algorithm.
+ *
+ * The second method is to limit the window to the bandwidth delay product
+ * of the link. This is the method we implement. RTT variances and our
+ * own manipulation of the congestion window, bwnd, can potentially
+ * destabilize the algorithm. For this reason we have to stabilize the
+ * elements used to calculate the window. We do this by using the minimum
+ * observed RTT, the long term average of the observed bandwidth, and
+ * by adding two segments worth of slop. It isn't perfect but it is able
+ * to react to changing conditions and gives us a very stable basis on
+ * which to extend the algorithm.
+ */
+void
+tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
+{
+ u_long bw;
+ u_long bwnd;
+ int save_ticks;
+
+ /*
+ * If inflight_enable is disabled in the middle of a tcp connection,
+ * make sure snd_bwnd is effectively disabled.
+ */
+ if (tcp_inflight_enable == 0) {
+ tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+ tp->snd_bandwidth = 0;
+ return;
+ }
+
+ /*
+ * Figure out the bandwidth. Due to the tick granularity this
+ * is a very rough number and it MUST be averaged over a fairly
+ * long period of time. XXX we need to take into account a link
+ * that is not using all available bandwidth, but for now our
+ * slop will ramp us up if this case occurs and the bandwidth later
+ * increases.
+ */
+ save_ticks = ticks;
+ if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
+ return;
+
+ bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz /
+ (save_ticks - tp->t_bw_rtttime);
+ tp->t_bw_rtttime = save_ticks;
+ tp->t_bw_rtseq = ack_seq;
+ if (tp->t_bw_rtttime == 0)
+ return;
+ bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
+
+ tp->snd_bandwidth = bw;
+
+ /*
+ * Calculate the semi-static bandwidth delay product, plus two maximal
+ * segments. The additional slop puts us squarely in the sweet
+ * spot and also handles the bandwidth run-up case. Without the
+ * slop we could be locking ourselves into a lower bandwidth.
+ *
+ * Situations Handled:
+ * (1) Prevents over-queueing of packets on LANs, especially on
+ * high speed LANs, allowing larger TCP buffers to be
+ * specified, and also does a good job preventing
+ * over-queueing of packets over choke points like modems
+ * (at least for the transmit side).
+ *
+ * (2) Is able to handle changing network loads (bandwidth
+ * drops so bwnd drops, bandwidth increases so bwnd
+ * increases).
+ *
+ * (3) Theoretically should stabilize in the face of multiple
+ * connections implementing the same algorithm (this may need
+ * a little work).
+ */
+#define USERTT ((tp->t_srtt + tp->t_rttbest) / 2)
+ bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + 2 * tp->t_maxseg;
+
+ if (tcp_inflight_debug > 0) {
+ static int ltime;
+ if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
+ ltime = ticks;
+ printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
+ tp,
+ bw,
+ tp->t_rttbest,
+ tp->t_srtt,
+ bwnd
+ );
+ }
+ }
+ if ((long)bwnd < tcp_inflight_min)
+ bwnd = tcp_inflight_min;
+ if (bwnd > tcp_inflight_max)
+ bwnd = tcp_inflight_max;
+ if ((long)bwnd < tp->t_maxseg * 2)
+ bwnd = tp->t_maxseg * 2;
+ tp->snd_bwnd = bwnd;
+}
+
OpenPOWER on IntegriCloud