Auto sizing TCP socket buffers.

Normally the socket buffers are static (either derived from global defaults or set with setsockopt) and do not adapt to real network conditions. Two things happen: a) your socket buffers are too small and you can't reach the full potential of the network between both hosts; b) your socket buffers are too big and you waste a lot of kernel memory for data just sitting around. With automatic TCP send and receive socket buffers we can start with a small buffer and quickly grow it in parallel with the TCP congestion window to match real network conditions. FreeBSD has a default 32K send socket buffer. This supports a maximal transfer rate of only slightly more than 2Mbit/s on a 100ms RTT trans-continental link. Or at 200ms just above 1Mbit/s. With TCP send buffer auto scaling and the default values below it supports 20Mbit/s at 100ms and 10Mbit/s at 200ms. That's an improvement of factor 10, or 1000%. For the receive side it looks slightly better with a default of 64K buffer size. New sysctls are: net.inet.tcp.sendbuf_auto=1 (enabled) net.inet.tcp.sendbuf_inc=8192 (8K, step size) net.inet.tcp.sendbuf_max=262144 (256K, growth limit) net.inet.tcp.recvbuf_auto=1 (enabled) net.inet.tcp.recvbuf_inc=16384 (16K, step size) net.inet.tcp.recvbuf_max=262144 (256K, growth limit) Tested by: many (on HEAD and RELENG_6) Approved by: re MFC after: 1 month
author: andre <andre@FreeBSD.org> 2007-02-01 18:32:13 +0000
committer: andre <andre@FreeBSD.org> 2007-02-01 18:32:13 +0000
commit: 25c4be862e487b1d4f9c6562a52fb07cf145908a (patch)
tree: 7c81206310a4555c1c0c8da554782bf1b9af4a3e /sys
parent: ad9bb7722c1df7084e512c80e86b0b022110ed5b (diff)
download: FreeBSD-src-25c4be862e487b1d4f9c6562a52fb07cf145908a.zip
FreeBSD-src-25c4be862e487b1d4f9c6562a52fb07cf145908a.tar.gz
5 files changed, 236 insertions, 10 deletions
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 79d5e86..70462f1 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -161,6 +161,18 @@ SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD,
 	   &tcp_reass_overflows, 0,
 	   "Global number of TCP Segment Reassembly Queue Overflows");
 
+int	tcp_do_autorcvbuf = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
+	   &tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing");
+
+int	tcp_autorcvbuf_inc = 16*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
+	   &tcp_autorcvbuf_inc, 0, "Incrementor step size of automatic receive buffer");
+
+int	tcp_autorcvbuf_max = 256*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
+	   &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer");
+
 struct inpcbhead tcb;
 #define	tcb6	tcb  /* for KAME src sync over BSD*'s */
 struct inpcbinfo tcbinfo;
@@ -1295,6 +1307,8 @@ after_listen:
 		} else if (th->th_ack == tp->snd_una &&
 		    LIST_EMPTY(&tp->t_segq) &&
 		    tlen <= sbspace(&so->so_rcv)) {
+			int newsize = 0;	/* automatic sockbuf scaling */
+
 			KASSERT(headlocked, ("headlocked"));
 			INP_INFO_WUNLOCK(&tcbinfo);
 			headlocked = 0;
@@ -1321,18 +1335,78 @@ after_listen:
 			tcpstat.tcps_rcvpack++;
 			tcpstat.tcps_rcvbyte += tlen;
 			ND6_HINT(tp);	/* some progress has been done */
-			/*
 #ifdef TCPDEBUG
 			if (so->so_options & SO_DEBUG)
 				tcp_trace(TA_INPUT, ostate, tp,
 				    (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
-			 * Add data to socket buffer.
-			 */
+		/*
+		 * Automatic sizing of receive socket buffer.  Often the send
+		 * buffer size is not optimally adjusted to the actual network
+		 * conditions at hand (delay bandwidth product).  Setting the
+		 * buffer size too small limits throughput on links with high
+		 * bandwidth and high delay (eg. trans-continental/oceanic links).
+		 *
+		 * On the receive side the socket buffer memory is only rarely
+		 * used to any significant extent.  This allows us to be much
+		 * more aggressive in scaling the receive socket buffer.  For
+		 * the case that the buffer space is actually used to a large
+		 * extent and we run out of kernel memory we can simply drop
+		 * the new segments; TCP on the sender will just retransmit it
+		 * later.  Setting the buffer size too big may only consume too
+		 * much kernel memory if the application doesn't read() from
+		 * the socket or packet loss or reordering makes use of the
+		 * reassembly queue.
+		 *
+		 * The criteria to step up the receive buffer one notch are:
+		 *  1. the number of bytes received during the time it takes
+		 *     one timestamp to be reflected back to us (the RTT);
+		 *  2. received bytes per RTT is within seven eighth of the
+		 *     current socket buffer size;
+		 *  3. receive buffer size has not hit maximal automatic size;
+		 *
+		 * This algorithm does one step per RTT at most and only if
+		 * we receive a bulk stream w/o packet losses or reorderings.
+		 * Shrinking the buffer during idle times is not necessary as
+		 * it doesn't consume any memory when idle.
+		 *
+		 * TODO: Only step up if the application is actually serving
+		 * the buffer to better manage the socket buffer resources.
+		 */
+			if (tcp_do_autorcvbuf &&
+			    to.to_tsecr &&
+			    (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
+				if (to.to_tsecr > tp->rfbuf_ts &&
+				    to.to_tsecr - tp->rfbuf_ts < hz) {
+					if (tp->rfbuf_cnt >
+					    (so->so_rcv.sb_hiwat / 8 * 7) &&
+					    so->so_rcv.sb_hiwat <
+					    tcp_autorcvbuf_max) {
+						newsize =
+						    min(so->so_rcv.sb_hiwat +
+						    tcp_autorcvbuf_inc,
+						    tcp_autorcvbuf_max);
+					}
+					/* Start over with next RTT. */
+					tp->rfbuf_ts = 0;
+					tp->rfbuf_cnt = 0;
+				} else
+					tp->rfbuf_cnt += tlen;	/* add up */
+			}
+
+			/* Add data to socket buffer. */
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 				m_freem(m);
 			} else {
+				/*
+				 * Set new socket buffer size.
+				 * Give up when limit is reached.
+				 */
+				if (newsize)
+					if (!sbreserve_locked(&so->so_rcv,
+					    newsize, so, curthread))
+						so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
 				m_adj(m, drop_hdrlen);	/* delayed header drop */
 				sbappendstream_locked(&so->so_rcv, m);
 			}
@@ -1361,6 +1435,10 @@ after_listen:
 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 	}
 
+	/* Reset receive buffer auto scaling when not in bulk receive mode. */
+	tp->rfbuf_ts = 0;
+	tp->rfbuf_cnt = 0;
+
 	switch (tp->t_state) {
 
 	/*
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index 25402cd..fdfe735 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -110,6 +110,19 @@ int	tcp_do_tso = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
 	&tcp_do_tso, 0, "Enable TCP Segmentation Offload");
 
+int	tcp_do_autosndbuf = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW,
+	&tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing");
+
+int	tcp_autosndbuf_inc = 8*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW,
+	&tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer");
+
+int	tcp_autosndbuf_max = 256*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
+	&tcp_autosndbuf_max, 0, "Max size of automatic send buffer");
+
+
 /*
  * Tcp output routine: figure out what should be sent and send it.
  */
@@ -380,11 +393,60 @@ after_sack_rexmit:
 		}
 	}
 
+	/* len will be >= 0 after this point. */
+	KASSERT(len >= 0, ("%s: len < 0", __func__));
+
 	/*
-	 * len will be >= 0 after this point.  Truncate to the maximum
-	 * segment length or enable TCP Segmentation Offloading (if supported
-	 * by hardware) and ensure that FIN is removed if the length no longer
-	 * contains the last data byte.
+	 * Automatic sizing of send socket buffer.  Often the send buffer
+	 * size is not optimally adjusted to the actual network conditions
+	 * at hand (delay bandwidth product).  Setting the buffer size too
+	 * small limits throughput on links with high bandwidth and high
+	 * delay (eg. trans-continental/oceanic links).  Setting the
+	 * buffer size too big consumes too much real kernel memory,
+	 * especially with many connections on busy servers.
+	 *
+	 * The criteria to step up the send buffer one notch are:
+	 *  1. receive window of remote host is larger than send buffer
+	 *     (with a fudge factor of 5/4th);
+	 *  2. send buffer is filled to 7/8th with data (so we actually
+	 *     have data to make use of it);
+	 *  3. send buffer fill has not hit maximal automatic size;
+	 *  4. our send window (slow start and cogestion controlled) is
+	 *     larger than sent but unacknowledged data in send buffer.
+	 *
+	 * The remote host receive window scaling factor may limit the
+	 * growing of the send buffer before it reaches its allowed
+	 * maximum.
+	 *
+	 * It scales directly with slow start or congestion window
+	 * and does at most one step per received ACK.  This fast
+	 * scaling has the drawback of growing the send buffer beyond
+	 * what is strictly necessary to make full use of a given
+	 * delay*bandwith product.  However testing has shown this not
+	 * to be much of an problem.  At worst we are trading wasting
+	 * of available bandwith (the non-use of it) for wasting some
+	 * socket buffer memory.
+	 *
+	 * TODO: Shrink send buffer during idle periods together
+	 * with congestion window.  Requires another timer.  Has to
+	 * wait for upcoming tcp timer rewrite.
+	 */
+	if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
+		if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
+		    so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) &&
+		    so->so_snd.sb_cc < tcp_autosndbuf_max &&
+		    sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
+			if (!sbreserve_locked(&so->so_snd,
+			    min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
+			     tcp_autosndbuf_max), so, curthread))
+				so->so_snd.sb_flags &= ~SB_AUTOSIZE;
+		}
+	}
+
+	/*
+	 * Truncate to the maximum segment length or enable TCP Segmentation
+	 * Offloading (if supported by hardware) and ensure that FIN is removed
+	 * if the length no longer contains the last data byte.
 	 *
 	 * TSO may only be used if we are in a pure bulk sending state.  The
 	 * presence of TCP-MD5, SACK retransmits, SACK advertizements and
@@ -606,6 +668,10 @@ send:
 		optlen += TCPOLEN_TSTAMP_APPA;
 	}
 
+	/* Set receive buffer autosizing timestamp. */
+	if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE))
+		tp->rfbuf_ts = ticks;
+
 #ifdef TCP_SIGNATURE
 #ifdef INET6
 	if (!isipv6)
diff --git a/sys/netinet/tcp_reass.c b/sys/netinet/tcp_reass.c
index 79d5e86..70462f1 100644
--- a/sys/netinet/tcp_reass.c
+++ b/sys/netinet/tcp_reass.c
@@ -161,6 +161,18 @@ SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD,
 	   &tcp_reass_overflows, 0,
 	   "Global number of TCP Segment Reassembly Queue Overflows");
 
+int	tcp_do_autorcvbuf = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
+	   &tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing");
+
+int	tcp_autorcvbuf_inc = 16*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
+	   &tcp_autorcvbuf_inc, 0, "Incrementor step size of automatic receive buffer");
+
+int	tcp_autorcvbuf_max = 256*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
+	   &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer");
+
 struct inpcbhead tcb;
 #define	tcb6	tcb  /* for KAME src sync over BSD*'s */
 struct inpcbinfo tcbinfo;
@@ -1295,6 +1307,8 @@ after_listen:
 		} else if (th->th_ack == tp->snd_una &&
 		    LIST_EMPTY(&tp->t_segq) &&
 		    tlen <= sbspace(&so->so_rcv)) {
+			int newsize = 0;	/* automatic sockbuf scaling */
+
 			KASSERT(headlocked, ("headlocked"));
 			INP_INFO_WUNLOCK(&tcbinfo);
 			headlocked = 0;
@@ -1321,18 +1335,78 @@ after_listen:
 			tcpstat.tcps_rcvpack++;
 			tcpstat.tcps_rcvbyte += tlen;
 			ND6_HINT(tp);	/* some progress has been done */
-			/*
 #ifdef TCPDEBUG
 			if (so->so_options & SO_DEBUG)
 				tcp_trace(TA_INPUT, ostate, tp,
 				    (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
-			 * Add data to socket buffer.
-			 */
+		/*
+		 * Automatic sizing of receive socket buffer.  Often the send
+		 * buffer size is not optimally adjusted to the actual network
+		 * conditions at hand (delay bandwidth product).  Setting the
+		 * buffer size too small limits throughput on links with high
+		 * bandwidth and high delay (eg. trans-continental/oceanic links).
+		 *
+		 * On the receive side the socket buffer memory is only rarely
+		 * used to any significant extent.  This allows us to be much
+		 * more aggressive in scaling the receive socket buffer.  For
+		 * the case that the buffer space is actually used to a large
+		 * extent and we run out of kernel memory we can simply drop
+		 * the new segments; TCP on the sender will just retransmit it
+		 * later.  Setting the buffer size too big may only consume too
+		 * much kernel memory if the application doesn't read() from
+		 * the socket or packet loss or reordering makes use of the
+		 * reassembly queue.
+		 *
+		 * The criteria to step up the receive buffer one notch are:
+		 *  1. the number of bytes received during the time it takes
+		 *     one timestamp to be reflected back to us (the RTT);
+		 *  2. received bytes per RTT is within seven eighth of the
+		 *     current socket buffer size;
+		 *  3. receive buffer size has not hit maximal automatic size;
+		 *
+		 * This algorithm does one step per RTT at most and only if
+		 * we receive a bulk stream w/o packet losses or reorderings.
+		 * Shrinking the buffer during idle times is not necessary as
+		 * it doesn't consume any memory when idle.
+		 *
+		 * TODO: Only step up if the application is actually serving
+		 * the buffer to better manage the socket buffer resources.
+		 */
+			if (tcp_do_autorcvbuf &&
+			    to.to_tsecr &&
+			    (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
+				if (to.to_tsecr > tp->rfbuf_ts &&
+				    to.to_tsecr - tp->rfbuf_ts < hz) {
+					if (tp->rfbuf_cnt >
+					    (so->so_rcv.sb_hiwat / 8 * 7) &&
+					    so->so_rcv.sb_hiwat <
+					    tcp_autorcvbuf_max) {
+						newsize =
+						    min(so->so_rcv.sb_hiwat +
+						    tcp_autorcvbuf_inc,
+						    tcp_autorcvbuf_max);
+					}
+					/* Start over with next RTT. */
+					tp->rfbuf_ts = 0;
+					tp->rfbuf_cnt = 0;
+				} else
+					tp->rfbuf_cnt += tlen;	/* add up */
+			}
+
+			/* Add data to socket buffer. */
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 				m_freem(m);
 			} else {
+				/*
+				 * Set new socket buffer size.
+				 * Give up when limit is reached.
+				 */
+				if (newsize)
+					if (!sbreserve_locked(&so->so_rcv,
+					    newsize, so, curthread))
+						so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
 				m_adj(m, drop_hdrlen);	/* delayed header drop */
 				sbappendstream_locked(&so->so_rcv, m);
 			}
@@ -1361,6 +1435,10 @@ after_listen:
 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 	}
 
+	/* Reset receive buffer auto scaling when not in bulk receive mode. */
+	tp->rfbuf_ts = 0;
+	tp->rfbuf_cnt = 0;
+
 	switch (tp->t_state) {
 
 	/*
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 3553661..1618143 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -1446,6 +1446,8 @@ tcp_attach(so)
 		if (error)
 			return (error);
 	}
+	so->so_rcv.sb_flags |= SB_AUTOSIZE;
+	so->so_snd.sb_flags |= SB_AUTOSIZE;
 	INP_INFO_WLOCK(&tcbinfo);
 	error = in_pcballoc(so, &tcbinfo);
 	if (error) {
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 35c2533..4c50a82 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -202,6 +202,8 @@ struct tcpcb {
 					   episode starts at this seq number */
 	struct sackhint	sackhint;	/* SACK scoreboard hint */
 	int	t_rttlow;		/* smallest observerved RTT */
+	u_int32_t	rfbuf_ts;	/* recv buffer autoscaling timestamp */
+	int	rfbuf_cnt;		/* recv buffer autoscaling byte count */
 };
 
 #define IN_FASTRECOVERY(tp)	(tp->t_flags & TF_FASTRECOVERY)
author	andre <andre@FreeBSD.org>	2007-02-01 18:32:13 +0000
committer	andre <andre@FreeBSD.org>	2007-02-01 18:32:13 +0000
commit	25c4be862e487b1d4f9c6562a52fb07cf145908a (patch)
tree	7c81206310a4555c1c0c8da554782bf1b9af4a3e /sys
parent	ad9bb7722c1df7084e512c80e86b0b022110ed5b (diff)
download	FreeBSD-src-25c4be862e487b1d4f9c6562a52fb07cf145908a.zip FreeBSD-src-25c4be862e487b1d4f9c6562a52fb07cf145908a.tar.gz