summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--sys/netinet/tcp_input.c84
-rw-r--r--sys/netinet/tcp_output.c74
-rw-r--r--sys/netinet/tcp_reass.c84
-rw-r--r--sys/netinet/tcp_usrreq.c2
-rw-r--r--sys/netinet/tcp_var.h2
5 files changed, 236 insertions, 10 deletions
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 79d5e86..70462f1 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -161,6 +161,18 @@ SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD,
&tcp_reass_overflows, 0,
"Global number of TCP Segment Reassembly Queue Overflows");
+int tcp_do_autorcvbuf = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
+ &tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing");
+
+int tcp_autorcvbuf_inc = 16*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
+ &tcp_autorcvbuf_inc, 0, "Incrementor step size of automatic receive buffer");
+
+int tcp_autorcvbuf_max = 256*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
+ &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer");
+
struct inpcbhead tcb;
#define tcb6 tcb /* for KAME src sync over BSD*'s */
struct inpcbinfo tcbinfo;
@@ -1295,6 +1307,8 @@ after_listen:
} else if (th->th_ack == tp->snd_una &&
LIST_EMPTY(&tp->t_segq) &&
tlen <= sbspace(&so->so_rcv)) {
+ int newsize = 0; /* automatic sockbuf scaling */
+
KASSERT(headlocked, ("headlocked"));
INP_INFO_WUNLOCK(&tcbinfo);
headlocked = 0;
@@ -1321,18 +1335,78 @@ after_listen:
tcpstat.tcps_rcvpack++;
tcpstat.tcps_rcvbyte += tlen;
ND6_HINT(tp); /* some progress has been done */
- /*
#ifdef TCPDEBUG
if (so->so_options & SO_DEBUG)
tcp_trace(TA_INPUT, ostate, tp,
(void *)tcp_saveipgen, &tcp_savetcp, 0);
#endif
- * Add data to socket buffer.
- */
+ /*
+ * Automatic sizing of receive socket buffer. Often the send
+ * buffer size is not optimally adjusted to the actual network
+ * conditions at hand (delay bandwidth product). Setting the
+ * buffer size too small limits throughput on links with high
+ * bandwidth and high delay (eg. trans-continental/oceanic links).
+ *
+ * On the receive side the socket buffer memory is only rarely
+ * used to any significant extent. This allows us to be much
+ * more aggressive in scaling the receive socket buffer. For
+ * the case that the buffer space is actually used to a large
+ * extent and we run out of kernel memory we can simply drop
+ * the new segments; TCP on the sender will just retransmit it
+ * later. Setting the buffer size too big may only consume too
+ * much kernel memory if the application doesn't read() from
+ * the socket or packet loss or reordering makes use of the
+ * reassembly queue.
+ *
+ * The criteria to step up the receive buffer one notch are:
+ * 1. the number of bytes received during the time it takes
+ * one timestamp to be reflected back to us (the RTT);
+ * 2. received bytes per RTT is within seven eighth of the
+ * current socket buffer size;
+ * 3. receive buffer size has not hit maximal automatic size;
+ *
+ * This algorithm does one step per RTT at most and only if
+ * we receive a bulk stream w/o packet losses or reorderings.
+ * Shrinking the buffer during idle times is not necessary as
+ * it doesn't consume any memory when idle.
+ *
+ * TODO: Only step up if the application is actually serving
+ * the buffer to better manage the socket buffer resources.
+ */
+ if (tcp_do_autorcvbuf &&
+ to.to_tsecr &&
+ (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
+ if (to.to_tsecr > tp->rfbuf_ts &&
+ to.to_tsecr - tp->rfbuf_ts < hz) {
+ if (tp->rfbuf_cnt >
+ (so->so_rcv.sb_hiwat / 8 * 7) &&
+ so->so_rcv.sb_hiwat <
+ tcp_autorcvbuf_max) {
+ newsize =
+ min(so->so_rcv.sb_hiwat +
+ tcp_autorcvbuf_inc,
+ tcp_autorcvbuf_max);
+ }
+ /* Start over with next RTT. */
+ tp->rfbuf_ts = 0;
+ tp->rfbuf_cnt = 0;
+ } else
+ tp->rfbuf_cnt += tlen; /* add up */
+ }
+
+ /* Add data to socket buffer. */
SOCKBUF_LOCK(&so->so_rcv);
if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
m_freem(m);
} else {
+ /*
+ * Set new socket buffer size.
+ * Give up when limit is reached.
+ */
+ if (newsize)
+ if (!sbreserve_locked(&so->so_rcv,
+ newsize, so, curthread))
+ so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
m_adj(m, drop_hdrlen); /* delayed header drop */
sbappendstream_locked(&so->so_rcv, m);
}
@@ -1361,6 +1435,10 @@ after_listen:
tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
}
+ /* Reset receive buffer auto scaling when not in bulk receive mode. */
+ tp->rfbuf_ts = 0;
+ tp->rfbuf_cnt = 0;
+
switch (tp->t_state) {
/*
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index 25402cd..fdfe735 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -110,6 +110,19 @@ int tcp_do_tso = 1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
&tcp_do_tso, 0, "Enable TCP Segmentation Offload");
+int tcp_do_autosndbuf = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW,
+ &tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing");
+
+int tcp_autosndbuf_inc = 8*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW,
+ &tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer");
+
+int tcp_autosndbuf_max = 256*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
+ &tcp_autosndbuf_max, 0, "Max size of automatic send buffer");
+
+
/*
* Tcp output routine: figure out what should be sent and send it.
*/
@@ -380,11 +393,60 @@ after_sack_rexmit:
}
}
+ /* len will be >= 0 after this point. */
+ KASSERT(len >= 0, ("%s: len < 0", __func__));
+
/*
- * len will be >= 0 after this point. Truncate to the maximum
- * segment length or enable TCP Segmentation Offloading (if supported
- * by hardware) and ensure that FIN is removed if the length no longer
- * contains the last data byte.
+ * Automatic sizing of send socket buffer. Often the send buffer
+ * size is not optimally adjusted to the actual network conditions
+ * at hand (delay bandwidth product). Setting the buffer size too
+ * small limits throughput on links with high bandwidth and high
+ * delay (eg. trans-continental/oceanic links). Setting the
+ * buffer size too big consumes too much real kernel memory,
+ * especially with many connections on busy servers.
+ *
+ * The criteria to step up the send buffer one notch are:
+ * 1. receive window of remote host is larger than send buffer
+ * (with a fudge factor of 5/4th);
+ * 2. send buffer is filled to 7/8th with data (so we actually
+ * have data to make use of it);
+ * 3. send buffer fill has not hit maximal automatic size;
+ * 4. our send window (slow start and cogestion controlled) is
+ * larger than sent but unacknowledged data in send buffer.
+ *
+ * The remote host receive window scaling factor may limit the
+ * growing of the send buffer before it reaches its allowed
+ * maximum.
+ *
+ * It scales directly with slow start or congestion window
+ * and does at most one step per received ACK. This fast
+ * scaling has the drawback of growing the send buffer beyond
+ * what is strictly necessary to make full use of a given
+ * delay*bandwith product. However testing has shown this not
+ * to be much of an problem. At worst we are trading wasting
+ * of available bandwith (the non-use of it) for wasting some
+ * socket buffer memory.
+ *
+ * TODO: Shrink send buffer during idle periods together
+ * with congestion window. Requires another timer. Has to
+ * wait for upcoming tcp timer rewrite.
+ */
+ if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
+ if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
+ so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) &&
+ so->so_snd.sb_cc < tcp_autosndbuf_max &&
+ sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
+ if (!sbreserve_locked(&so->so_snd,
+ min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
+ tcp_autosndbuf_max), so, curthread))
+ so->so_snd.sb_flags &= ~SB_AUTOSIZE;
+ }
+ }
+
+ /*
+ * Truncate to the maximum segment length or enable TCP Segmentation
+ * Offloading (if supported by hardware) and ensure that FIN is removed
+ * if the length no longer contains the last data byte.
*
* TSO may only be used if we are in a pure bulk sending state. The
* presence of TCP-MD5, SACK retransmits, SACK advertizements and
@@ -606,6 +668,10 @@ send:
optlen += TCPOLEN_TSTAMP_APPA;
}
+ /* Set receive buffer autosizing timestamp. */
+ if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE))
+ tp->rfbuf_ts = ticks;
+
#ifdef TCP_SIGNATURE
#ifdef INET6
if (!isipv6)
diff --git a/sys/netinet/tcp_reass.c b/sys/netinet/tcp_reass.c
index 79d5e86..70462f1 100644
--- a/sys/netinet/tcp_reass.c
+++ b/sys/netinet/tcp_reass.c
@@ -161,6 +161,18 @@ SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD,
&tcp_reass_overflows, 0,
"Global number of TCP Segment Reassembly Queue Overflows");
+int tcp_do_autorcvbuf = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
+ &tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing");
+
+int tcp_autorcvbuf_inc = 16*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
+ &tcp_autorcvbuf_inc, 0, "Incrementor step size of automatic receive buffer");
+
+int tcp_autorcvbuf_max = 256*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
+ &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer");
+
struct inpcbhead tcb;
#define tcb6 tcb /* for KAME src sync over BSD*'s */
struct inpcbinfo tcbinfo;
@@ -1295,6 +1307,8 @@ after_listen:
} else if (th->th_ack == tp->snd_una &&
LIST_EMPTY(&tp->t_segq) &&
tlen <= sbspace(&so->so_rcv)) {
+ int newsize = 0; /* automatic sockbuf scaling */
+
KASSERT(headlocked, ("headlocked"));
INP_INFO_WUNLOCK(&tcbinfo);
headlocked = 0;
@@ -1321,18 +1335,78 @@ after_listen:
tcpstat.tcps_rcvpack++;
tcpstat.tcps_rcvbyte += tlen;
ND6_HINT(tp); /* some progress has been done */
- /*
#ifdef TCPDEBUG
if (so->so_options & SO_DEBUG)
tcp_trace(TA_INPUT, ostate, tp,
(void *)tcp_saveipgen, &tcp_savetcp, 0);
#endif
- * Add data to socket buffer.
- */
+ /*
+ * Automatic sizing of receive socket buffer. Often the send
+ * buffer size is not optimally adjusted to the actual network
+ * conditions at hand (delay bandwidth product). Setting the
+ * buffer size too small limits throughput on links with high
+ * bandwidth and high delay (eg. trans-continental/oceanic links).
+ *
+ * On the receive side the socket buffer memory is only rarely
+ * used to any significant extent. This allows us to be much
+ * more aggressive in scaling the receive socket buffer. For
+ * the case that the buffer space is actually used to a large
+ * extent and we run out of kernel memory we can simply drop
+ * the new segments; TCP on the sender will just retransmit it
+ * later. Setting the buffer size too big may only consume too
+ * much kernel memory if the application doesn't read() from
+ * the socket or packet loss or reordering makes use of the
+ * reassembly queue.
+ *
+ * The criteria to step up the receive buffer one notch are:
+ * 1. the number of bytes received during the time it takes
+ * one timestamp to be reflected back to us (the RTT);
+ * 2. received bytes per RTT is within seven eighth of the
+ * current socket buffer size;
+ * 3. receive buffer size has not hit maximal automatic size;
+ *
+ * This algorithm does one step per RTT at most and only if
+ * we receive a bulk stream w/o packet losses or reorderings.
+ * Shrinking the buffer during idle times is not necessary as
+ * it doesn't consume any memory when idle.
+ *
+ * TODO: Only step up if the application is actually serving
+ * the buffer to better manage the socket buffer resources.
+ */
+ if (tcp_do_autorcvbuf &&
+ to.to_tsecr &&
+ (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
+ if (to.to_tsecr > tp->rfbuf_ts &&
+ to.to_tsecr - tp->rfbuf_ts < hz) {
+ if (tp->rfbuf_cnt >
+ (so->so_rcv.sb_hiwat / 8 * 7) &&
+ so->so_rcv.sb_hiwat <
+ tcp_autorcvbuf_max) {
+ newsize =
+ min(so->so_rcv.sb_hiwat +
+ tcp_autorcvbuf_inc,
+ tcp_autorcvbuf_max);
+ }
+ /* Start over with next RTT. */
+ tp->rfbuf_ts = 0;
+ tp->rfbuf_cnt = 0;
+ } else
+ tp->rfbuf_cnt += tlen; /* add up */
+ }
+
+ /* Add data to socket buffer. */
SOCKBUF_LOCK(&so->so_rcv);
if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
m_freem(m);
} else {
+ /*
+ * Set new socket buffer size.
+ * Give up when limit is reached.
+ */
+ if (newsize)
+ if (!sbreserve_locked(&so->so_rcv,
+ newsize, so, curthread))
+ so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
m_adj(m, drop_hdrlen); /* delayed header drop */
sbappendstream_locked(&so->so_rcv, m);
}
@@ -1361,6 +1435,10 @@ after_listen:
tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
}
+ /* Reset receive buffer auto scaling when not in bulk receive mode. */
+ tp->rfbuf_ts = 0;
+ tp->rfbuf_cnt = 0;
+
switch (tp->t_state) {
/*
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 3553661..1618143 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -1446,6 +1446,8 @@ tcp_attach(so)
if (error)
return (error);
}
+ so->so_rcv.sb_flags |= SB_AUTOSIZE;
+ so->so_snd.sb_flags |= SB_AUTOSIZE;
INP_INFO_WLOCK(&tcbinfo);
error = in_pcballoc(so, &tcbinfo);
if (error) {
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 35c2533..4c50a82 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -202,6 +202,8 @@ struct tcpcb {
episode starts at this seq number */
struct sackhint sackhint; /* SACK scoreboard hint */
int t_rttlow; /* smallest observerved RTT */
+ u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */
+ int rfbuf_cnt; /* recv buffer autoscaling byte count */
};
#define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY)
OpenPOWER on IntegriCloud