diff options
-rw-r--r-- | sys/netinet/tcp_input.c | 84 | ||||
-rw-r--r-- | sys/netinet/tcp_output.c | 74 | ||||
-rw-r--r-- | sys/netinet/tcp_reass.c | 84 | ||||
-rw-r--r-- | sys/netinet/tcp_usrreq.c | 2 | ||||
-rw-r--r-- | sys/netinet/tcp_var.h | 2 |
5 files changed, 236 insertions, 10 deletions
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index 79d5e86..70462f1 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -161,6 +161,18 @@ SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD, &tcp_reass_overflows, 0, "Global number of TCP Segment Reassembly Queue Overflows"); +int tcp_do_autorcvbuf = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW, + &tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing"); + +int tcp_autorcvbuf_inc = 16*1024; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW, + &tcp_autorcvbuf_inc, 0, "Incrementor step size of automatic receive buffer"); + +int tcp_autorcvbuf_max = 256*1024; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW, + &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer"); + struct inpcbhead tcb; #define tcb6 tcb /* for KAME src sync over BSD*'s */ struct inpcbinfo tcbinfo; @@ -1295,6 +1307,8 @@ after_listen: } else if (th->th_ack == tp->snd_una && LIST_EMPTY(&tp->t_segq) && tlen <= sbspace(&so->so_rcv)) { + int newsize = 0; /* automatic sockbuf scaling */ + KASSERT(headlocked, ("headlocked")); INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; @@ -1321,18 +1335,78 @@ after_listen: tcpstat.tcps_rcvpack++; tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); /* some progress has been done */ - /* #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif - * Add data to socket buffer. - */ + /* + * Automatic sizing of receive socket buffer. Often the send + * buffer size is not optimally adjusted to the actual network + * conditions at hand (delay bandwidth product). Setting the + * buffer size too small limits throughput on links with high + * bandwidth and high delay (eg. trans-continental/oceanic links). + * + * On the receive side the socket buffer memory is only rarely + * used to any significant extent. This allows us to be much + * more aggressive in scaling the receive socket buffer. For + * the case that the buffer space is actually used to a large + * extent and we run out of kernel memory we can simply drop + * the new segments; TCP on the sender will just retransmit it + * later. Setting the buffer size too big may only consume too + * much kernel memory if the application doesn't read() from + * the socket or packet loss or reordering makes use of the + * reassembly queue. + * + * The criteria to step up the receive buffer one notch are: + * 1. the number of bytes received during the time it takes + * one timestamp to be reflected back to us (the RTT); + * 2. received bytes per RTT is within seven eighth of the + * current socket buffer size; + * 3. receive buffer size has not hit maximal automatic size; + * + * This algorithm does one step per RTT at most and only if + * we receive a bulk stream w/o packet losses or reorderings. + * Shrinking the buffer during idle times is not necessary as + * it doesn't consume any memory when idle. + * + * TODO: Only step up if the application is actually serving + * the buffer to better manage the socket buffer resources. + */ + if (tcp_do_autorcvbuf && + to.to_tsecr && + (so->so_rcv.sb_flags & SB_AUTOSIZE)) { + if (to.to_tsecr > tp->rfbuf_ts && + to.to_tsecr - tp->rfbuf_ts < hz) { + if (tp->rfbuf_cnt > + (so->so_rcv.sb_hiwat / 8 * 7) && + so->so_rcv.sb_hiwat < + tcp_autorcvbuf_max) { + newsize = + min(so->so_rcv.sb_hiwat + + tcp_autorcvbuf_inc, + tcp_autorcvbuf_max); + } + /* Start over with next RTT. */ + tp->rfbuf_ts = 0; + tp->rfbuf_cnt = 0; + } else + tp->rfbuf_cnt += tlen; /* add up */ + } + + /* Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { + /* + * Set new socket buffer size. + * Give up when limit is reached. + */ + if (newsize) + if (!sbreserve_locked(&so->so_rcv, + newsize, so, curthread)) + so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ sbappendstream_locked(&so->so_rcv, m); } @@ -1361,6 +1435,10 @@ after_listen: tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); } + /* Reset receive buffer auto scaling when not in bulk receive mode. */ + tp->rfbuf_ts = 0; + tp->rfbuf_cnt = 0; + switch (tp->t_state) { /* diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index 25402cd..fdfe735 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -110,6 +110,19 @@ int tcp_do_tso = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, &tcp_do_tso, 0, "Enable TCP Segmentation Offload"); +int tcp_do_autosndbuf = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW, + &tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing"); + +int tcp_autosndbuf_inc = 8*1024; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW, + &tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer"); + +int tcp_autosndbuf_max = 256*1024; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW, + &tcp_autosndbuf_max, 0, "Max size of automatic send buffer"); + + /* * Tcp output routine: figure out what should be sent and send it. */ @@ -380,11 +393,60 @@ after_sack_rexmit: } } + /* len will be >= 0 after this point. */ + KASSERT(len >= 0, ("%s: len < 0", __func__)); + /* - * len will be >= 0 after this point. Truncate to the maximum - * segment length or enable TCP Segmentation Offloading (if supported - * by hardware) and ensure that FIN is removed if the length no longer - * contains the last data byte. + * Automatic sizing of send socket buffer. Often the send buffer + * size is not optimally adjusted to the actual network conditions + * at hand (delay bandwidth product). Setting the buffer size too + * small limits throughput on links with high bandwidth and high + * delay (eg. trans-continental/oceanic links). Setting the + * buffer size too big consumes too much real kernel memory, + * especially with many connections on busy servers. + * + * The criteria to step up the send buffer one notch are: + * 1. receive window of remote host is larger than send buffer + * (with a fudge factor of 5/4th); + * 2. send buffer is filled to 7/8th with data (so we actually + * have data to make use of it); + * 3. send buffer fill has not hit maximal automatic size; + * 4. our send window (slow start and cogestion controlled) is + * larger than sent but unacknowledged data in send buffer. + * + * The remote host receive window scaling factor may limit the + * growing of the send buffer before it reaches its allowed + * maximum. + * + * It scales directly with slow start or congestion window + * and does at most one step per received ACK. This fast + * scaling has the drawback of growing the send buffer beyond + * what is strictly necessary to make full use of a given + * delay*bandwith product. However testing has shown this not + * to be much of an problem. At worst we are trading wasting + * of available bandwith (the non-use of it) for wasting some + * socket buffer memory. + * + * TODO: Shrink send buffer during idle periods together + * with congestion window. Requires another timer. Has to + * wait for upcoming tcp timer rewrite. + */ + if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { + if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && + so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && + so->so_snd.sb_cc < tcp_autosndbuf_max && + sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) { + if (!sbreserve_locked(&so->so_snd, + min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc, + tcp_autosndbuf_max), so, curthread)) + so->so_snd.sb_flags &= ~SB_AUTOSIZE; + } + } + + /* + * Truncate to the maximum segment length or enable TCP Segmentation + * Offloading (if supported by hardware) and ensure that FIN is removed + * if the length no longer contains the last data byte. * * TSO may only be used if we are in a pure bulk sending state. The * presence of TCP-MD5, SACK retransmits, SACK advertizements and @@ -606,6 +668,10 @@ send: optlen += TCPOLEN_TSTAMP_APPA; } + /* Set receive buffer autosizing timestamp. */ + if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) + tp->rfbuf_ts = ticks; + #ifdef TCP_SIGNATURE #ifdef INET6 if (!isipv6) diff --git a/sys/netinet/tcp_reass.c b/sys/netinet/tcp_reass.c index 79d5e86..70462f1 100644 --- a/sys/netinet/tcp_reass.c +++ b/sys/netinet/tcp_reass.c @@ -161,6 +161,18 @@ SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD, &tcp_reass_overflows, 0, "Global number of TCP Segment Reassembly Queue Overflows"); +int tcp_do_autorcvbuf = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW, + &tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing"); + +int tcp_autorcvbuf_inc = 16*1024; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW, + &tcp_autorcvbuf_inc, 0, "Incrementor step size of automatic receive buffer"); + +int tcp_autorcvbuf_max = 256*1024; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW, + &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer"); + struct inpcbhead tcb; #define tcb6 tcb /* for KAME src sync over BSD*'s */ struct inpcbinfo tcbinfo; @@ -1295,6 +1307,8 @@ after_listen: } else if (th->th_ack == tp->snd_una && LIST_EMPTY(&tp->t_segq) && tlen <= sbspace(&so->so_rcv)) { + int newsize = 0; /* automatic sockbuf scaling */ + KASSERT(headlocked, ("headlocked")); INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; @@ -1321,18 +1335,78 @@ after_listen: tcpstat.tcps_rcvpack++; tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); /* some progress has been done */ - /* #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif - * Add data to socket buffer. - */ + /* + * Automatic sizing of receive socket buffer. Often the send + * buffer size is not optimally adjusted to the actual network + * conditions at hand (delay bandwidth product). Setting the + * buffer size too small limits throughput on links with high + * bandwidth and high delay (eg. trans-continental/oceanic links). + * + * On the receive side the socket buffer memory is only rarely + * used to any significant extent. This allows us to be much + * more aggressive in scaling the receive socket buffer. For + * the case that the buffer space is actually used to a large + * extent and we run out of kernel memory we can simply drop + * the new segments; TCP on the sender will just retransmit it + * later. Setting the buffer size too big may only consume too + * much kernel memory if the application doesn't read() from + * the socket or packet loss or reordering makes use of the + * reassembly queue. + * + * The criteria to step up the receive buffer one notch are: + * 1. the number of bytes received during the time it takes + * one timestamp to be reflected back to us (the RTT); + * 2. received bytes per RTT is within seven eighth of the + * current socket buffer size; + * 3. receive buffer size has not hit maximal automatic size; + * + * This algorithm does one step per RTT at most and only if + * we receive a bulk stream w/o packet losses or reorderings. + * Shrinking the buffer during idle times is not necessary as + * it doesn't consume any memory when idle. + * + * TODO: Only step up if the application is actually serving + * the buffer to better manage the socket buffer resources. + */ + if (tcp_do_autorcvbuf && + to.to_tsecr && + (so->so_rcv.sb_flags & SB_AUTOSIZE)) { + if (to.to_tsecr > tp->rfbuf_ts && + to.to_tsecr - tp->rfbuf_ts < hz) { + if (tp->rfbuf_cnt > + (so->so_rcv.sb_hiwat / 8 * 7) && + so->so_rcv.sb_hiwat < + tcp_autorcvbuf_max) { + newsize = + min(so->so_rcv.sb_hiwat + + tcp_autorcvbuf_inc, + tcp_autorcvbuf_max); + } + /* Start over with next RTT. */ + tp->rfbuf_ts = 0; + tp->rfbuf_cnt = 0; + } else + tp->rfbuf_cnt += tlen; /* add up */ + } + + /* Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { + /* + * Set new socket buffer size. + * Give up when limit is reached. + */ + if (newsize) + if (!sbreserve_locked(&so->so_rcv, + newsize, so, curthread)) + so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ sbappendstream_locked(&so->so_rcv, m); } @@ -1361,6 +1435,10 @@ after_listen: tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); } + /* Reset receive buffer auto scaling when not in bulk receive mode. */ + tp->rfbuf_ts = 0; + tp->rfbuf_cnt = 0; + switch (tp->t_state) { /* diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index 3553661..1618143 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -1446,6 +1446,8 @@ tcp_attach(so) if (error) return (error); } + so->so_rcv.sb_flags |= SB_AUTOSIZE; + so->so_snd.sb_flags |= SB_AUTOSIZE; INP_INFO_WLOCK(&tcbinfo); error = in_pcballoc(so, &tcbinfo); if (error) { diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 35c2533..4c50a82 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -202,6 +202,8 @@ struct tcpcb { episode starts at this seq number */ struct sackhint sackhint; /* SACK scoreboard hint */ int t_rttlow; /* smallest observerved RTT */ + u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ + int rfbuf_cnt; /* recv buffer autoscaling byte count */ }; #define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY) |