summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorsmh <smh@FreeBSD.org>2017-04-24 16:31:28 +0000
committersmh <smh@FreeBSD.org>2017-04-24 16:31:28 +0000
commit5f7e779f9ffada568a00fd3d20896316ecf69996 (patch)
treefe6aba59a930b30016ee7d2ec96ff36628b01719
parente7481cb7514ab48287a87757be4290e9ce05b949 (diff)
downloadFreeBSD-src-5f7e779f9ffada568a00fd3d20896316ecf69996.zip
FreeBSD-src-5f7e779f9ffada568a00fd3d20896316ecf69996.tar.gz
Partial MFC r316676 and the required r313045
MFC r316676: Use estimated RTT for receive buffer auto resizing instead of timestamps. This is a partial MFC as stable/10 doesn't include the TCP stack modularisation. MFC r313045: Add an mbuf to ipinfo_t translator to finish cleanup of mbuf passing to TCP probes. This is a partial MFC (missing debug__output & debug__drop changes) due to the massive amount of additional dtrace changes that would be required for a full MFC. Relnotes: Yes Sponsored by: Multiplay
-rw-r--r--cddl/lib/libdtrace/ip.d18
-rw-r--r--sys/netinet/in_kdtrace.c18
-rw-r--r--sys/netinet/in_kdtrace.h1
-rw-r--r--sys/netinet/tcp_input.c121
-rw-r--r--sys/netinet/tcp_output.c10
-rw-r--r--sys/netinet/tcp_var.h2
6 files changed, 103 insertions, 67 deletions
diff --git a/cddl/lib/libdtrace/ip.d b/cddl/lib/libdtrace/ip.d
index f8fa9b1..15dfc34 100644
--- a/cddl/lib/libdtrace/ip.d
+++ b/cddl/lib/libdtrace/ip.d
@@ -240,6 +240,24 @@ translator ipinfo_t < uint8_t *p > {
#pragma D binding "1.0" IFF_LOOPBACK
inline int IFF_LOOPBACK = 0x8;
+#pragma D binding "1.13" translator
+translator ipinfo_t < struct mbuf *m > {
+ ip_ver = m == NULL ? 0 : ((struct ip *)m->m_data)->ip_v;
+ ip_plength = m == NULL ? 0 :
+ ((struct ip *)m->m_data)->ip_v == 4 ?
+ ntohs(((struct ip *)m->m_data)->ip_len) -
+ (((struct ip *)m->m_data)->ip_hl << 2):
+ ntohs(((struct ip6_hdr *)m->m_data)->ip6_ctlun.ip6_un1.ip6_un1_plen);
+ ip_saddr = m == NULL ? 0 :
+ ((struct ip *)m->m_data)->ip_v == 4 ?
+ inet_ntoa(&((struct ip *)m->m_data)->ip_src.s_addr) :
+ inet_ntoa6(&((struct ip6_hdr *)m->m_data)->ip6_src);
+ ip_daddr = m == NULL ? 0 :
+ ((struct ip *)m->m_data)->ip_v == 4 ?
+ inet_ntoa(&((struct ip *)m->m_data)->ip_dst.s_addr) :
+ inet_ntoa6(&((struct ip6_hdr *)m->m_data)->ip6_dst);
+};
+
#pragma D binding "1.0" translator
translator ifinfo_t < struct ifnet *p > {
if_name = p->if_xname;
diff --git a/sys/netinet/in_kdtrace.c b/sys/netinet/in_kdtrace.c
index 785ddcb..d34abc5 100644
--- a/sys/netinet/in_kdtrace.c
+++ b/sys/netinet/in_kdtrace.c
@@ -58,28 +58,28 @@ SDT_PROBE_DEFINE6_XLATE(ip, , , send,
SDT_PROBE_DEFINE5_XLATE(tcp, , , accept__established,
"void *", "pktinfo_t *",
"struct tcpcb *", "csinfo_t *",
- "uint8_t *", "ipinfo_t *",
+ "struct mbuf *", "ipinfo_t *",
"struct tcpcb *", "tcpsinfo_t *" ,
"struct tcphdr *", "tcpinfoh_t *");
SDT_PROBE_DEFINE5_XLATE(tcp, , , accept__refused,
"void *", "pktinfo_t *",
"struct tcpcb *", "csinfo_t *",
- "uint8_t *", "ipinfo_t *",
+ "struct mbuf *", "ipinfo_t *",
"struct tcpcb *", "tcpsinfo_t *" ,
"struct tcphdr *", "tcpinfo_t *");
SDT_PROBE_DEFINE5_XLATE(tcp, , , connect__established,
"void *", "pktinfo_t *",
"struct tcpcb *", "csinfo_t *",
- "uint8_t *", "ipinfo_t *",
+ "struct mbuf *", "ipinfo_t *",
"struct tcpcb *", "tcpsinfo_t *" ,
"struct tcphdr *", "tcpinfoh_t *");
SDT_PROBE_DEFINE5_XLATE(tcp, , , connect__refused,
"void *", "pktinfo_t *",
"struct tcpcb *", "csinfo_t *",
- "uint8_t *", "ipinfo_t *",
+ "struct mbuf *", "ipinfo_t *",
"struct tcpcb *", "tcpsinfo_t *" ,
"struct tcphdr *", "tcpinfoh_t *");
@@ -93,7 +93,7 @@ SDT_PROBE_DEFINE5_XLATE(tcp, , , connect__request,
SDT_PROBE_DEFINE5_XLATE(tcp, , , receive,
"void *", "pktinfo_t *",
"struct tcpcb *", "csinfo_t *",
- "uint8_t *", "ipinfo_t *",
+ "struct mbuf *", "ipinfo_t *",
"struct tcpcb *", "tcpsinfo_t *" ,
"struct tcphdr *", "tcpinfoh_t *");
@@ -112,6 +112,14 @@ SDT_PROBE_DEFINE6_XLATE(tcp, , , state__change,
"void *", "void *",
"int", "tcplsinfo_t *");
+SDT_PROBE_DEFINE6_XLATE(tcp, , , receive__autoresize,
+ "void *", "void *",
+ "struct tcpcb *", "csinfo_t *",
+ "struct mbuf *", "ipinfo_t *",
+ "struct tcpcb *", "tcpsinfo_t *" ,
+ "struct tcphdr *", "tcpinfoh_t *",
+ "int", "int");
+
SDT_PROBE_DEFINE5_XLATE(udp, , , receive,
"void *", "pktinfo_t *",
"struct inpcb *", "csinfo_t *",
diff --git a/sys/netinet/in_kdtrace.h b/sys/netinet/in_kdtrace.h
index 07b3e3a..770142c 100644
--- a/sys/netinet/in_kdtrace.h
+++ b/sys/netinet/in_kdtrace.h
@@ -52,6 +52,7 @@ SDT_PROBE_DECLARE(tcp, , , connect__request);
SDT_PROBE_DECLARE(tcp, , , receive);
SDT_PROBE_DECLARE(tcp, , , send);
SDT_PROBE_DECLARE(tcp, , , state__change);
+SDT_PROBE_DECLARE(tcp, , , receive__autoresize);
SDT_PROBE_DECLARE(udp, , , receive);
SDT_PROBE_DECLARE(udp, , , send);
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 254454f..95c4d53 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -1469,6 +1469,68 @@ drop:
m_freem(m);
}
+/*
+ * Automatic sizing of receive socket buffer. Often the send
+ * buffer size is not optimally adjusted to the actual network
+ * conditions at hand (delay bandwidth product). Setting the
+ * buffer size too small limits throughput on links with high
+ * bandwidth and high delay (eg. trans-continental/oceanic links).
+ *
+ * On the receive side the socket buffer memory is only rarely
+ * used to any significant extent. This allows us to be much
+ * more aggressive in scaling the receive socket buffer. For
+ * the case that the buffer space is actually used to a large
+ * extent and we run out of kernel memory we can simply drop
+ * the new segments; TCP on the sender will just retransmit it
+ * later. Setting the buffer size too big may only consume too
+ * much kernel memory if the application doesn't read() from
+ * the socket or packet loss or reordering makes use of the
+ * reassembly queue.
+ *
+ * The criteria to step up the receive buffer one notch are:
+ * 1. Application has not set receive buffer size with
+ * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
+ * 2. the number of bytes received during the time it takes
+ * one timestamp to be reflected back to us (the RTT);
+ * 3. received bytes per RTT is within seven eighth of the
+ * current socket buffer size;
+ * 4. receive buffer size has not hit maximal automatic size;
+ *
+ * This algorithm does one step per RTT at most and only if
+ * we receive a bulk stream w/o packet losses or reorderings.
+ * Shrinking the buffer during idle times is not necessary as
+ * it doesn't consume any memory when idle.
+ *
+ * TODO: Only step up if the application is actually serving
+ * the buffer to better manage the socket buffer resources.
+ */
+int
+tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, int tlen)
+{
+ int newsize = 0;
+
+ if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) &&
+ tp->t_srtt != 0 && tp->rfbuf_ts != 0 &&
+ TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) >
+ (tp->t_srtt >> TCP_RTT_SHIFT)) {
+ if (tp->rfbuf_cnt > (so->so_rcv.sb_hiwat / 8 * 7) &&
+ so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) {
+ newsize = min(so->so_rcv.sb_hiwat +
+ V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max);
+ }
+ TCP_PROBE6(receive__autoresize, NULL, tp, m, tp, th, newsize);
+
+ /* Start over with next RTT. */
+ tp->rfbuf_ts = 0;
+ tp->rfbuf_cnt = 0;
+ } else {
+ tp->rfbuf_cnt += tlen; /* add up */
+ }
+
+ return (newsize);
+}
+
static void
tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
@@ -1811,60 +1873,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tcp_trace(TA_INPUT, ostate, tp,
(void *)tcp_saveipgen, &tcp_savetcp, 0);
#endif
- /*
- * Automatic sizing of receive socket buffer. Often the send
- * buffer size is not optimally adjusted to the actual network
- * conditions at hand (delay bandwidth product). Setting the
- * buffer size too small limits throughput on links with high
- * bandwidth and high delay (eg. trans-continental/oceanic links).
- *
- * On the receive side the socket buffer memory is only rarely
- * used to any significant extent. This allows us to be much
- * more aggressive in scaling the receive socket buffer. For
- * the case that the buffer space is actually used to a large
- * extent and we run out of kernel memory we can simply drop
- * the new segments; TCP on the sender will just retransmit it
- * later. Setting the buffer size too big may only consume too
- * much kernel memory if the application doesn't read() from
- * the socket or packet loss or reordering makes use of the
- * reassembly queue.
- *
- * The criteria to step up the receive buffer one notch are:
- * 1. the number of bytes received during the time it takes
- * one timestamp to be reflected back to us (the RTT);
- * 2. received bytes per RTT is within seven eighth of the
- * current socket buffer size;
- * 3. receive buffer size has not hit maximal automatic size;
- *
- * This algorithm does one step per RTT at most and only if
- * we receive a bulk stream w/o packet losses or reorderings.
- * Shrinking the buffer during idle times is not necessary as
- * it doesn't consume any memory when idle.
- *
- * TODO: Only step up if the application is actually serving
- * the buffer to better manage the socket buffer resources.
- */
- if (V_tcp_do_autorcvbuf &&
- (to.to_flags & TOF_TS) &&
- to.to_tsecr &&
- (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
- if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) &&
- to.to_tsecr - tp->rfbuf_ts < hz) {
- if (tp->rfbuf_cnt >
- (so->so_rcv.sb_hiwat / 8 * 7) &&
- so->so_rcv.sb_hiwat <
- V_tcp_autorcvbuf_max) {
- newsize =
- min(so->so_rcv.sb_hiwat +
- V_tcp_autorcvbuf_inc,
- V_tcp_autorcvbuf_max);
- }
- /* Start over with next RTT. */
- tp->rfbuf_ts = 0;
- tp->rfbuf_cnt = 0;
- } else
- tp->rfbuf_cnt += tlen; /* add up */
- }
+ newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
/* Add data to socket buffer. */
SOCKBUF_LOCK(&so->so_rcv);
@@ -1905,10 +1914,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
win = 0;
tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
- /* Reset receive buffer auto scaling when not in bulk receive mode. */
- tp->rfbuf_ts = 0;
- tp->rfbuf_cnt = 0;
-
switch (tp->t_state) {
/*
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index 6dbfeaf..cc45d9d 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -790,11 +790,13 @@ send:
to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
to.to_tsecr = tp->ts_recent;
to.to_flags |= TOF_TS;
- /* Set receive buffer autosizing timestamp. */
- if (tp->rfbuf_ts == 0 &&
- (so->so_rcv.sb_flags & SB_AUTOSIZE))
- tp->rfbuf_ts = tcp_ts_getticks();
}
+
+ /* Set receive buffer autosizing timestamp. */
+ if (tp->rfbuf_ts == 0 &&
+ (so->so_rcv.sb_flags & SB_AUTOSIZE))
+ tp->rfbuf_ts = tcp_ts_getticks();
+
/* Selective ACK's. */
if (tp->t_flags & TF_SACK_PERMIT) {
if (flags & TH_SYN)
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index b9a72ef..c8462d3 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -704,6 +704,8 @@ int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *);
void tcp_reass_global_init(void);
void tcp_reass_flush(struct tcpcb *);
void tcp_input(struct mbuf *, int);
+int tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *,
+ struct tcpcb *, int);
u_long tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);
u_long tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *);
void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *,
OpenPOWER on IntegriCloud