From 5f852eb536ad651b8734559dcf4353514cb0bea3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Feb 2015 14:10:18 -0800
Subject: tcp: tso: remove tp->tso_deferred

TSO relies on ability to defer sending a small amount of packets.
Heuristic is to wait for future ACKS in hope to send more packets at once.
Current algorithm uses a per socket tso_deferred field as a pseudo timer.

This pseudo timer relies on future ACK, but there is no guarantee
we receive them in time.

Fix would be to use a real timer, but cost of such timer is probably too
expensive for typical cases.

This patch changes the logic to test the time of last transmit,
because we should not add bursts of more than 1ms for any given flow.

We've used this patch for about two years at Google, before FQ/pacing
as it would reduce a fair amount of bursts.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'net/ipv4/tcp_output.c')
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a2a796c..cb95c7a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1763,9 +1763,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
 	if (icsk->icsk_ca_state != TCP_CA_Open)
 		goto send_now;
 
-	/* Defer for less than two clock ticks. */
-	if (tp->tso_deferred &&
-	    (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1)
+	/* Avoid bursty behavior by allowing defer
+	 * only if the last write was recent.
+	 */
+	if ((s32)(tcp_time_stamp - tp->lsndtime) > 0)
 		goto send_now;
 
 	in_flight = tcp_packets_in_flight(tp);
@@ -1807,11 +1808,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
 			goto send_now;
 	}
 
-	/* Ok, it looks like it is advisable to defer.
-	 * Do not rearm the timer if already set to not break TCP ACK clocking.
-	 */
-	if (!tp->tso_deferred)
-		tp->tso_deferred = 1 | (jiffies << 1);
+	/* Ok, it looks like it is advisable to defer. */
 
 	if (cong_win < send_win && cong_win < skb->len)
 		*is_cwnd_limited = true;
@@ -1819,7 +1816,6 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
 	return true;
 
 send_now:
-	tp->tso_deferred = 0;
 	return false;
 }
 
-- 
cgit v1.1


From 50c8339e9299aaf09e48c8f94de39baec638c874 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Feb 2015 14:10:19 -0800
Subject: tcp: tso: restore IW10 after TSO autosizing

With sysctl_tcp_min_tso_segs being 4, it is very possible
that tcp_tso_should_defer() decides not sending last 2 MSS
of initial window of 10 packets. This also applies if
autosizing decides to send X MSS per GSO packet, and cwnd
is not a multiple of X.

This patch implements an heuristic based on age of first
skb in write queue : If it was sent very recently (less than half srtt),
we can predict that no ACK packet will come in less than half rtt,
so deferring might cause an under utilization of our window.

This is visible on initial send (IW10) on web servers,
but more generally on some RPC, as the last part of the message
might need an extra RTT to get delivered.

Tested:

Ran following packetdrill test
// A simple server-side test that sends exactly an initial window (IW10)
// worth of packets.

`sysctl -e -q net.ipv4.tcp_min_tso_segs=4`

0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0    setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0    bind(3, ..., ...) = 0
+0    listen(3, 1) = 0

+.1   < S 0:0(0) win 32792 <mss 1460,sackOK,nop,nop,nop,wscale 7>
+0    > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 6>
+.1   < . 1:1(0) ack 1 win 257
+0    accept(3, ..., ...) = 4

+0    write(4, ..., 14600) = 14600
+0    > . 1:5841(5840) ack 1 win 457
+0    > . 5841:11681(5840) ack 1 win 457
// Following packet should be sent right now.
+0    > P. 11681:14601(2920) ack 1 win 457

+.1   < . 1:1(0) ack 14601 win 257

+0    close(4) = 0
+0    > F. 14601:14601(0) ack 1
+.1   < F. 1:1(0) ack 14602 win 257
+0    > . 14602:14602(0) ack 2

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index cb95c7a..5f4fb4d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1752,9 +1752,11 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
 				 bool *is_cwnd_limited, u32 max_segs)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_connection_sock *icsk = inet_csk(sk);
-	u32 send_win, cong_win, limit, in_flight;
+	u32 age, send_win, cong_win, limit, in_flight;
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct skb_mstamp now;
+	struct sk_buff *head;
 	int win_divisor;
 
 	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -1808,6 +1810,13 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
 			goto send_now;
 	}
 
+	head = tcp_write_queue_head(sk);
+	skb_mstamp_get(&now);
+	age = skb_mstamp_us_delta(&now, &head->skb_mstamp);
+	/* If next ACK is likely to come too late (half srtt), do not defer */
+	if (age < (tp->srtt_us >> 4))
+		goto send_now;
+
 	/* Ok, it looks like it is advisable to defer. */
 
 	if (cong_win < send_win && cong_win < skb->len)
-- 
cgit v1.1


From a0ea700e4095e26a8e291374757dd29519900f18 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Feb 2015 14:10:20 -0800
Subject: tcp: tso: allow CA_CWR state in tcp_tso_should_defer()

Another TCP issue is triggered by ECN.

Under pressure, receiver gets ECN marks, and send back ACK packets
with ECE TCP flag. Senders enter CA_CWR state.

In this state, tcp_tso_should_defer() is short cut :

if (icsk->icsk_ca_state != TCP_CA_Open)
    goto send_now;

This means that about all ACK packets we receive are triggering
a partial send, and because cwnd is kept small, we can only send
a small amount of data for each incoming ACK,
which in return generate more ACK packets.

Allowing CA_Open and CA_CWR states to enable TSO defer in
tcp_tso_should_defer() brings performance back :
TSO autodefer has more chance to defer under pressure.

This patch increases TSO and LRO/GRO efficiency back to normal levels,
and does not impact overall ECN behavior.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5f4fb4d..8bbd86c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1762,7 +1762,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
 	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 		goto send_now;
 
-	if (icsk->icsk_ca_state != TCP_CA_Open)
+	if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_CWR)))
 		goto send_now;
 
 	/* Avoid bursty behavior by allowing defer
-- 
cgit v1.1