From e1cfcbe82b4534bd0f99fef92a6d33843fd85e0e Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 27 Sep 2017 11:35:40 +0800 Subject: ipv4: Namespaceify tcp_fastopen knob Different namespace application might require enable TCP Fast Open feature independently of the host. This patch series continues making more of the TCP Fast Open related sysctl knobs be per net-namespace. Reported-by: Luca BRUNO Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d9416b5..88409b1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2472,6 +2472,8 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_window_scaling = 1; net->ipv4.sysctl_tcp_timestamps = 1; + net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; + return 0; fail: tcp_sk_exit(net); -- cgit v1.1 From 437138485656c41e32b8c63c0987cfa0348be0e6 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 27 Sep 2017 11:35:42 +0800 Subject: ipv4: Namespaceify tcp_fastopen_key knob Different namespace application might require different tcp_fastopen_key independently of the host. David Miller pointed out there is a leak without releasing the context of tcp_fastopen_key during netns teardown. So add the release action in exit_batch path. Tested: 1. Container namespace: # cat /proc/sys/net/ipv4/tcp_fastopen_key: 2817fff2-f803cf97-eadfd1f3-78c0992b cookie key in tcp syn packets: Fast Open Cookie Kind: TCP Fast Open Cookie (34) Length: 10 Fast Open Cookie: 1e5dd82a8c492ca9 2. Host: # cat /proc/sys/net/ipv4/tcp_fastopen_key: 107d7c5f-68eb2ac7-02fb06e6-ed341702 cookie key in tcp syn packets: Fast Open Cookie Kind: TCP Fast Open Cookie (34) Length: 10 Fast Open Cookie: e213c02bf0afbc8a Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 88409b1..49c74c0 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2473,6 +2473,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_timestamps = 1; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; + spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); return 0; fail: @@ -2483,7 +2484,12 @@ fail: static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { + struct net *net; + inet_twsk_purge(&tcp_hashinfo, AF_INET); + + list_for_each_entry(net, net_exit_list, exit_list) + tcp_fastopen_ctx_destroy(net); } static struct pernet_operations __net_initdata tcp_sk_ops = { -- cgit v1.1 From 3733be14a32bae288b61ed28341e593baba983af Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 27 Sep 2017 11:35:43 +0800 Subject: ipv4: Namespaceify tcp_fastopen_blackhole_timeout knob Different namespace application might require different time period in second to disable Fastopen on active TCP sockets. Tested: Simulate following similar situation that the server's data gets dropped after 3WHS. C ---- syn-data ---> S C <--- syn/ack ----- S C ---- ack --------> S S (accept & write) C? X <- data ------ S [retry and timeout] And then print netstat of TCPFastOpenBlackhole, the counter increased as expected when the firewall blackhole issue is detected and active TFO is disabled. # cat /proc/net/netstat | awk '{print $91}' TCPFastOpenBlackhole 1 Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 49c74c0..ad3b5bb 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2474,6 +2474,8 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); + net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; + atomic_set(&net->ipv4.tfo_active_disable_times, 0); return 0; fail: -- cgit v1.1 From 75c119afe14f74b4dd967d75ed9f57ab6c0ef045 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Oct 2017 22:21:27 -0700 Subject: tcp: implement rb-tree based retransmit queue Using a linear list to store all skbs in write queue has been okay for quite a while : O(N) is not too bad when N < 500. Things get messy when N is the order of 100,000 : Modern TCP stacks want 10Gbit+ of throughput even with 200 ms RTT flows. 40 ns per cache line miss means a full scan can use 4 ms, blowing away CPU caches. SACK processing often can use various hints to avoid parsing whole retransmit queue. But with high packet losses and/or high reordering, hints no longer work. Sender has to process thousands of unfriendly SACK, accumulating a huge socket backlog, burning a cpu and massively dropping packets. Using an rb-tree for retransmit queue has been avoided for years because it added complexity and overhead, but now is the time to be more resistant and say no to quadratic behavior. 1) RTX queue is no longer part of the write queue : already sent skbs are stored in one rb-tree. 2) Since reaching the head of write queue no longer needs sk->sk_send_head, we added an union of sk_send_head and tcp_rtx_queue Tested: On receiver : netem on ingress : delay 150ms 200us loss 1 GRO disabled to force stress and SACK storms. for f in `seq 1 10` do ./netperf -H lpaa6 -l30 -- -K bbr -o THROUGHPUT|tail -1 done | awk '{print $0} {sum += $0} END {printf "%7u\n",sum}' Before patch : 323.87 351.48 339.59 338.62 306.72 204.07 304.93 291.88 202.47 176.88 2840 After patch: 1700.83 2207.98 2070.17 1544.26 2114.76 2124.89 1693.14 1080.91 2216.82 1299.94 18053 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c7460fd..5418ecf 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -480,7 +480,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) TCP_TIMEOUT_INIT; icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); BUG_ON(!skb); tcp_mstamp_refresh(tp); -- cgit v1.1 From fcfd6dfab97006d44c7db5d6c908eac383af6649 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 16 Oct 2017 15:48:55 -0500 Subject: ipv4: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Notice that in some cases I placed the "fall through" comment on its own line, which is what GCC is expecting to find. Addresses-Coverity-ID: 115108 Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5418ecf..ecee4dd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1779,8 +1779,9 @@ do_time_wait: refcounted = false; goto process; } - /* Fall through to ACK */ } + /* to ACK */ + /* fall through */ case TCP_TW_ACK: tcp_v4_timewait_ack(sk, skb); break; -- cgit v1.1 From 1fba70e5b6bed53496ba1f1f16127f5be01b5fb6 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 18 Oct 2017 11:22:51 -0700 Subject: tcp: socket option to set TCP fast open key New socket option TCP_FASTOPEN_KEY to allow different keys per listener. The listener by default uses the global key until the socket option is set. The key is a 16 bytes long binary data. This option has no effect on regular non-listener TCP sockets. Signed-off-by: Yuchung Cheng Reviewed-by: Eric Dumazet Reviewed-by: Christoph Paasch Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ecee4dd..28ca4e1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1893,6 +1893,7 @@ void tcp_v4_destroy_sock(struct sock *sk) /* If socket is aborted during connect operation */ tcp_free_fastopen_req(tp); + tcp_fastopen_destroy_cipher(sk); tcp_saved_syn_free(tp); sk_sockets_allocated_dec(sk); -- cgit v1.1 From c24b14c46bb88d844275de5c4024c8745ae89d42 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Oct 2017 09:20:24 -0700 Subject: tcp: add tracepoint trace_tcp_send_reset New tracepoint trace_tcp_send_reset is added and called from tcp_v4_send_reset(), tcp_v6_send_reset() and tcp_send_active_reset(). Signed-off-by: Song Liu Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e22439f..eb3f3b8 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -85,6 +85,8 @@ #include #include +#include + #ifdef CONFIG_TCP_MD5SIG static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, __be32 daddr, __be32 saddr, const struct tcphdr *th); @@ -701,8 +703,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) * routing might fail in this case. No choice here, if we choose to force * input interface, we will misroute in case of asymmetric route. */ - if (sk) + if (sk) { arg.bound_dev_if = sk->sk_bound_dev_if; + trace_tcp_send_reset(sk, skb); + } BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != offsetof(struct inet_timewait_sock, tw_bound_dev_if)); -- cgit v1.1 From e1a4aa50f47303ebb3ca0cfd01687884551ce03d Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Oct 2017 09:20:26 -0700 Subject: tcp: add tracepoint trace_tcp_destroy_sock This patch adds trace event trace_tcp_destroy_sock. Signed-off-by: Song Liu Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index eb3f3b8..23a8100 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1869,6 +1869,8 @@ void tcp_v4_destroy_sock(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + trace_tcp_destroy_sock(sk); + tcp_clear_xmit_timers(sk); tcp_cleanup_congestion_control(sk); -- cgit v1.1 From 2ae21cf527da0e5cf9d7ee14bd5b0909bb9d1a75 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:54:56 -0700 Subject: tcp: Namespace-ify sysctl_tcp_early_retrans Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 23a8100..7ab313f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2484,6 +2484,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_sack = 1; net->ipv4.sysctl_tcp_window_scaling = 1; net->ipv4.sysctl_tcp_timestamps = 1; + net->ipv4.sysctl_tcp_early_retrans = 3; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.1 From e20223f1962831d1b1c416d59d259879d0639d68 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:54:57 -0700 Subject: tcp: Namespace-ify sysctl_tcp_recovery Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7ab313f..517ff19 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2485,6 +2485,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_window_scaling = 1; net->ipv4.sysctl_tcp_timestamps = 1; net->ipv4.sysctl_tcp_early_retrans = 3; + net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.1 From b510f0d23a47c3d1f074fe583e7867dc4918fe02 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:54:59 -0700 Subject: tcp: Namespace-ify sysctl_tcp_slow_start_after_idle Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 517ff19..cea63a4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2486,6 +2486,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_timestamps = 1; net->ipv4.sysctl_tcp_early_retrans = 3; net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; + net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.1 From e0a1e5b519236dc1662ff25e42560dd1be9e3776 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:00 -0700 Subject: tcp: Namespace-ify sysctl_tcp_retrans_collapse Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index cea63a4..2bc6ba2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2487,7 +2487,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_early_retrans = 3; net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ - + net->ipv4.sysctl_tcp_retrans_collapse = 1; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; -- cgit v1.1 From c6e218035913e14952b04ceecf1a543205106fdb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:06 -0700 Subject: tcp: Namespace-ify sysctl_tcp_max_reordering Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2bc6ba2..c379a24 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2488,6 +2488,8 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ net->ipv4.sysctl_tcp_retrans_collapse = 1; + net->ipv4.sysctl_tcp_max_reordering = 300; + net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; -- cgit v1.1 From 6496f6bde0c323fba5e8c5b5cbf3a7bf28dad7ed Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:07 -0700 Subject: tcp: Namespace-ify sysctl_tcp_dsack Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c379a24..d9d4d19 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2489,6 +2489,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ net->ipv4.sysctl_tcp_retrans_collapse = 1; net->ipv4.sysctl_tcp_max_reordering = 300; + net->ipv4.sysctl_tcp_dsack = 1; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.1 From 0c12654ac6d9004b9538b2a969b2b59e9a5ed831 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:08 -0700 Subject: tcp: Namespace-ify sysctl_tcp_app_win Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d9d4d19..189664e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2490,6 +2490,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_retrans_collapse = 1; net->ipv4.sysctl_tcp_max_reordering = 300; net->ipv4.sysctl_tcp_dsack = 1; + net->ipv4.sysctl_tcp_app_win = 31; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.1 From 94f0893e0c27219f4a726932618505aab6795973 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:09 -0700 Subject: tcp: Namespace-ify sysctl_tcp_adv_win_scale Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 189664e..1fe30fb 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2491,6 +2491,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_max_reordering = 300; net->ipv4.sysctl_tcp_dsack = 1; net->ipv4.sysctl_tcp_app_win = 31; + net->ipv4.sysctl_tcp_adv_win_scale = 1; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.1 From af9b69a7a6ca6b817e8d6f416e7aa5b2a5bf1d91 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:10 -0700 Subject: tcp: Namespace-ify sysctl_tcp_frto Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1fe30fb..49757c7 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2492,6 +2492,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_dsack = 1; net->ipv4.sysctl_tcp_app_win = 31; net->ipv4.sysctl_tcp_adv_win_scale = 1; + net->ipv4.sysctl_tcp_frto = 2; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.1 From 4540c0cf98b8892a642d2453eec20ae3eb5696fb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:22 -0700 Subject: tcp: Namespace-ify sysctl_tcp_moderate_rcvbuf Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 49757c7..27f376b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2493,6 +2493,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_app_win = 31; net->ipv4.sysctl_tcp_adv_win_scale = 1; net->ipv4.sysctl_tcp_frto = 2; + net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.1 From d06a99045837d3f4d5431793c4c390b0daf2a08d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:23 -0700 Subject: tcp: Namespace-ify sysctl_tcp_tso_win_divisor Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 27f376b..284ff16 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2494,6 +2494,11 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_adv_win_scale = 1; net->ipv4.sysctl_tcp_frto = 2; net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; + /* This limits the percentage of the congestion window which we + * will allow a single TSO frame to consume. Building TSO frames + * which are too large can cause TCP streams to be bursty. + */ + net->ipv4.sysctl_tcp_tso_win_divisor = 3; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.1 From 9184d8bb448a3d2c2d9f90f1e2f5de625292e769 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:25 -0700 Subject: tcp: Namespace-ify sysctl_tcp_limit_output_bytes Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 284ff16..713b802 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2499,6 +2499,8 @@ static int __net_init tcp_sk_init(struct net *net) * which are too large can cause TCP streams to be bursty. */ net->ipv4.sysctl_tcp_tso_win_divisor = 3; + /* Default TSQ limit of four TSO segments */ + net->ipv4.sysctl_tcp_limit_output_bytes = 262144; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.1 From b530b68148301d73775cd27cc136ce4dd5738ae8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:26 -0700 Subject: tcp: Namespace-ify sysctl_tcp_challenge_ack_limit Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 713b802..50ab3a3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2501,6 +2501,8 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_tso_win_divisor = 3; /* Default TSQ limit of four TSO segments */ net->ipv4.sysctl_tcp_limit_output_bytes = 262144; + /* rfc5961 challenge ack rate limiting */ + net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.1 From 26e9596e5b8f11025b57b12e7265df649129ab00 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:27 -0700 Subject: tcp: Namespace-ify sysctl_tcp_min_tso_segs Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 50ab3a3..6192f26 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2503,6 +2503,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_limit_output_bytes = 262144; /* rfc5961 challenge ack rate limiting */ net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; + net->ipv4.sysctl_tcp_min_tso_segs = 2; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.1 From bd239704295c66196e6b77c5717ec4aec076ddd5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:28 -0700 Subject: tcp: Namespace-ify sysctl_tcp_min_rtt_wlen Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6192f26..ced35af 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2504,6 +2504,7 @@ static int __net_init tcp_sk_init(struct net *net) /* rfc5961 challenge ack rate limiting */ net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; net->ipv4.sysctl_tcp_min_tso_segs = 2; + net->ipv4.sysctl_tcp_min_rtt_wlen = 300; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.1 From 790f00e19f65673c3c169dfc137c09a9236847d5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:29 -0700 Subject: tcp: Namespace-ify sysctl_tcp_autocorking Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ced35af..351e349 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2505,6 +2505,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; net->ipv4.sysctl_tcp_min_tso_segs = 2; net->ipv4.sysctl_tcp_min_rtt_wlen = 300; + net->ipv4.sysctl_tcp_autocorking = 1; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.1 From 4170ba6b589ced82da56c7e4f71cc84b2be036d6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:30 -0700 Subject: tcp: Namespace-ify sysctl_tcp_invalid_ratelimit Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 351e349..6617aae 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2506,6 +2506,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_min_tso_segs = 2; net->ipv4.sysctl_tcp_min_rtt_wlen = 300; net->ipv4.sysctl_tcp_autocorking = 1; + net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.1 From 23a7102a2d1068508fa2a0ce593a0df7f8fdc0ac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:31 -0700 Subject: tcp: Namespace-ify sysctl_tcp_pacing_ss_ratio Also remove an obsolete comment about TCP pacing. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6617aae..1d8fc66 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2507,6 +2507,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_min_rtt_wlen = 300; net->ipv4.sysctl_tcp_autocorking = 1; net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; + net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.1 From c26e91f8b9b8e1fd252e07c1f60e50220cd7ebab Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:32 -0700 Subject: tcp: Namespace-ify sysctl_tcp_pacing_ca_ratio Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1d8fc66..7c1dae6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2508,6 +2508,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_autocorking = 1; net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; + net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); -- cgit v1.1 From 356d1833b638bd465672aefeb71def3ab93fc17d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 7 Nov 2017 00:29:28 -0800 Subject: tcp: Namespace-ify sysctl_tcp_rmem and sysctl_tcp_wmem Note that when a new netns is created, it inherits its sysctl_tcp_rmem and sysctl_tcp_wmem from initial netns. This change is needed so that we can refine TCP rcvbuf autotuning, to take RTT into consideration. Signed-off-by: Eric Dumazet Cc: Wei Wang Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0162c57..1eac84b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2409,8 +2409,8 @@ struct proto tcp_prot = { .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, .sysctl_mem = sysctl_tcp_mem, - .sysctl_wmem = sysctl_tcp_wmem, - .sysctl_rmem = sysctl_tcp_rmem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, @@ -2509,7 +2509,14 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; - + if (net != &init_net) { + memcpy(net->ipv4.sysctl_tcp_rmem, + init_net.ipv4.sysctl_tcp_rmem, + sizeof(init_net.ipv4.sysctl_tcp_rmem)); + memcpy(net->ipv4.sysctl_tcp_wmem, + init_net.ipv4.sysctl_tcp_wmem, + sizeof(init_net.ipv4.sysctl_tcp_wmem)); + } net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; -- cgit v1.1 From 6670e152447732ba90626f36dfc015a13fbf150e Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 14 Nov 2017 08:25:49 -0800 Subject: tcp: Namespace-ify sysctl_tcp_default_congestion_control Make default TCP default congestion control to a per namespace value. This changes default congestion control to a pointer to congestion ops (rather than implicit as first element of available lsit). The congestion control setting of new namespaces is inherited from the current setting of the root namespace. Signed-off-by: Stephen Hemminger Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1eac84b..c6bc0c4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2430,6 +2430,8 @@ static void __net_exit tcp_sk_exit(struct net *net) { int cpu; + module_put(net->ipv4.tcp_congestion_control->owner); + for_each_possible_cpu(cpu) inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); free_percpu(net->ipv4.tcp_sk); @@ -2522,6 +2524,13 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; atomic_set(&net->ipv4.tfo_active_disable_times, 0); + /* Reno is always built in */ + if (!net_eq(net, &init_net) && + try_module_get(init_net.ipv4.tcp_congestion_control->owner)) + net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; + else + net->ipv4.tcp_congestion_control = &tcp_reno; + return 0; fail: tcp_sk_exit(net); -- cgit v1.1