From 62ad6fcd743792bf294f2a7ba26ab8f462065150 Mon Sep 17 00:00:00 2001 From: Shan Wei Date: Tue, 24 Apr 2012 18:15:41 +0000 Subject: udp_diag: implement idiag_get_info for udp/udplite to get queue information When we use netlink to monitor queue information for udp socket, idiag_rqueue and idiag_wqueue of inet_diag_msg are returned with 0. Keep consistent with netstat, just return back allocated rmem/wmem size. Signed-off-by: Shan Wei Acked-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 2 +- net/ipv4/udp_diag.c | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 8d25a1c..8f8db72 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -141,7 +141,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, goto rtattr_failure; if (icsk == NULL) { - r->idiag_rqueue = r->idiag_wqueue = 0; + handler->idiag_get_info(sk, r, NULL); goto out; } diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 8a949f1..a7f86a3 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -146,9 +146,17 @@ static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, return udp_dump_one(&udp_table, in_skb, nlh, req); } +static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *info) +{ + r->idiag_rqueue = sk_rmem_alloc_get(sk); + r->idiag_wqueue = sk_wmem_alloc_get(sk); +} + static const struct inet_diag_handler udp_diag_handler = { .dump = udp_diag_dump, .dump_one = udp_diag_dump_one, + .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDP, }; @@ -167,6 +175,7 @@ static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr * static const struct inet_diag_handler udplite_diag_handler = { .dump = udplite_diag_dump, .dump_one = udplite_diag_dump_one, + .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDPLITE, }; -- cgit v1.1 From 651913ce9de2bbcedef608c5d6cf39c244248509 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 27 Apr 2012 11:29:37 -0400 Subject: tcp: clean up use of jiffies in tcp_rcv_rtt_measure() Clean up a reference to jiffies in tcp_rcv_rtt_measure() that should instead reference tcp_time_stamp. Since the result of the subtraction is passed into a function taking u32, this should not change any behavior (and indeed the generated assembly does not change on x86_64). However, it seems worth cleaning this up for consistency and clarity (and perhaps to avoid bugs if this is copied and pasted somewhere else). Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3ff36406..2a702e3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -495,7 +495,7 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) goto new_measure; if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) return; - tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1); + tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1); new_measure: tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; -- cgit v1.1 From 1cebce36d660c83bd1353e41f3e66abd4686f215 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 30 Apr 2012 06:00:18 +0000 Subject: tcp: fix infinite cwnd in tcp_complete_cwr() When the cwnd reduction is done, ssthresh may be infinite if TCP enters CWR via ECN or F-RTO. If cwnd is not undone, i.e., undo_marker is set, tcp_complete_cwr() falsely set cwnd to the infinite ssthresh value. The correct operation is to keep cwnd intact because it has been updated in ECN or F-RTO. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2a702e3..d99efd7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2868,11 +2868,14 @@ static inline void tcp_complete_cwr(struct sock *sk) /* Do not moderate cwnd if it's already undone in cwr or recovery. */ if (tp->undo_marker) { - if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) + if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) { tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); - else /* PRR */ + tp->snd_cwnd_stamp = tcp_time_stamp; + } else if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) { + /* PRR algorithm. */ tp->snd_cwnd = tp->snd_ssthresh; - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_time_stamp; + } } tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } -- cgit v1.1 From b49960a05e32121d29316cfdf653894b88ac9190 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 May 2012 02:28:41 +0000 Subject: tcp: change tcp_adv_win_scale and tcp_rmem[2] tcp_adv_win_scale default value is 2, meaning we expect a good citizen skb to have skb->len / skb->truesize ratio of 75% (3/4) In 2.6 kernels we (mis)accounted for typical MSS=1460 frame : 1536 + 64 + 256 = 1856 'estimated truesize', and 1856 * 3/4 = 1392. So these skbs were considered as not bloated. With recent truesize fixes, a typical MSS=1460 frame truesize is now the more precise : 2048 + 256 = 2304. But 2304 * 3/4 = 1728. So these skb are not good citizen anymore, because 1460 < 1728 (GRO can escape this problem because it build skbs with a too low truesize.) This also means tcp advertises a too optimistic window for a given allocated rcvspace : When receiving frames, sk_rmem_alloc can hit sk_rcvbuf limit and we call tcp_prune_queue()/tcp_collapse() too often, especially when application is slow to drain its receive queue or in case of losses (netperf is fast, scp is slow). This is a major latency source. We should adjust the len/truesize ratio to 50% instead of 75% This patch : 1) changes tcp_adv_win_scale default to 1 instead of 2 2) increase tcp_rmem[2] limit from 4MB to 6MB to take into account better truesize tracking and to allow autotuning tcp receive window to reach same value than before. Note that same amount of kernel memory is consumed compared to 2.6 kernels. Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Tom Herbert Cc: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 9 +++++---- net/ipv4/tcp_input.c | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8bb6ade..1272a88 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3243,7 +3243,7 @@ void __init tcp_init(void) { struct sk_buff *skb = NULL; unsigned long limit; - int max_share, cnt; + int max_rshare, max_wshare, cnt; unsigned int i; unsigned long jiffy = jiffies; @@ -3303,15 +3303,16 @@ void __init tcp_init(void) tcp_init_mem(&init_net); /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); - max_share = min(4UL*1024*1024, limit); + max_wshare = min(4UL*1024*1024, limit); + max_rshare = min(6UL*1024*1024, limit); sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; sysctl_tcp_wmem[1] = 16*1024; - sysctl_tcp_wmem[2] = max(64*1024, max_share); + sysctl_tcp_wmem[2] = max(64*1024, max_wshare); sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; sysctl_tcp_rmem[1] = 87380; - sysctl_tcp_rmem[2] = max(87380, max_share); + sysctl_tcp_rmem[2] = max(87380, max_rshare); pr_info("Hash tables configured (established %u bind %u)\n", tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d99efd7..257b617 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -85,7 +85,7 @@ int sysctl_tcp_ecn __read_mostly = 2; EXPORT_SYMBOL(sysctl_tcp_ecn); int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; -int sysctl_tcp_adv_win_scale __read_mostly = 2; +int sysctl_tcp_adv_win_scale __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); int sysctl_tcp_stdurg __read_mostly; -- cgit v1.1