From b1afde60f2b9ee8444fba4e012dc99a3b28d224d Mon Sep 17 00:00:00 2001
From: Nandita Dukkipati <nanditad@google.com>
Date: Fri, 3 Dec 2010 13:33:44 +0000
Subject: tcp: Bug fix in initialization of receive window.

The bug has to do with boundary checks on the initial receive window.
If the initial receive window falls between init_cwnd and the
receive window specified by the user, the initial window is incorrectly
brought down to init_cwnd. The correct behavior is to allow it to
remain unchanged.

Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 05b1ecf..3c59ab4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -231,11 +231,10 @@ void tcp_select_initial_window(int __space, __u32 mss,
 		/* when initializing use the value from init_rcv_wnd
 		 * rather than the default from above
 		 */
-		if (init_rcv_wnd &&
-		    (*rcv_wnd > init_rcv_wnd * mss))
-			*rcv_wnd = init_rcv_wnd * mss;
-		else if (*rcv_wnd > init_cwnd * mss)
-			*rcv_wnd = init_cwnd * mss;
+		if (init_rcv_wnd)
+			*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
+		else
+			*rcv_wnd = min(*rcv_wnd, init_cwnd * mss);
 	}
 
 	/* Set the clamp no higher than max representable value */
-- 
cgit v1.1


From 67631510a318d5a930055fe927607f483716e100 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Wed, 8 Dec 2010 12:16:33 -0800
Subject: tcp: Replace time wait bucket msg by counter

Rather than printing the message to the log, use a mib counter to keep
track of the count of occurences of time wait bucket overflow.  Reduces
spam in logs.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/proc.c          | 1 +
 net/ipv4/tcp_minisocks.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 1b48eb1..b14ec7d 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -253,6 +253,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
 	SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
 	SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
+	SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 43cf901..a66735f 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -347,7 +347,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		 * socket up.  We've got bigger problems than
 		 * non-graceful socket closings.
 		 */
-		LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n");
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
 	}
 
 	tcp_update_metrics(sk);
-- 
cgit v1.1


From ad9f4f50fe9288bbe65b7dfd76d8820afac6a24c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 7 Dec 2010 12:03:55 +0000
Subject: tcp: avoid a possible divide by zero

sysctl_tcp_tso_win_divisor might be set to zero while one cpu runs in
tcp_tso_should_defer(). Make sure we dont allow a divide by zero by
reading sysctl_tcp_tso_win_divisor exactly once.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 3c59ab4..0d4a3ce 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1512,6 +1512,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
 	struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 send_win, cong_win, limit, in_flight;
+	int win_divisor;
 
 	if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN)
 		goto send_now;
@@ -1543,13 +1544,14 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
 	if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
 		goto send_now;
 
-	if (sysctl_tcp_tso_win_divisor) {
+	win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
+	if (win_divisor) {
 		u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
 
 		/* If at least some fraction of a window is available,
 		 * just use it.
 		 */
-		chunk /= sysctl_tcp_tso_win_divisor;
+		chunk /= win_divisor;
 		if (limit >= chunk)
 			goto send_now;
 	} else {
-- 
cgit v1.1


From f19872575ff7819a3723154657a497d9bca66b33 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 7 Dec 2010 12:20:47 +0000
Subject: tcp: protect sysctl_tcp_cookie_size reads

Make sure sysctl_tcp_cookie_size is read once in
tcp_cookie_size_check(), or we might return an illegal value to caller
if sysctl_tcp_cookie_size is changed by another cpu.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: William Allen Simpson <william.allen.simpson@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0d4a3ce..61c2463 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -385,27 +385,30 @@ struct tcp_out_options {
  */
 static u8 tcp_cookie_size_check(u8 desired)
 {
-	if (desired > 0) {
+	int cookie_size;
+
+	if (desired > 0)
 		/* previously specified */
 		return desired;
-	}
-	if (sysctl_tcp_cookie_size <= 0) {
+
+	cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size);
+	if (cookie_size <= 0)
 		/* no default specified */
 		return 0;
-	}
-	if (sysctl_tcp_cookie_size <= TCP_COOKIE_MIN) {
+
+	if (cookie_size <= TCP_COOKIE_MIN)
 		/* value too small, specify minimum */
 		return TCP_COOKIE_MIN;
-	}
-	if (sysctl_tcp_cookie_size >= TCP_COOKIE_MAX) {
+
+	if (cookie_size >= TCP_COOKIE_MAX)
 		/* value too large, specify maximum */
 		return TCP_COOKIE_MAX;
-	}
-	if (0x1 & sysctl_tcp_cookie_size) {
+
+	if (cookie_size & 1)
 		/* 8-bit multiple, illegal, fix it */
-		return (u8)(sysctl_tcp_cookie_size + 0x1);
-	}
-	return (u8)sysctl_tcp_cookie_size;
+		cookie_size++;
+
+	return (u8)cookie_size;
 }
 
 /* Write previously computed TCP options to the packet.
-- 
cgit v1.1


From fcbdf09d9652c8919dcf47072e3ae7dcb4eb98ac Mon Sep 17 00:00:00 2001
From: Octavian Purdila <opurdila@ixiacom.com>
Date: Thu, 16 Dec 2010 14:26:56 -0800
Subject: net: fix nulls list corruptions in sk_prot_alloc

Special care is taken inside sk_port_alloc to avoid overwriting
skc_node/skc_nulls_node. We should also avoid overwriting
skc_bind_node/skc_portaddr_node.

The patch fixes the following crash:

 BUG: unable to handle kernel paging request at fffffffffffffff0
 IP: [<ffffffff812ec6dd>] udp4_lib_lookup2+0xad/0x370
 [<ffffffff812ecc22>] __udp4_lib_lookup+0x282/0x360
 [<ffffffff812ed63e>] __udp4_lib_rcv+0x31e/0x700
 [<ffffffff812bba45>] ? ip_local_deliver_finish+0x65/0x190
 [<ffffffff812bbbf8>] ? ip_local_deliver+0x88/0xa0
 [<ffffffff812eda35>] udp_rcv+0x15/0x20
 [<ffffffff812bba45>] ip_local_deliver_finish+0x65/0x190
 [<ffffffff812bbbf8>] ip_local_deliver+0x88/0xa0
 [<ffffffff812bb2cd>] ip_rcv_finish+0x32d/0x6f0
 [<ffffffff8128c14c>] ? netif_receive_skb+0x99c/0x11c0
 [<ffffffff812bb94b>] ip_rcv+0x2bb/0x350
 [<ffffffff8128c14c>] netif_receive_skb+0x99c/0x11c0

Signed-off-by: Leonard Crestez <lcrestez@ixiacom.com>
Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c     | 1 +
 net/ipv4/udplite.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5e0a3a5..2d3ded4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1899,6 +1899,7 @@ struct proto udp_prot = {
 	.compat_setsockopt = compat_udp_setsockopt,
 	.compat_getsockopt = compat_udp_getsockopt,
 #endif
+	.clear_sk	   = sk_prot_clear_portaddr_nulls,
 };
 EXPORT_SYMBOL(udp_prot);
 
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index ab76aa9..aee9963 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -57,6 +57,7 @@ struct proto 	udplite_prot = {
 	.compat_setsockopt = compat_udp_setsockopt,
 	.compat_getsockopt = compat_udp_getsockopt,
 #endif
+	.clear_sk	   = sk_prot_clear_portaddr_nulls,
 };
 EXPORT_SYMBOL(udplite_prot);
 
-- 
cgit v1.1


From 1bde5ac49398a064c753bb490535cfad89e99a5f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 23 Dec 2010 09:32:46 -0800
Subject: tcp: fix listening_get_next()

Alexey Vlasov found /proc/net/tcp could sometime loop and display
millions of sockets in LISTEN state.

In 2.6.29, when we converted TCP hash tables to RCU, we left two
sk_next() calls in listening_get_next().

We must instead use sk_nulls_next() to properly detect an end of chain.

Reported-by: Alexey Vlasov <renton@renton.name>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e13da6d..d978bb2 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2030,7 +2030,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
 get_req:
 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
 		}
-		sk	  = sk_next(st->syn_wait_sk);
+		sk	  = sk_nulls_next(st->syn_wait_sk);
 		st->state = TCP_SEQ_STATE_LISTENING;
 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
 	} else {
@@ -2039,7 +2039,7 @@ get_req:
 		if (reqsk_queue_len(&icsk->icsk_accept_queue))
 			goto start_req;
 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
-		sk = sk_next(sk);
+		sk = sk_nulls_next(sk);
 	}
 get_sk:
 	sk_nulls_for_each_from(sk, node) {
-- 
cgit v1.1


From e058464990c2ef1f3ecd6b83a154913c3c06f02a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 23 Dec 2010 12:03:57 -0800
Subject: Revert "ipv4: Allow configuring subnets as local addresses"

This reverts commit 4465b469008bc03b98a1b8df4e9ae501b6c69d4b.

Conflicts:

	net/ipv4/fib_frontend.c

As reported by Ben Greear, this causes regressions:

> Change 4465b469008bc03b98a1b8df4e9ae501b6c69d4b caused rules
> to stop matching the input device properly because the
> FLOWI_FLAG_MATCH_ANY_IIF is always defined in ip_dev_find().
>
> This breaks rules such as:
>
> ip rule add pref 512 lookup local
> ip rule del pref 0 lookup local
> ip link set eth2 up
> ip -4 addr add 172.16.0.102/24 broadcast 172.16.0.255 dev eth2
> ip rule add to 172.16.0.102 iif eth2 lookup local pref 10
> ip rule add iif eth2 lookup 10001 pref 20
> ip route add 172.16.0.0/24 dev eth2 table 10001
> ip route add unreachable 0/0 table 10001
>
> If you had a second interface 'eth0' that was on a different
> subnet, pinging a system on that interface would fail:
>
>   [root@ct503-60 ~]# ping 192.168.100.1
>   connect: Invalid argument

Reported-by: Ben Greear <greearb@candelatech.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index eb6f69a..c19c1f7 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -163,13 +163,19 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
 				.daddr = addr
 			}
 		},
-		.flags = FLOWI_FLAG_MATCH_ANY_IIF
 	};
 	struct fib_result res = { 0 };
 	struct net_device *dev = NULL;
+	struct fib_table *local_table;
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+	res.r = NULL;
+#endif
 
 	rcu_read_lock();
-	if (fib_lookup(net, &fl, &res)) {
+	local_table = fib_get_table(net, RT_TABLE_LOCAL);
+	if (!local_table ||
+	    fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
 		rcu_read_unlock();
 		return NULL;
 	}
-- 
cgit v1.1


From fc75fc8339e7727167443469027540b283daac71 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 22 Dec 2010 04:39:39 +0000
Subject: ipv4: dont create routes on down devices

In ip_route_output_slow(), instead of allowing a route to be created on
a not UPed device, report -ENETUNREACH immediately.

# ip tunnel add mode ipip remote 10.16.0.164 local
10.16.0.72 dev eth0
# (Note : tunl1 is down)
# ping -I tunl1 10.1.2.3
PING 10.1.2.3 (10.1.2.3) from 192.168.18.5 tunl1: 56(84) bytes of data.
(nothing)
# ./a.out tunl1
# ip tunnel del tunl1
Message from syslogd@shelby at Dec 22 10:12:08 ...
  kernel: unregister_netdevice: waiting for tunl1 to become free.
Usage count = 3

After patch:
# ping -I tunl1 10.1.2.3
connect: Network is unreachable

Reported-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Reviewed-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 987bf9ad..df948b0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2585,9 +2585,10 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 			goto out;
 
 		/* RACE: Check return value of inet_select_addr instead. */
-		if (rcu_dereference(dev_out->ip_ptr) == NULL)
-			goto out;	/* Wrong error code */
-
+		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
+			err = -ENETUNREACH;
+			goto out;
+		}
 		if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
 		    ipv4_is_lbcast(oldflp->fl4_dst)) {
 			if (!fl.fl4_src)
-- 
cgit v1.1