From c936e8bd1de2fa50c49e3df6fa5036bf07870b67 Mon Sep 17 00:00:00 2001
From: Xiaotian Feng <dfeng@redhat.com>
Date: Mon, 31 May 2010 16:41:09 +0200
Subject: netfilter: don't xt_jumpstack_alloc twice in xt_register_table

In xt_register_table, xt_jumpstack_alloc is called first, later
xt_replace_table is used. But in xt_replace_table, xt_jumpstack_alloc
will be used again. Then the memory allocated by previous xt_jumpstack_alloc
will be leaked. We can simply remove the previous xt_jumpstack_alloc because
there aren't any users of newinfo between xt_jumpstack_alloc and
xt_replace_table.

Signed-off-by: Xiaotian Feng <dfeng@redhat.com>
Cc: Patrick McHardy <kaber@trash.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jan Engelhardt <jengelh@medozas.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Acked-By: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/x_tables.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 445de70..47b1e79 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -844,10 +844,6 @@ struct xt_table *xt_register_table(struct net *net,
 	struct xt_table_info *private;
 	struct xt_table *t, *table;
 
-	ret = xt_jumpstack_alloc(newinfo);
-	if (ret < 0)
-		return ERR_PTR(ret);
-
 	/* Don't add one object to multiple lists. */
 	table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL);
 	if (!table) {
-- 
cgit v1.1


From 7489aec8eed4f2f1eb3b4d35763bd3ea30b32ef5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 31 May 2010 16:41:35 +0200
Subject: netfilter: xtables: stackptr should be percpu

commit f3c5c1bfd4 (netfilter: xtables: make ip_tables reentrant)
introduced a performance regression, because stackptr array is shared by
all cpus, adding cache line ping pongs. (16 cpus share a 64 bytes cache
line)

Fix this using alloc_percpu()

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-By: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/ip_tables.c  |  2 +-
 net/ipv6/netfilter/ip6_tables.c |  2 +-
 net/netfilter/x_tables.c        | 13 +++----------
 3 files changed, 5 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 63958f3..4b6c5ca 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -336,7 +336,7 @@ ipt_do_table(struct sk_buff *skb,
 	cpu        = smp_processor_id();
 	table_base = private->entries[cpu];
 	jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];
-	stackptr   = &private->stackptr[cpu];
+	stackptr   = per_cpu_ptr(private->stackptr, cpu);
 	origptr    = *stackptr;
 
 	e = get_entry(table_base, private->hook_entry[hook]);
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 6f517bd..9d2d68f 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -363,7 +363,7 @@ ip6t_do_table(struct sk_buff *skb,
 	cpu        = smp_processor_id();
 	table_base = private->entries[cpu];
 	jumpstack  = (struct ip6t_entry **)private->jumpstack[cpu];
-	stackptr   = &private->stackptr[cpu];
+	stackptr   = per_cpu_ptr(private->stackptr, cpu);
 	origptr    = *stackptr;
 
 	e = get_entry(table_base, private->hook_entry[hook]);
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 47b1e79..e34622f 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -699,10 +699,8 @@ void xt_free_table_info(struct xt_table_info *info)
 		vfree(info->jumpstack);
 	else
 		kfree(info->jumpstack);
-	if (sizeof(unsigned int) * nr_cpu_ids > PAGE_SIZE)
-		vfree(info->stackptr);
-	else
-		kfree(info->stackptr);
+
+	free_percpu(info->stackptr);
 
 	kfree(info);
 }
@@ -753,14 +751,9 @@ static int xt_jumpstack_alloc(struct xt_table_info *i)
 	unsigned int size;
 	int cpu;
 
-	size = sizeof(unsigned int) * nr_cpu_ids;
-	if (size > PAGE_SIZE)
-		i->stackptr = vmalloc(size);
-	else
-		i->stackptr = kmalloc(size, GFP_KERNEL);
+	i->stackptr = alloc_percpu(unsigned int);
 	if (i->stackptr == NULL)
 		return -ENOMEM;
-	memset(i->stackptr, 0, size);
 
 	size = sizeof(void **) * nr_cpu_ids;
 	if (size > PAGE_SIZE)
-- 
cgit v1.1


From b1faf5666438090a4dc4fceac8502edc7788b7e3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 31 May 2010 23:44:05 -0700
Subject: net: sock_queue_err_skb() dont mess with sk_forward_alloc

Correct sk_forward_alloc handling for error_queue would need to use a
backlog of frames that softirq handler could not deliver because socket
is owned by user thread. Or extend backlog processing to be able to
process normal and error packets.

Another possibility is to not use mem charge for error queue, this is
what I implemented in this patch.

Note: this reverts commit 29030374
(net: fix sk_forward_alloc corruptions), since we dont need to lock
socket anymore.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 30 ++++++++++++++++++++++++++++--
 net/ipv4/udp.c    |  6 ++----
 net/ipv6/udp.c    |  6 ++----
 3 files changed, 32 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4e7ac09..9f07e74 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2965,6 +2965,34 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
 }
 EXPORT_SYMBOL_GPL(skb_cow_data);
 
+static void sock_rmem_free(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+
+	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+}
+
+/*
+ * Note: We dont mem charge error packets (no sk_forward_alloc changes)
+ */
+int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
+{
+	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
+	    (unsigned)sk->sk_rcvbuf)
+		return -ENOMEM;
+
+	skb_orphan(skb);
+	skb->sk = sk;
+	skb->destructor = sock_rmem_free;
+	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+
+	skb_queue_tail(&sk->sk_error_queue, skb);
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_data_ready(sk, skb->len);
+	return 0;
+}
+EXPORT_SYMBOL(sock_queue_err_skb);
+
 void skb_tstamp_tx(struct sk_buff *orig_skb,
 		struct skb_shared_hwtstamps *hwtstamps)
 {
@@ -2997,9 +3025,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
 	serr->ee.ee_errno = ENOMSG;
 	serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
 
-	bh_lock_sock(sk);
 	err = sock_queue_err_skb(sk, skb);
-	bh_unlock_sock(sk);
 
 	if (err)
 		kfree_skb(skb);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 50678f9..eec4ff4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -633,11 +633,9 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 	if (!inet->recverr) {
 		if (!harderr || sk->sk_state != TCP_ESTABLISHED)
 			goto out;
-	} else {
-		bh_lock_sock(sk);
+	} else
 		ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));
-		bh_unlock_sock(sk);
-	}
+
 	sk->sk_err = err;
 	sk->sk_error_report(sk);
 out:
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 3048f90..87be586 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -466,11 +466,9 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (sk->sk_state != TCP_ESTABLISHED && !np->recverr)
 		goto out;
 
-	if (np->recverr) {
-		bh_lock_sock(sk);
+	if (np->recverr)
 		ipv6_icmp_error(sk, skb, err, uh->dest, ntohl(info), (u8 *)(uh+1));
-		bh_unlock_sock(sk);
-	}
+
 	sk->sk_err = err;
 	sk->sk_error_report(sk);
 out:
-- 
cgit v1.1


From 288fcee8b7aa98796d96cd5b1b2e8005639328bf Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 31 May 2010 23:48:19 -0700
Subject: net/ipv4/tcp_input.c: fix compilation breakage when FASTRETRANS_DEBUG
 > 1

Commit: c720c7e8383aff1cb219bddf474ed89d850336e3 missed these.

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3e6dafc..548d575 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2639,7 +2639,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
 	if (sk->sk_family == AF_INET) {
 		printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
 		       msg,
-		       &inet->daddr, ntohs(inet->dport),
+		       &inet->inet_daddr, ntohs(inet->inet_dport),
 		       tp->snd_cwnd, tcp_left_out(tp),
 		       tp->snd_ssthresh, tp->prior_ssthresh,
 		       tp->packets_out);
@@ -2649,7 +2649,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
 		struct ipv6_pinfo *np = inet6_sk(sk);
 		printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
 		       msg,
-		       &np->daddr, ntohs(inet->dport),
+		       &np->daddr, ntohs(inet->inet_dport),
 		       tp->snd_cwnd, tcp_left_out(tp),
 		       tp->snd_ssthresh, tp->prior_ssthresh,
 		       tp->packets_out);
-- 
cgit v1.1


From 8ae5977ff95c03fe6c36a5721c57dcb4bfe4f290 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Sun, 30 May 2010 14:52:58 +0200
Subject: mac80211: fix blockack-req processing

Daniel reported that the paged RX changes had
broken blockack request frame processing due
to using data that wasn't really part of the
skb data.

Fix this using skb_copy_bits() for the needed
data. As a side effect, this adds a check on
processing too short frames, which previously
this code could do.

Reported-by: Daniel Halperin <dhalperi@cs.washington.edu>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Daniel Halperin <dhalperi@cs.washington.edu>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/rx.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 6e2a7bc..5e0b654 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1818,17 +1818,26 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames)
 		return RX_CONTINUE;
 
 	if (ieee80211_is_back_req(bar->frame_control)) {
+		struct {
+			__le16 control, start_seq_num;
+		} __packed bar_data;
+
 		if (!rx->sta)
 			return RX_DROP_MONITOR;
+
+		if (skb_copy_bits(skb, offsetof(struct ieee80211_bar, control),
+				  &bar_data, sizeof(bar_data)))
+			return RX_DROP_MONITOR;
+
 		spin_lock(&rx->sta->lock);
-		tid = le16_to_cpu(bar->control) >> 12;
+		tid = le16_to_cpu(bar_data.control) >> 12;
 		if (!rx->sta->ampdu_mlme.tid_active_rx[tid]) {
 			spin_unlock(&rx->sta->lock);
 			return RX_DROP_MONITOR;
 		}
 		tid_agg_rx = rx->sta->ampdu_mlme.tid_rx[tid];
 
-		start_seq_num = le16_to_cpu(bar->start_seq_num) >> 4;
+		start_seq_num = le16_to_cpu(bar_data.start_seq_num) >> 4;
 
 		/* reset session timer */
 		if (tid_agg_rx->timeout)
-- 
cgit v1.1


From 51a0d38de26226f2779912d92f155b93d539da9a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 31 May 2010 12:00:12 +0200
Subject: mac80211: fix dialog token allocator

The dialog token allocator has apparently been broken
since b83f4e15 ("mac80211: fix deadlock in sta->lock")
because it got moved out under the spinlock. Fix it.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/agg-tx.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index c163d0a..98258b7 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -332,14 +332,16 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid)
 		IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
 
 	spin_unlock(&local->ampdu_lock);
-	spin_unlock_bh(&sta->lock);
 
-	/* send an addBA request */
+	/* prepare tid data */
 	sta->ampdu_mlme.dialog_token_allocator++;
 	sta->ampdu_mlme.tid_tx[tid]->dialog_token =
 			sta->ampdu_mlme.dialog_token_allocator;
 	sta->ampdu_mlme.tid_tx[tid]->ssn = start_seq_num;
 
+	spin_unlock_bh(&sta->lock);
+
+	/* send AddBA request */
 	ieee80211_send_addba_request(sdata, pubsta->addr, tid,
 			 sta->ampdu_mlme.tid_tx[tid]->dialog_token,
 			 sta->ampdu_mlme.tid_tx[tid]->ssn,
-- 
cgit v1.1


From fafeeb6c80e3842c6dc19d05de09a23f23eef0d8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 1 Jun 2010 10:04:49 +0000
Subject: xfrm: force a dst reference in __xfrm_route_forward()

Packets going through __xfrm_route_forward() have a not refcounted dst
entry, since we enabled a noref forwarding path.

xfrm_lookup() might incorrectly release this dst entry.

It's a bit late to make invasive changes in xfrm_lookup(), so lets force
a refcount in this path.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/xfrm/xfrm_policy.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index d965a2b..4bf27d9 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2153,6 +2153,7 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
 		return 0;
 	}
 
+	skb_dst_force(skb);
 	dst = skb_dst(skb);
 
 	res = xfrm_lookup(net, &dst, &fl, NULL, 0) == 0;
-- 
cgit v1.1


From 194dbcc8a1a97cbac9a619a563e5f6b7f7d5a485 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.r.fastabend@intel.com>
Date: Wed, 12 May 2010 21:31:06 +0000
Subject: net: init_vlan should not copy slave or master flags

The vlan device should not copy the slave or master flags from
the real device. It is not in the bond until added nor is it
a master.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan_dev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 55be908..5298426 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -708,7 +708,8 @@ static int vlan_dev_init(struct net_device *dev)
 	netif_carrier_off(dev);
 
 	/* IFF_BROADCAST|IFF_MULTICAST; ??? */
-	dev->flags  = real_dev->flags & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI);
+	dev->flags  = real_dev->flags & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
+					  IFF_MASTER | IFF_SLAVE);
 	dev->iflink = real_dev->ifindex;
 	dev->state  = (real_dev->state & ((1<<__LINK_STATE_NOCARRIER) |
 					  (1<<__LINK_STATE_DORMANT))) |
-- 
cgit v1.1


From 2df4a0fa1540c460ec69788ab2a901cc72a75644 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.r.fastabend@intel.com>
Date: Wed, 12 May 2010 21:31:11 +0000
Subject: net: fix conflict between null_or_orig and null_or_bond

If a skb is received on an inactive bond that does not meet
the special cases checked for by skb_bond_should_drop it should
only be delivered to exact matches as the comment in
netif_receive_skb() says.

However because null_or_bond could also be null this is not
always true.  This patch renames null_or_bond to orig_or_bond
and initializes it to orig_dev.  This keeps the intent of
null_or_bond to pass frames received on VLAN interfaces stacked
on bonding interfaces without invalidating the statement for
null_or_orig.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 1845b08..d03470f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2795,7 +2795,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	struct net_device *orig_dev;
 	struct net_device *master;
 	struct net_device *null_or_orig;
-	struct net_device *null_or_bond;
+	struct net_device *orig_or_bond;
 	int ret = NET_RX_DROP;
 	__be16 type;
 
@@ -2868,10 +2868,10 @@ ncls:
 	 * device that may have registered for a specific ptype.  The
 	 * handler may have to adjust skb->dev and orig_dev.
 	 */
-	null_or_bond = NULL;
+	orig_or_bond = orig_dev;
 	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
 	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
-		null_or_bond = vlan_dev_real_dev(skb->dev);
+		orig_or_bond = vlan_dev_real_dev(skb->dev);
 	}
 
 	type = skb->protocol;
@@ -2879,7 +2879,7 @@ ncls:
 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
 		if (ptype->type == type && (ptype->dev == null_or_orig ||
 		     ptype->dev == skb->dev || ptype->dev == orig_dev ||
-		     ptype->dev == null_or_bond)) {
+		     ptype->dev == orig_or_bond)) {
 			if (pt_prev)
 				ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = ptype;
-- 
cgit v1.1


From 33c29dde7d04dc0ec0edb649d20ccf1351c13a06 Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Sat, 29 May 2010 14:26:59 +0000
Subject: act_nat: fix the wrong checksum when addr isn't in old_addr/mask

fix the wrong checksum when addr isn't in old_addr/mask

For TCP and UDP packets, when addr isn't in old_addr/mask we don't do SNAT or
DNAT, and we should not update layer 4 checksum.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
 net/sched/act_nat.c |    4 ++++
 1 file changed, 4 insertions(+)
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_nat.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index d885ba3..5709494 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -159,6 +159,9 @@ static int tcf_nat(struct sk_buff *skb, struct tc_action *a,
 			iph->daddr = new_addr;
 
 		csum_replace4(&iph->check, addr, new_addr);
+	} else if ((iph->frag_off & htons(IP_OFFSET)) ||
+		   iph->protocol != IPPROTO_ICMP) {
+		goto out;
 	}
 
 	ihl = iph->ihl * 4;
@@ -247,6 +250,7 @@ static int tcf_nat(struct sk_buff *skb, struct tc_action *a,
 		break;
 	}
 
+out:
 	return action;
 
 drop:
-- 
cgit v1.1


From edafe502404f3669d364b6e96d79b54067b634b4 Mon Sep 17 00:00:00 2001
From: Daniele Lacamera <root@danielinux.net>
Date: Wed, 2 Jun 2010 02:02:04 +0000
Subject: TCP: tcp_hybla: Fix integer overflow in slow start increment

For large values of rtt, 2^rho operation may overflow u32. Clamp down the increment to 2^16.

Signed-off-by: Daniele Lacamera <root@danielinux.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_hybla.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index c209e05..377bc93 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -126,8 +126,8 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 		 * calculate 2^fract in a <<7 value.
 		 */
 		is_slowstart = 1;
-		increment = ((1 << ca->rho) * hybla_fraction(rho_fractions))
-			- 128;
+		increment = ((1 << min(ca->rho, 16U)) *
+			hybla_fraction(rho_fractions)) - 128;
 	} else {
 		/*
 		 * congestion avoidance
-- 
cgit v1.1


From fbc2e7d9cf49e0bf89b9e91fd60a06851a855c5d Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Wed, 2 Jun 2010 07:32:42 -0700
Subject: cls_u32: use skb_header_pointer() to dereference data safely

use skb_header_pointer() to dereference data safely

the original skb->data dereference isn't safe, as there isn't any skb->len or
skb_is_nonlinear() check. skb_header_pointer() is used instead in this patch.
And when the skb isn't long enough, we terminate the function u32_classify()
immediately with -1.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_u32.c | 49 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 9627542..4f52214 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -98,11 +98,11 @@ static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_re
 {
 	struct {
 		struct tc_u_knode *knode;
-		u8		  *ptr;
+		unsigned int	  off;
 	} stack[TC_U32_MAXDEPTH];
 
 	struct tc_u_hnode *ht = (struct tc_u_hnode*)tp->root;
-	u8 *ptr = skb_network_header(skb);
+	unsigned int off = skb_network_offset(skb);
 	struct tc_u_knode *n;
 	int sdepth = 0;
 	int off2 = 0;
@@ -134,8 +134,14 @@ next_knode:
 #endif
 
 		for (i = n->sel.nkeys; i>0; i--, key++) {
-
-			if ((*(__be32*)(ptr+key->off+(off2&key->offmask))^key->val)&key->mask) {
+			unsigned int toff;
+			__be32 *data, _data;
+
+			toff = off + key->off + (off2 & key->offmask);
+			data = skb_header_pointer(skb, toff, 4, &_data);
+			if (!data)
+				goto out;
+			if ((*data ^ key->val) & key->mask) {
 				n = n->next;
 				goto next_knode;
 			}
@@ -174,29 +180,45 @@ check_terminal:
 		if (sdepth >= TC_U32_MAXDEPTH)
 			goto deadloop;
 		stack[sdepth].knode = n;
-		stack[sdepth].ptr = ptr;
+		stack[sdepth].off = off;
 		sdepth++;
 
 		ht = n->ht_down;
 		sel = 0;
-		if (ht->divisor)
-			sel = ht->divisor&u32_hash_fold(*(__be32*)(ptr+n->sel.hoff), &n->sel,n->fshift);
-
+		if (ht->divisor) {
+			__be32 *data, _data;
+
+			data = skb_header_pointer(skb, off + n->sel.hoff, 4,
+						  &_data);
+			if (!data)
+				goto out;
+			sel = ht->divisor & u32_hash_fold(*data, &n->sel,
+							  n->fshift);
+		}
 		if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT)))
 			goto next_ht;
 
 		if (n->sel.flags&(TC_U32_OFFSET|TC_U32_VAROFFSET)) {
 			off2 = n->sel.off + 3;
-			if (n->sel.flags&TC_U32_VAROFFSET)
-				off2 += ntohs(n->sel.offmask & *(__be16*)(ptr+n->sel.offoff)) >>n->sel.offshift;
+			if (n->sel.flags & TC_U32_VAROFFSET) {
+				__be16 *data, _data;
+
+				data = skb_header_pointer(skb,
+							  off + n->sel.offoff,
+							  2, &_data);
+				if (!data)
+					goto out;
+				off2 += ntohs(n->sel.offmask & *data) >>
+					n->sel.offshift;
+			}
 			off2 &= ~3;
 		}
 		if (n->sel.flags&TC_U32_EAT) {
-			ptr += off2;
+			off += off2;
 			off2 = 0;
 		}
 
-		if (ptr < skb_tail_pointer(skb))
+		if (off < skb->len)
 			goto next_ht;
 	}
 
@@ -204,9 +226,10 @@ check_terminal:
 	if (sdepth--) {
 		n = stack[sdepth].knode;
 		ht = n->ht_up;
-		ptr = stack[sdepth].ptr;
+		off = stack[sdepth].off;
 		goto check_terminal;
 	}
+out:
 	return -1;
 
 deadloop:
-- 
cgit v1.1


From db2c24175d149b55784f7cb2c303622ce962c1ae Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Wed, 2 Jun 2010 04:55:02 +0000
Subject: act_pedit: access skb->data safely

access skb->data safely

we should use skb_header_pointer() and skb_store_bits() to access skb->data to
handle small or non-linear skbs.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
 net/sched/act_pedit.c |   24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_pedit.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index fdbd0b7..50e3d94 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -125,7 +125,7 @@ static int tcf_pedit(struct sk_buff *skb, struct tc_action *a,
 {
 	struct tcf_pedit *p = a->priv;
 	int i, munged = 0;
-	u8 *pptr;
+	unsigned int off;
 
 	if (!(skb->tc_verd & TC_OK2MUNGE)) {
 		/* should we set skb->cloned? */
@@ -134,7 +134,7 @@ static int tcf_pedit(struct sk_buff *skb, struct tc_action *a,
 		}
 	}
 
-	pptr = skb_network_header(skb);
+	off = skb_network_offset(skb);
 
 	spin_lock(&p->tcf_lock);
 
@@ -144,17 +144,17 @@ static int tcf_pedit(struct sk_buff *skb, struct tc_action *a,
 		struct tc_pedit_key *tkey = p->tcfp_keys;
 
 		for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
-			u32 *ptr;
+			u32 *ptr, _data;
 			int offset = tkey->off;
 
 			if (tkey->offmask) {
-				if (skb->len > tkey->at) {
-					 char *j = pptr + tkey->at;
-					 offset += ((*j & tkey->offmask) >>
-						   tkey->shift);
-				} else {
+				char *d, _d;
+
+				d = skb_header_pointer(skb, off + tkey->at, 1,
+						       &_d);
+				if (!d)
 					goto bad;
-				}
+				offset += (*d & tkey->offmask) >> tkey->shift;
 			}
 
 			if (offset % 4) {
@@ -169,9 +169,13 @@ static int tcf_pedit(struct sk_buff *skb, struct tc_action *a,
 				goto bad;
 			}
 
-			ptr = (u32 *)(pptr+offset);
+			ptr = skb_header_pointer(skb, off + offset, 4, &_data);
+			if (!ptr)
+				goto bad;
 			/* just do it, baby */
 			*ptr = ((*ptr & tkey->mask) ^ tkey->val);
+			if (ptr == &_data)
+				skb_store_bits(skb, off + offset, ptr, 4);
 			munged++;
 		}
 
-- 
cgit v1.1


From 8764ab2ca7ab5055e1ca80f9cfa4970c34acb804 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Fri, 4 Jun 2010 01:57:38 +0000
Subject: net: check for refcount if pop a stacked dst_entry

xfrm triggers a warning if dst_pop() drops a refcount
on a noref dst. This patch changes dst_pop() to
skb_dst_pop(). skb_dst_pop() drops the refcnt only
on a refcounted dst. Also we don't clone the child
dst_entry, so it is not refcounted and we can use
skb_dst_set_noref() in xfrm_output_one().

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/xfrm/xfrm_output.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 6a32915..a3cca0a 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -95,13 +95,13 @@ resume:
 			goto error_nolock;
 		}
 
-		dst = dst_pop(dst);
+		dst = skb_dst_pop(skb);
 		if (!dst) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
 			err = -EHOSTUNREACH;
 			goto error_nolock;
 		}
-		skb_dst_set(skb, dst);
+		skb_dst_set_noref(skb, dst);
 		x = dst->xfrm;
 	} while (x && !(x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL));
 
-- 
cgit v1.1


From 57f1553ee5d9f093660cc49098f494e17ed11668 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 3 Jun 2010 00:42:30 +0000
Subject: syncookies: remove Kconfig text line about disabled-by-default

syncookies default to on since
e994b7c901ded7200b525a707c6da71f2cf6d4bb
(tcp: Don't make syn cookies initial setting depend on CONFIG_SYSCTL).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 8e3a1fd..7c3a7d1 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -303,7 +303,7 @@ config ARPD
 	  If unsure, say N.
 
 config SYN_COOKIES
-	bool "IP: TCP syncookie support (disabled per default)"
+	bool "IP: TCP syncookie support"
 	---help---
 	  Normal TCP/IP networking is open to an attack known as "SYN
 	  flooding". This denial-of-service attack prevents legitimate remote
@@ -328,13 +328,13 @@ config SYN_COOKIES
 	  server is really overloaded. If this happens frequently better turn
 	  them off.
 
-	  If you say Y here, note that SYN cookies aren't enabled by default;
-	  you can enable them by saying Y to "/proc file system support" and
+	  If you say Y here, you can disable SYN cookies at run time by
+	  saying Y to "/proc file system support" and
 	  "Sysctl support" below and executing the command
 
-	  echo 1 >/proc/sys/net/ipv4/tcp_syncookies
+	  echo 0 > /proc/sys/net/ipv4/tcp_syncookies
 
-	  at boot time after the /proc file system has been mounted.
+	  after the /proc file system has been mounted.
 
 	  If unsure, say N.
 
-- 
cgit v1.1


From ca55158c6ecb7832a6ad80ac44a14d23bab8cdfc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 3 Jun 2010 09:03:58 +0000
Subject: rps: tcp: fix rps_sock_flow_table table updates

I believe a moderate SYN flood attack can corrupt RFS flow table
(rps_sock_flow_table), making RPS/RFS much less effective.

Even in a normal situation, server handling short lived sessions suffer
from bad steering for the first data packet of a session, if another SYN
packet is received for another session.

We do following action in tcp_v4_rcv() :

	sock_rps_save_rxhash(sk, skb->rxhash);

We should _not_ do this if sk is a LISTEN socket, as about each
packet received on a LISTEN socket has a different rxhash than
previous one.
 -> RPS_NO_CPU markers are spread all over rps_sock_flow_table.

Also, it makes sense to protect sk->rxhash field changes with socket
lock (We currently can change it even if user thread owns the lock
and might use rxhash)

This patch moves sock_rps_save_rxhash() to a sock locked section,
and only for non LISTEN sockets.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 202cf09..fe193e5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1555,6 +1555,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 #endif
 
 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
+		sock_rps_save_rxhash(sk, skb->rxhash);
 		TCP_CHECK_TIMER(sk);
 		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
 			rsk = sk;
@@ -1579,7 +1580,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 			}
 			return 0;
 		}
-	}
+	} else
+		sock_rps_save_rxhash(sk, skb->rxhash);
+
 
 	TCP_CHECK_TIMER(sk);
 	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
@@ -1672,8 +1675,6 @@ process:
 
 	skb->dev = NULL;
 
-	sock_rps_save_rxhash(sk, skb->rxhash);
-
 	bh_lock_sock_nested(sk);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
-- 
cgit v1.1


From c44649216522cd607a4027d2ebf4a8147d3fa94c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 3 Jun 2010 05:45:47 +0000
Subject: tcp: use correct net ns in cookie_v4_check()

Its better to make a route lookup in appropriate namespace.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/syncookies.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 5c24db4..9f6b222 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -347,7 +347,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 					       { .sport = th->dest,
 						 .dport = th->source } } };
 		security_req_classify_flow(req, &fl);
-		if (ip_route_output_key(&init_net, &rt, &fl)) {
+		if (ip_route_output_key(sock_net(sk), &rt, &fl)) {
 			reqsk_free(req);
 			goto out;
 		}
-- 
cgit v1.1


From 72e09ad107e78d69ff4d3b97a69f0aad2b77280f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sat, 5 Jun 2010 03:03:30 -0700
Subject: ipv6: avoid high order allocations

With mtu=9000, mld_newpack() use order-2 GFP_ATOMIC allocations, that
are very unreliable, on machines where PAGE_SIZE=4K

Limit allocated skbs to be at most one page. (order-0 allocations)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/mcast.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 59f1881..ab1622d 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1356,7 +1356,10 @@ static struct sk_buff *mld_newpack(struct net_device *dev, int size)
 		     IPV6_TLV_PADN, 0 };
 
 	/* we assume size > sizeof(ra) here */
-	skb = sock_alloc_send_skb(sk, size + LL_ALLOCATED_SPACE(dev), 1, &err);
+	size += LL_ALLOCATED_SPACE(dev);
+	/* limit our allocations to order-0 page */
+	size = min_t(int, size, SKB_MAX_ORDER(0, 0));
+	skb = sock_alloc_send_skb(sk, size, 1, &err);
 
 	if (!skb)
 		return NULL;
-- 
cgit v1.1


From 8ffb335e8d696affc04f963bf73ce2196f80edb9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sun, 6 Jun 2010 15:34:40 -0700
Subject: ip6mr: fix a typo in ip6mr_for_each_table()

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6mr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 073071f..89c0b07 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -120,7 +120,7 @@ static void mroute_clean_tables(struct mr6_table *mrt);
 static void ipmr_expire_process(unsigned long arg);
 
 #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
-#define ip6mr_for_each_table(mrt, met) \
+#define ip6mr_for_each_table(mrt, net) \
 	list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list)
 
 static struct mr6_table *ip6mr_get_table(struct net *net, u32 id)
-- 
cgit v1.1