From 42cf93cd464e0df3c85d298c647411bae6d99e6e Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 21 Feb 2006 13:37:35 -0800
Subject: [NETFILTER]: Fix bridge netfilter related in xfrm_lookup

The bridge-netfilter code attaches a fake dst_entry with dst->ops == NULL
to purely bridged packets. When these packets are SNATed and a policy
lookup is done, xfrm_lookup crashes because it tries to dereference
dst->ops.

Change xfrm_lookup not to dereference dst->ops before checking for the
DST_NOXFRM flag and set this flag in the fake dst_entry.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_netfilter.c | 1 +
 net/xfrm/xfrm_policy.c    | 7 ++++---
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 6bb0c7e..e060aad 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -90,6 +90,7 @@ static struct rtable __fake_rtable = {
 			.dev			= &__fake_net_device,
 			.path			= &__fake_rtable.u.dst,
 			.metrics		= {[RTAX_MTU - 1] = 1500},
+			.flags			= DST_NOXFRM,
 		}
 	},
 	.rt_flags	= 0,
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 5e6b05a..8206025 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -782,7 +782,7 @@ int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl,
 	int nx = 0;
 	int err;
 	u32 genid;
-	u16 family = dst_orig->ops->family;
+	u16 family;
 	u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
 	u32 sk_sid = security_sk_sid(sk, fl, dir);
 restart:
@@ -796,13 +796,14 @@ restart:
 		if ((dst_orig->flags & DST_NOXFRM) || !xfrm_policy_list[XFRM_POLICY_OUT])
 			return 0;
 
-		policy = flow_cache_lookup(fl, sk_sid, family, dir,
-					   xfrm_policy_lookup);
+		policy = flow_cache_lookup(fl, sk_sid, dst_orig->ops->family,
+					   dir, xfrm_policy_lookup);
 	}
 
 	if (!policy)
 		return 0;
 
+	family = dst_orig->ops->family;
 	policy->curlft.use_time = (unsigned long)xtime.tv_sec;
 
 	switch (policy->action) {
-- 
cgit v1.1


From 85259878499d6c428cba191bb4e415a250dcd75a Mon Sep 17 00:00:00 2001
From: Suresh Bhogavilli <sbhogavilli@verisign.com>
Date: Tue, 21 Feb 2006 13:42:22 -0800
Subject: [IPV4]: Fix garbage collection of multipath route entries

When garbage collecting route cache entries of multipath routes
in rt_garbage_collect(), entries were deleted from the hash bucket
'i' while holding a spin lock on bucket 'k' resulting in a system
hang.  Delete entries, if any, from bucket 'k' instead.

Signed-off-by: Suresh Bhogavilli <sbhogavilli@verisign.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d82c242..fca5fe0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -835,7 +835,7 @@ static int rt_garbage_collect(void)
 					int r;
 
 					rthp = rt_remove_balanced_route(
-						&rt_hash_table[i].chain,
+						&rt_hash_table[k].chain,
 						rth,
 						&r);
 					goal -= r;
-- 
cgit v1.1


From 21380b81ef8699179b535e197a95b891a7badac7 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 22 Feb 2006 14:47:13 -0800
Subject: [XFRM]: Eliminate refcounting confusion by creating
 __xfrm_state_put().

We often just do an atomic_dec(&x->refcnt) on an xfrm_state object
because we know there is more than 1 reference remaining and thus
we can elide the heavier xfrm_state_put() call.

Do this behind an inline function called __xfrm_state_put() so that is
more obvious and also to allow us to more cleanly add refcount
debugging later.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/key/af_key.c      | 2 +-
 net/xfrm/xfrm_state.c | 8 ++++----
 net/xfrm/xfrm_user.c  | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/key/af_key.c b/net/key/af_key.c
index ae86d23..b2d4d1d 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1423,7 +1423,7 @@ static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr,
 
 	if (err < 0) {
 		x->km.state = XFRM_STATE_DEAD;
-		xfrm_state_put(x);
+		__xfrm_state_put(x);
 		goto out;
 	}
 
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index e12d0be..c656cba 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -220,14 +220,14 @@ static int __xfrm_state_delete(struct xfrm_state *x)
 		x->km.state = XFRM_STATE_DEAD;
 		spin_lock(&xfrm_state_lock);
 		list_del(&x->bydst);
-		atomic_dec(&x->refcnt);
+		__xfrm_state_put(x);
 		if (x->id.spi) {
 			list_del(&x->byspi);
-			atomic_dec(&x->refcnt);
+			__xfrm_state_put(x);
 		}
 		spin_unlock(&xfrm_state_lock);
 		if (del_timer(&x->timer))
-			atomic_dec(&x->refcnt);
+			__xfrm_state_put(x);
 
 		/* The number two in this test is the reference
 		 * mentioned in the comment below plus the reference
@@ -243,7 +243,7 @@ static int __xfrm_state_delete(struct xfrm_state *x)
 		 * The xfrm_state_alloc call gives a reference, and that
 		 * is what we are dropping here.
 		 */
-		atomic_dec(&x->refcnt);
+		__xfrm_state_put(x);
 		err = 0;
 	}
 
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index ac87a09..7de1755 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -345,7 +345,7 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
 
 	if (err < 0) {
 		x->km.state = XFRM_STATE_DEAD;
-		xfrm_state_put(x);
+		__xfrm_state_put(x);
 		goto out;
 	}
 
-- 
cgit v1.1


From f8d0e3f11593928ac3f968c378a44e80b04488c9 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <hadi@cyberus.ca>
Date: Thu, 23 Feb 2006 16:18:01 -0800
Subject: [NET] ethernet: Fix first packet goes out with MAC 00:00:00:00:00:00

When you turn off ARP on a netdevice then the first packet always goes
out with a dstMAC of all zeroes. This is because the first packet is
used to resolve ARP entries. Even though the ARP entry may be resolved
(I tried by setting a static ARP entry for a host i was pinging from),
it gets overwritten by virtue of having the netdevice disabling ARP.

Subsequent packets go out fine with correct dstMAC address (which may
be why people have ignored reporting this issue).

To cut the story short:

the culprit code is in net/ethernet/eth.c::eth_header()

----
        /*
         *      Anyway, the loopback-device should never use this
function...
         */

        if (dev->flags & (IFF_LOOPBACK|IFF_NOARP))
        {
                memset(eth->h_dest, 0, dev->addr_len);
                return ETH_HLEN;
        }

	if(daddr)
        {
                memcpy(eth->h_dest,daddr,dev->addr_len);
                return ETH_HLEN;
        }

----

Note how the h_dest is being reset when device has IFF_NOARP.

As a note:
All devices including loopback pass a daddr. loopback in fact passes
a 0 all the time ;->
This means i can delete the check totaly or i can remove the IFF_NOARP

Alexey says:
--------------------
I think, it was me who did this crap. It was so long ago I do not remember
why it was made.

I remember some troubles with dummy device. It tried to resolve
addresses, apparently, without success and generated errors instead of
blackholing. I think the problem was eventually solved at neighbour
level.

After some thinking I suspect the deletion of this chunk could change
behaviour of some parts which do not use neighbour cache f.e. packet
socket.

I think safer approach would be to move this chunk after if (daddr).
And the possibility to remove this completely could be analyzed later.
--------------------

Patch updated with Alexey's safer suggestions.

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Acked-by: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ethernet/eth.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 9890fd9..c971f14 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -95,6 +95,12 @@ int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
 		saddr = dev->dev_addr;
 	memcpy(eth->h_source,saddr,dev->addr_len);
 
+	if(daddr)
+	{
+		memcpy(eth->h_dest,daddr,dev->addr_len);
+		return ETH_HLEN;
+	}
+	
 	/*
 	 *	Anyway, the loopback-device should never use this function... 
 	 */
@@ -105,12 +111,6 @@ int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
 		return ETH_HLEN;
 	}
 	
-	if(daddr)
-	{
-		memcpy(eth->h_dest,daddr,dev->addr_len);
-		return ETH_HLEN;
-	}
-	
 	return -ETH_HLEN;
 }
 
-- 
cgit v1.1


From 4da3089f2b582b21e1374ccc6df722d4361eb915 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 23 Feb 2006 16:19:26 -0800
Subject: [IPSEC]: Use TOS when doing tunnel lookups

We should use the TOS because it's one of the routing keys.  It also
means that we update the correct routing cache entry when PMTU occurs.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/xfrm4_policy.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 45f7ae5..f285bbf 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -35,6 +35,7 @@ __xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
 		if (xdst->u.rt.fl.oif == fl->oif &&	/*XXX*/
 		    xdst->u.rt.fl.fl4_dst == fl->fl4_dst &&
 	    	    xdst->u.rt.fl.fl4_src == fl->fl4_src &&
+	    	    xdst->u.rt.fl.fl4_tos == fl->fl4_tos &&
 		    xfrm_bundle_ok(xdst, fl, AF_INET)) {
 			dst_clone(dst);
 			break;
@@ -61,7 +62,8 @@ __xfrm4_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int
 		.nl_u = {
 			.ip4_u = {
 				.saddr = local,
-				.daddr = remote
+				.daddr = remote,
+				.tos = fl->fl4_tos
 			}
 		}
 	};
@@ -230,6 +232,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl)
 	fl->proto = iph->protocol;
 	fl->fl4_dst = iph->daddr;
 	fl->fl4_src = iph->saddr;
+	fl->fl4_tos = iph->tos;
 }
 
 static inline int xfrm4_garbage_collect(void)
-- 
cgit v1.1


From 0c0888908dec145aaaa40d8a49d34913573f5a27 Mon Sep 17 00:00:00 2001
From: Hugo Santos <hsantos@av.it.pt>
Date: Fri, 24 Feb 2006 13:16:25 -0800
Subject: [IPV6] ip6_tunnel: release cached dst on change of tunnel params

The included patch fixes ip6_tunnel to release the cached dst entry
when the tunnel parameters (such as tunnel endpoints) are changed so
they are used immediatly for the next encapsulated packets.

Signed-off-by: Hugo Santos <hsantos@av.it.pt>
Acked-by: Ville Nuorvala <vnuorval@tcs.hut.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_tunnel.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index faea8a1..4859753 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -884,6 +884,7 @@ ip6ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p)
 	t->parms.encap_limit = p->encap_limit;
 	t->parms.flowinfo = p->flowinfo;
 	t->parms.link = p->link;
+	ip6_tnl_dst_reset(t);
 	ip6ip6_tnl_link_config(t);
 	return 0;
 }
-- 
cgit v1.1


From d91675f9c7f5752e8657df1e1d926bd6a624434f Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yosufuji@linux-ipv6.org>
Date: Fri, 24 Feb 2006 13:18:33 -0800
Subject: [IPV6]: Do not ignore IPV6_MTU socket option.

Based on patch by Hoerdt Mickael <hoerdt@clarinet.u-strasbg.fr>.

Signed-off-by: YOSHIFUJI Hideaki <yosufuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_output.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index efa3e72..f999edd 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -494,6 +494,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	struct net_device *dev;
 	struct sk_buff *frag;
 	struct rt6_info *rt = (struct rt6_info*)skb->dst;
+	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 	struct ipv6hdr *tmp_hdr;
 	struct frag_hdr *fh;
 	unsigned int mtu, hlen, left, len;
@@ -505,7 +506,12 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
 	nexthdr = *prevhdr;
 
-	mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
+	mtu = dst_mtu(&rt->u.dst);
+	if (np && np->frag_size < mtu) {
+		if (np->frag_size)
+			mtu = np->frag_size;
+	}
+	mtu -= hlen + sizeof(struct frag_hdr);
 
 	if (skb_shinfo(skb)->frag_list) {
 		int first_len = skb_pagelen(skb);
@@ -882,7 +888,12 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 		inet->cork.fl = *fl;
 		np->cork.hop_limit = hlimit;
 		np->cork.tclass = tclass;
-		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
+		mtu = dst_mtu(rt->u.dst.path);
+		if (np && np->frag_size < mtu) {
+			if (np->frag_size)
+				mtu = np->frag_size;
+		}
+		inet->cork.fragsize = mtu;
 		if (dst_allfrag(rt->u.dst.path))
 			inet->cork.flags |= IPCORK_ALLFRAG;
 		inet->cork.length = 0;
-- 
cgit v1.1