From a9b3cd7f323b2e57593e7215362a7b02fc933e3a Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Mon, 1 Aug 2011 16:19:00 +0000
Subject: rcu: convert uses of rcu_assign_pointer(x, NULL) to RCU_INIT_POINTER

When assigning a NULL value to an RCU protected pointer, no barrier
is needed. The rcu_assign_pointer, used to handle that but will soon
change to not handle the special case.

Convert all rcu_assign_pointer of NULL value.

//smpl
@@ expression P; @@

- rcu_assign_pointer(P, NULL)
+ RCU_INIT_POINTER(P, NULL)

// </smpl>

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/core/dev.c')
diff --git a/net/core/dev.c b/net/core/dev.c
index 17d67b5..9428766 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3094,8 +3094,8 @@ void netdev_rx_handler_unregister(struct net_device *dev)
 {
 
 	ASSERT_RTNL();
-	rcu_assign_pointer(dev->rx_handler, NULL);
-	rcu_assign_pointer(dev->rx_handler_data, NULL);
+	RCU_INIT_POINTER(dev->rx_handler, NULL);
+	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
 }
 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
 
-- 
cgit v1.1


From 33d480ce6d43326e2541fd79b3548858a174ec3c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 11 Aug 2011 19:30:52 +0000
Subject: net: cleanup some rcu_dereference_raw

RCU api had been completed and rcu_access_pointer() or
rcu_dereference_protected() are better than generic
rcu_dereference_raw()

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 9428766..d22ffd7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2673,13 +2673,13 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 	map = rcu_dereference(rxqueue->rps_map);
 	if (map) {
 		if (map->len == 1 &&
-		    !rcu_dereference_raw(rxqueue->rps_flow_table)) {
+		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
 			tcpu = map->cpus[0];
 			if (cpu_online(tcpu))
 				cpu = tcpu;
 			goto done;
 		}
-	} else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
+	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
 		goto done;
 	}
 
@@ -5727,8 +5727,8 @@ void netdev_run_todo(void)
 
 		/* paranoia */
 		BUG_ON(netdev_refcnt_read(dev));
-		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
-		WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
+		WARN_ON(rcu_access_pointer(dev->ip_ptr));
+		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
 		WARN_ON(dev->dn_ptr);
 
 		if (dev->destructor)
@@ -5932,7 +5932,7 @@ void free_netdev(struct net_device *dev)
 	kfree(dev->_rx);
 #endif
 
-	kfree(rcu_dereference_raw(dev->ingress_queue));
+	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
 
 	/* Flush device addresses */
 	dev_addr_flush(dev);
-- 
cgit v1.1


From 792df22cd0499b4e662d4618b0008fdcfef8b04e Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Sun, 14 Aug 2011 19:45:04 +0000
Subject: rps: Some minor cleanup in get_rps_cpus

Use some variables for clarity and extensibility.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index d22ffd7..6578d94 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2528,15 +2528,17 @@ __u32 __skb_get_rxhash(struct sk_buff *skb)
 	const struct ipv6hdr *ip6;
 	const struct iphdr *ip;
 	u8 ip_proto;
-	u32 addr1, addr2, ihl;
+	u32 addr1, addr2;
+	u16 proto;
 	union {
 		u32 v32;
 		u16 v16[2];
 	} ports;
 
 	nhoff = skb_network_offset(skb);
+	proto = skb->protocol;
 
-	switch (skb->protocol) {
+	switch (proto) {
 	case __constant_htons(ETH_P_IP):
 		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
 			goto done;
@@ -2548,7 +2550,7 @@ __u32 __skb_get_rxhash(struct sk_buff *skb)
 			ip_proto = ip->protocol;
 		addr1 = (__force u32) ip->saddr;
 		addr2 = (__force u32) ip->daddr;
-		ihl = ip->ihl;
+		nhoff += ip->ihl * 4;
 		break;
 	case __constant_htons(ETH_P_IPV6):
 		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
@@ -2558,7 +2560,7 @@ __u32 __skb_get_rxhash(struct sk_buff *skb)
 		ip_proto = ip6->nexthdr;
 		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
 		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
-		ihl = (40 >> 2);
+		nhoff += 40;
 		break;
 	default:
 		goto done;
@@ -2567,7 +2569,7 @@ __u32 __skb_get_rxhash(struct sk_buff *skb)
 	ports.v32 = 0;
 	poff = proto_ports_offset(ip_proto);
 	if (poff >= 0) {
-		nhoff += ihl * 4 + poff;
+		nhoff += poff;
 		if (pskb_may_pull(skb, nhoff + 4)) {
 			ports.v32 = * (__force u32 *) (skb->data + nhoff);
 			if (ports.v16[1] < ports.v16[0])
-- 
cgit v1.1


From bdeab991918663aed38757904219e8398214334c Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Sun, 14 Aug 2011 19:45:55 +0000
Subject: rps: Add flag to skb to indicate rxhash is based on L4 tuple

The l4_rxhash flag was added to the skb structure to indicate
that the rxhash value was computed over the 4 tuple for the
packet which includes the port information in the encapsulated
transport packet.  This is used by the stack to preserve the
rxhash value in __skb_rx_tunnel.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 6578d94..e485cb3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2519,10 +2519,11 @@ static inline void ____napi_schedule(struct softnet_data *sd,
 
 /*
  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
- * and src/dst port numbers. Returns a non-zero hash number on success
- * and 0 on failure.
+ * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
+ * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
+ * if hash is a canonical 4-tuple hash over transport ports.
  */
-__u32 __skb_get_rxhash(struct sk_buff *skb)
+void __skb_get_rxhash(struct sk_buff *skb)
 {
 	int nhoff, hash = 0, poff;
 	const struct ipv6hdr *ip6;
@@ -2574,6 +2575,7 @@ __u32 __skb_get_rxhash(struct sk_buff *skb)
 			ports.v32 = * (__force u32 *) (skb->data + nhoff);
 			if (ports.v16[1] < ports.v16[0])
 				swap(ports.v16[0], ports.v16[1]);
+			skb->l4_rxhash = 1;
 		}
 	}
 
@@ -2586,7 +2588,7 @@ __u32 __skb_get_rxhash(struct sk_buff *skb)
 		hash = 1;
 
 done:
-	return hash;
+	skb->rxhash = hash;
 }
 EXPORT_SYMBOL(__skb_get_rxhash);
 
-- 
cgit v1.1


From e971b7225bcb1f318811ef04628c441497372999 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Sun, 14 Aug 2011 19:46:12 +0000
Subject: rps: Infrastructure in __skb_get_rxhash for deep inspection

Basics for looking for ports in encapsulated packets in tunnels.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index e485cb3..4bee9a9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -133,6 +133,7 @@
 #include <linux/pci.h>
 #include <linux/inetdevice.h>
 #include <linux/cpu_rmap.h>
+#include <linux/if_tunnel.h>
 
 #include "net-sysfs.h"
 
@@ -2539,6 +2540,7 @@ void __skb_get_rxhash(struct sk_buff *skb)
 	nhoff = skb_network_offset(skb);
 	proto = skb->protocol;
 
+again:
 	switch (proto) {
 	case __constant_htons(ETH_P_IP):
 		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
@@ -2567,6 +2569,11 @@ void __skb_get_rxhash(struct sk_buff *skb)
 		goto done;
 	}
 
+	switch (ip_proto) {
+	default:
+		break;
+	}
+
 	ports.v32 = 0;
 	poff = proto_ports_offset(ip_proto);
 	if (poff >= 0) {
-- 
cgit v1.1


From c6865cb3cc6f3c2857fa4c6f5fda2945d70b1e84 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Sun, 14 Aug 2011 19:46:29 +0000
Subject: rps: Inspect GRE encapsulated packets to get flow hash

Crack open GRE packets in __skb_get_rxhash to compute 4-tuple hash on
in encapsulated packet.  Note that this is used only when the
__skb_get_rxhash is taken, in particular only when the device does
not compute provide the rxhash (ie. feature is disabled).

This was tested by creating a single GRE tunnel between two 16 core
AMD machines.  200 netperf TCP_RR streams were ran with 1 byte
request and response size.

Without patch: 157497 tps, 50/90/99% latencies 1250/1292/1364 usecs
With patch: 325896 tps, 50/90/99% latencies 603/848/1169

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 4bee9a9..a8d91a5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2570,6 +2570,28 @@ again:
 	}
 
 	switch (ip_proto) {
+	case IPPROTO_GRE:
+		if (pskb_may_pull(skb, nhoff + 16)) {
+			u8 *h = skb->data + nhoff;
+			__be16 flags = *(__be16 *)h;
+
+			/*
+			 * Only look inside GRE if version zero and no
+			 * routing
+			 */
+			if (!(flags & (GRE_VERSION|GRE_ROUTING))) {
+				proto = *(__be16 *)(h + 2);
+				nhoff += 4;
+				if (flags & GRE_CSUM)
+					nhoff += 4;
+				if (flags & GRE_KEY)
+					nhoff += 4;
+				if (flags & GRE_SEQ)
+					nhoff += 4;
+				goto again;
+			}
+		}
+		break;
 	default:
 		break;
 	}
-- 
cgit v1.1


From 01789349ee52e4a3faf376f1485303d9723c4f1f Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Tue, 16 Aug 2011 06:29:00 +0000
Subject: net: introduce IFF_UNICAST_FLT private flag

Use IFF_UNICAST_FTL to find out if driver handles unicast address
filtering. In case it does not, promisc mode is entered.

Patch also fixes following drivers:
stmmac, niu: support uc filtering and yet it propagated
	ndo_set_multicast_list
bna, benet, pxa168_eth, ks8851, ks8851_mll, ksz884x : has set
	ndo_set_rx_mode but do not support uc filtering

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index a8d91a5..6eb03fd 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4522,9 +4522,7 @@ void __dev_set_rx_mode(struct net_device *dev)
 	if (!netif_device_present(dev))
 		return;
 
-	if (ops->ndo_set_rx_mode)
-		ops->ndo_set_rx_mode(dev);
-	else {
+	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
 		/* Unicast addresses changes may only happen under the rtnl,
 		 * therefore calling __dev_set_promiscuity here is safe.
 		 */
@@ -4535,10 +4533,12 @@ void __dev_set_rx_mode(struct net_device *dev)
 			__dev_set_promiscuity(dev, -1);
 			dev->uc_promisc = false;
 		}
-
-		if (ops->ndo_set_multicast_list)
-			ops->ndo_set_multicast_list(dev);
 	}
+
+	if (ops->ndo_set_rx_mode)
+		ops->ndo_set_rx_mode(dev);
+	else if (ops->ndo_set_multicast_list)
+		ops->ndo_set_multicast_list(dev);
 }
 
 void dev_set_rx_mode(struct net_device *dev)
-- 
cgit v1.1


From b81693d9149c598302e8eb9c20cb20330d922c8e Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Tue, 16 Aug 2011 06:29:02 +0000
Subject: net: remove ndo_set_multicast_list callback

Remove no longer used operation.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 6eb03fd..ead0366 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4537,8 +4537,6 @@ void __dev_set_rx_mode(struct net_device *dev)
 
 	if (ops->ndo_set_rx_mode)
 		ops->ndo_set_rx_mode(dev);
-	else if (ops->ndo_set_multicast_list)
-		ops->ndo_set_multicast_list(dev);
 }
 
 void dev_set_rx_mode(struct net_device *dev)
@@ -4888,7 +4886,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
 		return -EOPNOTSUPP;
 
 	case SIOCADDMULTI:
-		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
+		if (!ops->ndo_set_rx_mode ||
 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
 			return -EINVAL;
 		if (!netif_device_present(dev))
@@ -4896,7 +4894,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
 		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
 
 	case SIOCDELMULTI:
-		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
+		if (!ops->ndo_set_rx_mode ||
 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
 			return -EINVAL;
 		if (!netif_device_present(dev))
-- 
cgit v1.1


From 1ff1986fc94ee711df3cf19d45f2abf351436a6d Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Thu, 18 Aug 2011 22:07:54 -0700
Subject: net: rps: support 802.1Q

For the 802.1Q packets, if the NIC doesn't support hw-accel-vlan-rx, RPS
won't inspect the internal 4 tuples to generate skb->rxhash, so this kind
of traffic can't get any benefit from RPS.

This patch adds the support for 802.1Q to RPS.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index ead0366..be7ee50 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2529,6 +2529,7 @@ void __skb_get_rxhash(struct sk_buff *skb)
 	int nhoff, hash = 0, poff;
 	const struct ipv6hdr *ip6;
 	const struct iphdr *ip;
+	const struct vlan_hdr *vlan;
 	u8 ip_proto;
 	u32 addr1, addr2;
 	u16 proto;
@@ -2565,6 +2566,13 @@ again:
 		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
 		nhoff += 40;
 		break;
+	case __constant_htons(ETH_P_8021Q):
+		if (!pskb_may_pull(skb, sizeof(*vlan) + nhoff))
+			goto done;
+		vlan = (const struct vlan_hdr *) (skb->data + nhoff);
+		proto = vlan->h_vlan_encapsulated_proto;
+		nhoff += sizeof(*vlan);
+		goto again;
 	default:
 		goto done;
 	}
-- 
cgit v1.1


From ae1511bf769cafeae5ab61aaf9947a16a22cbd10 Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Thu, 18 Aug 2011 23:23:47 -0700
Subject: net: rps: support PPPOE session messages

Inspect the payload of PPPOE session messages for the 4 tuples to generate
skb->rxhash.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index be7ee50..c2442b4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -134,6 +134,7 @@
 #include <linux/inetdevice.h>
 #include <linux/cpu_rmap.h>
 #include <linux/if_tunnel.h>
+#include <linux/if_pppox.h>
 
 #include "net-sysfs.h"
 
@@ -2573,6 +2574,13 @@ again:
 		proto = vlan->h_vlan_encapsulated_proto;
 		nhoff += sizeof(*vlan);
 		goto again;
+	case __constant_htons(ETH_P_PPP_SES):
+		if (!pskb_may_pull(skb, PPPOE_SES_HLEN + nhoff))
+			goto done;
+		proto = *((__be16 *) (skb->data + nhoff +
+				      sizeof(struct pppoe_hdr)));
+		nhoff += PPPOE_SES_HLEN;
+		goto again;
 	default:
 		goto done;
 	}
-- 
cgit v1.1


From 0dfe178239453547d4297a4583ee7847948a481b Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Mon, 22 Aug 2011 12:43:22 -0700
Subject: net: vlan: goto another_round instead of calling __netif_receive_skb

Now, when vlan tag on untagged in non-accelerated path is stripped from
skb, headers are reset right away. Benefit from that and avoid calling
__netif_receive_skb recursivelly and just use another_round.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index c2442b4..a4306f7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3236,10 +3236,9 @@ ncls:
 			ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = NULL;
 		}
-		if (vlan_do_receive(&skb)) {
-			ret = __netif_receive_skb(skb);
-			goto out;
-		} else if (unlikely(!skb))
+		if (vlan_do_receive(&skb))
+			goto another_round;
+		else if (unlikely(!skb))
 			goto out;
 	}
 
-- 
cgit v1.1


From ec5efe7946280d1e84603389a1030ccec0a767ae Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 24 Aug 2011 10:41:19 +0000
Subject: rps: support IPIP encapsulation

Skip IPIP header to get proper layer-4 information.

Like GRE tunnels, this only works if rxhash is not already provided by
the device itself (ethtool -K ethX rxhash off), to allow kernel compute
a software rxhash.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index a4306f7..b668a3d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2608,6 +2608,8 @@ again:
 			}
 		}
 		break;
+	case IPPROTO_IPIP:
+		goto again;
 	default:
 		break;
 	}
-- 
cgit v1.1


From ea2ab69379a941c6f8884e290fdd28c93936a778 Mon Sep 17 00:00:00 2001
From: Ian Campbell <Ian.Campbell@citrix.com>
Date: Mon, 22 Aug 2011 23:44:58 +0000
Subject: net: convert core to skb paged frag APIs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: "Michał Mirosław" <mirq-linux@rere.qmqm.pl>
Cc: netdev@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index b668a3d..b2e262e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1949,9 +1949,11 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 #ifdef CONFIG_HIGHMEM
 	int i;
 	if (!(dev->features & NETIF_F_HIGHDMA)) {
-		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
-			if (PageHighMem(skb_shinfo(skb)->frags[i].page))
+		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			if (PageHighMem(skb_frag_page(frag)))
 				return 1;
+		}
 	}
 
 	if (PCI_DMA_BUS_IS_PHYS) {
@@ -1960,7 +1962,8 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 		if (!pdev)
 			return 0;
 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
-			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
 				return 1;
 		}
@@ -3474,7 +3477,7 @@ pull:
 		skb_shinfo(skb)->frags[0].size -= grow;
 
 		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
-			put_page(skb_shinfo(skb)->frags[0].page);
+			skb_frag_unref(skb, 0);
 			memmove(skb_shinfo(skb)->frags,
 				skb_shinfo(skb)->frags + 1,
 				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
@@ -3538,10 +3541,9 @@ void skb_gro_reset_offset(struct sk_buff *skb)
 	NAPI_GRO_CB(skb)->frag0_len = 0;
 
 	if (skb->mac_header == skb->tail &&
-	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
+	    !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
 		NAPI_GRO_CB(skb)->frag0 =
-			page_address(skb_shinfo(skb)->frags[0].page) +
-			skb_shinfo(skb)->frags[0].page_offset;
+			skb_frag_address(&skb_shinfo(skb)->frags[0]);
 		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
 	}
 }
-- 
cgit v1.1


From 4bc71cb983fd2844e603bf633df2bb53385182d2 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Sat, 3 Sep 2011 03:34:30 +0000
Subject: net: consolidate and fix ethtool_ops->get_settings calling

This patch does several things:
- introduces __ethtool_get_settings which is called from ethtool code and
  from drivers as well. Put ASSERT_RTNL there.
- dev_ethtool_get_settings() is replaced by __ethtool_get_settings()
- changes calling in drivers so rtnl locking is respected. In
  iboe_get_rate was previously ->get_settings() called unlocked. This
  fixes it. Also prb_calc_retire_blk_tmo() in af_packet.c had the same
  problem. Also fixed by calling __dev_get_by_index() instead of
  dev_get_by_index() and holding rtnl_lock for both calls.
- introduces rtnl_lock in bnx2fc_vport_create() and fcoe_vport_create()
  so bnx2fc_if_create() and fcoe_if_create() are called locked as they
  are from other places.
- use __ethtool_get_settings() in bonding code

Signed-off-by: Jiri Pirko <jpirko@redhat.com>

v2->v3:
	-removed dev_ethtool_get_settings()
	-added ASSERT_RTNL into __ethtool_get_settings()
	-prb_calc_retire_blk_tmo - use __dev_get_by_index() and lock
	 around it and __ethtool_get_settings() call
v1->v2:
        add missing export_symbol
Reviewed-by: Ben Hutchings <bhutchings@solarflare.com> [except FCoE bits]
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 24 ------------------------
 1 file changed, 24 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index b2e262e..4b9981c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4566,30 +4566,6 @@ void dev_set_rx_mode(struct net_device *dev)
 }
 
 /**
- *	dev_ethtool_get_settings - call device's ethtool_ops::get_settings()
- *	@dev: device
- *	@cmd: memory area for ethtool_ops::get_settings() result
- *
- *      The cmd arg is initialized properly (cleared and
- *      ethtool_cmd::cmd field set to ETHTOOL_GSET).
- *
- *	Return device's ethtool_ops::get_settings() result value or
- *	-EOPNOTSUPP when device doesn't expose
- *	ethtool_ops::get_settings() operation.
- */
-int dev_ethtool_get_settings(struct net_device *dev,
-			     struct ethtool_cmd *cmd)
-{
-	if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings)
-		return -EOPNOTSUPP;
-
-	memset(cmd, 0, sizeof(struct ethtool_cmd));
-	cmd->cmd = ETHTOOL_GSET;
-	return dev->ethtool_ops->get_settings(dev, cmd);
-}
-EXPORT_SYMBOL(dev_ethtool_get_settings);
-
-/**
  *	dev_get_flags - get flags reported to userspace
  *	@dev: device
  *
-- 
cgit v1.1


From 5dd17e08f333cde0fa11000792e33d8d39b5599f Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Tue, 20 Sep 2011 22:36:07 +0000
Subject: net: rps: fix the support for PPPOE

The upper protocol numbers of PPPOE are different, and should be treated
specially.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index bf49a47..7f4486e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -135,6 +135,7 @@
 #include <linux/cpu_rmap.h>
 #include <linux/if_tunnel.h>
 #include <linux/if_pppox.h>
+#include <linux/ppp_defs.h>
 
 #include "net-sysfs.h"
 
@@ -2556,6 +2557,7 @@ void __skb_get_rxhash(struct sk_buff *skb)
 again:
 	switch (proto) {
 	case __constant_htons(ETH_P_IP):
+ip:
 		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
 			goto done;
 
@@ -2569,6 +2571,7 @@ again:
 		nhoff += ip->ihl * 4;
 		break;
 	case __constant_htons(ETH_P_IPV6):
+ipv6:
 		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
 			goto done;
 
@@ -2591,7 +2594,14 @@ again:
 		proto = *((__be16 *) (skb->data + nhoff +
 				      sizeof(struct pppoe_hdr)));
 		nhoff += PPPOE_SES_HLEN;
-		goto again;
+		switch (proto) {
+		case __constant_htons(PPP_IP):
+			goto ip;
+		case __constant_htons(PPP_IPV6):
+			goto ipv6;
+		default:
+			goto done;
+		}
 	default:
 		goto done;
 	}
-- 
cgit v1.1


From 09994d1b09bd9b0046a4708fa50d2106610a4058 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Mon, 3 Oct 2011 04:42:46 +0000
Subject: RPS: Ensure that an expired hardware filter can be re-added later

Amir Vadai wrote:
> When a stream is paused, and its rule is expired while it is paused,
> no new rule will be configured to the HW when traffic resume.
[...]
> - When stream was resumed, traffic was steered again by RSS, and
> because current-cpu was equal to desired-cpu,  ndo_rx_flow_steer
> wasn't called and no rule was configured to the HW.

Fix this by setting the flow's current CPU only in the table for the
newly selected RX queue.

Reported-and-tested-by: Amir Vadai <amirv@dev.mellanox.co.il>
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 7f4486e..70ecb86 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2670,10 +2670,7 @@ static struct rps_dev_flow *
 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 	    struct rps_dev_flow *rflow, u16 next_cpu)
 {
-	u16 tcpu;
-
-	tcpu = rflow->cpu = next_cpu;
-	if (tcpu != RPS_NO_CPU) {
+	if (next_cpu != RPS_NO_CPU) {
 #ifdef CONFIG_RFS_ACCEL
 		struct netdev_rx_queue *rxqueue;
 		struct rps_dev_flow_table *flow_table;
@@ -2701,16 +2698,16 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 			goto out;
 		old_rflow = rflow;
 		rflow = &flow_table->flows[flow_id];
-		rflow->cpu = next_cpu;
 		rflow->filter = rc;
 		if (old_rflow->filter == rflow->filter)
 			old_rflow->filter = RPS_NO_FILTER;
 	out:
 #endif
 		rflow->last_qtail =
-			per_cpu(softnet_data, tcpu).input_queue_head;
+			per_cpu(softnet_data, next_cpu).input_queue_head;
 	}
 
+	rflow->cpu = next_cpu;
 	return rflow;
 }
 
-- 
cgit v1.1


From 2425717b27eb92b175335ca4ff0bb218cbe0cb64 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.r.fastabend@intel.com>
Date: Mon, 10 Oct 2011 09:16:41 +0000
Subject: net: allow vlan traffic to be received under bond

The following configuration used to work as I expected. At least
we could use the fcoe interfaces to do MPIO and the bond0 iface
to do load balancing or failover.

       ---eth2.228-fcoe
       |
eth2 -----|
          |
          |---- bond0
          |
eth3 -----|
       |
       ---eth3.228-fcoe

This worked because of a change we added to allow inactive slaves
to rx 'exact' matches. This functionality was kept intact with the
rx_handler mechanism. However now the vlan interface attached to the
active slave never receives traffic because the bonding rx_handler
updates the skb->dev and goto's another_round. Previously, the
vlan_do_receive() logic was called before the bonding rx_handler.

Now by the time vlan_do_receive calls vlan_find_dev() the
skb->dev is set to bond0 and it is clear no vlan is attached
to this iface. The vlan lookup fails.

This patch moves the VLAN check above the rx_handler. A VLAN
tagged frame is now routed to the eth2.228-fcoe iface in the
above schematic. Untagged frames continue to the bond0 as
normal. This case also remains intact,

eth2 --> bond0 --> vlan.228

Here the skb is VLAN tagged but the vlan lookup fails on eth2
causing the bonding rx_handler to be called. On the second
pass the vlan lookup is on the bond0 iface and completes as
expected.

Putting a VLAN.228 on both the bond0 and eth2 device will
result in eth2.228 receiving the skb. I don't think this is
completely unexpected and was the result prior to the rx_handler
result.

Note, the same setup is also used for other storage traffic that
MPIO is used with eg. iSCSI and similar setups can be contrived
without storage protocols.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Jesse Gross <jesse@nicira.com>
Reviewed-by: Jiri Pirko <jpirko@redhat.com>
Tested-by: Hans Schillstrom <hams.schillstrom@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 70ecb86..8b6118a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3231,6 +3231,17 @@ another_round:
 ncls:
 #endif
 
+	if (vlan_tx_tag_present(skb)) {
+		if (pt_prev) {
+			ret = deliver_skb(skb, pt_prev, orig_dev);
+			pt_prev = NULL;
+		}
+		if (vlan_do_receive(&skb))
+			goto another_round;
+		else if (unlikely(!skb))
+			goto out;
+	}
+
 	rx_handler = rcu_dereference(skb->dev->rx_handler);
 	if (rx_handler) {
 		if (pt_prev) {
@@ -3251,17 +3262,6 @@ ncls:
 		}
 	}
 
-	if (vlan_tx_tag_present(skb)) {
-		if (pt_prev) {
-			ret = deliver_skb(skb, pt_prev, orig_dev);
-			pt_prev = NULL;
-		}
-		if (vlan_do_receive(&skb))
-			goto another_round;
-		else if (unlikely(!skb))
-			goto out;
-	}
-
 	/* deliver only exact match when indicated */
 	null_or_dev = deliver_exact ? skb->dev : NULL;
 
-- 
cgit v1.1


From 9e903e085262ffbf1fc44a17ac06058aca03524a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 18 Oct 2011 21:00:24 +0000
Subject: net: add skb frag size accessors

To ease skb->truesize sanitization, its better to be able to localize
all references to skb frags size.

Define accessors : skb_frag_size() to fetch frag size, and
skb_frag_size_{set|add|sub}() to manipulate it.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 8b6118a..cbb5918 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3489,9 +3489,9 @@ pull:
 		skb->data_len -= grow;
 
 		skb_shinfo(skb)->frags[0].page_offset += grow;
-		skb_shinfo(skb)->frags[0].size -= grow;
+		skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
 
-		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
+		if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
 			skb_frag_unref(skb, 0);
 			memmove(skb_shinfo(skb)->frags,
 				skb_shinfo(skb)->frags + 1,
@@ -3559,7 +3559,7 @@ void skb_gro_reset_offset(struct sk_buff *skb)
 	    !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
 		NAPI_GRO_CB(skb)->frag0 =
 			skb_frag_address(&skb_shinfo(skb)->frags[0]);
-		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
+		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
 	}
 }
 EXPORT_SYMBOL(skb_gro_reset_offset);
-- 
cgit v1.1


From 850a545bd8a41648445bfc5541afe36ca105b0b8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 13 Oct 2011 22:25:23 +0000
Subject: net: Move rcu_barrier from rollback_registered_many to
 netdev_run_todo.

This patch moves the rcu_barrier from rollback_registered_many
(inside the rtnl_lock) into netdev_run_todo (just outside the rtnl_lock).
This allows us to gain the full benefit of sychronize_net calling
synchronize_rcu_expedited when the rtnl_lock is held.

The rcu_barrier in rollback_registered_many was originally a synchronize_net
but was promoted to be a rcu_barrier() when it was found that people were
unnecessarily hitting the 250ms wait in netdev_wait_allrefs().  Changing
the rcu_barrier back to a synchronize_net is therefore safe.

Since we only care about waiting for the rcu callbacks before we get
to netdev_wait_allrefs() it is also safe to move the wait into
netdev_run_todo.

This was tested by creating and destroying 1000 tap devices and observing
/proc/lock_stat.  /proc/lock_stat reports this change reduces the hold
times of the rtnl_lock by a factor of 10.  There was no observable
difference in the amount of time it takes to destroy a network device.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index cbb5918..09aef26 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5235,7 +5235,7 @@ static void rollback_registered_many(struct list_head *head)
 	dev = list_first_entry(head, struct net_device, unreg_list);
 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
 
-	rcu_barrier();
+	synchronize_net();
 
 	list_for_each_entry(dev, head, unreg_list)
 		dev_put(dev);
@@ -5748,6 +5748,12 @@ void netdev_run_todo(void)
 
 	__rtnl_unlock();
 
+	/* Wait for rcu callbacks to finish before attempting to drain
+	 * the device list.  This usually avoids a 250ms wait.
+	 */
+	if (!list_empty(&list))
+		rcu_barrier();
+
 	while (!list_empty(&list)) {
 		struct net_device *dev
 			= list_first_entry(&list, struct net_device, todo_list);
-- 
cgit v1.1


From 4dc360c5e7e155373bffbb3c1f7ea0022dee650c Mon Sep 17 00:00:00 2001
From: Richard Cochran <richard.cochran@omicron.at>
Date: Wed, 19 Oct 2011 17:00:35 -0400
Subject: net: validate HWTSTAMP ioctl parameters

This patch adds a sanity check on the values provided by user space for
the hardware time stamping configuration. If the values lie outside of
the absolute limits, then the ioctl request will be denied.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 09aef26..40d4395 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -136,6 +136,7 @@
 #include <linux/if_tunnel.h>
 #include <linux/if_pppox.h>
 #include <linux/ppp_defs.h>
+#include <linux/net_tstamp.h>
 
 #include "net-sysfs.h"
 
@@ -1477,6 +1478,57 @@ static inline void net_timestamp_check(struct sk_buff *skb)
 		__net_timestamp(skb);
 }
 
+static int net_hwtstamp_validate(struct ifreq *ifr)
+{
+	struct hwtstamp_config cfg;
+	enum hwtstamp_tx_types tx_type;
+	enum hwtstamp_rx_filters rx_filter;
+	int tx_type_valid = 0;
+	int rx_filter_valid = 0;
+
+	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
+		return -EFAULT;
+
+	if (cfg.flags) /* reserved for future extensions */
+		return -EINVAL;
+
+	tx_type = cfg.tx_type;
+	rx_filter = cfg.rx_filter;
+
+	switch (tx_type) {
+	case HWTSTAMP_TX_OFF:
+	case HWTSTAMP_TX_ON:
+	case HWTSTAMP_TX_ONESTEP_SYNC:
+		tx_type_valid = 1;
+		break;
+	}
+
+	switch (rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+	case HWTSTAMP_FILTER_ALL:
+	case HWTSTAMP_FILTER_SOME:
+	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+		rx_filter_valid = 1;
+		break;
+	}
+
+	if (!tx_type_valid || !rx_filter_valid)
+		return -ERANGE;
+
+	return 0;
+}
+
 static inline bool is_skb_forwardable(struct net_device *dev,
 				      struct sk_buff *skb)
 {
@@ -4921,6 +4973,12 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
 		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
 		return dev_change_name(dev, ifr->ifr_newname);
 
+	case SIOCSHWTSTAMP:
+		err = net_hwtstamp_validate(ifr);
+		if (err)
+			return err;
+		/* fall through */
+
 	/*
 	 *	Unknown or private ioctl
 	 */
-- 
cgit v1.1


From f04565ddf52e401880f8ba51de0dff8ba51c99fd Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihai.maruseac@gmail.com>
Date: Thu, 20 Oct 2011 20:45:10 +0000
Subject: dev: use name hash for dev_seq_ops

Instead of using the dev->next chain and trying to resync at each call to
dev_seq_start, use the name hash, keeping the bucket and the offset in
seq->private field.

Tests revealed the following results for ifconfig > /dev/null
	* 1000 interfaces:
		* 0.114s without patch
		* 0.089s with patch
	* 3000 interfaces:
		* 0.489s without patch
		* 0.110s with patch
	* 5000 interfaces:
		* 1.363s without patch
		* 0.250s with patch
	* 128000 interfaces (other setup):
		* ~100s without patch
		* ~30s with patch

Signed-off-by: Mihai Maruseac <mmaruseac@ixiacom.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 69 insertions(+), 15 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 40d4395..ad5d702 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4093,6 +4093,60 @@ static int dev_ifconf(struct net *net, char __user *arg)
 }
 
 #ifdef CONFIG_PROC_FS
+
+#define BUCKET_SPACE (32 - NETDEV_HASHBITS)
+
+struct dev_iter_state {
+	struct seq_net_private p;
+	unsigned int pos; /* bucket << BUCKET_SPACE + offset */
+};
+
+#define get_bucket(x) ((x) >> BUCKET_SPACE)
+#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
+#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
+
+static inline struct net_device *dev_from_same_bucket(struct seq_file *seq)
+{
+	struct dev_iter_state *state = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct net_device *dev;
+	struct hlist_node *p;
+	struct hlist_head *h;
+	unsigned int count, bucket, offset;
+
+	bucket = get_bucket(state->pos);
+	offset = get_offset(state->pos);
+	h = &net->dev_name_head[bucket];
+	count = 0;
+	hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
+		if (count++ == offset) {
+			state->pos = set_bucket_offset(bucket, count);
+			return dev;
+		}
+	}
+
+	return NULL;
+}
+
+static inline struct net_device *dev_from_new_bucket(struct seq_file *seq)
+{
+	struct dev_iter_state *state = seq->private;
+	struct net_device *dev;
+	unsigned int bucket;
+
+	bucket = get_bucket(state->pos);
+	do {
+		dev = dev_from_same_bucket(seq);
+		if (dev)
+			return dev;
+
+		bucket++;
+		state->pos = set_bucket_offset(bucket, 0);
+	} while (bucket < NETDEV_HASHENTRIES);
+
+	return NULL;
+}
+
 /*
  *	This is invoked by the /proc filesystem handler to display a device
  *	in detail.
@@ -4100,33 +4154,33 @@ static int dev_ifconf(struct net *net, char __user *arg)
 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(RCU)
 {
-	struct net *net = seq_file_net(seq);
-	loff_t off;
-	struct net_device *dev;
+	struct dev_iter_state *state = seq->private;
 
 	rcu_read_lock();
 	if (!*pos)
 		return SEQ_START_TOKEN;
 
-	off = 1;
-	for_each_netdev_rcu(net, dev)
-		if (off++ == *pos)
-			return dev;
+	/* check for end of the hash */
+	if (state->pos == 0 && *pos > 1)
+		return NULL;
 
-	return NULL;
+	return dev_from_new_bucket(seq);
 }
 
 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct net_device *dev = v;
+	struct net_device *dev;
+
+	++*pos;
 
 	if (v == SEQ_START_TOKEN)
-		dev = first_net_device_rcu(seq_file_net(seq));
-	else
-		dev = next_net_device_rcu(dev);
+		return dev_from_new_bucket(seq);
 
-	++*pos;
-	return dev;
+	dev = dev_from_same_bucket(seq);
+	if (dev)
+		return dev;
+
+	return dev_from_new_bucket(seq);
 }
 
 void dev_seq_stop(struct seq_file *seq, void *v)
@@ -4225,7 +4279,7 @@ static const struct seq_operations dev_seq_ops = {
 static int dev_seq_open(struct inode *inode, struct file *file)
 {
 	return seq_open_net(inode, file, &dev_seq_ops,
-			    sizeof(struct seq_net_private));
+			    sizeof(struct dev_iter_state));
 }
 
 static const struct file_operations dev_seq_fops = {
-- 
cgit v1.1


From d2237d35748e7f448a9c2d9dc6a85ef637466e24 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 21 Oct 2011 06:24:20 +0000
Subject: rtnetlink: Add missing manual netlink notification in
 dev_change_net_namespaces

Renato Westphal noticed that since commit a2835763e130c343ace5320c20d33c281e7097b7
"rtnetlink: handle rtnl_link netlink notifications manually" was merged
we no longer send a netlink message when a networking device is moved
from one network namespace to another.

Fix this by adding the missing manual notification in dev_change_net_namespaces.

Since all network devices that are processed by dev_change_net_namspaces are
in the initialized state the complicated tests that guard the manual
rtmsg_ifinfo calls in rollback_registered and register_netdevice are
unnecessary and we can just perform a plain notification.

Cc: stable@kernel.org
Tested-by: Renato Westphal <renatowestphal@gmail.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index b10ff0a..ae5cf2d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6115,6 +6115,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	*/
 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
+	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
 
 	/*
 	 *	Flush the unicast and multicast chains
-- 
cgit v1.1