From a9b3cd7f323b2e57593e7215362a7b02fc933e3a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 1 Aug 2011 16:19:00 +0000 Subject: rcu: convert uses of rcu_assign_pointer(x, NULL) to RCU_INIT_POINTER When assigning a NULL value to an RCU protected pointer, no barrier is needed. The rcu_assign_pointer, used to handle that but will soon change to not handle the special case. Convert all rcu_assign_pointer of NULL value. //smpl @@ expression P; @@ - rcu_assign_pointer(P, NULL) + RCU_INIT_POINTER(P, NULL) // Signed-off-by: Stephen Hemminger Acked-by: Paul E. McKenney Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 17d67b5..9428766 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3094,8 +3094,8 @@ void netdev_rx_handler_unregister(struct net_device *dev) { ASSERT_RTNL(); - rcu_assign_pointer(dev->rx_handler, NULL); - rcu_assign_pointer(dev->rx_handler_data, NULL); + RCU_INIT_POINTER(dev->rx_handler, NULL); + RCU_INIT_POINTER(dev->rx_handler_data, NULL); } EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); -- cgit v1.1 From 33d480ce6d43326e2541fd79b3548858a174ec3c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Aug 2011 19:30:52 +0000 Subject: net: cleanup some rcu_dereference_raw RCU api had been completed and rcu_access_pointer() or rcu_dereference_protected() are better than generic rcu_dereference_raw() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 9428766..d22ffd7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2673,13 +2673,13 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, map = rcu_dereference(rxqueue->rps_map); if (map) { if (map->len == 1 && - !rcu_dereference_raw(rxqueue->rps_flow_table)) { + !rcu_access_pointer(rxqueue->rps_flow_table)) { tcpu = map->cpus[0]; if (cpu_online(tcpu)) cpu = tcpu; goto done; } - } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) { + } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) { goto done; } @@ -5727,8 +5727,8 @@ void netdev_run_todo(void) /* paranoia */ BUG_ON(netdev_refcnt_read(dev)); - WARN_ON(rcu_dereference_raw(dev->ip_ptr)); - WARN_ON(rcu_dereference_raw(dev->ip6_ptr)); + WARN_ON(rcu_access_pointer(dev->ip_ptr)); + WARN_ON(rcu_access_pointer(dev->ip6_ptr)); WARN_ON(dev->dn_ptr); if (dev->destructor) @@ -5932,7 +5932,7 @@ void free_netdev(struct net_device *dev) kfree(dev->_rx); #endif - kfree(rcu_dereference_raw(dev->ingress_queue)); + kfree(rcu_dereference_protected(dev->ingress_queue, 1)); /* Flush device addresses */ dev_addr_flush(dev); -- cgit v1.1 From 792df22cd0499b4e662d4618b0008fdcfef8b04e Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 14 Aug 2011 19:45:04 +0000 Subject: rps: Some minor cleanup in get_rps_cpus Use some variables for clarity and extensibility. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/dev.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index d22ffd7..6578d94 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2528,15 +2528,17 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) const struct ipv6hdr *ip6; const struct iphdr *ip; u8 ip_proto; - u32 addr1, addr2, ihl; + u32 addr1, addr2; + u16 proto; union { u32 v32; u16 v16[2]; } ports; nhoff = skb_network_offset(skb); + proto = skb->protocol; - switch (skb->protocol) { + switch (proto) { case __constant_htons(ETH_P_IP): if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) goto done; @@ -2548,7 +2550,7 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) ip_proto = ip->protocol; addr1 = (__force u32) ip->saddr; addr2 = (__force u32) ip->daddr; - ihl = ip->ihl; + nhoff += ip->ihl * 4; break; case __constant_htons(ETH_P_IPV6): if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) @@ -2558,7 +2560,7 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) ip_proto = ip6->nexthdr; addr1 = (__force u32) ip6->saddr.s6_addr32[3]; addr2 = (__force u32) ip6->daddr.s6_addr32[3]; - ihl = (40 >> 2); + nhoff += 40; break; default: goto done; @@ -2567,7 +2569,7 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) ports.v32 = 0; poff = proto_ports_offset(ip_proto); if (poff >= 0) { - nhoff += ihl * 4 + poff; + nhoff += poff; if (pskb_may_pull(skb, nhoff + 4)) { ports.v32 = * (__force u32 *) (skb->data + nhoff); if (ports.v16[1] < ports.v16[0]) -- cgit v1.1 From bdeab991918663aed38757904219e8398214334c Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 14 Aug 2011 19:45:55 +0000 Subject: rps: Add flag to skb to indicate rxhash is based on L4 tuple The l4_rxhash flag was added to the skb structure to indicate that the rxhash value was computed over the 4 tuple for the packet which includes the port information in the encapsulated transport packet. This is used by the stack to preserve the rxhash value in __skb_rx_tunnel. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/dev.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 6578d94..e485cb3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2519,10 +2519,11 @@ static inline void ____napi_schedule(struct softnet_data *sd, /* * __skb_get_rxhash: calculate a flow hash based on src/dst addresses - * and src/dst port numbers. Returns a non-zero hash number on success - * and 0 on failure. + * and src/dst port numbers. Sets rxhash in skb to non-zero hash value + * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb + * if hash is a canonical 4-tuple hash over transport ports. */ -__u32 __skb_get_rxhash(struct sk_buff *skb) +void __skb_get_rxhash(struct sk_buff *skb) { int nhoff, hash = 0, poff; const struct ipv6hdr *ip6; @@ -2574,6 +2575,7 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) ports.v32 = * (__force u32 *) (skb->data + nhoff); if (ports.v16[1] < ports.v16[0]) swap(ports.v16[0], ports.v16[1]); + skb->l4_rxhash = 1; } } @@ -2586,7 +2588,7 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) hash = 1; done: - return hash; + skb->rxhash = hash; } EXPORT_SYMBOL(__skb_get_rxhash); -- cgit v1.1 From e971b7225bcb1f318811ef04628c441497372999 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 14 Aug 2011 19:46:12 +0000 Subject: rps: Infrastructure in __skb_get_rxhash for deep inspection Basics for looking for ports in encapsulated packets in tunnels. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/dev.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index e485cb3..4bee9a9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -133,6 +133,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -2539,6 +2540,7 @@ void __skb_get_rxhash(struct sk_buff *skb) nhoff = skb_network_offset(skb); proto = skb->protocol; +again: switch (proto) { case __constant_htons(ETH_P_IP): if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) @@ -2567,6 +2569,11 @@ void __skb_get_rxhash(struct sk_buff *skb) goto done; } + switch (ip_proto) { + default: + break; + } + ports.v32 = 0; poff = proto_ports_offset(ip_proto); if (poff >= 0) { -- cgit v1.1 From c6865cb3cc6f3c2857fa4c6f5fda2945d70b1e84 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 14 Aug 2011 19:46:29 +0000 Subject: rps: Inspect GRE encapsulated packets to get flow hash Crack open GRE packets in __skb_get_rxhash to compute 4-tuple hash on in encapsulated packet. Note that this is used only when the __skb_get_rxhash is taken, in particular only when the device does not compute provide the rxhash (ie. feature is disabled). This was tested by creating a single GRE tunnel between two 16 core AMD machines. 200 netperf TCP_RR streams were ran with 1 byte request and response size. Without patch: 157497 tps, 50/90/99% latencies 1250/1292/1364 usecs With patch: 325896 tps, 50/90/99% latencies 603/848/1169 Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/dev.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 4bee9a9..a8d91a5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2570,6 +2570,28 @@ again: } switch (ip_proto) { + case IPPROTO_GRE: + if (pskb_may_pull(skb, nhoff + 16)) { + u8 *h = skb->data + nhoff; + __be16 flags = *(__be16 *)h; + + /* + * Only look inside GRE if version zero and no + * routing + */ + if (!(flags & (GRE_VERSION|GRE_ROUTING))) { + proto = *(__be16 *)(h + 2); + nhoff += 4; + if (flags & GRE_CSUM) + nhoff += 4; + if (flags & GRE_KEY) + nhoff += 4; + if (flags & GRE_SEQ) + nhoff += 4; + goto again; + } + } + break; default: break; } -- cgit v1.1 From 01789349ee52e4a3faf376f1485303d9723c4f1f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Aug 2011 06:29:00 +0000 Subject: net: introduce IFF_UNICAST_FLT private flag Use IFF_UNICAST_FTL to find out if driver handles unicast address filtering. In case it does not, promisc mode is entered. Patch also fixes following drivers: stmmac, niu: support uc filtering and yet it propagated ndo_set_multicast_list bna, benet, pxa168_eth, ks8851, ks8851_mll, ksz884x : has set ndo_set_rx_mode but do not support uc filtering Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index a8d91a5..6eb03fd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4522,9 +4522,7 @@ void __dev_set_rx_mode(struct net_device *dev) if (!netif_device_present(dev)) return; - if (ops->ndo_set_rx_mode) - ops->ndo_set_rx_mode(dev); - else { + if (!(dev->priv_flags & IFF_UNICAST_FLT)) { /* Unicast addresses changes may only happen under the rtnl, * therefore calling __dev_set_promiscuity here is safe. */ @@ -4535,10 +4533,12 @@ void __dev_set_rx_mode(struct net_device *dev) __dev_set_promiscuity(dev, -1); dev->uc_promisc = false; } - - if (ops->ndo_set_multicast_list) - ops->ndo_set_multicast_list(dev); } + + if (ops->ndo_set_rx_mode) + ops->ndo_set_rx_mode(dev); + else if (ops->ndo_set_multicast_list) + ops->ndo_set_multicast_list(dev); } void dev_set_rx_mode(struct net_device *dev) -- cgit v1.1 From b81693d9149c598302e8eb9c20cb20330d922c8e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Aug 2011 06:29:02 +0000 Subject: net: remove ndo_set_multicast_list callback Remove no longer used operation. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 6eb03fd..ead0366 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4537,8 +4537,6 @@ void __dev_set_rx_mode(struct net_device *dev) if (ops->ndo_set_rx_mode) ops->ndo_set_rx_mode(dev); - else if (ops->ndo_set_multicast_list) - ops->ndo_set_multicast_list(dev); } void dev_set_rx_mode(struct net_device *dev) @@ -4888,7 +4886,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) return -EOPNOTSUPP; case SIOCADDMULTI: - if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || + if (!ops->ndo_set_rx_mode || ifr->ifr_hwaddr.sa_family != AF_UNSPEC) return -EINVAL; if (!netif_device_present(dev)) @@ -4896,7 +4894,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); case SIOCDELMULTI: - if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || + if (!ops->ndo_set_rx_mode || ifr->ifr_hwaddr.sa_family != AF_UNSPEC) return -EINVAL; if (!netif_device_present(dev)) -- cgit v1.1 From 1ff1986fc94ee711df3cf19d45f2abf351436a6d Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Thu, 18 Aug 2011 22:07:54 -0700 Subject: net: rps: support 802.1Q For the 802.1Q packets, if the NIC doesn't support hw-accel-vlan-rx, RPS won't inspect the internal 4 tuples to generate skb->rxhash, so this kind of traffic can't get any benefit from RPS. This patch adds the support for 802.1Q to RPS. Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index ead0366..be7ee50 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2529,6 +2529,7 @@ void __skb_get_rxhash(struct sk_buff *skb) int nhoff, hash = 0, poff; const struct ipv6hdr *ip6; const struct iphdr *ip; + const struct vlan_hdr *vlan; u8 ip_proto; u32 addr1, addr2; u16 proto; @@ -2565,6 +2566,13 @@ again: addr2 = (__force u32) ip6->daddr.s6_addr32[3]; nhoff += 40; break; + case __constant_htons(ETH_P_8021Q): + if (!pskb_may_pull(skb, sizeof(*vlan) + nhoff)) + goto done; + vlan = (const struct vlan_hdr *) (skb->data + nhoff); + proto = vlan->h_vlan_encapsulated_proto; + nhoff += sizeof(*vlan); + goto again; default: goto done; } -- cgit v1.1 From ae1511bf769cafeae5ab61aaf9947a16a22cbd10 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Thu, 18 Aug 2011 23:23:47 -0700 Subject: net: rps: support PPPOE session messages Inspect the payload of PPPOE session messages for the 4 tuples to generate skb->rxhash. Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index be7ee50..c2442b4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -134,6 +134,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -2573,6 +2574,13 @@ again: proto = vlan->h_vlan_encapsulated_proto; nhoff += sizeof(*vlan); goto again; + case __constant_htons(ETH_P_PPP_SES): + if (!pskb_may_pull(skb, PPPOE_SES_HLEN + nhoff)) + goto done; + proto = *((__be16 *) (skb->data + nhoff + + sizeof(struct pppoe_hdr))); + nhoff += PPPOE_SES_HLEN; + goto again; default: goto done; } -- cgit v1.1 From 0dfe178239453547d4297a4583ee7847948a481b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 22 Aug 2011 12:43:22 -0700 Subject: net: vlan: goto another_round instead of calling __netif_receive_skb Now, when vlan tag on untagged in non-accelerated path is stripped from skb, headers are reset right away. Benefit from that and avoid calling __netif_receive_skb recursivelly and just use another_round. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index c2442b4..a4306f7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3236,10 +3236,9 @@ ncls: ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } - if (vlan_do_receive(&skb)) { - ret = __netif_receive_skb(skb); - goto out; - } else if (unlikely(!skb)) + if (vlan_do_receive(&skb)) + goto another_round; + else if (unlikely(!skb)) goto out; } -- cgit v1.1 From ec5efe7946280d1e84603389a1030ccec0a767ae Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 24 Aug 2011 10:41:19 +0000 Subject: rps: support IPIP encapsulation Skip IPIP header to get proper layer-4 information. Like GRE tunnels, this only works if rxhash is not already provided by the device itself (ethtool -K ethX rxhash off), to allow kernel compute a software rxhash. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index a4306f7..b668a3d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2608,6 +2608,8 @@ again: } } break; + case IPPROTO_IPIP: + goto again; default: break; } -- cgit v1.1 From ea2ab69379a941c6f8884e290fdd28c93936a778 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Mon, 22 Aug 2011 23:44:58 +0000 Subject: net: convert core to skb paged frag APIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ian Campbell Cc: "David S. Miller" Cc: Eric Dumazet Cc: "Michał Mirosław" Cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- net/core/dev.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index b668a3d..b2e262e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1949,9 +1949,11 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) #ifdef CONFIG_HIGHMEM int i; if (!(dev->features & NETIF_F_HIGHDMA)) { - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - if (PageHighMem(skb_shinfo(skb)->frags[i].page)) + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + if (PageHighMem(skb_frag_page(frag))) return 1; + } } if (PCI_DMA_BUS_IS_PHYS) { @@ -1960,7 +1962,8 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) if (!pdev) return 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page); + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + dma_addr_t addr = page_to_phys(skb_frag_page(frag)); if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) return 1; } @@ -3474,7 +3477,7 @@ pull: skb_shinfo(skb)->frags[0].size -= grow; if (unlikely(!skb_shinfo(skb)->frags[0].size)) { - put_page(skb_shinfo(skb)->frags[0].page); + skb_frag_unref(skb, 0); memmove(skb_shinfo(skb)->frags, skb_shinfo(skb)->frags + 1, --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); @@ -3538,10 +3541,9 @@ void skb_gro_reset_offset(struct sk_buff *skb) NAPI_GRO_CB(skb)->frag0_len = 0; if (skb->mac_header == skb->tail && - !PageHighMem(skb_shinfo(skb)->frags[0].page)) { + !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) { NAPI_GRO_CB(skb)->frag0 = - page_address(skb_shinfo(skb)->frags[0].page) + - skb_shinfo(skb)->frags[0].page_offset; + skb_frag_address(&skb_shinfo(skb)->frags[0]); NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size; } } -- cgit v1.1 From 4bc71cb983fd2844e603bf633df2bb53385182d2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 3 Sep 2011 03:34:30 +0000 Subject: net: consolidate and fix ethtool_ops->get_settings calling This patch does several things: - introduces __ethtool_get_settings which is called from ethtool code and from drivers as well. Put ASSERT_RTNL there. - dev_ethtool_get_settings() is replaced by __ethtool_get_settings() - changes calling in drivers so rtnl locking is respected. In iboe_get_rate was previously ->get_settings() called unlocked. This fixes it. Also prb_calc_retire_blk_tmo() in af_packet.c had the same problem. Also fixed by calling __dev_get_by_index() instead of dev_get_by_index() and holding rtnl_lock for both calls. - introduces rtnl_lock in bnx2fc_vport_create() and fcoe_vport_create() so bnx2fc_if_create() and fcoe_if_create() are called locked as they are from other places. - use __ethtool_get_settings() in bonding code Signed-off-by: Jiri Pirko v2->v3: -removed dev_ethtool_get_settings() -added ASSERT_RTNL into __ethtool_get_settings() -prb_calc_retire_blk_tmo - use __dev_get_by_index() and lock around it and __ethtool_get_settings() call v1->v2: add missing export_symbol Reviewed-by: Ben Hutchings [except FCoE bits] Acked-by: Ralf Baechle Signed-off-by: David S. Miller --- net/core/dev.c | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index b2e262e..4b9981c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4566,30 +4566,6 @@ void dev_set_rx_mode(struct net_device *dev) } /** - * dev_ethtool_get_settings - call device's ethtool_ops::get_settings() - * @dev: device - * @cmd: memory area for ethtool_ops::get_settings() result - * - * The cmd arg is initialized properly (cleared and - * ethtool_cmd::cmd field set to ETHTOOL_GSET). - * - * Return device's ethtool_ops::get_settings() result value or - * -EOPNOTSUPP when device doesn't expose - * ethtool_ops::get_settings() operation. - */ -int dev_ethtool_get_settings(struct net_device *dev, - struct ethtool_cmd *cmd) -{ - if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings) - return -EOPNOTSUPP; - - memset(cmd, 0, sizeof(struct ethtool_cmd)); - cmd->cmd = ETHTOOL_GSET; - return dev->ethtool_ops->get_settings(dev, cmd); -} -EXPORT_SYMBOL(dev_ethtool_get_settings); - -/** * dev_get_flags - get flags reported to userspace * @dev: device * -- cgit v1.1 From 5dd17e08f333cde0fa11000792e33d8d39b5599f Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 20 Sep 2011 22:36:07 +0000 Subject: net: rps: fix the support for PPPOE The upper protocol numbers of PPPOE are different, and should be treated specially. Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/core/dev.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index bf49a47..7f4486e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -135,6 +135,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -2556,6 +2557,7 @@ void __skb_get_rxhash(struct sk_buff *skb) again: switch (proto) { case __constant_htons(ETH_P_IP): +ip: if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) goto done; @@ -2569,6 +2571,7 @@ again: nhoff += ip->ihl * 4; break; case __constant_htons(ETH_P_IPV6): +ipv6: if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) goto done; @@ -2591,7 +2594,14 @@ again: proto = *((__be16 *) (skb->data + nhoff + sizeof(struct pppoe_hdr))); nhoff += PPPOE_SES_HLEN; - goto again; + switch (proto) { + case __constant_htons(PPP_IP): + goto ip; + case __constant_htons(PPP_IPV6): + goto ipv6; + default: + goto done; + } default: goto done; } -- cgit v1.1 From 09994d1b09bd9b0046a4708fa50d2106610a4058 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 3 Oct 2011 04:42:46 +0000 Subject: RPS: Ensure that an expired hardware filter can be re-added later Amir Vadai wrote: > When a stream is paused, and its rule is expired while it is paused, > no new rule will be configured to the HW when traffic resume. [...] > - When stream was resumed, traffic was steered again by RSS, and > because current-cpu was equal to desired-cpu, ndo_rx_flow_steer > wasn't called and no rule was configured to the HW. Fix this by setting the flow's current CPU only in the table for the newly selected RX queue. Reported-and-tested-by: Amir Vadai Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 7f4486e..70ecb86 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2670,10 +2670,7 @@ static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) { - u16 tcpu; - - tcpu = rflow->cpu = next_cpu; - if (tcpu != RPS_NO_CPU) { + if (next_cpu != RPS_NO_CPU) { #ifdef CONFIG_RFS_ACCEL struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; @@ -2701,16 +2698,16 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, goto out; old_rflow = rflow; rflow = &flow_table->flows[flow_id]; - rflow->cpu = next_cpu; rflow->filter = rc; if (old_rflow->filter == rflow->filter) old_rflow->filter = RPS_NO_FILTER; out: #endif rflow->last_qtail = - per_cpu(softnet_data, tcpu).input_queue_head; + per_cpu(softnet_data, next_cpu).input_queue_head; } + rflow->cpu = next_cpu; return rflow; } -- cgit v1.1 From 2425717b27eb92b175335ca4ff0bb218cbe0cb64 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 10 Oct 2011 09:16:41 +0000 Subject: net: allow vlan traffic to be received under bond The following configuration used to work as I expected. At least we could use the fcoe interfaces to do MPIO and the bond0 iface to do load balancing or failover. ---eth2.228-fcoe | eth2 -----| | |---- bond0 | eth3 -----| | ---eth3.228-fcoe This worked because of a change we added to allow inactive slaves to rx 'exact' matches. This functionality was kept intact with the rx_handler mechanism. However now the vlan interface attached to the active slave never receives traffic because the bonding rx_handler updates the skb->dev and goto's another_round. Previously, the vlan_do_receive() logic was called before the bonding rx_handler. Now by the time vlan_do_receive calls vlan_find_dev() the skb->dev is set to bond0 and it is clear no vlan is attached to this iface. The vlan lookup fails. This patch moves the VLAN check above the rx_handler. A VLAN tagged frame is now routed to the eth2.228-fcoe iface in the above schematic. Untagged frames continue to the bond0 as normal. This case also remains intact, eth2 --> bond0 --> vlan.228 Here the skb is VLAN tagged but the vlan lookup fails on eth2 causing the bonding rx_handler to be called. On the second pass the vlan lookup is on the bond0 iface and completes as expected. Putting a VLAN.228 on both the bond0 and eth2 device will result in eth2.228 receiving the skb. I don't think this is completely unexpected and was the result prior to the rx_handler result. Note, the same setup is also used for other storage traffic that MPIO is used with eg. iSCSI and similar setups can be contrived without storage protocols. Signed-off-by: John Fastabend Acked-by: Jesse Gross Reviewed-by: Jiri Pirko Tested-by: Hans Schillstrom Signed-off-by: David S. Miller --- net/core/dev.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 70ecb86..8b6118a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3231,6 +3231,17 @@ another_round: ncls: #endif + if (vlan_tx_tag_present(skb)) { + if (pt_prev) { + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = NULL; + } + if (vlan_do_receive(&skb)) + goto another_round; + else if (unlikely(!skb)) + goto out; + } + rx_handler = rcu_dereference(skb->dev->rx_handler); if (rx_handler) { if (pt_prev) { @@ -3251,17 +3262,6 @@ ncls: } } - if (vlan_tx_tag_present(skb)) { - if (pt_prev) { - ret = deliver_skb(skb, pt_prev, orig_dev); - pt_prev = NULL; - } - if (vlan_do_receive(&skb)) - goto another_round; - else if (unlikely(!skb)) - goto out; - } - /* deliver only exact match when indicated */ null_or_dev = deliver_exact ? skb->dev : NULL; -- cgit v1.1 From 9e903e085262ffbf1fc44a17ac06058aca03524a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 18 Oct 2011 21:00:24 +0000 Subject: net: add skb frag size accessors To ease skb->truesize sanitization, its better to be able to localize all references to skb frags size. Define accessors : skb_frag_size() to fetch frag size, and skb_frag_size_{set|add|sub}() to manipulate it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 8b6118a..cbb5918 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3489,9 +3489,9 @@ pull: skb->data_len -= grow; skb_shinfo(skb)->frags[0].page_offset += grow; - skb_shinfo(skb)->frags[0].size -= grow; + skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow); - if (unlikely(!skb_shinfo(skb)->frags[0].size)) { + if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) { skb_frag_unref(skb, 0); memmove(skb_shinfo(skb)->frags, skb_shinfo(skb)->frags + 1, @@ -3559,7 +3559,7 @@ void skb_gro_reset_offset(struct sk_buff *skb) !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(&skb_shinfo(skb)->frags[0]); - NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size; + NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]); } } EXPORT_SYMBOL(skb_gro_reset_offset); -- cgit v1.1 From 850a545bd8a41648445bfc5541afe36ca105b0b8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 13 Oct 2011 22:25:23 +0000 Subject: net: Move rcu_barrier from rollback_registered_many to netdev_run_todo. This patch moves the rcu_barrier from rollback_registered_many (inside the rtnl_lock) into netdev_run_todo (just outside the rtnl_lock). This allows us to gain the full benefit of sychronize_net calling synchronize_rcu_expedited when the rtnl_lock is held. The rcu_barrier in rollback_registered_many was originally a synchronize_net but was promoted to be a rcu_barrier() when it was found that people were unnecessarily hitting the 250ms wait in netdev_wait_allrefs(). Changing the rcu_barrier back to a synchronize_net is therefore safe. Since we only care about waiting for the rcu callbacks before we get to netdev_wait_allrefs() it is also safe to move the wait into netdev_run_todo. This was tested by creating and destroying 1000 tap devices and observing /proc/lock_stat. /proc/lock_stat reports this change reduces the hold times of the rtnl_lock by a factor of 10. There was no observable difference in the amount of time it takes to destroy a network device. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/dev.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index cbb5918..09aef26 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5235,7 +5235,7 @@ static void rollback_registered_many(struct list_head *head) dev = list_first_entry(head, struct net_device, unreg_list); call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); - rcu_barrier(); + synchronize_net(); list_for_each_entry(dev, head, unreg_list) dev_put(dev); @@ -5748,6 +5748,12 @@ void netdev_run_todo(void) __rtnl_unlock(); + /* Wait for rcu callbacks to finish before attempting to drain + * the device list. This usually avoids a 250ms wait. + */ + if (!list_empty(&list)) + rcu_barrier(); + while (!list_empty(&list)) { struct net_device *dev = list_first_entry(&list, struct net_device, todo_list); -- cgit v1.1 From 4dc360c5e7e155373bffbb3c1f7ea0022dee650c Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 19 Oct 2011 17:00:35 -0400 Subject: net: validate HWTSTAMP ioctl parameters This patch adds a sanity check on the values provided by user space for the hardware time stamping configuration. If the values lie outside of the absolute limits, then the ioctl request will be denied. Signed-off-by: Richard Cochran Signed-off-by: David S. Miller --- net/core/dev.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 09aef26..40d4395 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -136,6 +136,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -1477,6 +1478,57 @@ static inline void net_timestamp_check(struct sk_buff *skb) __net_timestamp(skb); } +static int net_hwtstamp_validate(struct ifreq *ifr) +{ + struct hwtstamp_config cfg; + enum hwtstamp_tx_types tx_type; + enum hwtstamp_rx_filters rx_filter; + int tx_type_valid = 0; + int rx_filter_valid = 0; + + if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) + return -EFAULT; + + if (cfg.flags) /* reserved for future extensions */ + return -EINVAL; + + tx_type = cfg.tx_type; + rx_filter = cfg.rx_filter; + + switch (tx_type) { + case HWTSTAMP_TX_OFF: + case HWTSTAMP_TX_ON: + case HWTSTAMP_TX_ONESTEP_SYNC: + tx_type_valid = 1; + break; + } + + switch (rx_filter) { + case HWTSTAMP_FILTER_NONE: + case HWTSTAMP_FILTER_ALL: + case HWTSTAMP_FILTER_SOME: + case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: + case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: + case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + rx_filter_valid = 1; + break; + } + + if (!tx_type_valid || !rx_filter_valid) + return -ERANGE; + + return 0; +} + static inline bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb) { @@ -4921,6 +4973,12 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) ifr->ifr_newname[IFNAMSIZ-1] = '\0'; return dev_change_name(dev, ifr->ifr_newname); + case SIOCSHWTSTAMP: + err = net_hwtstamp_validate(ifr); + if (err) + return err; + /* fall through */ + /* * Unknown or private ioctl */ -- cgit v1.1 From f04565ddf52e401880f8ba51de0dff8ba51c99fd Mon Sep 17 00:00:00 2001 From: Mihai Maruseac Date: Thu, 20 Oct 2011 20:45:10 +0000 Subject: dev: use name hash for dev_seq_ops Instead of using the dev->next chain and trying to resync at each call to dev_seq_start, use the name hash, keeping the bucket and the offset in seq->private field. Tests revealed the following results for ifconfig > /dev/null * 1000 interfaces: * 0.114s without patch * 0.089s with patch * 3000 interfaces: * 0.489s without patch * 0.110s with patch * 5000 interfaces: * 1.363s without patch * 0.250s with patch * 128000 interfaces (other setup): * ~100s without patch * ~30s with patch Signed-off-by: Mihai Maruseac Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 69 insertions(+), 15 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 40d4395..ad5d702 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4093,6 +4093,60 @@ static int dev_ifconf(struct net *net, char __user *arg) } #ifdef CONFIG_PROC_FS + +#define BUCKET_SPACE (32 - NETDEV_HASHBITS) + +struct dev_iter_state { + struct seq_net_private p; + unsigned int pos; /* bucket << BUCKET_SPACE + offset */ +}; + +#define get_bucket(x) ((x) >> BUCKET_SPACE) +#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1)) +#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) + +static inline struct net_device *dev_from_same_bucket(struct seq_file *seq) +{ + struct dev_iter_state *state = seq->private; + struct net *net = seq_file_net(seq); + struct net_device *dev; + struct hlist_node *p; + struct hlist_head *h; + unsigned int count, bucket, offset; + + bucket = get_bucket(state->pos); + offset = get_offset(state->pos); + h = &net->dev_name_head[bucket]; + count = 0; + hlist_for_each_entry_rcu(dev, p, h, name_hlist) { + if (count++ == offset) { + state->pos = set_bucket_offset(bucket, count); + return dev; + } + } + + return NULL; +} + +static inline struct net_device *dev_from_new_bucket(struct seq_file *seq) +{ + struct dev_iter_state *state = seq->private; + struct net_device *dev; + unsigned int bucket; + + bucket = get_bucket(state->pos); + do { + dev = dev_from_same_bucket(seq); + if (dev) + return dev; + + bucket++; + state->pos = set_bucket_offset(bucket, 0); + } while (bucket < NETDEV_HASHENTRIES); + + return NULL; +} + /* * This is invoked by the /proc filesystem handler to display a device * in detail. @@ -4100,33 +4154,33 @@ static int dev_ifconf(struct net *net, char __user *arg) void *dev_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { - struct net *net = seq_file_net(seq); - loff_t off; - struct net_device *dev; + struct dev_iter_state *state = seq->private; rcu_read_lock(); if (!*pos) return SEQ_START_TOKEN; - off = 1; - for_each_netdev_rcu(net, dev) - if (off++ == *pos) - return dev; + /* check for end of the hash */ + if (state->pos == 0 && *pos > 1) + return NULL; - return NULL; + return dev_from_new_bucket(seq); } void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct net_device *dev = v; + struct net_device *dev; + + ++*pos; if (v == SEQ_START_TOKEN) - dev = first_net_device_rcu(seq_file_net(seq)); - else - dev = next_net_device_rcu(dev); + return dev_from_new_bucket(seq); - ++*pos; - return dev; + dev = dev_from_same_bucket(seq); + if (dev) + return dev; + + return dev_from_new_bucket(seq); } void dev_seq_stop(struct seq_file *seq, void *v) @@ -4225,7 +4279,7 @@ static const struct seq_operations dev_seq_ops = { static int dev_seq_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &dev_seq_ops, - sizeof(struct seq_net_private)); + sizeof(struct dev_iter_state)); } static const struct file_operations dev_seq_fops = { -- cgit v1.1 From d2237d35748e7f448a9c2d9dc6a85ef637466e24 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 21 Oct 2011 06:24:20 +0000 Subject: rtnetlink: Add missing manual netlink notification in dev_change_net_namespaces Renato Westphal noticed that since commit a2835763e130c343ace5320c20d33c281e7097b7 "rtnetlink: handle rtnl_link netlink notifications manually" was merged we no longer send a netlink message when a networking device is moved from one network namespace to another. Fix this by adding the missing manual notification in dev_change_net_namespaces. Since all network devices that are processed by dev_change_net_namspaces are in the initialized state the complicated tests that guard the manual rtmsg_ifinfo calls in rollback_registered and register_netdevice are unnecessary and we can just perform a plain notification. Cc: stable@kernel.org Tested-by: Renato Westphal Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index b10ff0a..ae5cf2d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6115,6 +6115,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); /* * Flush the unicast and multicast chains -- cgit v1.1