From a9b3cd7f323b2e57593e7215362a7b02fc933e3a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 1 Aug 2011 16:19:00 +0000 Subject: rcu: convert uses of rcu_assign_pointer(x, NULL) to RCU_INIT_POINTER When assigning a NULL value to an RCU protected pointer, no barrier is needed. The rcu_assign_pointer, used to handle that but will soon change to not handle the special case. Convert all rcu_assign_pointer of NULL value. //smpl @@ expression P; @@ - rcu_assign_pointer(P, NULL) + RCU_INIT_POINTER(P, NULL) // Signed-off-by: Stephen Hemminger Acked-by: Paul E. McKenney Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- net/core/fib_rules.c | 2 +- net/core/filter.c | 2 +- net/core/net-sysfs.c | 4 ++-- net/core/netpoll.c | 4 ++-- net/core/sock.c | 4 ++-- 6 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 17d67b5..9428766 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3094,8 +3094,8 @@ void netdev_rx_handler_unregister(struct net_device *dev) { ASSERT_RTNL(); - rcu_assign_pointer(dev->rx_handler, NULL); - rcu_assign_pointer(dev->rx_handler_data, NULL); + RCU_INIT_POINTER(dev->rx_handler, NULL); + RCU_INIT_POINTER(dev->rx_handler_data, NULL); } EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index e7ab0c0..0657b57 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -487,7 +487,7 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (ops->nr_goto_rules > 0) { list_for_each_entry(tmp, &ops->rules_list, list) { if (rtnl_dereference(tmp->ctarget) == rule) { - rcu_assign_pointer(tmp->ctarget, NULL); + RCU_INIT_POINTER(tmp->ctarget, NULL); ops->unresolved_rules++; } } diff --git a/net/core/filter.c b/net/core/filter.c index 36f975f..8fcc2d7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -645,7 +645,7 @@ int sk_detach_filter(struct sock *sk) filter = rcu_dereference_protected(sk->sk_filter, sock_owned_by_user(sk)); if (filter) { - rcu_assign_pointer(sk->sk_filter, NULL); + RCU_INIT_POINTER(sk->sk_filter, NULL); sk_filter_uncharge(sk, filter); ret = 0; } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 1683e5d..b1ab887 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -987,10 +987,10 @@ static ssize_t store_xps_map(struct netdev_queue *queue, } if (nonempty) - rcu_assign_pointer(dev->xps_maps, new_dev_maps); + RCU_INIT_POINTER(dev->xps_maps, new_dev_maps); else { kfree(new_dev_maps); - rcu_assign_pointer(dev->xps_maps, NULL); + RCU_INIT_POINTER(dev->xps_maps, NULL); } if (dev_maps) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index adf84dd..d676a56 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -760,7 +760,7 @@ int __netpoll_setup(struct netpoll *np) } /* last thing to do is link it to the net device structure */ - rcu_assign_pointer(ndev->npinfo, npinfo); + RCU_INIT_POINTER(ndev->npinfo, npinfo); return 0; @@ -901,7 +901,7 @@ void __netpoll_cleanup(struct netpoll *np) if (ops->ndo_netpoll_cleanup) ops->ndo_netpoll_cleanup(np->dev); - rcu_assign_pointer(np->dev->npinfo, NULL); + RCU_INIT_POINTER(np->dev->npinfo, NULL); /* avoid racing with NAPI reading npinfo */ synchronize_rcu_bh(); diff --git a/net/core/sock.c b/net/core/sock.c index bc745d0..9997026 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -387,7 +387,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { sk_tx_queue_clear(sk); - rcu_assign_pointer(sk->sk_dst_cache, NULL); + RCU_INIT_POINTER(sk->sk_dst_cache, NULL); dst_release(dst); return NULL; } @@ -1158,7 +1158,7 @@ static void __sk_free(struct sock *sk) atomic_read(&sk->sk_wmem_alloc) == 0); if (filter) { sk_filter_uncharge(sk, filter); - rcu_assign_pointer(sk->sk_filter, NULL); + RCU_INIT_POINTER(sk->sk_filter, NULL); } sock_disable_timestamp(sk, SOCK_TIMESTAMP); -- cgit v1.1 From 9de79c127cccecb11ae6a21ab1499e87aa222880 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Aug 2011 20:56:14 +0000 Subject: net: fix potential neighbour race in dst_ifdown() Followup of commit f2c31e32b378a (fix NULL dereferences in check_peer_redir()). We need to make sure dst neighbour doesnt change in dst_ifdown(). Fix some sparse errors. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dst.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/dst.c b/net/core/dst.c index 14b33baf..d5e2c4c 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -171,7 +171,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, dst_init_metrics(dst, dst_default_metrics, true); dst->expires = 0UL; dst->path = dst; - dst->_neighbour = NULL; + RCU_INIT_POINTER(dst->_neighbour, NULL); #ifdef CONFIG_XFRM dst->xfrm = NULL; #endif @@ -229,11 +229,11 @@ struct dst_entry *dst_destroy(struct dst_entry * dst) smp_rmb(); again: - neigh = dst->_neighbour; + neigh = rcu_dereference_protected(dst->_neighbour, 1); child = dst->child; if (neigh) { - dst->_neighbour = NULL; + RCU_INIT_POINTER(dst->_neighbour, NULL); neigh_release(neigh); } @@ -360,14 +360,19 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, if (!unregister) { dst->input = dst->output = dst_discard; } else { + struct neighbour *neigh; + dst->dev = dev_net(dst->dev)->loopback_dev; dev_hold(dst->dev); dev_put(dev); - if (dst->_neighbour && dst->_neighbour->dev == dev) { - dst->_neighbour->dev = dst->dev; + rcu_read_lock(); + neigh = dst_get_neighbour(dst); + if (neigh && neigh->dev == dev) { + neigh->dev = dst->dev; dev_hold(dst->dev); dev_put(dev); } + rcu_read_unlock(); } } -- cgit v1.1 From e7c379d2a0dcb8c30cb580184a0df11805464703 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 10 Aug 2011 06:09:45 +0000 Subject: rtnetlink: remove initialization of dev->real_num_tx_queues dev->real_num_tx_queues is correctly set already in alloc_netdev_mqs. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 99d9e95..39f8dd6 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1604,7 +1604,6 @@ struct net_device *rtnl_create_link(struct net *src_net, struct net *net, dev_net_set(dev, net); dev->rtnl_link_ops = ops; dev->rtnl_link_state = RTNL_LINK_INITIALIZING; - dev->real_num_tx_queues = real_num_queues; if (tb[IFLA_MTU]) dev->mtu = nla_get_u32(tb[IFLA_MTU]); -- cgit v1.1 From cd28ca0a3dd17c68d24b839602a0e6268ad28b5d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Aug 2011 08:15:58 +0000 Subject: neigh: reduce arp latency Remove the artificial HZ latency on arp resolution. Instead of firing a timer in one jiffy (up to 10 ms if HZ=100), lets send the ARP message immediately. Before patch : # arp -d 192.168.20.108 ; ping -c 3 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 56(84) bytes of data. 64 bytes from 192.168.20.108: icmp_seq=1 ttl=64 time=9.91 ms 64 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.065 ms 64 bytes from 192.168.20.108: icmp_seq=3 ttl=64 time=0.061 ms After patch : $ arp -d 192.168.20.108 ; ping -c 3 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 56(84) bytes of data. 64 bytes from 192.168.20.108: icmp_seq=1 ttl=64 time=0.152 ms 64 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.064 ms 64 bytes from 192.168.20.108: icmp_seq=3 ttl=64 time=0.074 ms Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/neighbour.c | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8fab9b0..4002261 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -844,6 +844,19 @@ static void neigh_invalidate(struct neighbour *neigh) skb_queue_purge(&neigh->arp_queue); } +static void neigh_probe(struct neighbour *neigh) + __releases(neigh->lock) +{ + struct sk_buff *skb = skb_peek(&neigh->arp_queue); + /* keep skb alive even if arp_queue overflows */ + if (skb) + skb = skb_copy(skb, GFP_ATOMIC); + write_unlock(&neigh->lock); + neigh->ops->solicit(neigh, skb); + atomic_inc(&neigh->probes); + kfree_skb(skb); +} + /* Called when a timer expires for a neighbour entry. */ static void neigh_timer_handler(unsigned long arg) @@ -920,14 +933,7 @@ static void neigh_timer_handler(unsigned long arg) neigh_hold(neigh); } if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) { - struct sk_buff *skb = skb_peek(&neigh->arp_queue); - /* keep skb alive even if arp_queue overflows */ - if (skb) - skb = skb_copy(skb, GFP_ATOMIC); - write_unlock(&neigh->lock); - neigh->ops->solicit(neigh, skb); - atomic_inc(&neigh->probes); - kfree_skb(skb); + neigh_probe(neigh); } else { out: write_unlock(&neigh->lock); @@ -942,7 +948,7 @@ out: int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) { int rc; - unsigned long now; + bool immediate_probe = false; write_lock_bh(&neigh->lock); @@ -950,14 +956,16 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)) goto out_unlock_bh; - now = jiffies; - if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { if (neigh->parms->mcast_probes + neigh->parms->app_probes) { + unsigned long next, now = jiffies; + atomic_set(&neigh->probes, neigh->parms->ucast_probes); neigh->nud_state = NUD_INCOMPLETE; - neigh->updated = jiffies; - neigh_add_timer(neigh, now + 1); + neigh->updated = now; + next = now + max(neigh->parms->retrans_time, HZ/2); + neigh_add_timer(neigh, next); + immediate_probe = true; } else { neigh->nud_state = NUD_FAILED; neigh->updated = jiffies; @@ -989,7 +997,11 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) rc = 1; } out_unlock_bh: - write_unlock_bh(&neigh->lock); + if (immediate_probe) + neigh_probe(neigh); + else + write_unlock(&neigh->lock); + local_bh_enable(); return rc; } EXPORT_SYMBOL(__neigh_event_send); -- cgit v1.1 From 33d480ce6d43326e2541fd79b3548858a174ec3c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Aug 2011 19:30:52 +0000 Subject: net: cleanup some rcu_dereference_raw RCU api had been completed and rcu_access_pointer() or rcu_dereference_protected() are better than generic rcu_dereference_raw() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 10 +++++----- net/core/fib_rules.c | 2 +- net/core/net-sysfs.c | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 9428766..d22ffd7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2673,13 +2673,13 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, map = rcu_dereference(rxqueue->rps_map); if (map) { if (map->len == 1 && - !rcu_dereference_raw(rxqueue->rps_flow_table)) { + !rcu_access_pointer(rxqueue->rps_flow_table)) { tcpu = map->cpus[0]; if (cpu_online(tcpu)) cpu = tcpu; goto done; } - } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) { + } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) { goto done; } @@ -5727,8 +5727,8 @@ void netdev_run_todo(void) /* paranoia */ BUG_ON(netdev_refcnt_read(dev)); - WARN_ON(rcu_dereference_raw(dev->ip_ptr)); - WARN_ON(rcu_dereference_raw(dev->ip6_ptr)); + WARN_ON(rcu_access_pointer(dev->ip_ptr)); + WARN_ON(rcu_access_pointer(dev->ip6_ptr)); WARN_ON(dev->dn_ptr); if (dev->destructor) @@ -5932,7 +5932,7 @@ void free_netdev(struct net_device *dev) kfree(dev->_rx); #endif - kfree(rcu_dereference_raw(dev->ingress_queue)); + kfree(rcu_dereference_protected(dev->ingress_queue, 1)); /* Flush device addresses */ dev_addr_flush(dev); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 0657b57..67c5c28 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -545,7 +545,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, frh->flags = rule->flags; if (rule->action == FR_ACT_GOTO && - rcu_dereference_raw(rule->ctarget) == NULL) + rcu_access_pointer(rule->ctarget) == NULL) frh->flags |= FIB_RULE_UNRESOLVED; if (rule->iifname[0]) { diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index b1ab887..56e42ab 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -712,13 +712,13 @@ static void rx_queue_release(struct kobject *kobj) struct rps_dev_flow_table *flow_table; - map = rcu_dereference_raw(queue->rps_map); + map = rcu_dereference_protected(queue->rps_map, 1); if (map) { RCU_INIT_POINTER(queue->rps_map, NULL); kfree_rcu(map, rcu); } - flow_table = rcu_dereference_raw(queue->rps_flow_table); + flow_table = rcu_dereference_protected(queue->rps_flow_table, 1); if (flow_table) { RCU_INIT_POINTER(queue->rps_flow_table, NULL); call_rcu(&flow_table->rcu, rps_dev_flow_table_release); -- cgit v1.1 From 792df22cd0499b4e662d4618b0008fdcfef8b04e Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 14 Aug 2011 19:45:04 +0000 Subject: rps: Some minor cleanup in get_rps_cpus Use some variables for clarity and extensibility. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/dev.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index d22ffd7..6578d94 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2528,15 +2528,17 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) const struct ipv6hdr *ip6; const struct iphdr *ip; u8 ip_proto; - u32 addr1, addr2, ihl; + u32 addr1, addr2; + u16 proto; union { u32 v32; u16 v16[2]; } ports; nhoff = skb_network_offset(skb); + proto = skb->protocol; - switch (skb->protocol) { + switch (proto) { case __constant_htons(ETH_P_IP): if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) goto done; @@ -2548,7 +2550,7 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) ip_proto = ip->protocol; addr1 = (__force u32) ip->saddr; addr2 = (__force u32) ip->daddr; - ihl = ip->ihl; + nhoff += ip->ihl * 4; break; case __constant_htons(ETH_P_IPV6): if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) @@ -2558,7 +2560,7 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) ip_proto = ip6->nexthdr; addr1 = (__force u32) ip6->saddr.s6_addr32[3]; addr2 = (__force u32) ip6->daddr.s6_addr32[3]; - ihl = (40 >> 2); + nhoff += 40; break; default: goto done; @@ -2567,7 +2569,7 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) ports.v32 = 0; poff = proto_ports_offset(ip_proto); if (poff >= 0) { - nhoff += ihl * 4 + poff; + nhoff += poff; if (pskb_may_pull(skb, nhoff + 4)) { ports.v32 = * (__force u32 *) (skb->data + nhoff); if (ports.v16[1] < ports.v16[0]) -- cgit v1.1 From bdeab991918663aed38757904219e8398214334c Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 14 Aug 2011 19:45:55 +0000 Subject: rps: Add flag to skb to indicate rxhash is based on L4 tuple The l4_rxhash flag was added to the skb structure to indicate that the rxhash value was computed over the 4 tuple for the packet which includes the port information in the encapsulated transport packet. This is used by the stack to preserve the rxhash value in __skb_rx_tunnel. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/dev.c | 10 ++++++---- net/core/skbuff.c | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 6578d94..e485cb3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2519,10 +2519,11 @@ static inline void ____napi_schedule(struct softnet_data *sd, /* * __skb_get_rxhash: calculate a flow hash based on src/dst addresses - * and src/dst port numbers. Returns a non-zero hash number on success - * and 0 on failure. + * and src/dst port numbers. Sets rxhash in skb to non-zero hash value + * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb + * if hash is a canonical 4-tuple hash over transport ports. */ -__u32 __skb_get_rxhash(struct sk_buff *skb) +void __skb_get_rxhash(struct sk_buff *skb) { int nhoff, hash = 0, poff; const struct ipv6hdr *ip6; @@ -2574,6 +2575,7 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) ports.v32 = * (__force u32 *) (skb->data + nhoff); if (ports.v16[1] < ports.v16[0]) swap(ports.v16[0], ports.v16[1]); + skb->l4_rxhash = 1; } } @@ -2586,7 +2588,7 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) hash = 1; done: - return hash; + skb->rxhash = hash; } EXPORT_SYMBOL(__skb_get_rxhash); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 27002df..edb66f3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -529,6 +529,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->mac_header = old->mac_header; skb_dst_copy(new, old); new->rxhash = old->rxhash; + new->l4_rxhash = old->l4_rxhash; #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); #endif -- cgit v1.1 From e971b7225bcb1f318811ef04628c441497372999 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 14 Aug 2011 19:46:12 +0000 Subject: rps: Infrastructure in __skb_get_rxhash for deep inspection Basics for looking for ports in encapsulated packets in tunnels. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/dev.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index e485cb3..4bee9a9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -133,6 +133,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -2539,6 +2540,7 @@ void __skb_get_rxhash(struct sk_buff *skb) nhoff = skb_network_offset(skb); proto = skb->protocol; +again: switch (proto) { case __constant_htons(ETH_P_IP): if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) @@ -2567,6 +2569,11 @@ void __skb_get_rxhash(struct sk_buff *skb) goto done; } + switch (ip_proto) { + default: + break; + } + ports.v32 = 0; poff = proto_ports_offset(ip_proto); if (poff >= 0) { -- cgit v1.1 From c6865cb3cc6f3c2857fa4c6f5fda2945d70b1e84 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 14 Aug 2011 19:46:29 +0000 Subject: rps: Inspect GRE encapsulated packets to get flow hash Crack open GRE packets in __skb_get_rxhash to compute 4-tuple hash on in encapsulated packet. Note that this is used only when the __skb_get_rxhash is taken, in particular only when the device does not compute provide the rxhash (ie. feature is disabled). This was tested by creating a single GRE tunnel between two 16 core AMD machines. 200 netperf TCP_RR streams were ran with 1 byte request and response size. Without patch: 157497 tps, 50/90/99% latencies 1250/1292/1364 usecs With patch: 325896 tps, 50/90/99% latencies 603/848/1169 Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/dev.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 4bee9a9..a8d91a5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2570,6 +2570,28 @@ again: } switch (ip_proto) { + case IPPROTO_GRE: + if (pskb_may_pull(skb, nhoff + 16)) { + u8 *h = skb->data + nhoff; + __be16 flags = *(__be16 *)h; + + /* + * Only look inside GRE if version zero and no + * routing + */ + if (!(flags & (GRE_VERSION|GRE_ROUTING))) { + proto = *(__be16 *)(h + 2); + nhoff += 4; + if (flags & GRE_CSUM) + nhoff += 4; + if (flags & GRE_KEY) + nhoff += 4; + if (flags & GRE_SEQ) + nhoff += 4; + goto again; + } + } + break; default: break; } -- cgit v1.1 From 01789349ee52e4a3faf376f1485303d9723c4f1f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Aug 2011 06:29:00 +0000 Subject: net: introduce IFF_UNICAST_FLT private flag Use IFF_UNICAST_FTL to find out if driver handles unicast address filtering. In case it does not, promisc mode is entered. Patch also fixes following drivers: stmmac, niu: support uc filtering and yet it propagated ndo_set_multicast_list bna, benet, pxa168_eth, ks8851, ks8851_mll, ksz884x : has set ndo_set_rx_mode but do not support uc filtering Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index a8d91a5..6eb03fd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4522,9 +4522,7 @@ void __dev_set_rx_mode(struct net_device *dev) if (!netif_device_present(dev)) return; - if (ops->ndo_set_rx_mode) - ops->ndo_set_rx_mode(dev); - else { + if (!(dev->priv_flags & IFF_UNICAST_FLT)) { /* Unicast addresses changes may only happen under the rtnl, * therefore calling __dev_set_promiscuity here is safe. */ @@ -4535,10 +4533,12 @@ void __dev_set_rx_mode(struct net_device *dev) __dev_set_promiscuity(dev, -1); dev->uc_promisc = false; } - - if (ops->ndo_set_multicast_list) - ops->ndo_set_multicast_list(dev); } + + if (ops->ndo_set_rx_mode) + ops->ndo_set_rx_mode(dev); + else if (ops->ndo_set_multicast_list) + ops->ndo_set_multicast_list(dev); } void dev_set_rx_mode(struct net_device *dev) -- cgit v1.1 From b81693d9149c598302e8eb9c20cb20330d922c8e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Aug 2011 06:29:02 +0000 Subject: net: remove ndo_set_multicast_list callback Remove no longer used operation. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++---- net/core/dev_addr_lists.c | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 6eb03fd..ead0366 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4537,8 +4537,6 @@ void __dev_set_rx_mode(struct net_device *dev) if (ops->ndo_set_rx_mode) ops->ndo_set_rx_mode(dev); - else if (ops->ndo_set_multicast_list) - ops->ndo_set_multicast_list(dev); } void dev_set_rx_mode(struct net_device *dev) @@ -4888,7 +4886,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) return -EOPNOTSUPP; case SIOCADDMULTI: - if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || + if (!ops->ndo_set_rx_mode || ifr->ifr_hwaddr.sa_family != AF_UNSPEC) return -EINVAL; if (!netif_device_present(dev)) @@ -4896,7 +4894,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); case SIOCDELMULTI: - if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || + if (!ops->ndo_set_rx_mode || ifr->ifr_hwaddr.sa_family != AF_UNSPEC) return -EINVAL; if (!netif_device_present(dev)) diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index e2e6693..283d1b8 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -591,8 +591,8 @@ EXPORT_SYMBOL(dev_mc_del_global); * addresses that have no users left. The source device must be * locked by netif_tx_lock_bh. * - * This function is intended to be called from the dev->set_multicast_list - * or dev->set_rx_mode function of layered software devices. + * This function is intended to be called from the ndo_set_rx_mode + * function of layered software devices. */ int dev_mc_sync(struct net_device *to, struct net_device *from) { -- cgit v1.1 From 1ff1986fc94ee711df3cf19d45f2abf351436a6d Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Thu, 18 Aug 2011 22:07:54 -0700 Subject: net: rps: support 802.1Q For the 802.1Q packets, if the NIC doesn't support hw-accel-vlan-rx, RPS won't inspect the internal 4 tuples to generate skb->rxhash, so this kind of traffic can't get any benefit from RPS. This patch adds the support for 802.1Q to RPS. Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ead0366..be7ee50 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2529,6 +2529,7 @@ void __skb_get_rxhash(struct sk_buff *skb) int nhoff, hash = 0, poff; const struct ipv6hdr *ip6; const struct iphdr *ip; + const struct vlan_hdr *vlan; u8 ip_proto; u32 addr1, addr2; u16 proto; @@ -2565,6 +2566,13 @@ again: addr2 = (__force u32) ip6->daddr.s6_addr32[3]; nhoff += 40; break; + case __constant_htons(ETH_P_8021Q): + if (!pskb_may_pull(skb, sizeof(*vlan) + nhoff)) + goto done; + vlan = (const struct vlan_hdr *) (skb->data + nhoff); + proto = vlan->h_vlan_encapsulated_proto; + nhoff += sizeof(*vlan); + goto again; default: goto done; } -- cgit v1.1 From ae1511bf769cafeae5ab61aaf9947a16a22cbd10 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Thu, 18 Aug 2011 23:23:47 -0700 Subject: net: rps: support PPPOE session messages Inspect the payload of PPPOE session messages for the 4 tuples to generate skb->rxhash. Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index be7ee50..c2442b4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -134,6 +134,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -2573,6 +2574,13 @@ again: proto = vlan->h_vlan_encapsulated_proto; nhoff += sizeof(*vlan); goto again; + case __constant_htons(ETH_P_PPP_SES): + if (!pskb_may_pull(skb, PPPOE_SES_HLEN + nhoff)) + goto done; + proto = *((__be16 *) (skb->data + nhoff + + sizeof(struct pppoe_hdr))); + nhoff += PPPOE_SES_HLEN; + goto again; default: goto done; } -- cgit v1.1 From 6461be3a54f802e00d5dcba3537271f92a90eaf3 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Fri, 19 Aug 2011 04:44:18 +0000 Subject: net: Preserve ooo_okay when copying skb header Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/core/skbuff.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index edb66f3..e27334e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -529,6 +529,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->mac_header = old->mac_header; skb_dst_copy(new, old); new->rxhash = old->rxhash; + new->ooo_okay = old->ooo_okay; new->l4_rxhash = old->l4_rxhash; #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); -- cgit v1.1 From 0dfe178239453547d4297a4583ee7847948a481b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 22 Aug 2011 12:43:22 -0700 Subject: net: vlan: goto another_round instead of calling __netif_receive_skb Now, when vlan tag on untagged in non-accelerated path is stripped from skb, headers are reset right away. Benefit from that and avoid calling __netif_receive_skb recursivelly and just use another_round. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index c2442b4..a4306f7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3236,10 +3236,9 @@ ncls: ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } - if (vlan_do_receive(&skb)) { - ret = __netif_receive_skb(skb); - goto out; - } else if (unlikely(!skb)) + if (vlan_do_receive(&skb)) + goto another_round; + else if (unlikely(!skb)) goto out; } -- cgit v1.1 From ec5efe7946280d1e84603389a1030ccec0a767ae Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 24 Aug 2011 10:41:19 +0000 Subject: rps: support IPIP encapsulation Skip IPIP header to get proper layer-4 information. Like GRE tunnels, this only works if rxhash is not already provided by the device itself (ethtool -K ethX rxhash off), to allow kernel compute a software rxhash. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index a4306f7..b668a3d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2608,6 +2608,8 @@ again: } } break; + case IPPROTO_IPIP: + goto again; default: break; } -- cgit v1.1 From ea2ab69379a941c6f8884e290fdd28c93936a778 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Mon, 22 Aug 2011 23:44:58 +0000 Subject: net: convert core to skb paged frag APIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ian Campbell Cc: "David S. Miller" Cc: Eric Dumazet Cc: "Michał Mirosław" Cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- net/core/datagram.c | 8 ++++---- net/core/dev.c | 16 +++++++++------- net/core/kmap_skb.h | 2 +- net/core/pktgen.c | 3 +-- net/core/skbuff.c | 29 +++++++++++++++-------------- net/core/sock.c | 12 +++++------- net/core/user_dma.c | 2 +- 7 files changed, 36 insertions(+), 36 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index 18ac112..6449bed 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -332,7 +332,7 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, int err; u8 *vaddr; skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - struct page *page = frag->page; + struct page *page = skb_frag_page(frag); if (copy > len) copy = len; @@ -418,7 +418,7 @@ int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset, int err; u8 *vaddr; skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - struct page *page = frag->page; + struct page *page = skb_frag_page(frag); if (copy > len) copy = len; @@ -508,7 +508,7 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, int err; u8 *vaddr; skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - struct page *page = frag->page; + struct page *page = skb_frag_page(frag); if (copy > len) copy = len; @@ -594,7 +594,7 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, int err = 0; u8 *vaddr; skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - struct page *page = frag->page; + struct page *page = skb_frag_page(frag); if (copy > len) copy = len; diff --git a/net/core/dev.c b/net/core/dev.c index b668a3d..b2e262e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1949,9 +1949,11 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) #ifdef CONFIG_HIGHMEM int i; if (!(dev->features & NETIF_F_HIGHDMA)) { - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - if (PageHighMem(skb_shinfo(skb)->frags[i].page)) + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + if (PageHighMem(skb_frag_page(frag))) return 1; + } } if (PCI_DMA_BUS_IS_PHYS) { @@ -1960,7 +1962,8 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) if (!pdev) return 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page); + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + dma_addr_t addr = page_to_phys(skb_frag_page(frag)); if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) return 1; } @@ -3474,7 +3477,7 @@ pull: skb_shinfo(skb)->frags[0].size -= grow; if (unlikely(!skb_shinfo(skb)->frags[0].size)) { - put_page(skb_shinfo(skb)->frags[0].page); + skb_frag_unref(skb, 0); memmove(skb_shinfo(skb)->frags, skb_shinfo(skb)->frags + 1, --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); @@ -3538,10 +3541,9 @@ void skb_gro_reset_offset(struct sk_buff *skb) NAPI_GRO_CB(skb)->frag0_len = 0; if (skb->mac_header == skb->tail && - !PageHighMem(skb_shinfo(skb)->frags[0].page)) { + !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) { NAPI_GRO_CB(skb)->frag0 = - page_address(skb_shinfo(skb)->frags[0].page) + - skb_shinfo(skb)->frags[0].page_offset; + skb_frag_address(&skb_shinfo(skb)->frags[0]); NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size; } } diff --git a/net/core/kmap_skb.h b/net/core/kmap_skb.h index 283c2b99..81e1ed7 100644 --- a/net/core/kmap_skb.h +++ b/net/core/kmap_skb.h @@ -7,7 +7,7 @@ static inline void *kmap_skb_frag(const skb_frag_t *frag) local_bh_disable(); #endif - return kmap_atomic(frag->page, KM_SKB_DATA_SOFTIRQ); + return kmap_atomic(skb_frag_page(frag), KM_SKB_DATA_SOFTIRQ); } static inline void kunmap_skb_frag(void *vaddr) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index e35a6fb..796044a 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2602,8 +2602,7 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, if (!pkt_dev->page) break; } - skb_shinfo(skb)->frags[i].page = pkt_dev->page; - get_page(pkt_dev->page); + skb_frag_set_page(skb, i, pkt_dev->page); skb_shinfo(skb)->frags[i].page_offset = 0; /*last fragment, fill rest of data*/ if (i == (frags - 1)) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e27334e..296afd0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -326,7 +326,7 @@ static void skb_release_data(struct sk_buff *skb) if (skb_shinfo(skb)->nr_frags) { int i; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - put_page(skb_shinfo(skb)->frags[i].page); + skb_frag_unref(skb, i); } /* @@ -809,7 +809,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; - get_page(skb_shinfo(n)->frags[i].page); + skb_frag_ref(skb, i); } skb_shinfo(n)->nr_frags = i; } @@ -901,7 +901,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - get_page(skb_shinfo(skb)->frags[i].page); + skb_frag_ref(skb, i); if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); @@ -1181,7 +1181,7 @@ drop_pages: skb_shinfo(skb)->nr_frags = i; for (; i < nfrags; i++) - put_page(skb_shinfo(skb)->frags[i].page); + skb_frag_unref(skb, i); if (skb_has_frag_list(skb)) skb_drop_fraglist(skb); @@ -1350,7 +1350,7 @@ pull_pages: k = 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { if (skb_shinfo(skb)->frags[i].size <= eat) { - put_page(skb_shinfo(skb)->frags[i].page); + skb_frag_unref(skb, i); eat -= skb_shinfo(skb)->frags[i].size; } else { skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; @@ -1609,7 +1609,8 @@ static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; - if (__splice_segment(f->page, f->page_offset, f->size, + if (__splice_segment(skb_frag_page(f), + f->page_offset, f->size, offset, len, skb, spd, 0, sk, pipe)) return 1; } @@ -2154,7 +2155,7 @@ static inline void skb_split_no_header(struct sk_buff *skb, * where splitting is expensive. * 2. Split is accurately. We make this. */ - get_page(skb_shinfo(skb)->frags[i].page); + skb_frag_ref(skb, i); skb_shinfo(skb1)->frags[0].page_offset += len - pos; skb_shinfo(skb1)->frags[0].size -= len - pos; skb_shinfo(skb)->frags[i].size = len - pos; @@ -2229,7 +2230,8 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) * commit all, so that we don't have to undo partial changes */ if (!to || - !skb_can_coalesce(tgt, to, fragfrom->page, fragfrom->page_offset)) { + !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), + fragfrom->page_offset)) { merge = -1; } else { merge = to - 1; @@ -2276,7 +2278,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) to++; } else { - get_page(fragfrom->page); + __skb_frag_ref(fragfrom); fragto->page = fragfrom->page; fragto->page_offset = fragfrom->page_offset; fragto->size = todo; @@ -2298,7 +2300,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) fragto = &skb_shinfo(tgt)->frags[merge]; fragto->size += fragfrom->size; - put_page(fragfrom->page); + __skb_frag_unref(fragfrom); } /* Reposition in the original skb */ @@ -2543,8 +2545,7 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, left = PAGE_SIZE - frag->page_offset; copy = (length > left)? left : length; - ret = getfrag(from, (page_address(frag->page) + - frag->page_offset + frag->size), + ret = getfrag(from, skb_frag_address(frag) + frag->size, offset, copy, 0, skb); if (ret < 0) return -EFAULT; @@ -2696,7 +2697,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, u32 features) while (pos < offset + len && i < nfrags) { *frag = skb_shinfo(skb)->frags[i]; - get_page(frag->page); + __skb_frag_ref(frag); size = frag->size; if (pos < offset) { @@ -2919,7 +2920,7 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) if (copy > len) copy = len; - sg_set_page(&sg[elt], frag->page, copy, + sg_set_page(&sg[elt], skb_frag_page(frag), copy, frag->page_offset+offset-start); elt++; if (!(len -= copy)) diff --git a/net/core/sock.c b/net/core/sock.c index 9997026..b29ab61 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1533,7 +1533,6 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, skb_shinfo(skb)->nr_frags = npages; for (i = 0; i < npages; i++) { struct page *page; - skb_frag_t *frag; page = alloc_pages(sk->sk_allocation, 0); if (!page) { @@ -1543,12 +1542,11 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, goto failure; } - frag = &skb_shinfo(skb)->frags[i]; - frag->page = page; - frag->page_offset = 0; - frag->size = (data_len >= PAGE_SIZE ? - PAGE_SIZE : - data_len); + __skb_fill_page_desc(skb, i, + page, 0, + (data_len >= PAGE_SIZE ? + PAGE_SIZE : + data_len)); data_len -= PAGE_SIZE; } diff --git a/net/core/user_dma.c b/net/core/user_dma.c index 25d717e..34e9664 100644 --- a/net/core/user_dma.c +++ b/net/core/user_dma.c @@ -78,7 +78,7 @@ int dma_skb_copy_datagram_iovec(struct dma_chan *chan, copy = end - offset; if (copy > 0) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - struct page *page = frag->page; + struct page *page = skb_frag_page(frag); if (copy > len) copy = len; -- cgit v1.1 From c37e0c993055d8c4fd82202331d06e6ef9bfec4b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Aug 2011 23:31:58 +0000 Subject: net: linkwatch: allow vlans to get carrier changes faster MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a time-lag of IFF_RUNNING flag consistency between vlan and real devices when the real devices are in problem such as link or cable broken. This leads to a degradation of Availability such as a delay of failover in HA systems using vlan since the detection of the problem at real device is delayed. We can avoid the linkwatch delay (~1 sec) for devices linked to another ones, since delay is already done for the realdev. Based on a previous patch from Mitsuo Hayasaka Reported-by: Mitsuo Hayasaka Signed-off-by: Eric Dumazet Cc: Herbert Xu Cc: Patrick McHardy Cc: "Michał Mirosław" Cc: Tom Herbert Cc: Stephen Hemminger Cc: Jesse Gross Tested-by: Mitsuo Hayasaka Signed-off-by: David S. Miller --- net/core/link_watch.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 357bd4e..c3519c6 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -78,8 +78,13 @@ static void rfc2863_policy(struct net_device *dev) static bool linkwatch_urgent_event(struct net_device *dev) { - return netif_running(dev) && netif_carrier_ok(dev) && - qdisc_tx_changing(dev); + if (!netif_running(dev)) + return false; + + if (dev->ifindex != dev->iflink) + return true; + + return netif_carrier_ok(dev) && qdisc_tx_changing(dev); } -- cgit v1.1 From 4bc71cb983fd2844e603bf633df2bb53385182d2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 3 Sep 2011 03:34:30 +0000 Subject: net: consolidate and fix ethtool_ops->get_settings calling This patch does several things: - introduces __ethtool_get_settings which is called from ethtool code and from drivers as well. Put ASSERT_RTNL there. - dev_ethtool_get_settings() is replaced by __ethtool_get_settings() - changes calling in drivers so rtnl locking is respected. In iboe_get_rate was previously ->get_settings() called unlocked. This fixes it. Also prb_calc_retire_blk_tmo() in af_packet.c had the same problem. Also fixed by calling __dev_get_by_index() instead of dev_get_by_index() and holding rtnl_lock for both calls. - introduces rtnl_lock in bnx2fc_vport_create() and fcoe_vport_create() so bnx2fc_if_create() and fcoe_if_create() are called locked as they are from other places. - use __ethtool_get_settings() in bonding code Signed-off-by: Jiri Pirko v2->v3: -removed dev_ethtool_get_settings() -added ASSERT_RTNL into __ethtool_get_settings() -prb_calc_retire_blk_tmo - use __dev_get_by_index() and lock around it and __ethtool_get_settings() call v1->v2: add missing export_symbol Reviewed-by: Ben Hutchings [except FCoE bits] Acked-by: Ralf Baechle Signed-off-by: David S. Miller --- net/core/dev.c | 24 ------------------------ net/core/ethtool.c | 20 +++++++++++++++----- net/core/net-sysfs.c | 4 ++-- 3 files changed, 17 insertions(+), 31 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index b2e262e..4b9981c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4566,30 +4566,6 @@ void dev_set_rx_mode(struct net_device *dev) } /** - * dev_ethtool_get_settings - call device's ethtool_ops::get_settings() - * @dev: device - * @cmd: memory area for ethtool_ops::get_settings() result - * - * The cmd arg is initialized properly (cleared and - * ethtool_cmd::cmd field set to ETHTOOL_GSET). - * - * Return device's ethtool_ops::get_settings() result value or - * -EOPNOTSUPP when device doesn't expose - * ethtool_ops::get_settings() operation. - */ -int dev_ethtool_get_settings(struct net_device *dev, - struct ethtool_cmd *cmd) -{ - if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings) - return -EOPNOTSUPP; - - memset(cmd, 0, sizeof(struct ethtool_cmd)); - cmd->cmd = ETHTOOL_GSET; - return dev->ethtool_ops->get_settings(dev, cmd); -} -EXPORT_SYMBOL(dev_ethtool_get_settings); - -/** * dev_get_flags - get flags reported to userspace * @dev: device * diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 6cdba5f..f444817 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -569,15 +569,25 @@ int __ethtool_set_flags(struct net_device *dev, u32 data) return 0; } -static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) +int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) { - struct ethtool_cmd cmd = { .cmd = ETHTOOL_GSET }; - int err; + ASSERT_RTNL(); - if (!dev->ethtool_ops->get_settings) + if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings) return -EOPNOTSUPP; - err = dev->ethtool_ops->get_settings(dev, &cmd); + memset(cmd, 0, sizeof(struct ethtool_cmd)); + cmd->cmd = ETHTOOL_GSET; + return dev->ethtool_ops->get_settings(dev, cmd); +} +EXPORT_SYMBOL(__ethtool_get_settings); + +static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) +{ + int err; + struct ethtool_cmd cmd; + + err = __ethtool_get_settings(dev, &cmd); if (err < 0) return err; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 56e42ab..7604a63 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -147,7 +147,7 @@ static ssize_t show_speed(struct device *dev, if (netif_running(netdev)) { struct ethtool_cmd cmd; - if (!dev_ethtool_get_settings(netdev, &cmd)) + if (!__ethtool_get_settings(netdev, &cmd)) ret = sprintf(buf, fmt_udec, ethtool_cmd_speed(&cmd)); } rtnl_unlock(); @@ -165,7 +165,7 @@ static ssize_t show_duplex(struct device *dev, if (netif_running(netdev)) { struct ethtool_cmd cmd; - if (!dev_ethtool_get_settings(netdev, &cmd)) + if (!__ethtool_get_settings(netdev, &cmd)) ret = sprintf(buf, "%s\n", cmd.duplex ? "full" : "half"); } -- cgit v1.1 From 16e5726269611b71c930054ffe9b858c1cea88eb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 19 Sep 2011 05:52:27 +0000 Subject: af_unix: dont send SCM_CREDENTIALS by default Since commit 7361c36c5224 (af_unix: Allow credentials to work across user and pid namespaces) af_unix performance dropped a lot. This is because we now take a reference on pid and cred in each write(), and release them in read(), usually done from another process, eventually from another cpu. This triggers false sharing. # Events: 154K cycles # # Overhead Command Shared Object Symbol # ........ ....... .................. ......................... # 10.40% hackbench [kernel.kallsyms] [k] put_pid 8.60% hackbench [kernel.kallsyms] [k] unix_stream_recvmsg 7.87% hackbench [kernel.kallsyms] [k] unix_stream_sendmsg 6.11% hackbench [kernel.kallsyms] [k] do_raw_spin_lock 4.95% hackbench [kernel.kallsyms] [k] unix_scm_to_skb 4.87% hackbench [kernel.kallsyms] [k] pid_nr_ns 4.34% hackbench [kernel.kallsyms] [k] cred_to_ucred 2.39% hackbench [kernel.kallsyms] [k] unix_destruct_scm 2.24% hackbench [kernel.kallsyms] [k] sub_preempt_count 1.75% hackbench [kernel.kallsyms] [k] fget_light 1.51% hackbench [kernel.kallsyms] [k] __mutex_lock_interruptible_slowpath 1.42% hackbench [kernel.kallsyms] [k] sock_alloc_send_pskb This patch includes SCM_CREDENTIALS information in a af_unix message/skb only if requested by the sender, [man 7 unix for details how to include ancillary data using sendmsg() system call] Note: This might break buggy applications that expected SCM_CREDENTIAL from an unaware write() system call, and receiver not using SO_PASSCRED socket option. If SOCK_PASSCRED is set on source or destination socket, we still include credentials for mere write() syscalls. Performance boost in hackbench : more than 50% gain on a 16 thread machine (2 quad-core cpus, 2 threads per core) hackbench 20 thread 2000 4.228 sec instead of 9.102 sec Signed-off-by: Eric Dumazet Acked-by: Tim Chen Signed-off-by: David S. Miller --- net/core/scm.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/scm.c b/net/core/scm.c index 811b53f..ff52ad0 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -173,7 +173,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) if (err) goto error; - if (pid_vnr(p->pid) != p->creds.pid) { + if (!p->pid || pid_vnr(p->pid) != p->creds.pid) { struct pid *pid; err = -ESRCH; pid = find_get_pid(p->creds.pid); @@ -183,8 +183,9 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) p->pid = pid; } - if ((p->cred->euid != p->creds.uid) || - (p->cred->egid != p->creds.gid)) { + if (!p->cred || + (p->cred->euid != p->creds.uid) || + (p->cred->egid != p->creds.gid)) { struct cred *cred; err = -ENOMEM; cred = prepare_creds(); @@ -193,7 +194,8 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) cred->uid = cred->euid = p->creds.uid; cred->gid = cred->egid = p->creds.gid; - put_cred(p->cred); + if (p->cred) + put_cred(p->cred); p->cred = cred; } break; -- cgit v1.1 From 5dd17e08f333cde0fa11000792e33d8d39b5599f Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 20 Sep 2011 22:36:07 +0000 Subject: net: rps: fix the support for PPPOE The upper protocol numbers of PPPOE are different, and should be treated specially. Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/core/dev.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index bf49a47..7f4486e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -135,6 +135,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -2556,6 +2557,7 @@ void __skb_get_rxhash(struct sk_buff *skb) again: switch (proto) { case __constant_htons(ETH_P_IP): +ip: if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) goto done; @@ -2569,6 +2571,7 @@ again: nhoff += ip->ihl * 4; break; case __constant_htons(ETH_P_IPV6): +ipv6: if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) goto done; @@ -2591,7 +2594,14 @@ again: proto = *((__be16 *) (skb->data + nhoff + sizeof(struct pppoe_hdr))); nhoff += PPPOE_SES_HLEN; - goto again; + switch (proto) { + case __constant_htons(PPP_IP): + goto ip; + case __constant_htons(PPP_IPV6): + goto ipv6; + default: + goto done; + } default: goto done; } -- cgit v1.1 From 09994d1b09bd9b0046a4708fa50d2106610a4058 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 3 Oct 2011 04:42:46 +0000 Subject: RPS: Ensure that an expired hardware filter can be re-added later Amir Vadai wrote: > When a stream is paused, and its rule is expired while it is paused, > no new rule will be configured to the HW when traffic resume. [...] > - When stream was resumed, traffic was steered again by RSS, and > because current-cpu was equal to desired-cpu, ndo_rx_flow_steer > wasn't called and no rule was configured to the HW. Fix this by setting the flow's current CPU only in the table for the newly selected RX queue. Reported-and-tested-by: Amir Vadai Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 7f4486e..70ecb86 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2670,10 +2670,7 @@ static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) { - u16 tcpu; - - tcpu = rflow->cpu = next_cpu; - if (tcpu != RPS_NO_CPU) { + if (next_cpu != RPS_NO_CPU) { #ifdef CONFIG_RFS_ACCEL struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; @@ -2701,16 +2698,16 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, goto out; old_rflow = rflow; rflow = &flow_table->flows[flow_id]; - rflow->cpu = next_cpu; rflow->filter = rc; if (old_rflow->filter == rflow->filter) old_rflow->filter = RPS_NO_FILTER; out: #endif rflow->last_qtail = - per_cpu(softnet_data, tcpu).input_queue_head; + per_cpu(softnet_data, next_cpu).input_queue_head; } + rflow->cpu = next_cpu; return rflow; } -- cgit v1.1 From 8083f0fc969d9b5353061a7a6f963405057e26b1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 7 Oct 2011 03:30:20 +0000 Subject: net: use sock_valbool_flag to set/clear SOCK_RXQ_OVFL There's no point in open-coding sock_valbool_flag(). Signed-off-by: Johannes Berg Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/core/sock.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index b29ab61..83c462d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -738,10 +738,7 @@ set_rcvbuf: /* We implement the SO_SNDLOWAT etc to not be settable (1003.1g 5.3) */ case SO_RXQ_OVFL: - if (valbool) - sock_set_flag(sk, SOCK_RXQ_OVFL); - else - sock_reset_flag(sk, SOCK_RXQ_OVFL); + sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); break; default: ret = -ENOPROTOOPT; -- cgit v1.1 From 87fb4b7b533073eeeaed0b6bf7c2328995f6c075 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 13 Oct 2011 07:28:54 +0000 Subject: net: more accurate skb truesize skb truesize currently accounts for sk_buff struct and part of skb head. kmalloc() roundings are also ignored. Considering that skb_shared_info is larger than sk_buff, its time to take it into account for better memory accounting. This patch introduces SKB_TRUESIZE(X) macro to centralize various assumptions into a single place. At skb alloc phase, we put skb_shared_info struct at the exact end of skb head, to allow a better use of memory (lowering number of reallocations), since kmalloc() gives us power-of-two memory blocks. Unless SLUB/SLUB debug is active, both skb->head and skb_shared_info are aligned to cache lines, as before. Note: This patch might trigger performance regressions because of misconfigured protocol stacks, hitting per socket or global memory limits that were previously not reached. But its a necessary step for a more accurate memory accounting. Signed-off-by: Eric Dumazet CC: Andi Kleen CC: Ben Hutchings Signed-off-by: David S. Miller --- net/core/skbuff.c | 18 ++++++++++++++---- net/core/sock.c | 2 +- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5b2c5f1..a7f855d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -184,11 +184,20 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, goto out; prefetchw(skb); - size = SKB_DATA_ALIGN(size); - data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info), - gfp_mask, node); + /* We do our best to align skb_shared_info on a separate cache + * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives + * aligned memory blocks, unless SLUB/SLAB debug is enabled. + * Both skb->head and skb_shared_info are cache line aligned. + */ + size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + data = kmalloc_node_track_caller(size, gfp_mask, node); if (!data) goto nodata; + /* kmalloc(size) might give us more room than requested. + * Put skb_shared_info exactly at the end of allocated zone, + * to allow max possible filling before reallocation. + */ + size = SKB_WITH_OVERHEAD(ksize(data)); prefetchw(data + size); /* @@ -197,7 +206,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, * the tail pointer in struct sk_buff! */ memset(skb, 0, offsetof(struct sk_buff, tail)); - skb->truesize = size + sizeof(struct sk_buff); + /* Account for allocated memory : skb + skb->head */ + skb->truesize = SKB_TRUESIZE(size); atomic_set(&skb->users, 1); skb->head = data; skb->data = data; diff --git a/net/core/sock.c b/net/core/sock.c index 83c462d..5a08762 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -207,7 +207,7 @@ static struct lock_class_key af_callback_keys[AF_MAX]; * not depend upon such differences. */ #define _SK_MEM_PACKETS 256 -#define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256) +#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) -- cgit v1.1 From 5f8444a3fa617076f8da51a3e8ecce01a5d7f738 Mon Sep 17 00:00:00 2001 From: Greg Rose Date: Sat, 8 Oct 2011 03:05:24 +0000 Subject: if_link: Add additional parameter to IFLA_VF_INFO for spoof checking Add configuration setting for drivers to turn spoof checking on or off for discrete VFs. v2 - Fix indentation problem, wrap the ifla_vf_info structure in #ifdef __KERNEL__ to prevent user space from accessing and change function paramater for the spoof check setting netdev op from u8 to bool. v3 - Preset spoof check setting to -1 so that user space tools such as ip can detect that the driver didn't report a spoofcheck setting. Prevents incorrect display of spoof check settings for drivers that don't report it. Signed-off-by: Greg Rose Signed-off-by: Jeff Kirsher --- net/core/rtnetlink.c | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 39f8dd6..9083e82 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -731,7 +731,8 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev) size += num_vfs * (nla_total_size(sizeof(struct ifla_vf_mac)) + nla_total_size(sizeof(struct ifla_vf_vlan)) + - nla_total_size(sizeof(struct ifla_vf_tx_rate))); + nla_total_size(sizeof(struct ifla_vf_tx_rate)) + + nla_total_size(sizeof(struct ifla_vf_spoofchk))); return size; } else return 0; @@ -954,13 +955,27 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct ifla_vf_mac vf_mac; struct ifla_vf_vlan vf_vlan; struct ifla_vf_tx_rate vf_tx_rate; + struct ifla_vf_spoofchk vf_spoofchk; + + /* + * Not all SR-IOV capable drivers support the + * spoofcheck query. Preset to -1 so the user + * space tool can detect that the driver didn't + * report anything. + */ + ivi.spoofchk = -1; if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi)) break; - vf_mac.vf = vf_vlan.vf = vf_tx_rate.vf = ivi.vf; + vf_mac.vf = + vf_vlan.vf = + vf_tx_rate.vf = + vf_spoofchk.vf = ivi.vf; + memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; vf_tx_rate.rate = ivi.tx_rate; + vf_spoofchk.setting = ivi.spoofchk; vf = nla_nest_start(skb, IFLA_VF_INFO); if (!vf) { nla_nest_cancel(skb, vfinfo); @@ -968,7 +983,10 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, } NLA_PUT(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac); NLA_PUT(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan); - NLA_PUT(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate); + NLA_PUT(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), + &vf_tx_rate); + NLA_PUT(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), + &vf_spoofchk); nla_nest_end(skb, vf); } nla_nest_end(skb, vfinfo); @@ -1202,6 +1220,15 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr) ivt->rate); break; } + case IFLA_VF_SPOOFCHK: { + struct ifla_vf_spoofchk *ivs; + ivs = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_spoofchk) + err = ops->ndo_set_vf_spoofchk(dev, ivs->vf, + ivs->setting); + break; + } default: err = -EINVAL; break; -- cgit v1.1 From 6ccc3abdc97e07e6895323fdab89f84e58b40ece Mon Sep 17 00:00:00 2001 From: huajun li Date: Tue, 27 Sep 2011 22:51:39 +0000 Subject: net/flow: Fix potential memory leak While preparing net flow caches, once a fail may cause potential memory leak , fix it. Signed-off-by: Huajun Li Signed-off-by: David S. Miller --- net/core/flow.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/flow.c b/net/core/flow.c index 555a456..8ae42de 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -413,7 +413,7 @@ static int __init flow_cache_init(struct flow_cache *fc) for_each_online_cpu(i) { if (flow_cache_cpu_prepare(fc, i)) - return -ENOMEM; + goto err; } fc->hotcpu_notifier = (struct notifier_block){ .notifier_call = flow_cache_cpu, @@ -426,6 +426,18 @@ static int __init flow_cache_init(struct flow_cache *fc) add_timer(&fc->rnd_timer); return 0; + +err: + for_each_possible_cpu(i) { + struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i); + kfree(fcp->hash_table); + fcp->hash_table = NULL; + } + + free_percpu(fc->percpu); + fc->percpu = NULL; + + return -ENOMEM; } static int __init flow_cache_init_global(void) -- cgit v1.1 From 2425717b27eb92b175335ca4ff0bb218cbe0cb64 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 10 Oct 2011 09:16:41 +0000 Subject: net: allow vlan traffic to be received under bond The following configuration used to work as I expected. At least we could use the fcoe interfaces to do MPIO and the bond0 iface to do load balancing or failover. ---eth2.228-fcoe | eth2 -----| | |---- bond0 | eth3 -----| | ---eth3.228-fcoe This worked because of a change we added to allow inactive slaves to rx 'exact' matches. This functionality was kept intact with the rx_handler mechanism. However now the vlan interface attached to the active slave never receives traffic because the bonding rx_handler updates the skb->dev and goto's another_round. Previously, the vlan_do_receive() logic was called before the bonding rx_handler. Now by the time vlan_do_receive calls vlan_find_dev() the skb->dev is set to bond0 and it is clear no vlan is attached to this iface. The vlan lookup fails. This patch moves the VLAN check above the rx_handler. A VLAN tagged frame is now routed to the eth2.228-fcoe iface in the above schematic. Untagged frames continue to the bond0 as normal. This case also remains intact, eth2 --> bond0 --> vlan.228 Here the skb is VLAN tagged but the vlan lookup fails on eth2 causing the bonding rx_handler to be called. On the second pass the vlan lookup is on the bond0 iface and completes as expected. Putting a VLAN.228 on both the bond0 and eth2 device will result in eth2.228 receiving the skb. I don't think this is completely unexpected and was the result prior to the rx_handler result. Note, the same setup is also used for other storage traffic that MPIO is used with eg. iSCSI and similar setups can be contrived without storage protocols. Signed-off-by: John Fastabend Acked-by: Jesse Gross Reviewed-by: Jiri Pirko Tested-by: Hans Schillstrom Signed-off-by: David S. Miller --- net/core/dev.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 70ecb86..8b6118a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3231,6 +3231,17 @@ another_round: ncls: #endif + if (vlan_tx_tag_present(skb)) { + if (pt_prev) { + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = NULL; + } + if (vlan_do_receive(&skb)) + goto another_round; + else if (unlikely(!skb)) + goto out; + } + rx_handler = rcu_dereference(skb->dev->rx_handler); if (rx_handler) { if (pt_prev) { @@ -3251,17 +3262,6 @@ ncls: } } - if (vlan_tx_tag_present(skb)) { - if (pt_prev) { - ret = deliver_skb(skb, pt_prev, orig_dev); - pt_prev = NULL; - } - if (vlan_do_receive(&skb)) - goto another_round; - else if (unlikely(!skb)) - goto out; - } - /* deliver only exact match when indicated */ null_or_dev = deliver_exact ? skb->dev : NULL; -- cgit v1.1 From 9e903e085262ffbf1fc44a17ac06058aca03524a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 18 Oct 2011 21:00:24 +0000 Subject: net: add skb frag size accessors To ease skb->truesize sanitization, its better to be able to localize all references to skb frags size. Define accessors : skb_frag_size() to fetch frag size, and skb_frag_size_{set|add|sub}() to manipulate it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/datagram.c | 16 ++++++------ net/core/dev.c | 6 ++--- net/core/pktgen.c | 12 ++++----- net/core/skbuff.c | 72 ++++++++++++++++++++++++++++------------------------- net/core/user_dma.c | 4 +-- 5 files changed, 57 insertions(+), 53 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index 6449bed..68bbf9f 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -324,14 +324,14 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, /* Copy paged appendix. Hmm... why does this look so complicated? */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; WARN_ON(start > offset + len); - end = start + skb_shinfo(skb)->frags[i].size; + end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { int err; u8 *vaddr; - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; struct page *page = skb_frag_page(frag); if (copy > len) @@ -410,14 +410,14 @@ int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset, /* Copy paged appendix. Hmm... why does this look so complicated? */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; WARN_ON(start > offset + len); - end = start + skb_shinfo(skb)->frags[i].size; + end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { int err; u8 *vaddr; - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; struct page *page = skb_frag_page(frag); if (copy > len) @@ -500,14 +500,14 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, /* Copy paged appendix. Hmm... why does this look so complicated? */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; WARN_ON(start > offset + len); - end = start + skb_shinfo(skb)->frags[i].size; + end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { int err; u8 *vaddr; - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; struct page *page = skb_frag_page(frag); if (copy > len) @@ -585,15 +585,15 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; WARN_ON(start > offset + len); - end = start + skb_shinfo(skb)->frags[i].size; + end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { __wsum csum2; int err = 0; u8 *vaddr; - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; struct page *page = skb_frag_page(frag); if (copy > len) diff --git a/net/core/dev.c b/net/core/dev.c index 8b6118a..cbb5918 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3489,9 +3489,9 @@ pull: skb->data_len -= grow; skb_shinfo(skb)->frags[0].page_offset += grow; - skb_shinfo(skb)->frags[0].size -= grow; + skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow); - if (unlikely(!skb_shinfo(skb)->frags[0].size)) { + if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) { skb_frag_unref(skb, 0); memmove(skb_shinfo(skb)->frags, skb_shinfo(skb)->frags + 1, @@ -3559,7 +3559,7 @@ void skb_gro_reset_offset(struct sk_buff *skb) !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(&skb_shinfo(skb)->frags[0]); - NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size; + NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]); } } EXPORT_SYMBOL(skb_gro_reset_offset); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 796044a..38d6577 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2606,13 +2606,13 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, skb_shinfo(skb)->frags[i].page_offset = 0; /*last fragment, fill rest of data*/ if (i == (frags - 1)) - skb_shinfo(skb)->frags[i].size = - (datalen < PAGE_SIZE ? datalen : PAGE_SIZE); + skb_frag_size_set(&skb_shinfo(skb)->frags[i], + (datalen < PAGE_SIZE ? datalen : PAGE_SIZE)); else - skb_shinfo(skb)->frags[i].size = frag_len; - datalen -= skb_shinfo(skb)->frags[i].size; - skb->len += skb_shinfo(skb)->frags[i].size; - skb->data_len += skb_shinfo(skb)->frags[i].size; + skb_frag_size_set(&skb_shinfo(skb)->frags[i], frag_len); + datalen -= skb_frag_size(&skb_shinfo(skb)->frags[i]); + skb->len += skb_frag_size(&skb_shinfo(skb)->frags[i]); + skb->data_len += skb_frag_size(&skb_shinfo(skb)->frags[i]); i++; skb_shinfo(skb)->nr_frags = i; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a7f855d..ce357d9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -659,7 +659,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) } vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); memcpy(page_address(page), - vaddr + f->page_offset, f->size); + vaddr + f->page_offset, skb_frag_size(f)); kunmap_skb_frag(vaddr); page->private = (unsigned long)head; head = page; @@ -1190,14 +1190,14 @@ int ___pskb_trim(struct sk_buff *skb, unsigned int len) goto drop_pages; for (; i < nfrags; i++) { - int end = offset + skb_shinfo(skb)->frags[i].size; + int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]); if (end < len) { offset = end; continue; } - skb_shinfo(skb)->frags[i++].size = len - offset; + skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset); drop_pages: skb_shinfo(skb)->nr_frags = i; @@ -1306,9 +1306,11 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) /* Estimate size of pulled pages. */ eat = delta; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - if (skb_shinfo(skb)->frags[i].size >= eat) + int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); + + if (size >= eat) goto pull_pages; - eat -= skb_shinfo(skb)->frags[i].size; + eat -= size; } /* If we need update frag list, we are in troubles. @@ -1371,14 +1373,16 @@ pull_pages: eat = delta; k = 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - if (skb_shinfo(skb)->frags[i].size <= eat) { + int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); + + if (size <= eat) { skb_frag_unref(skb, i); - eat -= skb_shinfo(skb)->frags[i].size; + eat -= size; } else { skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; if (eat) { skb_shinfo(skb)->frags[k].page_offset += eat; - skb_shinfo(skb)->frags[k].size -= eat; + skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); eat = 0; } k++; @@ -1433,7 +1437,7 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) WARN_ON(start > offset + len); - end = start + skb_shinfo(skb)->frags[i].size; + end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); if ((copy = end - offset) > 0) { u8 *vaddr; @@ -1632,7 +1636,7 @@ static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; if (__splice_segment(skb_frag_page(f), - f->page_offset, f->size, + f->page_offset, skb_frag_size(f), offset, len, skb, spd, 0, sk, pipe)) return 1; } @@ -1742,7 +1746,7 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) WARN_ON(start > offset + len); - end = start + frag->size; + end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { u8 *vaddr; @@ -1815,7 +1819,7 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, WARN_ON(start > offset + len); - end = start + skb_shinfo(skb)->frags[i].size; + end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); if ((copy = end - offset) > 0) { __wsum csum2; u8 *vaddr; @@ -1890,7 +1894,7 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, WARN_ON(start > offset + len); - end = start + skb_shinfo(skb)->frags[i].size; + end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); if ((copy = end - offset) > 0) { __wsum csum2; u8 *vaddr; @@ -2163,7 +2167,7 @@ static inline void skb_split_no_header(struct sk_buff *skb, skb->data_len = len - pos; for (i = 0; i < nfrags; i++) { - int size = skb_shinfo(skb)->frags[i].size; + int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); if (pos + size > len) { skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; @@ -2179,8 +2183,8 @@ static inline void skb_split_no_header(struct sk_buff *skb, */ skb_frag_ref(skb, i); skb_shinfo(skb1)->frags[0].page_offset += len - pos; - skb_shinfo(skb1)->frags[0].size -= len - pos; - skb_shinfo(skb)->frags[i].size = len - pos; + skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); + skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos); skb_shinfo(skb)->nr_frags++; } k++; @@ -2258,7 +2262,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) } else { merge = to - 1; - todo -= fragfrom->size; + todo -= skb_frag_size(fragfrom); if (todo < 0) { if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) @@ -2268,8 +2272,8 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) fragfrom = &skb_shinfo(skb)->frags[from]; fragto = &skb_shinfo(tgt)->frags[merge]; - fragto->size += shiftlen; - fragfrom->size -= shiftlen; + skb_frag_size_add(fragto, shiftlen); + skb_frag_size_sub(fragfrom, shiftlen); fragfrom->page_offset += shiftlen; goto onlymerged; @@ -2293,9 +2297,9 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) fragfrom = &skb_shinfo(skb)->frags[from]; fragto = &skb_shinfo(tgt)->frags[to]; - if (todo >= fragfrom->size) { + if (todo >= skb_frag_size(fragfrom)) { *fragto = *fragfrom; - todo -= fragfrom->size; + todo -= skb_frag_size(fragfrom); from++; to++; @@ -2303,10 +2307,10 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) __skb_frag_ref(fragfrom); fragto->page = fragfrom->page; fragto->page_offset = fragfrom->page_offset; - fragto->size = todo; + skb_frag_size_set(fragto, todo); fragfrom->page_offset += todo; - fragfrom->size -= todo; + skb_frag_size_sub(fragfrom, todo); todo = 0; to++; @@ -2321,7 +2325,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) fragfrom = &skb_shinfo(skb)->frags[0]; fragto = &skb_shinfo(tgt)->frags[merge]; - fragto->size += fragfrom->size; + skb_frag_size_add(fragto, skb_frag_size(fragfrom)); __skb_frag_unref(fragfrom); } @@ -2419,7 +2423,7 @@ next_skb: while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; - block_limit = frag->size + st->stepped_offset; + block_limit = skb_frag_size(frag) + st->stepped_offset; if (abs_offset < block_limit) { if (!st->frag_data) @@ -2437,7 +2441,7 @@ next_skb: } st->frag_idx++; - st->stepped_offset += frag->size; + st->stepped_offset += skb_frag_size(frag); } if (st->frag_data) { @@ -2567,13 +2571,13 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, left = PAGE_SIZE - frag->page_offset; copy = (length > left)? left : length; - ret = getfrag(from, skb_frag_address(frag) + frag->size, + ret = getfrag(from, skb_frag_address(frag) + skb_frag_size(frag), offset, copy, 0, skb); if (ret < 0) return -EFAULT; /* copy was successful so update the size parameters */ - frag->size += copy; + skb_frag_size_add(frag, copy); skb->len += copy; skb->data_len += copy; offset += copy; @@ -2720,11 +2724,11 @@ struct sk_buff *skb_segment(struct sk_buff *skb, u32 features) while (pos < offset + len && i < nfrags) { *frag = skb_shinfo(skb)->frags[i]; __skb_frag_ref(frag); - size = frag->size; + size = skb_frag_size(frag); if (pos < offset) { frag->page_offset += offset - pos; - frag->size -= offset - pos; + skb_frag_size_sub(frag, offset - pos); } skb_shinfo(nskb)->nr_frags++; @@ -2733,7 +2737,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, u32 features) i++; pos += size; } else { - frag->size -= pos + size - (offset + len); + skb_frag_size_sub(frag, pos + size - (offset + len)); goto skip_fraglist; } @@ -2813,7 +2817,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) } while (--i); frag->page_offset += offset; - frag->size -= offset; + skb_frag_size_sub(frag, offset); skb->truesize -= skb->data_len; skb->len -= skb->data_len; @@ -2865,7 +2869,7 @@ merge: unsigned int eat = offset - headlen; skbinfo->frags[0].page_offset += eat; - skbinfo->frags[0].size -= eat; + skb_frag_size_sub(&skbinfo->frags[0], eat); skb->data_len -= eat; skb->len -= eat; offset = headlen; @@ -2936,7 +2940,7 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) WARN_ON(start > offset + len); - end = start + skb_shinfo(skb)->frags[i].size; + end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); if ((copy = end - offset) > 0) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; diff --git a/net/core/user_dma.c b/net/core/user_dma.c index 34e9664..2d7cf3d 100644 --- a/net/core/user_dma.c +++ b/net/core/user_dma.c @@ -71,13 +71,13 @@ int dma_skb_copy_datagram_iovec(struct dma_chan *chan, /* Copy paged appendix. Hmm... why does this look so complicated? */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; WARN_ON(start > offset + len); - end = start + skb_shinfo(skb)->frags[i].size; + end = start + skb_frag_size(frag); copy = end - offset; if (copy > 0) { - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; struct page *page = skb_frag_page(frag); if (copy > len) -- cgit v1.1 From 3d153a7c8b23031df35e61377c0600a6c96a8b0b Mon Sep 17 00:00:00 2001 From: Andy Fleming Date: Thu, 13 Oct 2011 04:33:54 +0000 Subject: net: Allow skb_recycle_check to be done in stages skb_recycle_check resets the skb if it's eligible for recycling. However, there are times when a driver might want to optionally manipulate the skb data with the skb before resetting the skb, but after it has determined eligibility. We do this by splitting the eligibility check from the skb reset, creating two inline functions to accomplish that task. Signed-off-by: Andy Fleming Acked-by: David Daney Signed-off-by: David S. Miller --- net/core/skbuff.c | 51 ++++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 25 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ce357d9..e271040 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -485,6 +485,30 @@ void consume_skb(struct sk_buff *skb) EXPORT_SYMBOL(consume_skb); /** + * skb_recycle - clean up an skb for reuse + * @skb: buffer + * + * Recycles the skb to be reused as a receive buffer. This + * function does any necessary reference count dropping, and + * cleans up the skbuff as if it just came from __alloc_skb(). + */ +void skb_recycle(struct sk_buff *skb) +{ + struct skb_shared_info *shinfo; + + skb_release_head_state(skb); + + shinfo = skb_shinfo(skb); + memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); + atomic_set(&shinfo->dataref, 1); + + memset(skb, 0, offsetof(struct sk_buff, tail)); + skb->data = skb->head + NET_SKB_PAD; + skb_reset_tail_pointer(skb); +} +EXPORT_SYMBOL(skb_recycle); + +/** * skb_recycle_check - check if skb can be reused for receive * @skb: buffer * @skb_size: minimum receive buffer size @@ -498,33 +522,10 @@ EXPORT_SYMBOL(consume_skb); */ bool skb_recycle_check(struct sk_buff *skb, int skb_size) { - struct skb_shared_info *shinfo; - - if (irqs_disabled()) - return false; - - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) - return false; - - if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE) - return false; - - skb_size = SKB_DATA_ALIGN(skb_size + NET_SKB_PAD); - if (skb_end_pointer(skb) - skb->head < skb_size) - return false; - - if (skb_shared(skb) || skb_cloned(skb)) + if (!skb_is_recycleable(skb, skb_size)) return false; - skb_release_head_state(skb); - - shinfo = skb_shinfo(skb); - memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); - atomic_set(&shinfo->dataref, 1); - - memset(skb, 0, offsetof(struct sk_buff, tail)); - skb->data = skb->head + NET_SKB_PAD; - skb_reset_tail_pointer(skb); + skb_recycle(skb); return true; } -- cgit v1.1 From 850a545bd8a41648445bfc5541afe36ca105b0b8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 13 Oct 2011 22:25:23 +0000 Subject: net: Move rcu_barrier from rollback_registered_many to netdev_run_todo. This patch moves the rcu_barrier from rollback_registered_many (inside the rtnl_lock) into netdev_run_todo (just outside the rtnl_lock). This allows us to gain the full benefit of sychronize_net calling synchronize_rcu_expedited when the rtnl_lock is held. The rcu_barrier in rollback_registered_many was originally a synchronize_net but was promoted to be a rcu_barrier() when it was found that people were unnecessarily hitting the 250ms wait in netdev_wait_allrefs(). Changing the rcu_barrier back to a synchronize_net is therefore safe. Since we only care about waiting for the rcu callbacks before we get to netdev_wait_allrefs() it is also safe to move the wait into netdev_run_todo. This was tested by creating and destroying 1000 tap devices and observing /proc/lock_stat. /proc/lock_stat reports this change reduces the hold times of the rtnl_lock by a factor of 10. There was no observable difference in the amount of time it takes to destroy a network device. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/dev.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index cbb5918..09aef26 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5235,7 +5235,7 @@ static void rollback_registered_many(struct list_head *head) dev = list_first_entry(head, struct net_device, unreg_list); call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); - rcu_barrier(); + synchronize_net(); list_for_each_entry(dev, head, unreg_list) dev_put(dev); @@ -5748,6 +5748,12 @@ void netdev_run_todo(void) __rtnl_unlock(); + /* Wait for rcu callbacks to finish before attempting to drain + * the device list. This usually avoids a 250ms wait. + */ + if (!list_empty(&list)) + rcu_barrier(); + while (!list_empty(&list)) { struct net_device *dev = list_first_entry(&list, struct net_device, todo_list); -- cgit v1.1 From 4dc360c5e7e155373bffbb3c1f7ea0022dee650c Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 19 Oct 2011 17:00:35 -0400 Subject: net: validate HWTSTAMP ioctl parameters This patch adds a sanity check on the values provided by user space for the hardware time stamping configuration. If the values lie outside of the absolute limits, then the ioctl request will be denied. Signed-off-by: Richard Cochran Signed-off-by: David S. Miller --- net/core/dev.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 09aef26..40d4395 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -136,6 +136,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -1477,6 +1478,57 @@ static inline void net_timestamp_check(struct sk_buff *skb) __net_timestamp(skb); } +static int net_hwtstamp_validate(struct ifreq *ifr) +{ + struct hwtstamp_config cfg; + enum hwtstamp_tx_types tx_type; + enum hwtstamp_rx_filters rx_filter; + int tx_type_valid = 0; + int rx_filter_valid = 0; + + if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) + return -EFAULT; + + if (cfg.flags) /* reserved for future extensions */ + return -EINVAL; + + tx_type = cfg.tx_type; + rx_filter = cfg.rx_filter; + + switch (tx_type) { + case HWTSTAMP_TX_OFF: + case HWTSTAMP_TX_ON: + case HWTSTAMP_TX_ONESTEP_SYNC: + tx_type_valid = 1; + break; + } + + switch (rx_filter) { + case HWTSTAMP_FILTER_NONE: + case HWTSTAMP_FILTER_ALL: + case HWTSTAMP_FILTER_SOME: + case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: + case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: + case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + rx_filter_valid = 1; + break; + } + + if (!tx_type_valid || !rx_filter_valid) + return -ERANGE; + + return 0; +} + static inline bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb) { @@ -4921,6 +4973,12 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) ifr->ifr_newname[IFNAMSIZ-1] = '\0'; return dev_change_name(dev, ifr->ifr_newname); + case SIOCSHWTSTAMP: + err = net_hwtstamp_validate(ifr); + if (err) + return err; + /* fall through */ + /* * Unknown or private ioctl */ -- cgit v1.1 From 4f25af27827080c3163e59c7af1ca84a05ce121c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 17 Oct 2011 21:04:20 +0000 Subject: filter: use unsigned int to silence static checker warning This is just a cleanup. My testing version of Smatch warns about this: net/core/filter.c +380 check_load_and_stores(6) warn: check 'flen' for negative values flen comes from the user. We try to clamp the values here between 1 and BPF_MAXINSNS but the clamp doesn't work because it could be negative. This is a bug, but it's not exploitable. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 8fcc2d7..5dea452 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -436,7 +436,7 @@ error: * * Returns 0 if the rule set is legal or -EINVAL if not. */ -int sk_chk_filter(struct sock_filter *filter, int flen) +int sk_chk_filter(struct sock_filter *filter, unsigned int flen) { /* * Valid instructions are initialized to non-0. -- cgit v1.1 From e049f28883126c689cf95859480d9ee4ab23b7fa Mon Sep 17 00:00:00 2001 From: "roy.qing.li@gmail.com" Date: Mon, 17 Oct 2011 22:32:42 +0000 Subject: neigh: fix rcu splat in neigh_update() when use dst_get_neighbour to get neighbour, we need rcu_read_lock to protect, since dst_get_neighbour uses rcu_dereference. The bug was reported by Ari Savolainen [ 105.612095] [ 105.612096] =================================================== [ 105.612100] [ INFO: suspicious rcu_dereference_check() usage. ] [ 105.612101] --------------------------------------------------- [ 105.612103] include/net/dst.h:91 invoked rcu_dereference_check() without protection! [ 105.612105] [ 105.612106] other info that might help us debug this: [ 105.612106] [ 105.612108] [ 105.612108] rcu_scheduler_active = 1, debug_locks = 0 [ 105.612110] 1 lock held by dnsmasq/2618: [ 105.612111] #0: (rtnl_mutex){+.+.+.}, at: [] rtnl_lock+0x17/0x20 [ 105.612120] [ 105.612121] stack backtrace: [ 105.612123] Pid: 2618, comm: dnsmasq Not tainted 3.1.0-rc1 #41 [ 105.612125] Call Trace: [ 105.612129] [] lockdep_rcu_dereference+0xbb/0xc0 [ 105.612132] [] neigh_update+0x4f9/0x5f0 [ 105.612135] [] ? neigh_lookup+0xe1/0x220 [ 105.612139] [] arp_req_set+0xb8/0x230 [ 105.612142] [] arp_ioctl+0x1bf/0x310 [ 105.612146] [] ? lock_hrtimer_base.isra.26+0x30/0x60 [ 105.612150] [] inet_ioctl+0x85/0x90 [ 105.612154] [] sock_do_ioctl+0x30/0x70 [ 105.612157] [] sock_ioctl+0x73/0x280 [ 105.612162] [] do_vfs_ioctl+0x98/0x570 [ 105.612165] [] ? fget_light+0x340/0x3a0 [ 105.612168] [] sys_ioctl+0x4f/0x80 [ 105.612172] [] system_call_fastpath+0x16/0x1b Reported-by: Ari Savolainen Signed-off-by: RongQing Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/neighbour.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 4344964..909ecb3 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1168,10 +1168,14 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, struct dst_entry *dst = skb_dst(skb); struct neighbour *n2, *n1 = neigh; write_unlock_bh(&neigh->lock); + + rcu_read_lock(); /* On shaper/eql skb->dst->neighbour != neigh :( */ if (dst && (n2 = dst_get_neighbour(dst)) != NULL) n1 = n2; n1->output(n1, skb); + rcu_read_unlock(); + write_lock_bh(&neigh->lock); } skb_queue_purge(&neigh->arp_queue); -- cgit v1.1 From a0bec1cd8f7ac966f2885f7e56ec44df87bab738 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 18 Oct 2011 22:55:11 +0000 Subject: net: do not take an additional reference in skb_frag_set_page I audited all of the callers in the tree and only one of them (pktgen) expects it to do so. Taking this reference is pretty obviously confusing and error prone. In particular I looked at the following commits which switched callers of (__)skb_frag_set_page to the skb paged fragment api: 6a930b9f163d7e6d9ef692e05616c4ede65038ec cxgb3: convert to SKB paged frag API. 5dc3e196ea21e833128d51eb5b788a070fea1f28 myri10ge: convert to SKB paged frag API. 0e0634d20dd670a89af19af2a686a6cce943ac14 vmxnet3: convert to SKB paged frag API. 86ee8130a46769f73f8f423f99dbf782a09f9233 virtionet: convert to SKB paged frag API. 4a22c4c919c201c2a7f4ee09e672435a3072d875 sfc: convert to SKB paged frag API. 18324d690d6a5028e3c174fc1921447aedead2b8 cassini: convert to SKB paged frag API. b061b39e3ae18ad75466258cf2116e18fa5bbd80 benet: convert to SKB paged frag API. b7b6a688d217936459ff5cf1087b2361db952509 bnx2: convert to SKB paged frag API. 804cf14ea5ceca46554d5801e2817bba8116b7e5 net: xfrm: convert to SKB frag APIs ea2ab69379a941c6f8884e290fdd28c93936a778 net: convert core to skb paged frag APIs Signed-off-by: Ian Campbell Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/pktgen.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 38d6577..6bbf008 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2602,6 +2602,7 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, if (!pkt_dev->page) break; } + get_page(pkt_dev->page); skb_frag_set_page(skb, i, pkt_dev->page); skb_shinfo(skb)->frags[i].page_offset = 0; /*last fragment, fill rest of data*/ -- cgit v1.1 From 33136d12be00596a8320d4fe88f95c44f67afbdb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Oct 2011 17:00:21 -0400 Subject: pktgen: remove ndelay() call Daniel Turull reported inaccuracies in pktgen when using low packet rates, because we call ndelay(val) with values bigger than 20000. Instead of calling ndelay() for delays < 100us, we can instead loop calling ktime_now() only. Reported-by: Daniel Turull Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/pktgen.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 6bbf008..0001c24 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2145,9 +2145,12 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) } start_time = ktime_now(); - if (remaining < 100000) - ndelay(remaining); /* really small just spin */ - else { + if (remaining < 100000) { + /* for small delays (<100us), just loop until limit is reached */ + do { + end_time = ktime_now(); + } while (ktime_lt(end_time, spin_until)); + } else { /* see do_nanosleep */ hrtimer_init_sleeper(&t, current); do { @@ -2162,8 +2165,8 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) hrtimer_cancel(&t.timer); } while (t.task && pkt_dev->running && !signal_pending(current)); __set_current_state(TASK_RUNNING); + end_time = ktime_now(); } - end_time = ktime_now(); pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time)); pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay); -- cgit v1.1 From a8605c6063f785858c1bc431d0bfe66c41e71cfa Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Wed, 19 Oct 2011 23:01:49 +0000 Subject: net: add opaque struct around skb frag page I've split this bit out of the skb frag destructor patch since it helps enforce the use of the fragment API. Signed-off-by: Ian Campbell Signed-off-by: David S. Miller --- net/core/skbuff.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e271040..ca4db40 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -668,14 +668,14 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) /* skb frags release userspace buffers */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - put_page(skb_shinfo(skb)->frags[i].page); + skb_frag_unref(skb, i); uarg->callback(uarg); /* skb frags point to kernel buffers */ for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) { - skb_shinfo(skb)->frags[i - 1].page_offset = 0; - skb_shinfo(skb)->frags[i - 1].page = head; + __skb_fill_page_desc(skb, i-1, head, 0, + skb_shinfo(skb)->frags[i - 1].size); head = (struct page *)head->private; } -- cgit v1.1 From f04565ddf52e401880f8ba51de0dff8ba51c99fd Mon Sep 17 00:00:00 2001 From: Mihai Maruseac Date: Thu, 20 Oct 2011 20:45:10 +0000 Subject: dev: use name hash for dev_seq_ops Instead of using the dev->next chain and trying to resync at each call to dev_seq_start, use the name hash, keeping the bucket and the offset in seq->private field. Tests revealed the following results for ifconfig > /dev/null * 1000 interfaces: * 0.114s without patch * 0.089s with patch * 3000 interfaces: * 0.489s without patch * 0.110s with patch * 5000 interfaces: * 1.363s without patch * 0.250s with patch * 128000 interfaces (other setup): * ~100s without patch * ~30s with patch Signed-off-by: Mihai Maruseac Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 69 insertions(+), 15 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 40d4395..ad5d702 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4093,6 +4093,60 @@ static int dev_ifconf(struct net *net, char __user *arg) } #ifdef CONFIG_PROC_FS + +#define BUCKET_SPACE (32 - NETDEV_HASHBITS) + +struct dev_iter_state { + struct seq_net_private p; + unsigned int pos; /* bucket << BUCKET_SPACE + offset */ +}; + +#define get_bucket(x) ((x) >> BUCKET_SPACE) +#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1)) +#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) + +static inline struct net_device *dev_from_same_bucket(struct seq_file *seq) +{ + struct dev_iter_state *state = seq->private; + struct net *net = seq_file_net(seq); + struct net_device *dev; + struct hlist_node *p; + struct hlist_head *h; + unsigned int count, bucket, offset; + + bucket = get_bucket(state->pos); + offset = get_offset(state->pos); + h = &net->dev_name_head[bucket]; + count = 0; + hlist_for_each_entry_rcu(dev, p, h, name_hlist) { + if (count++ == offset) { + state->pos = set_bucket_offset(bucket, count); + return dev; + } + } + + return NULL; +} + +static inline struct net_device *dev_from_new_bucket(struct seq_file *seq) +{ + struct dev_iter_state *state = seq->private; + struct net_device *dev; + unsigned int bucket; + + bucket = get_bucket(state->pos); + do { + dev = dev_from_same_bucket(seq); + if (dev) + return dev; + + bucket++; + state->pos = set_bucket_offset(bucket, 0); + } while (bucket < NETDEV_HASHENTRIES); + + return NULL; +} + /* * This is invoked by the /proc filesystem handler to display a device * in detail. @@ -4100,33 +4154,33 @@ static int dev_ifconf(struct net *net, char __user *arg) void *dev_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { - struct net *net = seq_file_net(seq); - loff_t off; - struct net_device *dev; + struct dev_iter_state *state = seq->private; rcu_read_lock(); if (!*pos) return SEQ_START_TOKEN; - off = 1; - for_each_netdev_rcu(net, dev) - if (off++ == *pos) - return dev; + /* check for end of the hash */ + if (state->pos == 0 && *pos > 1) + return NULL; - return NULL; + return dev_from_new_bucket(seq); } void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct net_device *dev = v; + struct net_device *dev; + + ++*pos; if (v == SEQ_START_TOKEN) - dev = first_net_device_rcu(seq_file_net(seq)); - else - dev = next_net_device_rcu(dev); + return dev_from_new_bucket(seq); - ++*pos; - return dev; + dev = dev_from_same_bucket(seq); + if (dev) + return dev; + + return dev_from_new_bucket(seq); } void dev_seq_stop(struct seq_file *seq, void *v) @@ -4225,7 +4279,7 @@ static const struct seq_operations dev_seq_ops = { static int dev_seq_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &dev_seq_ops, - sizeof(struct seq_net_private)); + sizeof(struct dev_iter_state)); } static const struct file_operations dev_seq_fops = { -- cgit v1.1 From cf533ea53ebfae41be15b103d78e7ebec30b9969 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Oct 2011 05:22:42 -0400 Subject: tcp: add const qualifiers where possible Adding const qualifiers to pointers can ease code review, and spot some bugs. It might allow compiler to optimize code further. For example, is it legal to temporary write a null cksum into tcphdr in tcp_md5_hash_header() ? I am afraid a sniffer could catch the temporary null value... Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/secure_seq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 45329d7..025233d 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -35,7 +35,7 @@ static u32 seq_scale(u32 seq) } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -__u32 secure_tcpv6_sequence_number(__be32 *saddr, __be32 *daddr, +__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, __be16 sport, __be16 dport) { u32 secret[MD5_MESSAGE_BYTES / 4]; -- cgit v1.1 From da92b194cc36b5dc1fbd85206aeeffd80bee0c39 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Fri, 21 Oct 2011 00:49:15 +0000 Subject: net: hold sock reference while processing tx timestamps The pair of functions, * skb_clone_tx_timestamp() * skb_complete_tx_timestamp() were designed to allow timestamping in PHY devices. The first function, called during the MAC driver's hard_xmit method, identifies PTP protocol packets, clones them, and gives them to the PHY device driver. The PHY driver may hold onto the packet and deliver it at a later time using the second function, which adds the packet to the socket's error queue. As pointed out by Johannes, nothing prevents the socket from disappearing while the cloned packet is sitting in the PHY driver awaiting a timestamp. This patch fixes the issue by taking a reference on the socket for each such packet. In addition, the comments regarding the usage of these function are expanded to highlight the rule that PHY drivers must use skb_complete_tx_timestamp() to release the packet, in order to release the socket reference, too. These functions first appeared in v2.6.36. Reported-by: Johannes Berg Signed-off-by: Richard Cochran Cc: Signed-off-by: Eric Dumazet Reviewed-by: Johannes Berg Signed-off-by: David S. Miller --- net/core/timestamping.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 98a5264..82fb288 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -57,9 +57,13 @@ void skb_clone_tx_timestamp(struct sk_buff *skb) case PTP_CLASS_V2_VLAN: phydev = skb->dev->phydev; if (likely(phydev->drv->txtstamp)) { + if (!atomic_inc_not_zero(&sk->sk_refcnt)) + return; clone = skb_clone(skb, GFP_ATOMIC); - if (!clone) + if (!clone) { + sock_put(sk); return; + } clone->sk = sk; phydev->drv->txtstamp(phydev, clone, type); } @@ -77,8 +81,11 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, struct sock_exterr_skb *serr; int err; - if (!hwtstamps) + if (!hwtstamps) { + sock_put(sk); + kfree_skb(skb); return; + } *skb_hwtstamps(skb) = *hwtstamps; serr = SKB_EXT_ERR(skb); @@ -87,6 +94,7 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; skb->sk = NULL; err = sock_queue_err_skb(sk, skb); + sock_put(sk); if (err) kfree_skb(skb); } -- cgit v1.1 From d2237d35748e7f448a9c2d9dc6a85ef637466e24 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 21 Oct 2011 06:24:20 +0000 Subject: rtnetlink: Add missing manual netlink notification in dev_change_net_namespaces Renato Westphal noticed that since commit a2835763e130c343ace5320c20d33c281e7097b7 "rtnetlink: handle rtnl_link netlink notifications manually" was merged we no longer send a netlink message when a networking device is moved from one network namespace to another. Fix this by adding the missing manual notification in dev_change_net_namespaces. Since all network devices that are processed by dev_change_net_namspaces are in the initialized state the complicated tests that guard the manual rtmsg_ifinfo calls in rollback_registered and register_netdevice are unnecessary and we can just perform a plain notification. Cc: stable@kernel.org Tested-by: Renato Westphal Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index b10ff0a..ae5cf2d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6115,6 +6115,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); /* * Flush the unicast and multicast chains -- cgit v1.1