From 4ed377e36ec2f385484d12e516faf88516fad31c Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sat, 21 Sep 2013 06:32:34 +0200 Subject: net: neighbour: use source address of last enqueued packet for solicitation Currently we always use the first member of the arp_queue to determine the sender ip address of the arp packet (or in case of IPv6 - source address of the ndisc packet). This skb is fixed as long as the queue is not drained by a complete purge because of a timeout or by a successful response. If the first packet enqueued on the arp_queue is from a local application with a manually set source address and the to be discovered system does some kind of uRPF checks on the source address in the arp packet the resolving process hangs until a timeout and restarts. This hurts communication with the participating network node. This could be mitigated a bit if we use the latest enqueued skb's source address for the resolving process, which is not as static as the arp_queue's head. This change of the source address could result in better recovery of a failed solicitation. Cc: "David S. Miller" Cc: Julian Anastasov Reviewed-by: Julian Anastasov Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/core/neighbour.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 6072610..ca15f32 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -867,7 +867,7 @@ static void neigh_invalidate(struct neighbour *neigh) static void neigh_probe(struct neighbour *neigh) __releases(neigh->lock) { - struct sk_buff *skb = skb_peek(&neigh->arp_queue); + struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue); /* keep skb alive even if arp_queue overflows */ if (skb) skb = skb_copy(skb, GFP_ATOMIC); -- cgit v1.1 From 7863c054d1b4fd35f76c13e2e918f7f483fe48f4 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 25 Sep 2013 09:20:06 +0200 Subject: net: use lists as arguments instead of bool upper Currently we make use of bool upper when we want to specify if we want to work with upper/lower list. It's, however, harder to read, debug and occupies a lot more code. Fix this by just passing the correct upper/lower_dev_list list_head pointer instead of bool upper, and work internally with it. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 54 ++++++++++++++++++++++-------------------------------- 1 file changed, 22 insertions(+), 32 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 5c713f2..9be7937 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4385,12 +4385,9 @@ struct netdev_adjacent { static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, struct net_device *adj_dev, - bool upper) + struct list_head *dev_list) { struct netdev_adjacent *adj; - struct list_head *dev_list; - - dev_list = upper ? &dev->upper_dev_list : &dev->lower_dev_list; list_for_each_entry(adj, dev_list, list) { if (adj->dev == adj_dev) @@ -4402,13 +4399,13 @@ static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, static inline struct netdev_adjacent *__netdev_find_upper(struct net_device *dev, struct net_device *udev) { - return __netdev_find_adj(dev, udev, true); + return __netdev_find_adj(dev, udev, &dev->upper_dev_list); } static inline struct netdev_adjacent *__netdev_find_lower(struct net_device *dev, struct net_device *ldev) { - return __netdev_find_adj(dev, ldev, false); + return __netdev_find_adj(dev, ldev, &dev->lower_dev_list); } /** @@ -4514,12 +4511,12 @@ EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); static int __netdev_adjacent_dev_insert(struct net_device *dev, struct net_device *adj_dev, - bool neighbour, bool master, - bool upper) + struct list_head *dev_list, + bool neighbour, bool master) { struct netdev_adjacent *adj; - adj = __netdev_find_adj(dev, adj_dev, upper); + adj = __netdev_find_adj(dev, adj_dev, dev_list); if (adj) { BUG_ON(neighbour); @@ -4538,19 +4535,14 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, dev_hold(adj_dev); pr_debug("dev_hold for %s, because of %s link added from %s to %s\n", - adj_dev->name, upper ? "upper" : "lower", dev->name, - adj_dev->name); + adj_dev->name, dev_list == &dev->upper_dev_list ? + "upper" : "lower", dev->name, adj_dev->name); - if (!upper) { - list_add_tail_rcu(&adj->list, &dev->lower_dev_list); - return 0; - } - - /* Ensure that master upper link is always the first item in list. */ + /* Ensure that master link is always the first item in list. */ if (master) - list_add_rcu(&adj->list, &dev->upper_dev_list); + list_add_rcu(&adj->list, dev_list); else - list_add_tail_rcu(&adj->list, &dev->upper_dev_list); + list_add_tail_rcu(&adj->list, dev_list); return 0; } @@ -4559,27 +4551,25 @@ static inline int __netdev_upper_dev_insert(struct net_device *dev, struct net_device *udev, bool master, bool neighbour) { - return __netdev_adjacent_dev_insert(dev, udev, neighbour, master, - true); + return __netdev_adjacent_dev_insert(dev, udev, &dev->upper_dev_list, + neighbour, master); } static inline int __netdev_lower_dev_insert(struct net_device *dev, struct net_device *ldev, bool neighbour) { - return __netdev_adjacent_dev_insert(dev, ldev, neighbour, false, - false); + return __netdev_adjacent_dev_insert(dev, ldev, &dev->lower_dev_list, + neighbour, false); } void __netdev_adjacent_dev_remove(struct net_device *dev, - struct net_device *adj_dev, bool upper) + struct net_device *adj_dev, + struct list_head *dev_list) { struct netdev_adjacent *adj; - if (upper) - adj = __netdev_find_upper(dev, adj_dev); - else - adj = __netdev_find_lower(dev, adj_dev); + adj = __netdev_find_adj(dev, adj_dev, dev_list); if (!adj) BUG(); @@ -4591,8 +4581,8 @@ void __netdev_adjacent_dev_remove(struct net_device *dev, list_del_rcu(&adj->list); pr_debug("dev_put for %s, because of %s link removed from %s to %s\n", - adj_dev->name, upper ? "upper" : "lower", dev->name, - adj_dev->name); + adj_dev->name, dev_list == &dev->upper_dev_list ? + "upper" : "lower", dev->name, adj_dev->name); dev_put(adj_dev); kfree_rcu(adj, rcu); } @@ -4600,13 +4590,13 @@ void __netdev_adjacent_dev_remove(struct net_device *dev, static inline void __netdev_upper_dev_remove(struct net_device *dev, struct net_device *udev) { - return __netdev_adjacent_dev_remove(dev, udev, true); + return __netdev_adjacent_dev_remove(dev, udev, &dev->upper_dev_list); } static inline void __netdev_lower_dev_remove(struct net_device *dev, struct net_device *ldev) { - return __netdev_adjacent_dev_remove(dev, ldev, false); + return __netdev_adjacent_dev_remove(dev, ldev, &dev->lower_dev_list); } int __netdev_adjacent_dev_insert_link(struct net_device *dev, -- cgit v1.1 From 2f268f129c2d1a05d297fe3ee34d393f862d2b22 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 25 Sep 2013 09:20:07 +0200 Subject: net: add adj_list to save only neighbours Currently, we distinguish neighbours (first-level linked devices) from non-neighbours by the neighbour bool in the netdev_adjacent. This could be quite time-consuming in case we would like to traverse *only* through neighbours - cause we'd have to traverse through all devices and check for this flag, and in a (quite common) scenario where we have lots of vlans on top of bridge, which is on top of a bond - the bonding would have to go through all those vlans to get its upper neighbour linked devices. This situation is really unpleasant, cause there are already a lot of cases when a device with slaves needs to go through them in hot path. To fix this, introduce a new upper/lower device lists structure - adj_list, which contains only the neighbours. It works always in pair with the all_adj_list structure (renamed from upper/lower_dev_list), i.e. both of them contain the same links, only that all_adj_list contains also non-neighbour device links. It's really a small change visible, currently, only for __netdev_adjacent_dev_insert/remove(), and doesn't change the main linked logic at all. Also, add some comments a fix a name collision in netdev_for_each_upper_dev_rcu() and rework the naming by the following rules: netdev_(all_)(upper|lower)_* If "all_" is present, then we work with the whole list of upper/lower devices, otherwise - only with direct neighbours. Uninline functions - to get better stack traces. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 203 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 103 insertions(+), 100 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 9be7937..9a395e0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4373,9 +4373,6 @@ struct netdev_adjacent { /* upper master flag, there can only be one master device per list */ bool master; - /* indicates that this dev is our first-level lower/upper device */ - bool neighbour; - /* counter for the number of times this device was added to us */ u16 ref_nr; @@ -4385,29 +4382,17 @@ struct netdev_adjacent { static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, struct net_device *adj_dev, - struct list_head *dev_list) + struct list_head *adj_list) { struct netdev_adjacent *adj; - list_for_each_entry(adj, dev_list, list) { + list_for_each_entry(adj, adj_list, list) { if (adj->dev == adj_dev) return adj; } return NULL; } -static inline struct netdev_adjacent *__netdev_find_upper(struct net_device *dev, - struct net_device *udev) -{ - return __netdev_find_adj(dev, udev, &dev->upper_dev_list); -} - -static inline struct netdev_adjacent *__netdev_find_lower(struct net_device *dev, - struct net_device *ldev) -{ - return __netdev_find_adj(dev, ldev, &dev->lower_dev_list); -} - /** * netdev_has_upper_dev - Check if device is linked to an upper device * @dev: device @@ -4422,7 +4407,7 @@ bool netdev_has_upper_dev(struct net_device *dev, { ASSERT_RTNL(); - return __netdev_find_upper(dev, upper_dev); + return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper); } EXPORT_SYMBOL(netdev_has_upper_dev); @@ -4437,7 +4422,7 @@ bool netdev_has_any_upper_dev(struct net_device *dev) { ASSERT_RTNL(); - return !list_empty(&dev->upper_dev_list); + return !list_empty(&dev->all_adj_list.upper); } EXPORT_SYMBOL(netdev_has_any_upper_dev); @@ -4454,10 +4439,10 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) ASSERT_RTNL(); - if (list_empty(&dev->upper_dev_list)) + if (list_empty(&dev->adj_list.upper)) return NULL; - upper = list_first_entry(&dev->upper_dev_list, + upper = list_first_entry(&dev->adj_list.upper, struct netdev_adjacent, list); if (likely(upper->master)) return upper->dev; @@ -4465,15 +4450,15 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) } EXPORT_SYMBOL(netdev_master_upper_dev_get); -/* netdev_upper_get_next_dev_rcu - Get the next dev from upper list +/* netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list * @dev: device * @iter: list_head ** of the current position * * Gets the next device from the dev's upper list, starting from iter * position. The caller must hold RCU read lock. */ -struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, - struct list_head **iter) +struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter) { struct netdev_adjacent *upper; @@ -4481,14 +4466,14 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); - if (&upper->list == &dev->upper_dev_list) + if (&upper->list == &dev->all_adj_list.upper) return NULL; *iter = &upper->list; return upper->dev; } -EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); +EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); /** * netdev_master_upper_dev_get_rcu - Get master upper device @@ -4501,7 +4486,7 @@ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) { struct netdev_adjacent *upper; - upper = list_first_or_null_rcu(&dev->upper_dev_list, + upper = list_first_or_null_rcu(&dev->adj_list.upper, struct netdev_adjacent, list); if (upper && likely(upper->master)) return upper->dev; @@ -4512,14 +4497,13 @@ EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); static int __netdev_adjacent_dev_insert(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list, - bool neighbour, bool master) + bool master) { struct netdev_adjacent *adj; adj = __netdev_find_adj(dev, adj_dev, dev_list); if (adj) { - BUG_ON(neighbour); adj->ref_nr++; return 0; } @@ -4530,13 +4514,11 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, adj->dev = adj_dev; adj->master = master; - adj->neighbour = neighbour; adj->ref_nr = 1; - dev_hold(adj_dev); - pr_debug("dev_hold for %s, because of %s link added from %s to %s\n", - adj_dev->name, dev_list == &dev->upper_dev_list ? - "upper" : "lower", dev->name, adj_dev->name); + + pr_debug("dev_hold for %s, because of link added from %s to %s\n", + adj_dev->name, dev->name, adj_dev->name); /* Ensure that master link is always the first item in list. */ if (master) @@ -4547,22 +4529,6 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, return 0; } -static inline int __netdev_upper_dev_insert(struct net_device *dev, - struct net_device *udev, - bool master, bool neighbour) -{ - return __netdev_adjacent_dev_insert(dev, udev, &dev->upper_dev_list, - neighbour, master); -} - -static inline int __netdev_lower_dev_insert(struct net_device *dev, - struct net_device *ldev, - bool neighbour) -{ - return __netdev_adjacent_dev_insert(dev, ldev, &dev->lower_dev_list, - neighbour, false); -} - void __netdev_adjacent_dev_remove(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list) @@ -4571,73 +4537,102 @@ void __netdev_adjacent_dev_remove(struct net_device *dev, adj = __netdev_find_adj(dev, adj_dev, dev_list); - if (!adj) + if (!adj) { + pr_err("tried to remove device %s from %s\n", + dev->name, adj_dev->name); BUG(); + } if (adj->ref_nr > 1) { + pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name, + adj->ref_nr-1); adj->ref_nr--; return; } list_del_rcu(&adj->list); - pr_debug("dev_put for %s, because of %s link removed from %s to %s\n", - adj_dev->name, dev_list == &dev->upper_dev_list ? - "upper" : "lower", dev->name, adj_dev->name); + pr_debug("dev_put for %s, because link removed from %s to %s\n", + adj_dev->name, dev->name, adj_dev->name); dev_put(adj_dev); kfree_rcu(adj, rcu); } -static inline void __netdev_upper_dev_remove(struct net_device *dev, - struct net_device *udev) -{ - return __netdev_adjacent_dev_remove(dev, udev, &dev->upper_dev_list); -} - -static inline void __netdev_lower_dev_remove(struct net_device *dev, - struct net_device *ldev) -{ - return __netdev_adjacent_dev_remove(dev, ldev, &dev->lower_dev_list); -} - -int __netdev_adjacent_dev_insert_link(struct net_device *dev, - struct net_device *upper_dev, - bool master, bool neighbour) +int __netdev_adjacent_dev_link_lists(struct net_device *dev, + struct net_device *upper_dev, + struct list_head *up_list, + struct list_head *down_list, + bool master) { int ret; - ret = __netdev_upper_dev_insert(dev, upper_dev, master, neighbour); + ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, master); if (ret) return ret; - ret = __netdev_lower_dev_insert(upper_dev, dev, neighbour); + ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, false); if (ret) { - __netdev_upper_dev_remove(dev, upper_dev); + __netdev_adjacent_dev_remove(dev, upper_dev, up_list); return ret; } return 0; } -static inline int __netdev_adjacent_dev_link(struct net_device *dev, - struct net_device *udev) +int __netdev_adjacent_dev_link(struct net_device *dev, + struct net_device *upper_dev) { - return __netdev_adjacent_dev_insert_link(dev, udev, false, false); + return __netdev_adjacent_dev_link_lists(dev, upper_dev, + &dev->all_adj_list.upper, + &upper_dev->all_adj_list.lower, + false); } -static inline int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, - struct net_device *udev, - bool master) +void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, + struct net_device *upper_dev, + struct list_head *up_list, + struct list_head *down_list) { - return __netdev_adjacent_dev_insert_link(dev, udev, master, true); + __netdev_adjacent_dev_remove(dev, upper_dev, up_list); + __netdev_adjacent_dev_remove(upper_dev, dev, down_list); } void __netdev_adjacent_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { - __netdev_upper_dev_remove(dev, upper_dev); - __netdev_lower_dev_remove(upper_dev, dev); + __netdev_adjacent_dev_unlink_lists(dev, upper_dev, + &dev->all_adj_list.upper, + &upper_dev->all_adj_list.lower); +} + +int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, + struct net_device *upper_dev, + bool master) +{ + int ret = __netdev_adjacent_dev_link(dev, upper_dev); + + if (ret) + return ret; + + ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, + &dev->adj_list.upper, + &upper_dev->adj_list.lower, + master); + if (ret) { + __netdev_adjacent_dev_unlink(dev, upper_dev); + return ret; + } + + return 0; } +void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, + struct net_device *upper_dev) +{ + __netdev_adjacent_dev_unlink(dev, upper_dev); + __netdev_adjacent_dev_unlink_lists(dev, upper_dev, + &dev->adj_list.upper, + &upper_dev->adj_list.lower); +} static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master) @@ -4651,10 +4646,10 @@ static int __netdev_upper_dev_link(struct net_device *dev, return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ - if (__netdev_find_upper(upper_dev, dev)) + if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper)) return -EBUSY; - if (__netdev_find_upper(dev, upper_dev)) + if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper)) return -EEXIST; if (master && netdev_master_upper_dev_get(dev)) @@ -4665,12 +4660,14 @@ static int __netdev_upper_dev_link(struct net_device *dev, return ret; /* Now that we linked these devs, make all the upper_dev's - * upper_dev_list visible to every dev's lower_dev_list and vice + * all_adj_list.upper visible to every dev's all_adj_list.lower an * versa, and don't forget the devices itself. All of these * links are non-neighbours. */ - list_for_each_entry(i, &dev->lower_dev_list, list) { - list_for_each_entry(j, &upper_dev->upper_dev_list, list) { + list_for_each_entry(i, &dev->all_adj_list.lower, list) { + list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { + pr_debug("Interlinking %s with %s, non-neighbour\n", + i->dev->name, j->dev->name); ret = __netdev_adjacent_dev_link(i->dev, j->dev); if (ret) goto rollback_mesh; @@ -4678,14 +4675,18 @@ static int __netdev_upper_dev_link(struct net_device *dev, } /* add dev to every upper_dev's upper device */ - list_for_each_entry(i, &upper_dev->upper_dev_list, list) { + list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { + pr_debug("linking %s's upper device %s with %s\n", + upper_dev->name, i->dev->name, dev->name); ret = __netdev_adjacent_dev_link(dev, i->dev); if (ret) goto rollback_upper_mesh; } /* add upper_dev to every dev's lower device */ - list_for_each_entry(i, &dev->lower_dev_list, list) { + list_for_each_entry(i, &dev->all_adj_list.lower, list) { + pr_debug("linking %s's lower device %s with %s\n", dev->name, + i->dev->name, upper_dev->name); ret = __netdev_adjacent_dev_link(i->dev, upper_dev); if (ret) goto rollback_lower_mesh; @@ -4696,7 +4697,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, rollback_lower_mesh: to_i = i; - list_for_each_entry(i, &dev->lower_dev_list, list) { + list_for_each_entry(i, &dev->all_adj_list.lower, list) { if (i == to_i) break; __netdev_adjacent_dev_unlink(i->dev, upper_dev); @@ -4706,7 +4707,7 @@ rollback_lower_mesh: rollback_upper_mesh: to_i = i; - list_for_each_entry(i, &upper_dev->upper_dev_list, list) { + list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { if (i == to_i) break; __netdev_adjacent_dev_unlink(dev, i->dev); @@ -4717,8 +4718,8 @@ rollback_upper_mesh: rollback_mesh: to_i = i; to_j = j; - list_for_each_entry(i, &dev->lower_dev_list, list) { - list_for_each_entry(j, &upper_dev->upper_dev_list, list) { + list_for_each_entry(i, &dev->all_adj_list.lower, list) { + list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { if (i == to_i && j == to_j) break; __netdev_adjacent_dev_unlink(i->dev, j->dev); @@ -4727,7 +4728,7 @@ rollback_mesh: break; } - __netdev_adjacent_dev_unlink(dev, upper_dev); + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); return ret; } @@ -4781,23 +4782,23 @@ void netdev_upper_dev_unlink(struct net_device *dev, struct netdev_adjacent *i, *j; ASSERT_RTNL(); - __netdev_adjacent_dev_unlink(dev, upper_dev); + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); /* Here is the tricky part. We must remove all dev's lower * devices from all upper_dev's upper devices and vice * versa, to maintain the graph relationship. */ - list_for_each_entry(i, &dev->lower_dev_list, list) - list_for_each_entry(j, &upper_dev->upper_dev_list, list) + list_for_each_entry(i, &dev->all_adj_list.lower, list) + list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) __netdev_adjacent_dev_unlink(i->dev, j->dev); /* remove also the devices itself from lower/upper device * list */ - list_for_each_entry(i, &dev->lower_dev_list, list) + list_for_each_entry(i, &dev->all_adj_list.lower, list) __netdev_adjacent_dev_unlink(i->dev, upper_dev); - list_for_each_entry(i, &upper_dev->upper_dev_list, list) + list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) __netdev_adjacent_dev_unlink(dev, i->dev); call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); @@ -6059,8 +6060,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); INIT_LIST_HEAD(&dev->link_watch_list); - INIT_LIST_HEAD(&dev->upper_dev_list); - INIT_LIST_HEAD(&dev->lower_dev_list); + INIT_LIST_HEAD(&dev->adj_list.upper); + INIT_LIST_HEAD(&dev->adj_list.lower); + INIT_LIST_HEAD(&dev->all_adj_list.upper); + INIT_LIST_HEAD(&dev->all_adj_list.lower); dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); -- cgit v1.1 From 5249dec7380cb64928d2ae6201028b4da1dceb1e Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 25 Sep 2013 09:20:08 +0200 Subject: net: add RCU variant to search for netdev_adjacent link Currently we have only the RTNL flavour, however we can traverse it while holding only RCU, so add the RCU search. Add an RCU variant that uses list_head * as an argument, so that it can be universally used afterwards. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 9a395e0..9290f09 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4380,6 +4380,19 @@ struct netdev_adjacent { struct rcu_head rcu; }; +static struct netdev_adjacent *__netdev_find_adj_rcu(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *adj_list) +{ + struct netdev_adjacent *adj; + + list_for_each_entry_rcu(adj, adj_list, list) { + if (adj->dev == adj_dev) + return adj; + } + return NULL; +} + static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, struct net_device *adj_dev, struct list_head *adj_list) -- cgit v1.1 From 402dae9614557296e84543008a8e582c28fb1db3 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 25 Sep 2013 09:20:09 +0200 Subject: net: add netdev_adjacent->private and allow to use it Currently, even though we can access any linked device, we can't attach anything to it, which is vital to properly manage them. To fix this, add a new void *private to netdev_adjacent and functions setting/getting it (per link), so that we can save, per example, bonding's slave structures there, per slave device. netdev_master_upper_dev_link_private(dev, upper_dev, private) links dev to upper dev and populates the neighbour link only with private. netdev_lower_dev_get_private{,_rcu}() returns the private, if found. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 57 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 9290f09..c69ab74 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4376,6 +4376,9 @@ struct netdev_adjacent { /* counter for the number of times this device was added to us */ u16 ref_nr; + /* private field for the users */ + void *private; + struct list_head list; struct rcu_head rcu; }; @@ -4510,7 +4513,7 @@ EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); static int __netdev_adjacent_dev_insert(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list, - bool master) + void *private, bool master) { struct netdev_adjacent *adj; @@ -4528,6 +4531,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, adj->dev = adj_dev; adj->master = master; adj->ref_nr = 1; + adj->private = private; dev_hold(adj_dev); pr_debug("dev_hold for %s, because of link added from %s to %s\n", @@ -4574,15 +4578,17 @@ int __netdev_adjacent_dev_link_lists(struct net_device *dev, struct net_device *upper_dev, struct list_head *up_list, struct list_head *down_list, - bool master) + void *private, bool master) { int ret; - ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, master); + ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, + master); if (ret) return ret; - ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, false); + ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, + false); if (ret) { __netdev_adjacent_dev_remove(dev, upper_dev, up_list); return ret; @@ -4597,7 +4603,7 @@ int __netdev_adjacent_dev_link(struct net_device *dev, return __netdev_adjacent_dev_link_lists(dev, upper_dev, &dev->all_adj_list.upper, &upper_dev->all_adj_list.lower, - false); + NULL, false); } void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, @@ -4619,7 +4625,7 @@ void __netdev_adjacent_dev_unlink(struct net_device *dev, int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, struct net_device *upper_dev, - bool master) + void *private, bool master) { int ret = __netdev_adjacent_dev_link(dev, upper_dev); @@ -4629,7 +4635,7 @@ int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, &dev->adj_list.upper, &upper_dev->adj_list.lower, - master); + private, master); if (ret) { __netdev_adjacent_dev_unlink(dev, upper_dev); return ret; @@ -4648,7 +4654,8 @@ void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, } static int __netdev_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev, bool master) + struct net_device *upper_dev, bool master, + void *private) { struct netdev_adjacent *i, *j, *to_i, *to_j; int ret = 0; @@ -4668,7 +4675,8 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (master && netdev_master_upper_dev_get(dev)) return -EBUSY; - ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, master); + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private, + master); if (ret) return ret; @@ -4759,7 +4767,7 @@ rollback_mesh: int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev) { - return __netdev_upper_dev_link(dev, upper_dev, false); + return __netdev_upper_dev_link(dev, upper_dev, false, NULL); } EXPORT_SYMBOL(netdev_upper_dev_link); @@ -4777,10 +4785,18 @@ EXPORT_SYMBOL(netdev_upper_dev_link); int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev) { - return __netdev_upper_dev_link(dev, upper_dev, true); + return __netdev_upper_dev_link(dev, upper_dev, true, NULL); } EXPORT_SYMBOL(netdev_master_upper_dev_link); +int netdev_master_upper_dev_link_private(struct net_device *dev, + struct net_device *upper_dev, + void *private) +{ + return __netdev_upper_dev_link(dev, upper_dev, true, private); +} +EXPORT_SYMBOL(netdev_master_upper_dev_link_private); + /** * netdev_upper_dev_unlink - Removes a link to upper device * @dev: device @@ -4818,6 +4834,36 @@ void netdev_upper_dev_unlink(struct net_device *dev, } EXPORT_SYMBOL(netdev_upper_dev_unlink); +void *netdev_lower_dev_get_private_rcu(struct net_device *dev, + struct net_device *lower_dev) +{ + struct netdev_adjacent *lower; + + if (!lower_dev) + return NULL; + lower = __netdev_find_adj_rcu(dev, lower_dev, &dev->adj_list.lower); + if (!lower) + return NULL; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_dev_get_private_rcu); + +void *netdev_lower_dev_get_private(struct net_device *dev, + struct net_device *lower_dev) +{ + struct netdev_adjacent *lower; + + if (!lower_dev) + return NULL; + lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower); + if (!lower) + return NULL; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_dev_get_private); + static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; -- cgit v1.1 From 31088a113c2a948856ed2047d8c21c217b13e85d Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 25 Sep 2013 09:20:12 +0200 Subject: net: add for_each iterators through neighbour lower link's private Add a possibility to iterate through netdev_adjacent's private, currently only for lower neighbours. Add both RCU and RTNL/other locking variants of iterators, and make the non-rcu variant to be safe from removal. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index c69ab74..0aa844aa 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4466,7 +4466,8 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) } EXPORT_SYMBOL(netdev_master_upper_dev_get); -/* netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list +/** + * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list * @dev: device * @iter: list_head ** of the current position * @@ -4492,6 +4493,63 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); /** + * netdev_lower_get_next_private - Get the next ->private from the + * lower neighbour list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold either hold the + * RTNL lock or its own locking that guarantees that the neighbour lower + * list will remain unchainged. + */ +void *netdev_lower_get_next_private(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry(*iter, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + if (iter) + *iter = lower->list.next; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_get_next_private); + +/** + * netdev_lower_get_next_private_rcu - Get the next ->private from the + * lower neighbour list, RCU + * variant + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold RCU read lock. + */ +void *netdev_lower_get_next_private_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + if (iter) + *iter = &lower->list; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); + +/** * netdev_master_upper_dev_get_rcu - Get master upper device * @dev: device * -- cgit v1.1 From b6ccba4c681fdaf0070e580bf951badf7edc860b Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 25 Sep 2013 09:20:23 +0200 Subject: net: add a possibility to get private from netdev_adjacent->list It will be useful to get first/last element. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0aa844aa..acc1181 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4466,6 +4466,16 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) } EXPORT_SYMBOL(netdev_master_upper_dev_get); +void *netdev_adjacent_get_private(struct list_head *adj_list) +{ + struct netdev_adjacent *adj; + + adj = list_entry(adj_list, struct netdev_adjacent, list); + + return adj->private; +} +EXPORT_SYMBOL(netdev_adjacent_get_private); + /** * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list * @dev: device -- cgit v1.1 From 842d67a7b34ea735155812ecf0671a481284f358 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 25 Sep 2013 09:20:31 +0200 Subject: net: expose the master link to sysfs, and remove it from bond Currently, we can have only one master upper neighbour, so it would be useful to create a symlink to it in the sysfs device directory, the way that bonding now does it, for every device. Lower devices from bridge/team/etc will automagically get it, so we could rely on it. Also, remove the same functionality from bonding. CC: Jay Vosburgh CC: Andy Gospodarek CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index acc1181..de443ee 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4584,6 +4584,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, void *private, bool master) { struct netdev_adjacent *adj; + int ret; adj = __netdev_find_adj(dev, adj_dev, dev_list); @@ -4606,12 +4607,23 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, adj_dev->name, dev->name, adj_dev->name); /* Ensure that master link is always the first item in list. */ - if (master) + if (master) { + ret = sysfs_create_link(&(dev->dev.kobj), + &(adj_dev->dev.kobj), "master"); + if (ret) + goto free_adj; + list_add_rcu(&adj->list, dev_list); - else + } else { list_add_tail_rcu(&adj->list, dev_list); + } return 0; + +free_adj: + kfree(adj); + + return ret; } void __netdev_adjacent_dev_remove(struct net_device *dev, @@ -4635,6 +4647,9 @@ void __netdev_adjacent_dev_remove(struct net_device *dev, return; } + if (adj->master) + sysfs_remove_link(&(dev->dev.kobj), "master"); + list_del_rcu(&adj->list); pr_debug("dev_put for %s, because link removed from %s to %s\n", adj_dev->name, dev->name, adj_dev->name); -- cgit v1.1 From 5831d66e8097aedfa3bc35941cf265ada2352317 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 25 Sep 2013 09:20:32 +0200 Subject: net: create sysfs symlinks for neighbour devices Also, remove the same functionality from bonding - it will be already done for any device that links to its lower/upper neighbour. The links will be created for dev's kobject, and will look like lower_eth0 for lower device eth0 and upper_bridge0 for upper device bridge0. CC: Jay Vosburgh CC: Andy Gospodarek CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index de443ee..25ab6fe 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4584,6 +4584,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, void *private, bool master) { struct netdev_adjacent *adj; + char linkname[IFNAMSIZ+7]; int ret; adj = __netdev_find_adj(dev, adj_dev, dev_list); @@ -4606,12 +4607,26 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, pr_debug("dev_hold for %s, because of link added from %s to %s\n", adj_dev->name, dev->name, adj_dev->name); + if (dev_list == &dev->adj_list.lower) { + sprintf(linkname, "lower_%s", adj_dev->name); + ret = sysfs_create_link(&(dev->dev.kobj), + &(adj_dev->dev.kobj), linkname); + if (ret) + goto free_adj; + } else if (dev_list == &dev->adj_list.upper) { + sprintf(linkname, "upper_%s", adj_dev->name); + ret = sysfs_create_link(&(dev->dev.kobj), + &(adj_dev->dev.kobj), linkname); + if (ret) + goto free_adj; + } + /* Ensure that master link is always the first item in list. */ if (master) { ret = sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), "master"); if (ret) - goto free_adj; + goto remove_symlinks; list_add_rcu(&adj->list, dev_list); } else { @@ -4620,6 +4635,15 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, return 0; +remove_symlinks: + if (dev_list == &dev->adj_list.lower) { + sprintf(linkname, "lower_%s", adj_dev->name); + sysfs_remove_link(&(dev->dev.kobj), linkname); + } else if (dev_list == &dev->adj_list.upper) { + sprintf(linkname, "upper_%s", adj_dev->name); + sysfs_remove_link(&(dev->dev.kobj), linkname); + } + free_adj: kfree(adj); @@ -4631,6 +4655,7 @@ void __netdev_adjacent_dev_remove(struct net_device *dev, struct list_head *dev_list) { struct netdev_adjacent *adj; + char linkname[IFNAMSIZ+7]; adj = __netdev_find_adj(dev, adj_dev, dev_list); @@ -4650,6 +4675,14 @@ void __netdev_adjacent_dev_remove(struct net_device *dev, if (adj->master) sysfs_remove_link(&(dev->dev.kobj), "master"); + if (dev_list == &dev->adj_list.lower) { + sprintf(linkname, "lower_%s", adj_dev->name); + sysfs_remove_link(&(dev->dev.kobj), linkname); + } else if (dev_list == &dev->adj_list.upper) { + sprintf(linkname, "upper_%s", adj_dev->name); + sysfs_remove_link(&(dev->dev.kobj), linkname); + } + list_del_rcu(&adj->list); pr_debug("dev_put for %s, because link removed from %s to %s\n", adj_dev->name, dev->name, adj_dev->name); -- cgit v1.1 From 62748f32d501f5d3712a7c372bbb92abc7c62bc7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Sep 2013 08:20:52 -0700 Subject: net: introduce SO_MAX_PACING_RATE As mentioned in commit afe4fd062416b ("pkt_sched: fq: Fair Queue packet scheduler"), this patch adds a new socket option. SO_MAX_PACING_RATE offers the application the ability to cap the rate computed by transport layer. Value is in bytes per second. u32 val = 1000000; setsockopt(sockfd, SOL_SOCKET, SO_MAX_PACING_RATE, &val, sizeof(val)); To be effectively paced, a flow must use FQ packet scheduler. Note that a packet scheduler takes into account the headers for its computations. The effective payload rate depends on MSS and retransmits if any. I chose to make this pacing rate a SOL_SOCKET option instead of a TCP one because this can be used by other protocols. Signed-off-by: Eric Dumazet Cc: Steinar H. Gunderson Cc: Michael Kerrisk Signed-off-by: David S. Miller --- net/core/sock.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 5b6beba..2bd9b3f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -914,6 +914,13 @@ set_rcvbuf: } break; #endif + + case SO_MAX_PACING_RATE: + sk->sk_max_pacing_rate = val; + sk->sk_pacing_rate = min(sk->sk_pacing_rate, + sk->sk_max_pacing_rate); + break; + default: ret = -ENOPROTOOPT; break; @@ -1177,6 +1184,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; #endif + case SO_MAX_PACING_RATE: + v.val = sk->sk_max_pacing_rate; + break; + default: return -ENOPROTOOPT; } @@ -2319,6 +2330,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_ll_usec = sysctl_net_busy_read; #endif + sk->sk_max_pacing_rate = ~0U; /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) -- cgit v1.1 From a528c219df2e865e178c538c7178961dfed5a13c Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 25 Sep 2013 12:02:44 +0200 Subject: dev: update __dev_notify_flags() to send rtnl msg This patch only prepares the next one, there is no functional change. Now, __dev_notify_flags() can also be used to notify flags changes via rtnetlink. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/dev.c | 11 ++++++----- net/core/rtnetlink.c | 3 +-- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 25ab6fe..594a6b0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5235,10 +5235,14 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) return ret; } -void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) +void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, + unsigned int gchanges) { unsigned int changes = dev->flags ^ old_flags; + if (gchanges) + rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges); + if (changes & IFF_UP) { if (dev->flags & IFF_UP) call_netdevice_notifiers(NETDEV_UP, dev); @@ -5274,10 +5278,7 @@ int dev_change_flags(struct net_device *dev, unsigned int flags) return ret; changes = old_flags ^ dev->flags; - if (changes) - rtmsg_ifinfo(RTM_NEWLINK, dev, changes); - - __dev_notify_flags(dev, old_flags); + __dev_notify_flags(dev, old_flags, changes); return ret; } EXPORT_SYMBOL(dev_change_flags); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2a0e21d..4aedf03 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1647,9 +1647,8 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) } dev->rtnl_link_state = RTNL_LINK_INITIALIZED; - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); - __dev_notify_flags(dev, old_flags); + __dev_notify_flags(dev, old_flags, ~0U); return 0; } EXPORT_SYMBOL(rtnl_configure_link); -- cgit v1.1 From 991fb3f74c142e1a1891ff4f7e9a6285a79a8ea1 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 25 Sep 2013 12:02:45 +0200 Subject: dev: always advertise rx_flags changes via netlink When flags IFF_PROMISC and IFF_ALLMULTI are changed, netlink messages are not consistent. For example, if a multicast daemon is running (flag IFF_ALLMULTI set in dev->flags but not dev->gflags, ie not exported to userspace) and then a user sets it via netlink (flag IFF_ALLMULTI set in dev->flags and dev->gflags, ie exported to userspace), no netlink message is sent. Same for IFF_PROMISC and because dev->promiscuity is exported via IFLA_PROMISCUITY, we may send a netlink message after each change of this counter. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/dev.c | 60 ++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 23 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 594a6b0..81340ed 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4988,7 +4988,7 @@ static void dev_change_rx_flags(struct net_device *dev, int flags) ops->ndo_change_rx_flags(dev, flags); } -static int __dev_set_promiscuity(struct net_device *dev, int inc) +static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags; kuid_t uid; @@ -5031,6 +5031,8 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc) dev_change_rx_flags(dev, IFF_PROMISC); } + if (notify) + __dev_notify_flags(dev, old_flags, IFF_PROMISC); return 0; } @@ -5050,7 +5052,7 @@ int dev_set_promiscuity(struct net_device *dev, int inc) unsigned int old_flags = dev->flags; int err; - err = __dev_set_promiscuity(dev, inc); + err = __dev_set_promiscuity(dev, inc, true); if (err < 0) return err; if (dev->flags != old_flags) @@ -5059,22 +5061,9 @@ int dev_set_promiscuity(struct net_device *dev, int inc) } EXPORT_SYMBOL(dev_set_promiscuity); -/** - * dev_set_allmulti - update allmulti count on a device - * @dev: device - * @inc: modifier - * - * Add or remove reception of all multicast frames to a device. While the - * count in the device remains above zero the interface remains listening - * to all interfaces. Once it hits zero the device reverts back to normal - * filtering operation. A negative @inc value is used to drop the counter - * when releasing a resource needing all multicasts. - * Return 0 if successful or a negative errno code on error. - */ - -int dev_set_allmulti(struct net_device *dev, int inc) +static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) { - unsigned int old_flags = dev->flags; + unsigned int old_flags = dev->flags, old_gflags = dev->gflags; ASSERT_RTNL(); @@ -5097,9 +5086,30 @@ int dev_set_allmulti(struct net_device *dev, int inc) if (dev->flags ^ old_flags) { dev_change_rx_flags(dev, IFF_ALLMULTI); dev_set_rx_mode(dev); + if (notify) + __dev_notify_flags(dev, old_flags, + dev->gflags ^ old_gflags); } return 0; } + +/** + * dev_set_allmulti - update allmulti count on a device + * @dev: device + * @inc: modifier + * + * Add or remove reception of all multicast frames to a device. While the + * count in the device remains above zero the interface remains listening + * to all interfaces. Once it hits zero the device reverts back to normal + * filtering operation. A negative @inc value is used to drop the counter + * when releasing a resource needing all multicasts. + * Return 0 if successful or a negative errno code on error. + */ + +int dev_set_allmulti(struct net_device *dev, int inc) +{ + return __dev_set_allmulti(dev, inc, true); +} EXPORT_SYMBOL(dev_set_allmulti); /* @@ -5124,10 +5134,10 @@ void __dev_set_rx_mode(struct net_device *dev) * therefore calling __dev_set_promiscuity here is safe. */ if (!netdev_uc_empty(dev) && !dev->uc_promisc) { - __dev_set_promiscuity(dev, 1); + __dev_set_promiscuity(dev, 1, false); dev->uc_promisc = true; } else if (netdev_uc_empty(dev) && dev->uc_promisc) { - __dev_set_promiscuity(dev, -1); + __dev_set_promiscuity(dev, -1, false); dev->uc_promisc = false; } } @@ -5216,9 +5226,13 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; + unsigned int old_flags = dev->flags; dev->gflags ^= IFF_PROMISC; - dev_set_promiscuity(dev, inc); + + if (__dev_set_promiscuity(dev, inc, false) >= 0) + if (dev->flags != old_flags) + dev_set_rx_mode(dev); } /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI @@ -5229,7 +5243,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) int inc = (flags & IFF_ALLMULTI) ? 1 : -1; dev->gflags ^= IFF_ALLMULTI; - dev_set_allmulti(dev, inc); + __dev_set_allmulti(dev, inc, false); } return ret; @@ -5271,13 +5285,13 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, int dev_change_flags(struct net_device *dev, unsigned int flags) { int ret; - unsigned int changes, old_flags = dev->flags; + unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; ret = __dev_change_flags(dev, flags); if (ret < 0) return ret; - changes = old_flags ^ dev->flags; + changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); __dev_notify_flags(dev, old_flags, changes); return ret; } -- cgit v1.1 From 357afe9c46c951c34769e39cabdf8d1637e2eecc Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 2 Oct 2013 13:39:24 +0200 Subject: flow_dissector: factor out the ports extraction in skb_flow_get_ports Factor out the code that extracts the ports from skb_flow_dissect and add a new function skb_flow_get_ports which can be re-used. Suggested-by: Veaceslav Falico Signed-off-by: Nikolay Aleksandrov Acked-by: Eric Dumazet Reviewed-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 8d7d0dd..f8e25ac 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -25,9 +25,35 @@ static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *i memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst)); } +/** + * skb_flow_get_ports - extract the upper layer ports and return them + * @skb: buffer to extract the ports from + * @thoff: transport header offset + * @ip_proto: protocol for which to get port offset + * + * The function will try to retrieve the ports at offset thoff + poff where poff + * is the protocol port offset returned from proto_ports_offset + */ +__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto) +{ + int poff = proto_ports_offset(ip_proto); + + if (poff >= 0) { + __be32 *ports, _ports; + + ports = skb_header_pointer(skb, thoff + poff, + sizeof(_ports), &_ports); + if (ports) + return *ports; + } + + return 0; +} +EXPORT_SYMBOL(skb_flow_get_ports); + bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) { - int poff, nhoff = skb_network_offset(skb); + int nhoff = skb_network_offset(skb); u8 ip_proto; __be16 proto = skb->protocol; @@ -150,16 +176,7 @@ ipv6: } flow->ip_proto = ip_proto; - poff = proto_ports_offset(ip_proto); - if (poff >= 0) { - __be32 *ports, _ports; - - ports = skb_header_pointer(skb, nhoff + poff, - sizeof(_ports), &_ports); - if (ports) - flow->ports = *ports; - } - + flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto); flow->thoff = (u16) nhoff; return true; -- cgit v1.1 From 5cde282938915f36a2e6769b51c24c4159654859 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 5 Oct 2013 19:26:05 -0700 Subject: net: Separate the close_list and the unreg_list v2 Separate the unreg_list and the close_list in dev_close_many preventing dev_close_many from permuting the unreg_list. The permutations of the unreg_list have resulted in cases where the loopback device is accessed it has been freed in code such as dst_ifdown. Resulting in subtle memory corruption. This is the second bug from sharing the storage between the close_list and the unreg_list. The issues that crop up with sharing are apparently too subtle to show up in normal testing or usage, so let's forget about being clever and use two separate lists. v2: Make all callers pass in a close_list to dev_close_many Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/core/dev.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index c25db20..fa0b2b0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1307,7 +1307,7 @@ static int __dev_close_many(struct list_head *head) ASSERT_RTNL(); might_sleep(); - list_for_each_entry(dev, head, unreg_list) { + list_for_each_entry(dev, head, close_list) { call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); clear_bit(__LINK_STATE_START, &dev->state); @@ -1323,7 +1323,7 @@ static int __dev_close_many(struct list_head *head) dev_deactivate_many(head); - list_for_each_entry(dev, head, unreg_list) { + list_for_each_entry(dev, head, close_list) { const struct net_device_ops *ops = dev->netdev_ops; /* @@ -1351,7 +1351,7 @@ static int __dev_close(struct net_device *dev) /* Temporarily disable netpoll until the interface is down */ netpoll_rx_disable(dev); - list_add(&dev->unreg_list, &single); + list_add(&dev->close_list, &single); retval = __dev_close_many(&single); list_del(&single); @@ -1362,21 +1362,20 @@ static int __dev_close(struct net_device *dev) static int dev_close_many(struct list_head *head) { struct net_device *dev, *tmp; - LIST_HEAD(tmp_list); - list_for_each_entry_safe(dev, tmp, head, unreg_list) + /* Remove the devices that don't need to be closed */ + list_for_each_entry_safe(dev, tmp, head, close_list) if (!(dev->flags & IFF_UP)) - list_move(&dev->unreg_list, &tmp_list); + list_del_init(&dev->close_list); __dev_close_many(head); - list_for_each_entry(dev, head, unreg_list) { + list_for_each_entry_safe(dev, tmp, head, close_list) { rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); call_netdevice_notifiers(NETDEV_DOWN, dev); + list_del_init(&dev->close_list); } - /* rollback_registered_many needs the complete original list */ - list_splice(&tmp_list, head); return 0; } @@ -1397,7 +1396,7 @@ int dev_close(struct net_device *dev) /* Block netpoll rx while the interface is going down */ netpoll_rx_disable(dev); - list_add(&dev->unreg_list, &single); + list_add(&dev->close_list, &single); dev_close_many(&single); list_del(&single); @@ -5439,6 +5438,7 @@ static void net_set_todo(struct net_device *dev) static void rollback_registered_many(struct list_head *head) { struct net_device *dev, *tmp; + LIST_HEAD(close_head); BUG_ON(dev_boot_phase); ASSERT_RTNL(); @@ -5461,7 +5461,9 @@ static void rollback_registered_many(struct list_head *head) } /* If device is running, close it first. */ - dev_close_many(head); + list_for_each_entry(dev, head, unreg_list) + list_add_tail(&dev->close_list, &close_head); + dev_close_many(&close_head); list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ @@ -6257,6 +6259,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); + INIT_LIST_HEAD(&dev->close_list); INIT_LIST_HEAD(&dev->link_watch_list); INIT_LIST_HEAD(&dev->adj_list.upper); INIT_LIST_HEAD(&dev->adj_list.lower); -- cgit v1.1 From e1af5e445ef8582e8f690fadcd63797db1e62663 Mon Sep 17 00:00:00 2001 From: Gao feng Date: Tue, 8 Oct 2013 11:05:19 +0800 Subject: cgroup: netprio: remove unnecessary task_netprioidx Since the tasks have been migrated to the cgroup, there is no need to call task_netprioidx to get task's cgroup id. Signed-off-by: Gao feng Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/core/netprio_cgroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index d9cd627..9b7cf6c 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -222,11 +222,10 @@ static void net_prio_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *p; - void *v; + void *v = (void *)(unsigned long)css->cgroup->id; cgroup_taskset_for_each(p, css, tset) { task_lock(p); - v = (void *)(unsigned long)task_netprioidx(p); iterate_fd(p->files, 0, update_netprio, v); task_unlock(p); } -- cgit v1.1 From 8a29111c7ca68d928dfab58636f3f6acf0ac04f7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Oct 2013 09:02:23 -0700 Subject: net: gro: allow to build full sized skb skb_gro_receive() is currently limited to 16 or 17 MSS per GRO skb, typically 24616 bytes, because it fills up to MAX_SKB_FRAGS frags. It's relatively easy to extend the skb using frag_list to allow more frags to be appended into the last sk_buff. This still builds very efficient skbs, and allows reaching 45 MSS per skb. (45 MSS GRO packet uses one skb plus a frag_list containing 2 additional sk_buff) High speed TCP flows benefit from this extension by lowering TCP stack cpu usage (less packets stored in receive queue, less ACK packets processed) Forwarding setups could be hurt, as such skbs will need to be linearized, although its not a new problem, as GRO could already provide skbs with a frag_list. We could make the 65536 bytes threshold a tunable to mitigate this. (First time we need to linearize skb in skb_needs_linearize(), we could lower the tunable to ~16*1460 so that following skb_gro_receive() calls build smaller skbs) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d81cff1..8ead744 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2936,32 +2936,30 @@ EXPORT_SYMBOL_GPL(skb_segment); int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) { - struct sk_buff *p = *head; - struct sk_buff *nskb; - struct skb_shared_info *skbinfo = skb_shinfo(skb); - struct skb_shared_info *pinfo = skb_shinfo(p); - unsigned int headroom; - unsigned int len = skb_gro_len(skb); + struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); unsigned int offset = skb_gro_offset(skb); unsigned int headlen = skb_headlen(skb); + struct sk_buff *nskb, *lp, *p = *head; + unsigned int len = skb_gro_len(skb); unsigned int delta_truesize; + unsigned int headroom; - if (p->len + len >= 65536) + if (unlikely(p->len + len >= 65536)) return -E2BIG; - if (pinfo->frag_list) - goto merge; - else if (headlen <= offset) { + lp = NAPI_GRO_CB(p)->last ?: p; + pinfo = skb_shinfo(lp); + + if (headlen <= offset) { skb_frag_t *frag; skb_frag_t *frag2; int i = skbinfo->nr_frags; int nr_frags = pinfo->nr_frags + i; - offset -= headlen; - if (nr_frags > MAX_SKB_FRAGS) - return -E2BIG; + goto merge; + offset -= headlen; pinfo->nr_frags = nr_frags; skbinfo->nr_frags = 0; @@ -2992,7 +2990,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) unsigned int first_offset; if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) - return -E2BIG; + goto merge; first_offset = skb->data - (unsigned char *)page_address(page) + @@ -3010,7 +3008,10 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; goto done; - } else if (skb_gro_len(p) != pinfo->gso_size) + } + if (pinfo->frag_list) + goto merge; + if (skb_gro_len(p) != pinfo->gso_size) return -E2BIG; headroom = skb_headroom(p); @@ -3062,16 +3063,24 @@ merge: __skb_pull(skb, offset); - NAPI_GRO_CB(p)->last->next = skb; + if (!NAPI_GRO_CB(p)->last) + skb_shinfo(p)->frag_list = skb; + else + NAPI_GRO_CB(p)->last->next = skb; NAPI_GRO_CB(p)->last = skb; skb_header_release(skb); + lp = p; done: NAPI_GRO_CB(p)->count++; p->data_len += len; p->truesize += delta_truesize; p->len += len; - + if (lp != p) { + lp->data_len += len; + lp->truesize += delta_truesize; + lp->len += len; + } NAPI_GRO_CB(skb)->same_flow = 1; return 0; } -- cgit v1.1 From 400dfd3ae899849b27d398ca7894e1b44430887f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 Oct 2013 16:27:07 -0700 Subject: net: refactor sk_page_frag_refill() While working on virtio_net new allocation strategy to increase payload/truesize ratio, we found that refactoring sk_page_frag_refill() was needed. This patch splits sk_page_frag_refill() into two parts, adding skb_page_frag_refill() which can be used without a socket. While we are at it, add a minimum frag size of 32 for sk_page_frag_refill() Michael will either use netdev_alloc_frag() from softirq context, or skb_page_frag_refill() from process context in refill_work() (GFP_KERNEL allocations) Signed-off-by: Eric Dumazet Cc: Michael Dalton Signed-off-by: David S. Miller --- net/core/sock.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index fd6afa2..440afdc 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1847,7 +1847,17 @@ EXPORT_SYMBOL(sock_alloc_send_skb); /* On 32bit arches, an skb frag is limited to 2^15 */ #define SKB_FRAG_PAGE_ORDER get_order(32768) -bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +/** + * skb_page_frag_refill - check that a page_frag contains enough room + * @sz: minimum size of the fragment we want to get + * @pfrag: pointer to page_frag + * @prio: priority for memory allocation + * + * Note: While this allocator tries to use high order pages, there is + * no guarantee that allocations succeed. Therefore, @sz MUST be + * less or equal than PAGE_SIZE. + */ +bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio) { int order; @@ -1856,16 +1866,16 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) pfrag->offset = 0; return true; } - if (pfrag->offset < pfrag->size) + if (pfrag->offset + sz <= pfrag->size) return true; put_page(pfrag->page); } /* We restrict high order allocations to users that can afford to wait */ - order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0; + order = (prio & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0; do { - gfp_t gfp = sk->sk_allocation; + gfp_t gfp = prio; if (order) gfp |= __GFP_COMP | __GFP_NOWARN; @@ -1877,6 +1887,15 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) } } while (--order >= 0); + return false; +} +EXPORT_SYMBOL(skb_page_frag_refill); + +bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +{ + if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) + return true; + sk_enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); return false; -- cgit v1.1 From 030737bcc3c404e273e97dbe06fe9561699a411b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 19 Oct 2013 11:42:54 -0700 Subject: net: generalize skb_segment() While implementing GSO/TSO support for IPIP, I found skb_segment() was assuming network header was immediately following mac header. Its not really true in the case inet_gso_segment() is stacked : By the time tcp_gso_segment() is called, network header points to the inner IP header. Let's instead assume nothing and pick the current offsets found in original skb, we have skb_headers_offset_update() helper for that. Also move the csum_start update inside skb_headers_offset_update() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8ead744..0ab32fa 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -903,6 +903,9 @@ EXPORT_SYMBOL(skb_clone); static void skb_headers_offset_update(struct sk_buff *skb, int off) { + /* Only adjust this if it actually is csum_start rather than csum */ + if (skb->ip_summed == CHECKSUM_PARTIAL) + skb->csum_start += off; /* {transport,network,mac}_header and tail are relative to skb->head */ skb->transport_header += off; skb->network_header += off; @@ -1109,9 +1112,6 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, #endif skb->tail += off; skb_headers_offset_update(skb, nhead); - /* Only adjust this if it actually is csum_start rather than csum */ - if (skb->ip_summed == CHECKSUM_PARTIAL) - skb->csum_start += nhead; skb->cloned = 0; skb->hdr_len = 0; skb->nohdr = 0; @@ -1176,7 +1176,6 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, NUMA_NO_NODE); int oldheadroom = skb_headroom(skb); int head_copy_len, head_copy_off; - int off; if (!n) return NULL; @@ -1200,11 +1199,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, copy_skb_header(n, skb); - off = newheadroom - oldheadroom; - if (n->ip_summed == CHECKSUM_PARTIAL) - n->csum_start += off; - - skb_headers_offset_update(n, off); + skb_headers_offset_update(n, newheadroom - oldheadroom); return n; } @@ -2837,14 +2832,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) __copy_skb_header(nskb, skb); nskb->mac_len = skb->mac_len; - /* nskb and skb might have different headroom */ - if (nskb->ip_summed == CHECKSUM_PARTIAL) - nskb->csum_start += skb_headroom(nskb) - headroom; - - skb_reset_mac_header(nskb); - skb_set_network_header(nskb, skb->mac_len); - nskb->transport_header = (nskb->network_header + - skb_network_header_len(skb)); + skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); skb_copy_from_linear_data_offset(skb, -tnl_hlen, nskb->data - tnl_hlen, -- cgit v1.1 From 3347c960295583eee3fd58e5c539fb1972fbc005 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 19 Oct 2013 11:42:56 -0700 Subject: ipv4: gso: make inet_gso_segment() stackable In order to support GSO on IPIP, we need to make inet_gso_segment() stackable. It should not assume network header starts right after mac header. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 1b6eadf..0918aad 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2377,6 +2377,8 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, } SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); + SKB_GSO_CB(skb)->encap_level = 0; + skb_reset_mac_header(skb); skb_reset_mac_len(skb); -- cgit v1.1 From cb32f511a70be8967ac9025cf49c44324ced9a39 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 19 Oct 2013 11:42:57 -0700 Subject: ipip: add GSO/TSO support Now inet_gso_segment() is stackable, its relatively easy to implement GSO/TSO support for IPIP Performance results, when segmentation is done after tunnel device (as no NIC is yet enabled for TSO IPIP support) : Before patch : lpq83:~# ./netperf -H 7.7.9.84 -Cc MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.9.84 () port 0 AF_INET Recv Send Send Utilization Service Demand Socket Socket Message Elapsed Send Recv Send Recv Size Size Size Time Throughput local remote local remote bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB 87380 16384 16384 10.00 3357.88 5.09 3.70 2.983 2.167 After patch : lpq83:~# ./netperf -H 7.7.9.84 -Cc MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.9.84 () port 0 AF_INET Recv Send Send Utilization Service Demand Socket Socket Message Elapsed Send Recv Send Recv Size Size Size Time Throughput local remote local remote bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB 87380 16384 16384 10.00 7710.19 4.52 6.62 1.152 1.687 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/ethtool.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 78e9d92..8cab774 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -81,6 +81,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation", [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation", + [NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation", [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", [NETIF_F_GSO_MPLS_BIT] = "tx-mpls-segmentation", -- cgit v1.1 From a48e42920ff38bc90bbf75143fff4555723d4540 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sat, 19 Oct 2013 21:48:55 +0200 Subject: net: introduce new macro net_get_random_once net_get_random_once is a new macro which handles the initialization of secret keys. It is possible to call it in the fast path. Only the initialization depends on the spinlock and is rather slow. Otherwise it should get used just before the key is used to delay the entropy extration as late as possible to get better randomness. It returns true if the key got initialized. The usage of static_keys for net_get_random_once is a bit uncommon so it needs some further explanation why this actually works: === In the simple non-HAVE_JUMP_LABEL case we actually have === no constrains to use static_key_(true|false) on keys initialized with STATIC_KEY_INIT_(FALSE|TRUE). So this path just expands in favor of the likely case that the initialization is already done. The key is initialized like this: ___done_key = { .enabled = ATOMIC_INIT(0) } The check if (!static_key_true(&___done_key)) \ expands into (pseudo code) if (!likely(___done_key > 0)) , so we take the fast path as soon as ___done_key is increased from the helper function. === If HAVE_JUMP_LABELs are available this depends === on patching of jumps into the prepared NOPs, which is done in jump_label_init at boot-up time (from start_kernel). It is forbidden and dangerous to use net_get_random_once in functions which are called before that! At compilation time NOPs are generated at the call sites of net_get_random_once. E.g. net/ipv6/inet6_hashtable.c:inet6_ehashfn (we need to call net_get_random_once two times in inet6_ehashfn, so two NOPs): 71: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) 76: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) Both will be patched to the actual jumps to the end of the function to call __net_get_random_once at boot time as explained above. arch_static_branch is optimized and inlined for false as return value and actually also returns false in case the NOP is placed in the instruction stream. So in the fast case we get a "return false". But because we initialize ___done_key with (enabled != (entries & 1)) this call-site will get patched up at boot thus returning true. The final check looks like this: if (!static_key_true(&___done_key)) \ ___ret = __net_get_random_once(buf, \ expands to if (!!static_key_false(&___done_key)) \ ___ret = __net_get_random_once(buf, \ So we get true at boot time and as soon as static_key_slow_inc is called on the key it will invert the logic and return false for the fast path. static_key_slow_inc will change the branch because it got initialized with .enabled == 0. After static_key_slow_inc is called on the key the branch is replaced with a nop again. === Misc: === The helper defers the increment into a workqueue so we don't have problems calling this code from atomic sections. A seperate boolean (___done) guards the case where we enter net_get_random_once again before the increment happend. Cc: Ingo Molnar Cc: Steven Rostedt Cc: Jason Baron Cc: Peter Zijlstra Cc: Eric Dumazet Cc: "David S. Miller" Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/core/utils.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'net/core') diff --git a/net/core/utils.c b/net/core/utils.c index aa88e23..bf09371 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -338,3 +338,51 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, csum_unfold(*sum))); } EXPORT_SYMBOL(inet_proto_csum_replace16); + +struct __net_random_once_work { + struct work_struct work; + struct static_key *key; +}; + +static void __net_random_once_deferred(struct work_struct *w) +{ + struct __net_random_once_work *work = + container_of(w, struct __net_random_once_work, work); + if (!static_key_enabled(work->key)) + static_key_slow_inc(work->key); + kfree(work); +} + +static void __net_random_once_disable_jump(struct static_key *key) +{ + struct __net_random_once_work *w; + + w = kmalloc(sizeof(*w), GFP_ATOMIC); + if (!w) + return; + + INIT_WORK(&w->work, __net_random_once_deferred); + w->key = key; + schedule_work(&w->work); +} + +bool __net_get_random_once(void *buf, int nbytes, bool *done, + struct static_key *done_key) +{ + static DEFINE_SPINLOCK(lock); + + spin_lock_bh(&lock); + if (*done) { + spin_unlock_bh(&lock); + return false; + } + + get_random_bytes(buf, nbytes); + *done = true; + spin_unlock_bh(&lock); + + __net_random_once_disable_jump(done_key); + + return true; +} +EXPORT_SYMBOL(__net_get_random_once); -- cgit v1.1 From e34c9a69970d8664a36b46e6445a7cc879111cfd Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sat, 19 Oct 2013 21:48:59 +0200 Subject: net: switch net_secret key generation to net_get_random_once Cc: Eric Dumazet Cc: "David S. Miller" Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/core/secure_seq.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'net/core') diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 3f1ec15..b02fd16 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -16,18 +17,7 @@ static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned; static void net_secret_init(void) { - u32 tmp; - int i; - - if (likely(net_secret[0])) - return; - - for (i = NET_SECRET_SIZE; i > 0;) { - do { - get_random_bytes(&tmp, sizeof(tmp)); - } while (!tmp); - cmpxchg(&net_secret[--i], 0, tmp); - } + net_get_random_once(net_secret, sizeof(net_secret)); } #ifdef CONFIG_INET -- cgit v1.1 From 61c1db7fae21ed33c614356a43bf6580c5e53118 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 20 Oct 2013 20:47:30 -0700 Subject: ipv6: sit: add GSO/TSO support Now ipv6_gso_segment() is stackable, its relatively easy to implement GSO/TSO support for SIT tunnels Performance results, when segmentation is done after tunnel device (as no NIC is yet enabled for TSO SIT support) : Before patch : lpq84:~# ./netperf -H 2002:af6:1153:: -Cc MIGRATED TCP STREAM TEST from ::0 (::) port 0 AF_INET6 to 2002:af6:1153:: () port 0 AF_INET6 Recv Send Send Utilization Service Demand Socket Socket Message Elapsed Send Recv Send Recv Size Size Size Time Throughput local remote local remote bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB 87380 16384 16384 10.00 3168.31 4.81 4.64 2.988 2.877 After patch : lpq84:~# ./netperf -H 2002:af6:1153:: -Cc MIGRATED TCP STREAM TEST from ::0 (::) port 0 AF_INET6 to 2002:af6:1153:: () port 0 AF_INET6 Recv Send Send Utilization Service Demand Socket Socket Message Elapsed Send Recv Send Recv Size Size Size Time Throughput local remote local remote bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB 87380 16384 16384 10.00 5525.00 7.76 5.17 2.763 1.840 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/ethtool.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 8cab774..8629898 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -82,6 +82,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation", [NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation", + [NETIF_F_GSO_SIT_BIT] = "tx-sit-segmentation", [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", [NETIF_F_GSO_MPLS_BIT] = "tx-mpls-segmentation", -- cgit v1.1 From 0a6957e7d47096bbeedda4e1d926359eb487dcfc Mon Sep 17 00:00:00 2001 From: ZHAO Gang Date: Tue, 22 Oct 2013 16:23:38 +0800 Subject: net: remove function sk_reset_txq() What sk_reset_txq() does is just calls function sk_tx_queue_reset(), and sk_reset_txq() is used only in sock.h, by dst_negative_advice(). Let dst_negative_advice() calls sk_tx_queue_reset() directly so we can remove unneeded sk_reset_txq(). Signed-off-by: ZHAO Gang Signed-off-by: David S. Miller --- net/core/sock.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 440afdc..ab20ed9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -475,12 +475,6 @@ discard_and_relse: } EXPORT_SYMBOL(sk_receive_skb); -void sk_reset_txq(struct sock *sk) -{ - sk_tx_queue_clear(sk); -} -EXPORT_SYMBOL(sk_reset_txq); - struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = __sk_dst_get(sk); -- cgit v1.1 From 34d92d5315b64a3e5292b7e9511c1bb617227fb6 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Wed, 23 Oct 2013 08:44:50 +0200 Subject: net: always inline net_secret_init Currently net_secret_init does not get inlined, so we always have a call to net_secret_init even in the fast path. Let's specify net_secret_init as __always_inline so we have the nop in the fast-path without the call to net_secret_init and the unlikely path at the epilogue of the function. jump_labels handle the inlining correctly. Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/core/secure_seq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index b02fd16..90e8a82 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -15,7 +15,7 @@ static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned; -static void net_secret_init(void) +static __always_inline void net_secret_init(void) { net_get_random_once(net_secret, sizeof(net_secret)); } -- cgit v1.1 From 974daef7f8bb5d7be78fae3a240fcce43cae0135 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 23 Oct 2013 15:28:56 +0200 Subject: net: add missing dev_put() in __netdev_adjacent_dev_insert I think that a dev_put() is needed in the error path to preserve the proper dev refcount. CC: Veaceslav Falico Signed-off-by: Nikolay Aleksandrov Acked-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0918aad..bdffd65 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4648,6 +4648,7 @@ remove_symlinks: free_adj: kfree(adj); + dev_put(adj_dev); return ret; } -- cgit v1.1 From f84be2bd96a108b09c8440263fa3adb3fb225fa3 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Wed, 23 Oct 2013 20:05:27 +0200 Subject: net: make net_get_random_once irq safe I initial build non irq safe version of net_get_random_once because I would liked to have the freedom to defer even the extraction process of get_random_bytes until the nonblocking pool is fully seeded. I don't think this is a good idea anymore and thus this patch makes net_get_random_once irq safe. Now someone using net_get_random_once does not need to care from where it is called. Cc: David S. Miller Cc: Eric Dumazet Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/core/utils.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/utils.c b/net/core/utils.c index bf09371..2f737bf 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -370,16 +370,17 @@ bool __net_get_random_once(void *buf, int nbytes, bool *done, struct static_key *done_key) { static DEFINE_SPINLOCK(lock); + unsigned long flags; - spin_lock_bh(&lock); + spin_lock_irqsave(&lock, flags); if (*done) { - spin_unlock_bh(&lock); + spin_unlock_irqrestore(&lock, flags); return false; } get_random_bytes(buf, nbytes); *done = true; - spin_unlock_bh(&lock); + spin_unlock_irqrestore(&lock, flags); __net_random_once_disable_jump(done_key); -- cgit v1.1 From 66415cf8a1b99d101317f5aa08574b1ec8832672 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Wed, 23 Oct 2013 20:06:00 +0200 Subject: net: initialize hashrnd in flow_dissector with net_get_random_once We also can defer the initialization of hashrnd in flow_dissector to its first use. Since net_get_random_once is irq safe now we don't have to audit the call paths if one of this functions get called by an interrupt handler. Cc: David S. Miller Cc: Eric Dumazet Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index f8e25ac..5cac36e 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -184,6 +184,22 @@ ipv6: EXPORT_SYMBOL(skb_flow_dissect); static u32 hashrnd __read_mostly; +static __always_inline void __flow_hash_secret_init(void) +{ + net_get_random_once(&hashrnd, sizeof(hashrnd)); +} + +static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c) +{ + __flow_hash_secret_init(); + return jhash_3words(a, b, c, hashrnd); +} + +static __always_inline u32 __flow_hash_1word(u32 a) +{ + __flow_hash_secret_init(); + return jhash_1word(a, hashrnd); +} /* * __skb_get_rxhash: calculate a flow hash based on src/dst addresses @@ -210,9 +226,9 @@ void __skb_get_rxhash(struct sk_buff *skb) swap(keys.port16[0], keys.port16[1]); } - hash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src, - (__force u32)keys.ports, hashrnd); + hash = __flow_hash_3words((__force u32)keys.dst, + (__force u32)keys.src, + (__force u32)keys.ports); if (!hash) hash = 1; @@ -248,7 +264,7 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, hash = skb->sk->sk_hash; else hash = (__force u16) skb->protocol; - hash = jhash_1word(hash, hashrnd); + hash = __flow_hash_1word(hash); return (u16) (((u64) hash * qcount) >> 32) + qoffset; } @@ -340,7 +356,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) else hash = (__force u16) skb->protocol ^ skb->rxhash; - hash = jhash_1word(hash, hashrnd); + hash = __flow_hash_1word(hash); queue_index = map->queues[ ((u64)hash * map->len) >> 32]; } @@ -395,11 +411,3 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, skb_set_queue_mapping(skb, queue_index); return netdev_get_tx_queue(dev, queue_index); } - -static int __init initialize_hashrnd(void) -{ - get_random_bytes(&hashrnd, sizeof(hashrnd)); - return 0; -} - -late_initcall_sync(initialize_hashrnd); -- cgit v1.1 From 7f29405403d7c17f539c099987972b862e7e5255 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 23 Oct 2013 16:02:42 -0700 Subject: net: fix rtnl notification in atomic context commit 991fb3f74c "dev: always advertise rx_flags changes via netlink" introduced rtnl notification from __dev_set_promiscuity(), which can be called in atomic context. Steps to reproduce: ip tuntap add dev tap1 mode tap ifconfig tap1 up tcpdump -nei tap1 & ip tuntap del dev tap1 mode tap [ 271.627994] device tap1 left promiscuous mode [ 271.639897] BUG: sleeping function called from invalid context at mm/slub.c:940 [ 271.664491] in_atomic(): 1, irqs_disabled(): 0, pid: 3394, name: ip [ 271.677525] INFO: lockdep is turned off. [ 271.690503] CPU: 0 PID: 3394 Comm: ip Tainted: G W 3.12.0-rc3+ #73 [ 271.703996] Hardware name: System manufacturer System Product Name/P8Z77 WS, BIOS 3007 07/26/2012 [ 271.731254] ffffffff81a58506 ffff8807f0d57a58 ffffffff817544e5 ffff88082fa0f428 [ 271.760261] ffff8808071f5f40 ffff8807f0d57a88 ffffffff8108bad1 ffffffff81110ff8 [ 271.790683] 0000000000000010 00000000000000d0 00000000000000d0 ffff8807f0d57af8 [ 271.822332] Call Trace: [ 271.838234] [] dump_stack+0x55/0x76 [ 271.854446] [] __might_sleep+0x181/0x240 [ 271.870836] [] ? rcu_irq_exit+0x68/0xb0 [ 271.887076] [] kmem_cache_alloc_node+0x4e/0x2a0 [ 271.903368] [] ? vprintk_emit+0x1dc/0x5a0 [ 271.919716] [] ? __alloc_skb+0x57/0x2a0 [ 271.936088] [] ? vprintk_emit+0x1e0/0x5a0 [ 271.952504] [] __alloc_skb+0x57/0x2a0 [ 271.968902] [] rtmsg_ifinfo+0x52/0x100 [ 271.985302] [] __dev_notify_flags+0xad/0xc0 [ 272.001642] [] __dev_set_promiscuity+0x8c/0x1c0 [ 272.017917] [] ? packet_notifier+0x5/0x380 [ 272.033961] [] dev_set_promiscuity+0x29/0x50 [ 272.049855] [] packet_dev_mc+0x87/0xc0 [ 272.065494] [] packet_notifier+0x1b2/0x380 [ 272.080915] [] ? packet_notifier+0x5/0x380 [ 272.096009] [] notifier_call_chain+0x66/0x150 [ 272.110803] [] __raw_notifier_call_chain+0xe/0x10 [ 272.125468] [] raw_notifier_call_chain+0x16/0x20 [ 272.139984] [] call_netdevice_notifiers_info+0x40/0x70 [ 272.154523] [] call_netdevice_notifiers+0x16/0x20 [ 272.168552] [] rollback_registered_many+0x145/0x240 [ 272.182263] [] rollback_registered+0x31/0x40 [ 272.195369] [] unregister_netdevice_queue+0x58/0x90 [ 272.208230] [] __tun_detach+0x140/0x340 [ 272.220686] [] tun_chr_close+0x36/0x60 Signed-off-by: Alexei Starovoitov Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/dev.c | 16 ++++++++-------- net/core/rtnetlink.c | 9 +++++---- 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index bdffd65..0054c8c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1203,7 +1203,7 @@ void netdev_state_change(struct net_device *dev) { if (dev->flags & IFF_UP) { call_netdevice_notifiers(NETDEV_CHANGE, dev); - rtmsg_ifinfo(RTM_NEWLINK, dev, 0); + rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); } } EXPORT_SYMBOL(netdev_state_change); @@ -1293,7 +1293,7 @@ int dev_open(struct net_device *dev) if (ret < 0) return ret; - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); call_netdevice_notifiers(NETDEV_UP, dev); return ret; @@ -1371,7 +1371,7 @@ static int dev_close_many(struct list_head *head) __dev_close_many(head); list_for_each_entry_safe(dev, tmp, head, close_list) { - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); call_netdevice_notifiers(NETDEV_DOWN, dev); list_del_init(&dev->close_list); } @@ -5258,7 +5258,7 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, unsigned int changes = dev->flags ^ old_flags; if (gchanges) - rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges); + rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC); if (changes & IFF_UP) { if (dev->flags & IFF_UP) @@ -5490,7 +5490,7 @@ static void rollback_registered_many(struct list_head *head) if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); /* * Flush the unicast and multicast chains @@ -5889,7 +5889,7 @@ int register_netdevice(struct net_device *dev) */ if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); out: return ret; @@ -6501,7 +6501,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); /* * Flush the unicast and multicast chains @@ -6540,7 +6540,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); synchronize_net(); err = 0; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 4aedf03..cf67144 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1984,14 +1984,15 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change) +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, + gfp_t flags) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; size_t if_info_size; - skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), GFP_KERNEL); + skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), flags); if (skb == NULL) goto errout; @@ -2002,7 +2003,7 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change) kfree_skb(skb); goto errout; } - rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags); return; errout: if (err < 0) @@ -2716,7 +2717,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_JOIN: break; default: - rtmsg_ifinfo(RTM_NEWLINK, dev, 0); + rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); break; } return NOTIFY_DONE; -- cgit v1.1 From c4e819d16c0f46fbdd3706adbd990b3b292d726c Mon Sep 17 00:00:00 2001 From: Zhi Yong Wu Date: Mon, 28 Oct 2013 14:01:49 +0800 Subject: net, datagram: fix the incorrect comment in zerocopy_sg_from_iovec() Signed-off-by: Zhi Yong Wu Signed-off-by: David S. Miller --- net/core/datagram.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index af814e7..a16ed7b 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -577,7 +577,7 @@ EXPORT_SYMBOL(skb_copy_datagram_from_iovec); /** * zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec * @skb: buffer to copy - * @from: io vector to copy to + * @from: io vector to copy from * @offset: offset in the io vector to start copying from * @count: amount of vectors to copy to buffer from * -- cgit v1.1 From ab1a2d7773b23dbbb863fd63fcf83b67cf361e34 Mon Sep 17 00:00:00 2001 From: Zhi Yong Wu Date: Mon, 28 Oct 2013 14:01:50 +0800 Subject: net, iovec: fix the incorrect comment in memcpy_fromiovecend() Signed-off-by: Zhi Yong Wu Signed-off-by: David S. Miller --- net/core/iovec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/iovec.c b/net/core/iovec.c index b77eeec..4cdb7c4 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -100,7 +100,7 @@ int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata, EXPORT_SYMBOL(memcpy_toiovecend); /* - * Copy iovec from kernel. Returns -EFAULT on error. + * Copy iovec to kernel. Returns -EFAULT on error. */ int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov, -- cgit v1.1 From cdfb97bc010d9e9d994eb68f2cbac3a8ada26104 Mon Sep 17 00:00:00 2001 From: Zhi Yong Wu Date: Mon, 28 Oct 2013 16:15:50 +0800 Subject: net, mc: fix the incorrect comments in two mc-related functions Signed-off-by: Zhi Yong Wu Signed-off-by: David S. Miller --- net/core/dev_addr_lists.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 6cda4e2..ec40a84 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -752,7 +752,7 @@ int dev_mc_del_global(struct net_device *dev, const unsigned char *addr) EXPORT_SYMBOL(dev_mc_del_global); /** - * dev_mc_sync - Synchronize device's unicast list to another device + * dev_mc_sync - Synchronize device's multicast list to another device * @to: destination device * @from: source device * @@ -780,7 +780,7 @@ int dev_mc_sync(struct net_device *to, struct net_device *from) EXPORT_SYMBOL(dev_mc_sync); /** - * dev_mc_sync_multiple - Synchronize device's unicast list to another + * dev_mc_sync_multiple - Synchronize device's multicast list to another * device, but allow for multiple calls to sync to multiple devices. * @to: destination device * @from: source device -- cgit v1.1 From 2817a336d4d533fb8b68719723cd60ea7dd7c09e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Oct 2013 11:50:51 +0100 Subject: net: skb_checksum: allow custom update/combine for walking skb Currently, skb_checksum walks over 1) linearized, 2) frags[], and 3) frag_list data and calculats the one's complement, a 32 bit result suitable for feeding into itself or csum_tcpudp_magic(), but unsuitable for SCTP as we're calculating CRC32c there. Hence, in order to not re-implement the very same function in SCTP (and maybe other protocols) over and over again, use an update() + combine() callback internally to allow for walking over the skb with different algorithms. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/skbuff.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0ab32fa..31aab53 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1928,9 +1928,8 @@ fault: EXPORT_SYMBOL(skb_store_bits); /* Checksum skb data. */ - -__wsum skb_checksum(const struct sk_buff *skb, int offset, - int len, __wsum csum) +__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, + __wsum csum, const struct skb_checksum_ops *ops) { int start = skb_headlen(skb); int i, copy = start - offset; @@ -1941,7 +1940,7 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, if (copy > 0) { if (copy > len) copy = len; - csum = csum_partial(skb->data + offset, copy, csum); + csum = ops->update(skb->data + offset, copy, csum); if ((len -= copy) == 0) return csum; offset += copy; @@ -1962,10 +1961,10 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, if (copy > len) copy = len; vaddr = kmap_atomic(skb_frag_page(frag)); - csum2 = csum_partial(vaddr + frag->page_offset + - offset - start, copy, 0); + csum2 = ops->update(vaddr + frag->page_offset + + offset - start, copy, 0); kunmap_atomic(vaddr); - csum = csum_block_add(csum, csum2, pos); + csum = ops->combine(csum, csum2, pos, copy); if (!(len -= copy)) return csum; offset += copy; @@ -1984,9 +1983,9 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, __wsum csum2; if (copy > len) copy = len; - csum2 = skb_checksum(frag_iter, offset - start, - copy, 0); - csum = csum_block_add(csum, csum2, pos); + csum2 = __skb_checksum(frag_iter, offset - start, + copy, 0, ops); + csum = ops->combine(csum, csum2, pos, copy); if ((len -= copy) == 0) return csum; offset += copy; @@ -1998,6 +1997,18 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, return csum; } +EXPORT_SYMBOL(__skb_checksum); + +__wsum skb_checksum(const struct sk_buff *skb, int offset, + int len, __wsum csum) +{ + const struct skb_checksum_ops ops = { + .update = csum_partial, + .combine = csum_block_add_ext, + }; + + return __skb_checksum(skb, offset, len, csum, &ops); +} EXPORT_SYMBOL(skb_checksum); /* Both of above in one bottle. */ -- cgit v1.1 From 74d332c13b2148ae934ea94dac1745ae92efe8e5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 30 Oct 2013 13:10:44 -0700 Subject: net: extend net_device allocation to vmalloc() Joby Poriyath provided a xen-netback patch to reduce the size of xenvif structure as some netdev allocation could fail under memory pressure/fragmentation. This patch is handling the problem at the core level, allowing any netdev structures to use vmalloc() if kmalloc() failed. As vmalloc() adds overhead on a critical network path, add __GFP_REPEAT to kzalloc() flags to do this fallback only when really needed. Signed-off-by: Eric Dumazet Reported-by: Joby Poriyath Cc: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 22 +++++++++++++++++----- net/core/net-sysfs.c | 2 +- 2 files changed, 18 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0054c8c..0e61365 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6196,6 +6196,16 @@ void netdev_set_default_ethtool_ops(struct net_device *dev, } EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); +void netdev_freemem(struct net_device *dev) +{ + char *addr = (char *)dev - dev->padded; + + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} + /** * alloc_netdev_mqs - allocate network device * @sizeof_priv: size of private data to allocate space for @@ -6239,7 +6249,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, /* ensure 32-byte alignment of whole construct */ alloc_size += NETDEV_ALIGN - 1; - p = kzalloc(alloc_size, GFP_KERNEL); + p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + if (!p) + p = vzalloc(alloc_size); if (!p) return NULL; @@ -6248,7 +6260,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) - goto free_p; + goto free_dev; if (dev_addr_init(dev)) goto free_pcpu; @@ -6301,8 +6313,8 @@ free_pcpu: kfree(dev->_rx); #endif -free_p: - kfree(p); +free_dev: + netdev_freemem(dev); return NULL; } EXPORT_SYMBOL(alloc_netdev_mqs); @@ -6339,7 +6351,7 @@ void free_netdev(struct net_device *dev) /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { - kfree((char *)dev - dev->padded); + netdev_freemem(dev); return; } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index d954b56..d03f2c9 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1263,7 +1263,7 @@ static void netdev_release(struct device *d) BUG_ON(dev->reg_state != NETREG_RELEASED); kfree(dev->ifalias); - kfree((char *)dev - dev->padded); + netdev_freemem(dev); } static const void *net_namespace(struct device *d) -- cgit v1.1 From cea80ea8d2a4c646f240a8fd6ece5c8e7bc969d3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 4 Nov 2013 17:10:25 +0100 Subject: net: checksum: fix warning in skb_checksum This patch fixes a build warning in skb_checksum() by wrapping the csum_partial() usage in skb_checksum(). The problem is that on a few architectures, csum_partial is used with prefix asmlinkage whereas on most architectures it's not. So fix this up generically as we did with csum_block_add_ext() to match the signature. Introduced by 2817a336d4d ("net: skb_checksum: allow custom update/combine for walking skb"). Reported-by: Fengguang Wu Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 31aab53..e411559 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2003,7 +2003,7 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum) { const struct skb_checksum_ops ops = { - .update = csum_partial, + .update = csum_partial_ext, .combine = csum_block_add_ext, }; -- cgit v1.1 From f8e617e100d7369a0108f96abf4414e9fb82ced7 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 1 Nov 2013 14:07:47 +0800 Subject: net: introduce skb_coalesce_rx_frag() Sometimes we need to coalesce the rx frags to avoid frag list. One example is virtio-net driver which tries to use small frags for both MTU sized packet and GSO packet. So this patch introduce skb_coalesce_rx_frag() to do this. Cc: Rusty Russell Cc: Michael S. Tsirkin Cc: Michael Dalton Cc: Eric Dumazet Acked-by: Michael S. Tsirkin Signed-off-by: Jason Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e411559..3735fad 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -476,6 +476,18 @@ void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, } EXPORT_SYMBOL(skb_add_rx_frag); +void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, + unsigned int truesize) +{ + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + skb_frag_size_add(frag, size); + skb->len += size; + skb->data_len += size; + skb->truesize += truesize; +} +EXPORT_SYMBOL(skb_coalesce_rx_frag); + static void skb_drop_list(struct sk_buff **listp) { kfree_skb_list(*listp); -- cgit v1.1 From a6cc0cfa72e0b6d9f2c8fd858aacc32313c4f272 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 6 Nov 2013 09:54:46 -0800 Subject: net: Add layer 2 hardware acceleration operations for macvlan devices Add a operations structure that allows a network interface to export the fact that it supports package forwarding in hardware between physical interfaces and other mac layer devices assigned to it (such as macvlans). This operaions structure can be used by virtual mac devices to bypass software switching so that forwarding can be done in hardware more efficiently. Signed-off-by: John Fastabend Signed-off-by: Neil Horman CC: Andy Gospodarek CC: "David S. Miller" Signed-off-by: David S. Miller --- net/core/dev.c | 18 +++++++++++++----- net/core/ethtool.c | 1 + 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0e61365..8ffc52e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2538,7 +2538,7 @@ static inline int skb_needs_linearize(struct sk_buff *skb, } int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq) + struct netdev_queue *txq, void *accel_priv) { const struct net_device_ops *ops = dev->netdev_ops; int rc = NETDEV_TX_OK; @@ -2604,9 +2604,13 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, dev_queue_xmit_nit(skb, dev); skb_len = skb->len; - rc = ops->ndo_start_xmit(skb, dev); + if (accel_priv) + rc = ops->ndo_dfwd_start_xmit(skb, dev, accel_priv); + else + rc = ops->ndo_start_xmit(skb, dev); + trace_net_dev_xmit(skb, rc, dev, skb_len); - if (rc == NETDEV_TX_OK) + if (rc == NETDEV_TX_OK && txq) txq_trans_update(txq); return rc; } @@ -2622,7 +2626,10 @@ gso: dev_queue_xmit_nit(nskb, dev); skb_len = nskb->len; - rc = ops->ndo_start_xmit(nskb, dev); + if (accel_priv) + rc = ops->ndo_dfwd_start_xmit(nskb, dev, accel_priv); + else + rc = ops->ndo_start_xmit(nskb, dev); trace_net_dev_xmit(nskb, rc, dev, skb_len); if (unlikely(rc != NETDEV_TX_OK)) { if (rc & ~NETDEV_TX_MASK) @@ -2647,6 +2654,7 @@ out_kfree_skb: out: return rc; } +EXPORT_SYMBOL_GPL(dev_hard_start_xmit); static void qdisc_pkt_len_init(struct sk_buff *skb) { @@ -2854,7 +2862,7 @@ int dev_queue_xmit(struct sk_buff *skb) if (!netif_xmit_stopped(txq)) { __this_cpu_inc(xmit_recursion); - rc = dev_hard_start_xmit(skb, dev, txq); + rc = dev_hard_start_xmit(skb, dev, txq, NULL); __this_cpu_dec(xmit_recursion); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 8629898..30071de 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -96,6 +96,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_LOOPBACK_BIT] = "loopback", [NETIF_F_RXFCS_BIT] = "rx-fcs", [NETIF_F_RXALL_BIT] = "rx-all", + [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", }; static int ethtool_get_features(struct net_device *dev, void __user *useraddr) -- cgit v1.1 From 0c7ddf36c29c3ce12f2d2931a357ccaa0861035a Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 7 Nov 2013 14:18:24 +0100 Subject: net: move pskb_put() to core code This function has usage beside IPsec so move it to the core skbuff code. While doing so, give it some documentation and change its return type to 'unsigned char *' to be in line with skb_put(). Signed-off-by: Mathias Krause Cc: Steffen Klassert Cc: "David S. Miller" Cc: Herbert Xu Signed-off-by: David S. Miller --- net/core/skbuff.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3735fad..2fbea08 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1264,6 +1264,29 @@ free_skb: EXPORT_SYMBOL(skb_pad); /** + * pskb_put - add data to the tail of a potentially fragmented buffer + * @skb: start of the buffer to use + * @tail: tail fragment of the buffer to use + * @len: amount of data to add + * + * This function extends the used data area of the potentially + * fragmented buffer. @tail must be the last fragment of @skb -- or + * @skb itself. If this would exceed the total buffer size the kernel + * will panic. A pointer to the first byte of the extra data is + * returned. + */ + +unsigned char *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) +{ + if (tail != skb) { + skb->data_len += len; + skb->len += len; + } + return skb_put(tail, len); +} +EXPORT_SYMBOL_GPL(pskb_put); + +/** * skb_put - add data to a buffer * @skb: buffer to use * @len: amount of data to add -- cgit v1.1 From bc32383cd6496d595e6a25cdc7cff1da6b694462 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 7 Nov 2013 14:18:26 +0100 Subject: net: skbuff - kernel-doc fixes Use "@" to refer to parameters in the kernel-doc description. According to Documentation/kernel-doc-nano-HOWTO.txt "&" shall be used to refer to structures only. Signed-off-by: Mathias Krause Cc: "David S. Miller" Signed-off-by: David S. Miller --- net/core/skbuff.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2fbea08..8c5197f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1051,8 +1051,8 @@ EXPORT_SYMBOL(__pskb_copy); * @ntail: room to add at tail * @gfp_mask: allocation priority * - * Expands (or creates identical copy, if &nhead and &ntail are zero) - * header of skb. &sk_buff itself is not changed. &sk_buff MUST have + * Expands (or creates identical copy, if @nhead and @ntail are zero) + * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have * reference count of 1. Returns zero in the case of success or error, * if expansion failed. In the last case, &sk_buff is not changed. * @@ -2563,14 +2563,14 @@ EXPORT_SYMBOL(skb_prepare_seq_read); * @data: destination pointer for data to be returned * @st: state variable * - * Reads a block of skb data at &consumed relative to the + * Reads a block of skb data at @consumed relative to the * lower offset specified to skb_prepare_seq_read(). Assigns - * the head of the data block to &data and returns the length + * the head of the data block to @data and returns the length * of the block or 0 if the end of the skb data or the upper * offset has been reached. * * The caller is not required to consume all of the data - * returned, i.e. &consumed is typically set to the number + * returned, i.e. @consumed is typically set to the number * of bytes already consumed and the next call to * skb_seq_read() will return the remaining part of the block. * -- cgit v1.1 From 3797d3e8462efdaadb64164ca540626b55fe8336 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2013 08:37:28 -0800 Subject: net: flow_dissector: small optimizations in IPv4 dissect By moving code around, we avoid : 1) A reload of iph->ihl (bit field, so needs a mask) 2) A conditional test (replaced by a conditional mov on x86) Fast path loads iph->protocol anyway. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 0242035..d6ef173 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -68,13 +68,13 @@ ip: iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); if (!iph || iph->ihl < 5) return false; + nhoff += iph->ihl * 4; + ip_proto = iph->protocol; if (ip_is_fragment(iph)) ip_proto = 0; - else - ip_proto = iph->protocol; + iph_to_flow_copy_addrs(flow, iph); - nhoff += iph->ihl * 4; break; } case __constant_htons(ETH_P_IPV6): { -- cgit v1.1 From 13eb2ab2d33c57ebddc57437a7d341995fc9138c Mon Sep 17 00:00:00 2001 From: Andreas Henriksson Date: Thu, 7 Nov 2013 18:26:38 +0100 Subject: net: Fix "ip rule delete table 256" When trying to delete a table >= 256 using iproute2 the local table will be deleted. The table id is specified as a netlink attribute when it needs more then 8 bits and iproute2 then sets the table field to RT_TABLE_UNSPEC (0). Preconditions to matching the table id in the rule delete code doesn't seem to take the "table id in netlink attribute" into condition so the frh_get_table helper function never gets to do its job when matching against current rule. Use the helper function twice instead of peaking at the table value directly. Originally reported at: http://bugs.debian.org/724783 Reported-by: Nicolas HICHER Signed-off-by: Andreas Henriksson Signed-off-by: David S. Miller --- net/core/fib_rules.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 2e65413..f409e0b 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -460,7 +460,8 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) if (frh->action && (frh->action != rule->action)) continue; - if (frh->table && (frh_get_table(frh, tb) != rule->table)) + if (frh_get_table(frh, tb) && + (frh_get_table(frh, tb) != rule->table)) continue; if (tb[FRA_PRIORITY] && -- cgit v1.1 From 6aafeef03b9d9ecf255f3a80ed85ee070260e1ae Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 6 Nov 2013 17:52:20 +0100 Subject: netfilter: push reasm skb through instead of original frag skbs Pushing original fragments through causes several problems. For example for matching, frags may not be matched correctly. Take following example: On HOSTA do: ip6tables -I INPUT -p icmpv6 -j DROP ip6tables -I INPUT -p icmpv6 -m icmp6 --icmpv6-type 128 -j ACCEPT and on HOSTB you do: ping6 HOSTA -s2000 (MTU is 1500) Incoming echo requests will be filtered out on HOSTA. This issue does not occur with smaller packets than MTU (where fragmentation does not happen) As was discussed previously, the only correct solution seems to be to use reassembled skb instead of separete frags. Doing this has positive side effects in reducing sk_buff by one pointer (nfct_reasm) and also the reams dances in ipvs and conntrack can be removed. Future plan is to remove net/ipv6/netfilter/nf_conntrack_reasm.c entirely and use code in net/ipv6/reassembly.c instead. Signed-off-by: Jiri Pirko Acked-by: Julian Anastasov Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/core/skbuff.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8c5197f..8cec1e6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -592,9 +592,6 @@ static void skb_release_head_state(struct sk_buff *skb) #if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_conntrack_put(skb->nfct); #endif -#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED - nf_conntrack_put_reasm(skb->nfct_reasm); -#endif #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(skb->nf_bridge); #endif -- cgit v1.1