From d5a4502e9efa534212484fd691339f6469cf95ff Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 19 May 2008 13:49:52 -0700 Subject: netns: Register net/core/ sysctls at read-only root. Most of the net/core/xxx sysctls are read-only now, but this goal is achieved with excessive memory consumption in each namespace - the whole table is cloned and most of the entries in it are ~= 0222. Split it into two parts and register (the largest) one at the read-only root. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/core/sysctl_net_core.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) (limited to 'net/core') diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 5fc8010..a570e2a 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -125,14 +125,6 @@ static struct ctl_table net_core_table[] = { #endif /* CONFIG_XFRM */ #endif /* CONFIG_NET */ { - .ctl_name = NET_CORE_SOMAXCONN, - .procname = "somaxconn", - .data = &init_net.core.sysctl_somaxconn, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { .ctl_name = NET_CORE_BUDGET, .procname = "netdev_budget", .data = &netdev_budget, @@ -151,6 +143,18 @@ static struct ctl_table net_core_table[] = { { .ctl_name = 0 } }; +static struct ctl_table netns_core_table[] = { + { + .ctl_name = NET_CORE_SOMAXCONN, + .procname = "somaxconn", + .data = &init_net.core.sysctl_somaxconn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { .ctl_name = 0 } +}; + static __net_initdata struct ctl_path net_core_path[] = { { .procname = "net", .ctl_name = CTL_NET, }, { .procname = "core", .ctl_name = NET_CORE, }, @@ -159,23 +163,17 @@ static __net_initdata struct ctl_path net_core_path[] = { static __net_init int sysctl_core_net_init(struct net *net) { - struct ctl_table *tbl, *tmp; + struct ctl_table *tbl; net->core.sysctl_somaxconn = SOMAXCONN; - tbl = net_core_table; + tbl = netns_core_table; if (net != &init_net) { - tbl = kmemdup(tbl, sizeof(net_core_table), GFP_KERNEL); + tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; - for (tmp = tbl; tmp->procname; tmp++) { - if (tmp->data >= (void *)&init_net && - tmp->data < (void *)(&init_net + 1)) - tmp->data += (char *)net - (char *)&init_net; - else - tmp->mode &= ~0222; - } + tbl[0].data = &net->core.sysctl_somaxconn; } net->core.sysctl_hdr = register_net_sysctl_table(net, @@ -186,7 +184,7 @@ static __net_init int sysctl_core_net_init(struct net *net) return 0; err_reg: - if (tbl != net_core_table) + if (tbl != netns_core_table) kfree(tbl); err_dup: return -ENOMEM; @@ -198,7 +196,7 @@ static __net_exit void sysctl_core_net_exit(struct net *net) tbl = net->core.sysctl_hdr->ctl_table_arg; unregister_net_sysctl_table(net->core.sysctl_hdr); - BUG_ON(tbl == net_core_table); + BUG_ON(tbl == netns_core_table); kfree(tbl); } @@ -209,6 +207,7 @@ static __net_initdata struct pernet_operations sysctl_core_ops = { static __init int sysctl_core_init(void) { + register_net_sysctl_rotable(net_core_path, net_core_table); return register_pernet_subsys(&sysctl_core_ops); } -- cgit v1.1 From 96e74088f1da4d9a53735a4a57a4f984f86b75c6 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 21 May 2008 14:12:46 -0700 Subject: net: The dev->get_stats pointer is not NULL nowadays. And so does the pointer is returns, but sysfs and netlinks still check for both cases. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 9 ++++----- net/core/rtnetlink.c | 20 ++++++++------------ 2 files changed, 12 insertions(+), 17 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 90e2177..dccd737 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -242,11 +242,11 @@ static ssize_t netstat_show(const struct device *d, offset % sizeof(unsigned long) != 0); read_lock(&dev_base_lock); - if (dev_isalive(dev) && dev->get_stats && - (stats = (*dev->get_stats)(dev))) + if (dev_isalive(dev)) { + stats = dev->get_stats(dev); ret = sprintf(buf, fmt_ulong, *(unsigned long *)(((u8 *) stats) + offset)); - + } read_unlock(&dev_base_lock); return ret; } @@ -457,8 +457,7 @@ int netdev_register_kobject(struct net_device *net) strlcpy(dev->bus_id, net->name, BUS_ID_SIZE); #ifdef CONFIG_SYSFS - if (net->get_stats) - *groups++ = &netstat_group; + *groups++ = &netstat_group; #ifdef CONFIG_WIRELESS_EXT if (net->wireless_handlers && net->wireless_handlers->get_wireless_stats) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index cf857c4..ca32ddb 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -606,6 +606,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, { struct ifinfomsg *ifm; struct nlmsghdr *nlh; + struct net_device_stats *stats; + struct nlattr *attr; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); if (nlh == NULL) @@ -652,19 +654,13 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, NLA_PUT(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast); } - if (dev->get_stats) { - struct net_device_stats *stats = dev->get_stats(dev); - if (stats) { - struct nlattr *attr; + attr = nla_reserve(skb, IFLA_STATS, + sizeof(struct rtnl_link_stats)); + if (attr == NULL) + goto nla_put_failure; - attr = nla_reserve(skb, IFLA_STATS, - sizeof(struct rtnl_link_stats)); - if (attr == NULL) - goto nla_put_failure; - - copy_rtnl_link_stats(nla_data(attr), stats); - } - } + stats = dev->get_stats(dev); + copy_rtnl_link_stats(nla_data(attr), stats); if (dev->rtnl_link_ops) { if (rtnl_link_fill(skb, dev) < 0) -- cgit v1.1 From 0b040829952d84bf2a62526f0e24b624e0699447 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 10 Jun 2008 22:46:50 -0700 Subject: net: remove CVS keywords This patch removes CVS keywords that weren't updated for a long time from comments. Signed-off-by: Adrian Bunk Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 -- net/core/sock.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1e556d3..3e18f85 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4,8 +4,6 @@ * Authors: Alan Cox * Florian La Roche * - * Version: $Id: skbuff.c,v 1.90 2001/11/07 05:56:19 davem Exp $ - * * Fixes: * Alan Cox : Fixed the worst of the load * balancer bugs. diff --git a/net/core/sock.c b/net/core/sock.c index 88094cb..3879bf6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -7,8 +7,6 @@ * handler for protocols to use and generic option handler. * * - * Version: $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, * Florian La Roche, -- cgit v1.1 From c1da4ac752b8b0411791d26c678fcf23d2eed242 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Fri, 13 Jun 2008 18:12:00 -0700 Subject: net/core: add NETDEV_BONDING_FAILOVER event Add NETDEV_BONDING_FAILOVER event to be used in a successive patch by bonding to announce fail-over for the active-backup mode through the netdev events notifier chain mechanism. Such an event can be of use for the RDMA CM (communication manager) to let native RDMA ULPs (eg NFS-RDMA, iSER) always be aligned with the IP stack, in the sense that they use the same ports/links as the stack does. More usages can be done to allow monitoring tools based on netlink events being aware to bonding fail-over. Signed-off-by: Or Gerlitz Signed-off-by: Jay Vosburgh Signed-off-by: Jeff Garzik --- net/core/dev.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 68d8df0..0e45742 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -961,6 +961,12 @@ void netdev_state_change(struct net_device *dev) } } +void netdev_bonding_change(struct net_device *dev) +{ + call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev); +} +EXPORT_SYMBOL(netdev_bonding_change); + /** * dev_load - load a network module * @net: the applicable net namespace -- cgit v1.1 From b8a9787eddb0e4665f31dd1d64584732b2b5d051 Mon Sep 17 00:00:00 2001 From: Jay Vosburgh Date: Fri, 13 Jun 2008 18:12:04 -0700 Subject: bonding: Allow setting max_bonds to zero Permit bonding to function rationally if max_bonds is set to zero. This will load the module, but create no master devices (which can be created via sysfs). Requires some change to bond_create_sysfs; currently, the netdev sysfs directory is determined from the first bonding device created, but this is no longer possible. Instead, an interface from net/core is created to create and destroy files in net_class. Based on a patch submitted by Phil Oester . Modified by Jay Vosburgh to fix the sysfs issue mentioned above and to update the documentation. Signed-off-by: Phil Oester Signed-off-by: Jay Vosburgh Signed-off-by: Jeff Garzik --- net/core/net-sysfs.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index dccd737..3f79413 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -468,6 +468,19 @@ int netdev_register_kobject(struct net_device *net) return device_add(dev); } +int netdev_class_create_file(struct class_attribute *class_attr) +{ + return class_create_file(&net_class, class_attr); +} + +void netdev_class_remove_file(struct class_attribute *class_attr) +{ + class_remove_file(&net_class, class_attr); +} + +EXPORT_SYMBOL(netdev_class_create_file); +EXPORT_SYMBOL(netdev_class_remove_file); + void netdev_initialize_kobject(struct net_device *net) { struct device *device = &(net->dev); -- cgit v1.1 From 972692e0db9b0a62329ca394062b58917ddbd03c Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jun 2008 22:41:38 -0700 Subject: net: Add sk_set_socket() helper. In order to more easily grep for all things that set sk->sk_socket, add sk_set_socket() helper inline function. Suggested (although only half-seriously) by Evgeniy Polyakov. Signed-off-by: David S. Miller --- net/core/sock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 3879bf6..2c0ba52 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1066,7 +1066,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) * to be taken into account in all callers. -acme */ sk_refcnt_debug_inc(newsk); - newsk->sk_socket = NULL; + sk_set_socket(newsk, NULL); newsk->sk_sleep = NULL; if (newsk->sk_prot->sockets_allocated) @@ -1702,7 +1702,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_rcvbuf = sysctl_rmem_default; sk->sk_sndbuf = sysctl_wmem_default; sk->sk_state = TCP_CLOSE; - sk->sk_socket = sock; + sk_set_socket(sk, sock); sock_set_flag(sk, SOCK_ZAPPED); -- cgit v1.1 From dad9b335c6940de2746a9788eb456d09cf102f81 Mon Sep 17 00:00:00 2001 From: Wang Chen Date: Wed, 18 Jun 2008 01:48:28 -0700 Subject: netdevice: Fix promiscuity and allmulti overflow Max of promiscuity and allmulti plus positive @inc can cause overflow. Fox example: when allmulti=0xFFFFFFFF, any caller give dev_set_allmulti() a positive @inc will cause allmulti be off. This is not what we want, though it's rare case. The fix is that only negative @inc will cause allmulti or promiscuity be off and when any caller makes the counters touch the roof, we return error. Change of v2: Change void function dev_set_promiscuity/allmulti to return int. So callers can get the overflow error. Caller's fix will be done later. Change of v3: 1. Since we return error to caller, we don't need to print KERN_ERROR, KERN_WARNING is enough. 2. In dev_set_promiscuity(), if __dev_set_promiscuity() failed, we return at once. Signed-off-by: Wang Chen Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/dev.c | 55 +++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0e45742..a495f71 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2771,16 +2771,29 @@ int netdev_set_master(struct net_device *slave, struct net_device *master) return 0; } -static void __dev_set_promiscuity(struct net_device *dev, int inc) +static int __dev_set_promiscuity(struct net_device *dev, int inc) { unsigned short old_flags = dev->flags; ASSERT_RTNL(); - if ((dev->promiscuity += inc) == 0) - dev->flags &= ~IFF_PROMISC; - else - dev->flags |= IFF_PROMISC; + dev->flags |= IFF_PROMISC; + dev->promiscuity += inc; + if (dev->promiscuity == 0) { + /* + * Avoid overflow. + * If inc causes overflow, untouch promisc and return error. + */ + if (inc < 0) + dev->flags &= ~IFF_PROMISC; + else { + dev->promiscuity -= inc; + printk(KERN_WARNING "%s: promiscuity touches roof, " + "set promiscuity failed, promiscuity feature " + "of device might be broken.\n", dev->name); + return -EOVERFLOW; + } + } if (dev->flags != old_flags) { printk(KERN_INFO "device %s %s promiscuous mode\n", dev->name, (dev->flags & IFF_PROMISC) ? "entered" : @@ -2798,6 +2811,7 @@ static void __dev_set_promiscuity(struct net_device *dev, int inc) if (dev->change_rx_flags) dev->change_rx_flags(dev, IFF_PROMISC); } + return 0; } /** @@ -2809,14 +2823,19 @@ static void __dev_set_promiscuity(struct net_device *dev, int inc) * remains above zero the interface remains promiscuous. Once it hits zero * the device reverts back to normal filtering operation. A negative inc * value is used to drop promiscuity on the device. + * Return 0 if successful or a negative errno code on error. */ -void dev_set_promiscuity(struct net_device *dev, int inc) +int dev_set_promiscuity(struct net_device *dev, int inc) { unsigned short old_flags = dev->flags; + int err; - __dev_set_promiscuity(dev, inc); + err = __dev_set_promiscuity(dev, inc); + if (!err) + return err; if (dev->flags != old_flags) dev_set_rx_mode(dev); + return err; } /** @@ -2829,22 +2848,38 @@ void dev_set_promiscuity(struct net_device *dev, int inc) * to all interfaces. Once it hits zero the device reverts back to normal * filtering operation. A negative @inc value is used to drop the counter * when releasing a resource needing all multicasts. + * Return 0 if successful or a negative errno code on error. */ -void dev_set_allmulti(struct net_device *dev, int inc) +int dev_set_allmulti(struct net_device *dev, int inc) { unsigned short old_flags = dev->flags; ASSERT_RTNL(); dev->flags |= IFF_ALLMULTI; - if ((dev->allmulti += inc) == 0) - dev->flags &= ~IFF_ALLMULTI; + dev->allmulti += inc; + if (dev->allmulti == 0) { + /* + * Avoid overflow. + * If inc causes overflow, untouch allmulti and return error. + */ + if (inc < 0) + dev->flags &= ~IFF_ALLMULTI; + else { + dev->allmulti -= inc; + printk(KERN_WARNING "%s: allmulti touches roof, " + "set allmulti failed, allmulti feature of " + "device might be broken.\n", dev->name); + return -EOVERFLOW; + } + } if (dev->flags ^ old_flags) { if (dev->change_rx_flags) dev->change_rx_flags(dev, IFF_ALLMULTI); dev_set_rx_mode(dev); } + return 0; } /* -- cgit v1.1 From 0187bdfb05674147774ca79a79942537f3ad54bd Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 19 Jun 2008 16:15:47 -0700 Subject: net: Disable LRO on devices that are forwarding Large Receive Offload (LRO) is only appropriate for packets that are destined for the host, and should be disabled if received packets may be forwarded. It can also confuse the GSO on output. Add dev_disable_lro() function which uses the appropriate ethtool ops to disable LRO if enabled. Add calls to dev_disable_lro() in br_add_if() and functions that enable IPv4 and IPv6 forwarding. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index a495f71..f6944ec 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -90,6 +90,7 @@ #include #include #include +#include #include #include #include @@ -1123,6 +1124,29 @@ int dev_close(struct net_device *dev) } +/** + * dev_disable_lro - disable Large Receive Offload on a device + * @dev: device + * + * Disable Large Receive Offload (LRO) on a net device. Must be + * called under RTNL. This is needed if received packets may be + * forwarded to another interface. + */ +void dev_disable_lro(struct net_device *dev) +{ + if (dev->ethtool_ops && dev->ethtool_ops->get_flags && + dev->ethtool_ops->set_flags) { + u32 flags = dev->ethtool_ops->get_flags(dev); + if (flags & ETH_FLAG_LRO) { + flags &= ~ETH_FLAG_LRO; + dev->ethtool_ops->set_flags(dev, flags); + } + } + WARN_ON(dev->features & NETIF_F_LRO); +} +EXPORT_SYMBOL(dev_disable_lro); + + static int dev_boot_phase = 1; /* -- cgit v1.1 From 4497b0763cb1afae463f5e144c28b5d806e28b60 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 19 Jun 2008 16:22:28 -0700 Subject: net: Discard and warn about LRO'd skbs received for forwarding Add skb_warn_if_lro() to test whether an skb was received with LRO and warn if so. Change br_forward(), ip_forward() and ip6_forward() to call it) and discard the skb if it returns true. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/skbuff.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3e18f85..2df012b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2583,6 +2583,13 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) return true; } +void __skb_warn_lro_forwarding(const struct sk_buff *skb) +{ + if (net_ratelimit()) + pr_warning("%s: received packets cannot be forwarded" + " while LRO is enabled\n", skb->dev->name); +} + EXPORT_SYMBOL(___pskb_trim); EXPORT_SYMBOL(__kfree_skb); EXPORT_SYMBOL(kfree_skb); @@ -2616,6 +2623,7 @@ EXPORT_SYMBOL(skb_seq_read); EXPORT_SYMBOL(skb_abort_seq_read); EXPORT_SYMBOL(skb_find_text); EXPORT_SYMBOL(skb_append_datato_frags); +EXPORT_SYMBOL(__skb_warn_lro_forwarding); EXPORT_SYMBOL_GPL(skb_to_sgvec); EXPORT_SYMBOL_GPL(skb_cow_data); -- cgit v1.1 From 0853ad66b14feb12acde7ac13b7c3b75770a0adc Mon Sep 17 00:00:00 2001 From: Santwona Behera Date: Wed, 2 Jul 2008 03:47:41 -0700 Subject: netdev: Add support for rx flow hash configuration, using ethtool. Added new interfaces to ethtool to configure receive network flow distribution across multiple rx rings using hashing. Signed-off-by: Santwona Behera Signed-off-by: David S. Miller --- net/core/ethtool.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 0133b5e..14ada53 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -209,6 +209,36 @@ static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr) return 0; } +static int ethtool_set_rxhash(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_rxnfc cmd; + + if (!dev->ethtool_ops->set_rxhash) + return -EOPNOTSUPP; + + if (copy_from_user(&cmd, useraddr, sizeof(cmd))) + return -EFAULT; + + return dev->ethtool_ops->set_rxhash(dev, &cmd); +} + +static int ethtool_get_rxhash(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_rxnfc info; + + if (!dev->ethtool_ops->get_rxhash) + return -EOPNOTSUPP; + + if (copy_from_user(&info, useraddr, sizeof(info))) + return -EFAULT; + + dev->ethtool_ops->get_rxhash(dev, &info); + + if (copy_to_user(useraddr, &info, sizeof(info))) + return -EFAULT; + return 0; +} + static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) { struct ethtool_regs regs; @@ -826,6 +856,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GGSO: case ETHTOOL_GFLAGS: case ETHTOOL_GPFLAGS: + case ETHTOOL_GRXFH: break; default: if (!capable(CAP_NET_ADMIN)) @@ -977,6 +1008,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) rc = ethtool_set_value(dev, useraddr, dev->ethtool_ops->set_priv_flags); break; + case ETHTOOL_GRXFH: + rc = ethtool_get_rxhash(dev, useraddr); + break; + case ETHTOOL_SRXFH: + rc = ethtool_set_rxhash(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } -- cgit v1.1 From ae299fc051aa68ca6ef1807c37bb92d9b6ff817c Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Sat, 5 Jul 2008 19:01:28 -0700 Subject: net: add fib_rules_ops to flush_cache method This is required to pass namespace context into rt_cache_flush called from ->flush_cache. Signed-off-by: Denis V. Lunev Signed-off-by: David S. Miller --- net/core/fib_rules.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index e3e9ab0..1c2943a 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -69,7 +69,7 @@ static void rules_ops_put(struct fib_rules_ops *ops) static void flush_route_cache(struct fib_rules_ops *ops) { if (ops->flush_cache) - ops->flush_cache(); + ops->flush_cache(ops); } int fib_rules_register(struct fib_rules_ops *ops) -- cgit v1.1 From 4b5a698ef423eebc37cfacc6d3376d6dffd5bf83 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 6 Jul 2008 15:49:08 -0700 Subject: net: fix dev_set_promiscuity() breakage Commit dad9b335 (netdevice: Fix promiscuity and allmulti overflow) broke dev_set_promiscuity() by returning on success without reprogramming the device. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index bfa9a6a..7593393 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2859,7 +2859,7 @@ int dev_set_promiscuity(struct net_device *dev, int inc) int err; err = __dev_set_promiscuity(dev, inc); - if (!err) + if (err < 0) return err; if (dev->flags != old_flags) dev_set_rx_mode(dev); -- cgit v1.1 From bb949fbd1878973c3539d9aecff52f284482a937 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 8 Jul 2008 16:55:56 -0700 Subject: netdev: Create netdev_queue abstraction. A netdev_queue is an entity managed by a qdisc. Currently there is one RX and one TX queue, and a netdev_queue merely contains a backpointer to the net_device. The Qdisc struct is augmented with a netdev_queue pointer as well. Eventually the 'dev' Qdisc member will go away and we will have the resulting hierarchy: net_device --> netdev_queue --> Qdisc Also, qdisc_alloc() and qdisc_create_dflt() now take a netdev_queue pointer argument. Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 7593393..9b281c9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4072,6 +4072,12 @@ static struct net_device_stats *internal_stats(struct net_device *dev) return &dev->stats; } +static void netdev_init_queues(struct net_device *dev) +{ + dev->rx_queue.dev = dev; + dev->tx_queue.dev = dev; +} + /** * alloc_netdev_mq - allocate network device * @sizeof_priv: size of private data to allocate space for @@ -4124,6 +4130,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev->egress_subqueue_count = queue_count; dev->gso_max_size = GSO_MAX_SIZE; + netdev_init_queues(dev); + dev->get_stats = internal_stats; netpoll_netdev_init(dev); setup(dev); -- cgit v1.1 From dc2b48475a0a36f8b3bbb2da60d3a006dc5c2c84 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 8 Jul 2008 17:18:23 -0700 Subject: netdev: Move queue_lock into struct netdev_queue. The lock is now an attribute of the device queue. One thing to notice is that "suspicious" places emerge which will need specific training about multiple queue handling. They are so marked with explicit "netdev->rx_queue" and "netdev->tx_queue" references. Signed-off-by: David S. Miller --- net/core/dev.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 9b281c9..0501104 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1667,6 +1667,7 @@ out_kfree_skb: int dev_queue_xmit(struct sk_buff *skb) { struct net_device *dev = skb->dev; + struct netdev_queue *txq; struct Qdisc *q; int rc = -ENOMEM; @@ -1699,14 +1700,15 @@ int dev_queue_xmit(struct sk_buff *skb) } gso: - spin_lock_prefetch(&dev->queue_lock); + txq = &dev->tx_queue; + spin_lock_prefetch(&txq->lock); /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ rcu_read_lock_bh(); - /* Updates of qdisc are serialized by queue_lock. + /* Updates of qdisc are serialized by queue->lock. * The struct Qdisc which is pointed to by qdisc is now a * rcu structure - it may be accessed without acquiring * a lock (but the structure may be stale.) The freeing of the @@ -1714,7 +1716,7 @@ gso: * more references to it. * * If the qdisc has an enqueue function, we still need to - * hold the queue_lock before calling it, since queue_lock + * hold the queue->lock before calling it, since queue->lock * also serializes access to the device queue. */ @@ -1724,19 +1726,19 @@ gso: #endif if (q->enqueue) { /* Grab device queue */ - spin_lock(&dev->queue_lock); + spin_lock(&txq->lock); q = dev->qdisc; if (q->enqueue) { /* reset queue_mapping to zero */ skb_set_queue_mapping(skb, 0); rc = q->enqueue(skb, q); qdisc_run(dev); - spin_unlock(&dev->queue_lock); + spin_unlock(&txq->lock); rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; goto out; } - spin_unlock(&dev->queue_lock); + spin_unlock(&txq->lock); } /* The device has no queue. Common case for software devices: @@ -1919,14 +1921,17 @@ static void net_tx_action(struct softirq_action *h) while (head) { struct net_device *dev = head; + struct netdev_queue *txq; head = head->next_sched; + txq = &dev->tx_queue; + smp_mb__before_clear_bit(); clear_bit(__LINK_STATE_SCHED, &dev->state); - if (spin_trylock(&dev->queue_lock)) { + if (spin_trylock(&txq->lock)) { qdisc_run(dev); - spin_unlock(&dev->queue_lock); + spin_unlock(&txq->lock); } else { netif_schedule(dev); } @@ -3787,7 +3792,6 @@ int register_netdevice(struct net_device *dev) BUG_ON(!dev_net(dev)); net = dev_net(dev); - spin_lock_init(&dev->queue_lock); spin_lock_init(&dev->_xmit_lock); netdev_set_lockdep_class(&dev->_xmit_lock, dev->type); dev->xmit_lock_owner = -1; @@ -4072,10 +4076,17 @@ static struct net_device_stats *internal_stats(struct net_device *dev) return &dev->stats; } +static void netdev_init_one_queue(struct net_device *dev, + struct netdev_queue *queue) +{ + spin_lock_init(&queue->lock); + queue->dev = dev; +} + static void netdev_init_queues(struct net_device *dev) { - dev->rx_queue.dev = dev; - dev->tx_queue.dev = dev; + netdev_init_one_queue(dev, &dev->rx_queue); + netdev_init_one_queue(dev, &dev->tx_queue); } /** -- cgit v1.1 From 555353cfa1aee293de445bfa6de43276138ddd82 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 8 Jul 2008 17:33:13 -0700 Subject: netdev: The ingress_lock member is no longer needed. Every qdisc is assosciated with a queue, and in the case of ingress qdiscs that will now be netdev->rx_queue so using that queue's lock is the thing to do. Signed-off-by: David S. Miller --- net/core/dev.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0501104..2322fb6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2014,10 +2014,11 @@ static inline struct sk_buff *handle_macvlan(struct sk_buff *skb, */ static int ing_filter(struct sk_buff *skb) { - struct Qdisc *q; struct net_device *dev = skb->dev; - int result = TC_ACT_OK; u32 ttl = G_TC_RTTL(skb->tc_verd); + struct netdev_queue *rxq; + int result = TC_ACT_OK; + struct Qdisc *q; if (MAX_RED_LOOP < ttl++) { printk(KERN_WARNING @@ -2029,10 +2030,12 @@ static int ing_filter(struct sk_buff *skb) skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - spin_lock(&dev->ingress_lock); + rxq = &dev->rx_queue; + + spin_lock(&rxq->lock); if ((q = dev->qdisc_ingress) != NULL) result = q->enqueue(skb, q); - spin_unlock(&dev->ingress_lock); + spin_unlock(&rxq->lock); return result; } @@ -3795,7 +3798,6 @@ int register_netdevice(struct net_device *dev) spin_lock_init(&dev->_xmit_lock); netdev_set_lockdep_class(&dev->_xmit_lock, dev->type); dev->xmit_lock_owner = -1; - spin_lock_init(&dev->ingress_lock); dev->iflink = -1; -- cgit v1.1 From b0e1e6462df3c5944010b3328a546d8fe5d932cd Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 8 Jul 2008 17:42:10 -0700 Subject: netdev: Move rest of qdisc state into struct netdev_queue Now qdisc, qdisc_sleeping, and qdisc_list also live there. Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- net/core/link_watch.c | 8 ++++++-- net/core/rtnetlink.c | 6 ++++-- 3 files changed, 12 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 2322fb6..ce79c28 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1720,14 +1720,14 @@ gso: * also serializes access to the device queue. */ - q = rcu_dereference(dev->qdisc); + q = rcu_dereference(txq->qdisc); #ifdef CONFIG_NET_CLS_ACT skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS); #endif if (q->enqueue) { /* Grab device queue */ spin_lock(&txq->lock); - q = dev->qdisc; + q = txq->qdisc; if (q->enqueue) { /* reset queue_mapping to zero */ skb_set_queue_mapping(skb, 0); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index a5e372b..5021821 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -79,8 +79,10 @@ static void rfc2863_policy(struct net_device *dev) static int linkwatch_urgent_event(struct net_device *dev) { + struct netdev_queue *txq = &dev->tx_queue; + return netif_running(dev) && netif_carrier_ok(dev) && - dev->qdisc != dev->qdisc_sleeping; + txq->qdisc != txq->qdisc_sleeping; } @@ -181,7 +183,9 @@ static void __linkwatch_run_queue(int urgent_only) rfc2863_policy(dev); if (dev->flags & IFF_UP) { if (netif_carrier_ok(dev)) { - WARN_ON(dev->qdisc_sleeping == &noop_qdisc); + struct netdev_queue *txq = &dev->tx_queue; + + WARN_ON(txq->qdisc_sleeping == &noop_qdisc); dev_activate(dev); } else dev_deactivate(dev); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 6c8d7f0..8ef9f1d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -605,6 +605,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags) { + struct netdev_queue *txq; struct ifinfomsg *ifm; struct nlmsghdr *nlh; struct net_device_stats *stats; @@ -635,8 +636,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (dev->master) NLA_PUT_U32(skb, IFLA_MASTER, dev->master->ifindex); - if (dev->qdisc_sleeping) - NLA_PUT_STRING(skb, IFLA_QDISC, dev->qdisc_sleeping->ops->id); + txq = &dev->tx_queue; + if (txq->qdisc_sleeping) + NLA_PUT_STRING(skb, IFLA_QDISC, txq->qdisc_sleeping->ops->id); if (1) { struct rtnl_link_ifmap map = { -- cgit v1.1 From 816f3258e70db38d6d92c8d871377179fd69160f Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 8 Jul 2008 22:49:00 -0700 Subject: netdev: Kill qdisc_ingress, use netdev->rx_queue.qdisc instead. Now that our qdisc management is bi-directional, per-queue, and fully orthogonal, there is no reason to have a special ingress qdisc pointer in struct net_device. Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ce79c28..ab760a9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2033,7 +2033,7 @@ static int ing_filter(struct sk_buff *skb) rxq = &dev->rx_queue; spin_lock(&rxq->lock); - if ((q = dev->qdisc_ingress) != NULL) + if ((q = rxq->qdisc) != NULL) result = q->enqueue(skb, q); spin_unlock(&rxq->lock); @@ -2044,7 +2044,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { - if (!skb->dev->qdisc_ingress) + if (!skb->dev->rx_queue.qdisc) goto out; if (*pt_prev) { -- cgit v1.1 From ee609cb36220d18c0cf476b066a5ab7e6f6d3a69 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 8 Jul 2008 22:58:37 -0700 Subject: netdev: Move next_sched into struct netdev_queue. We schedule queues, not the device, for output queue processing in BH. Signed-off-by: David S. Miller --- net/core/dev.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ab760a9..d6b8d3c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1323,13 +1323,14 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) void __netif_schedule(struct net_device *dev) { if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) { + struct netdev_queue *txq = &dev->tx_queue; unsigned long flags; struct softnet_data *sd; local_irq_save(flags); sd = &__get_cpu_var(softnet_data); - dev->next_sched = sd->output_queue; - sd->output_queue = dev; + txq->next_sched = sd->output_queue; + sd->output_queue = txq; raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } @@ -1912,7 +1913,7 @@ static void net_tx_action(struct softirq_action *h) } if (sd->output_queue) { - struct net_device *head; + struct netdev_queue *head; local_irq_disable(); head = sd->output_queue; @@ -1920,12 +1921,10 @@ static void net_tx_action(struct softirq_action *h) local_irq_enable(); while (head) { - struct net_device *dev = head; - struct netdev_queue *txq; + struct netdev_queue *txq = head; + struct net_device *dev = txq->dev; head = head->next_sched; - txq = &dev->tx_queue; - smp_mb__before_clear_bit(); clear_bit(__LINK_STATE_SCHED, &dev->state); @@ -4346,7 +4345,7 @@ static int dev_cpu_callback(struct notifier_block *nfb, void *ocpu) { struct sk_buff **list_skb; - struct net_device **list_net; + struct netdev_queue **list_net; struct sk_buff *skb; unsigned int cpu, oldcpu = (unsigned long)ocpu; struct softnet_data *sd, *oldsd; -- cgit v1.1 From 6fa9864b53f0680e432a2c431c2cf2055daa3a88 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 8 Jul 2008 23:01:06 -0700 Subject: net: Clean up explicit ->tx_queue references in link watch. First, we add a qdisc_tx_changing() helper which returns true if the qdisc attachment is in transition. Second, we remove an assertion warning which is of limited value and is hard to express precisely in a multiqueue environment. Signed-off-by: David S. Miller --- net/core/link_watch.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 5021821..bf8f7af 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -77,12 +77,10 @@ static void rfc2863_policy(struct net_device *dev) } -static int linkwatch_urgent_event(struct net_device *dev) +static bool linkwatch_urgent_event(struct net_device *dev) { - struct netdev_queue *txq = &dev->tx_queue; - return netif_running(dev) && netif_carrier_ok(dev) && - txq->qdisc != txq->qdisc_sleeping; + qdisc_tx_changing(dev); } @@ -182,12 +180,9 @@ static void __linkwatch_run_queue(int urgent_only) rfc2863_policy(dev); if (dev->flags & IFF_UP) { - if (netif_carrier_ok(dev)) { - struct netdev_queue *txq = &dev->tx_queue; - - WARN_ON(txq->qdisc_sleeping == &noop_qdisc); + if (netif_carrier_ok(dev)) dev_activate(dev); - } else + else dev_deactivate(dev); netdev_state_change(dev); @@ -218,7 +213,7 @@ static void linkwatch_event(struct work_struct *dummy) void linkwatch_fire_event(struct net_device *dev) { - int urgent = linkwatch_urgent_event(dev); + bool urgent = linkwatch_urgent_event(dev); if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { dev_hold(dev); -- cgit v1.1 From 86d804e10a37cd86f16bf72386c37e843a98a74b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 8 Jul 2008 23:11:25 -0700 Subject: netdev: Make netif_schedule() routines work with netdev_queue objects. Only plain netif_schedule() remains taking a net_device, mostly as a compatability item while we transition the rest of these interfaces. Everything else calls netif_schedule_queue() or __netif_schedule(), both of which take a netdev_queue pointer. Signed-off-by: David S. Miller --- net/core/dev.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index d6b8d3c..0dc888a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1320,12 +1320,13 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) } -void __netif_schedule(struct net_device *dev) +void __netif_schedule(struct netdev_queue *txq) { + struct net_device *dev = txq->dev; + if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) { - struct netdev_queue *txq = &dev->tx_queue; - unsigned long flags; struct softnet_data *sd; + unsigned long flags; local_irq_save(flags); sd = &__get_cpu_var(softnet_data); @@ -1932,7 +1933,7 @@ static void net_tx_action(struct softirq_action *h) qdisc_run(dev); spin_unlock(&txq->lock); } else { - netif_schedule(dev); + netif_schedule_queue(txq); } } } -- cgit v1.1 From eb6aafe3f843cb0e939546c03540a3b4911b6964 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 8 Jul 2008 23:12:38 -0700 Subject: pkt_sched: Make qdisc_run take a netdev_queue. This allows us to use this calling convention all the way down into qdisc_restart(). Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0dc888a..0218b0b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1734,7 +1734,7 @@ gso: /* reset queue_mapping to zero */ skb_set_queue_mapping(skb, 0); rc = q->enqueue(skb, q); - qdisc_run(dev); + qdisc_run(txq); spin_unlock(&txq->lock); rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; @@ -1930,7 +1930,7 @@ static void net_tx_action(struct softirq_action *h) clear_bit(__LINK_STATE_SCHED, &dev->state); if (spin_trylock(&txq->lock)) { - qdisc_run(dev); + qdisc_run(txq); spin_unlock(&txq->lock); } else { netif_schedule_queue(txq); -- cgit v1.1 From c773e847ea8f6812804e40f52399c6921a00eab1 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 8 Jul 2008 23:13:53 -0700 Subject: netdev: Move _xmit_lock and xmit_lock_owner into netdev_queue. Accesses are mostly structured such that when there are multiple TX queues the code transformations will be a little bit simpler. Signed-off-by: David S. Miller --- net/core/dev.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0218b0b..a29a359 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -258,7 +258,7 @@ DEFINE_PER_CPU(struct softnet_data, softnet_data); #ifdef CONFIG_DEBUG_LOCK_ALLOC /* - * register_netdevice() inits dev->_xmit_lock and sets lockdep class + * register_netdevice() inits txq->_xmit_lock and sets lockdep class * according to dev->type */ static const unsigned short netdev_lock_type[] = @@ -1758,19 +1758,19 @@ gso: if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */ - if (dev->xmit_lock_owner != cpu) { + if (txq->xmit_lock_owner != cpu) { - HARD_TX_LOCK(dev, cpu); + HARD_TX_LOCK(dev, txq, cpu); if (!netif_queue_stopped(dev) && !netif_subqueue_stopped(dev, skb)) { rc = 0; if (!dev_hard_start_xmit(skb, dev)) { - HARD_TX_UNLOCK(dev); + HARD_TX_UNLOCK(dev, txq); goto out; } } - HARD_TX_UNLOCK(dev); + HARD_TX_UNLOCK(dev, txq); if (net_ratelimit()) printk(KERN_CRIT "Virtual device %s asks to " "queue packet!\n", dev->name); @@ -3761,6 +3761,20 @@ static void rollback_registered(struct net_device *dev) dev_put(dev); } +static void __netdev_init_queue_locks_one(struct netdev_queue *dev_queue, + struct net_device *dev) +{ + spin_lock_init(&dev_queue->_xmit_lock); + netdev_set_lockdep_class(&dev_queue->_xmit_lock, dev->type); + dev_queue->xmit_lock_owner = -1; +} + +static void netdev_init_queue_locks(struct net_device *dev) +{ + __netdev_init_queue_locks_one(&dev->tx_queue, dev); + __netdev_init_queue_locks_one(&dev->rx_queue, dev); +} + /** * register_netdevice - register a network device * @dev: device to register @@ -3795,9 +3809,7 @@ int register_netdevice(struct net_device *dev) BUG_ON(!dev_net(dev)); net = dev_net(dev); - spin_lock_init(&dev->_xmit_lock); - netdev_set_lockdep_class(&dev->_xmit_lock, dev->type); - dev->xmit_lock_owner = -1; + netdev_init_queue_locks(dev); dev->iflink = -1; -- cgit v1.1 From 22bb1be4d271961846cd0889b0f8d671db773080 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 10 Jul 2008 11:16:47 +0200 Subject: wext: make sysfs bits optional and deprecate them The /sys/class/net/*/wireless/ direcory is, as far as I know, not used by anyone. Additionally, the same data is available via wext ioctls. Hence the sysfs files are pretty much useless. This patch makes them optional and schedules them for removal. Signed-off-by: Johannes Berg Cc: Jean Tourrilhes Signed-off-by: John W. Linville --- net/core/net-sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 3f79413..c1f4e0d 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -318,7 +318,7 @@ static struct attribute_group netstat_group = { .attrs = netstat_attrs, }; -#ifdef CONFIG_WIRELESS_EXT +#ifdef CONFIG_WIRELESS_EXT_SYSFS /* helper function that does all the locking etc for wireless stats */ static ssize_t wireless_show(struct device *d, char *buf, ssize_t (*format)(const struct iw_statistics *, @@ -459,7 +459,7 @@ int netdev_register_kobject(struct net_device *net) #ifdef CONFIG_SYSFS *groups++ = &netstat_group; -#ifdef CONFIG_WIRELESS_EXT +#ifdef CONFIG_WIRELESS_EXT_SYSFS if (net->wireless_handlers && net->wireless_handlers->get_wireless_stats) *groups++ = &wireless_group; #endif -- cgit v1.1 From 6aa895b047720f71ec4eb11452f7c3ce8426941f Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 14 Jul 2008 22:49:06 -0700 Subject: vlan: Don't store VLAN tag in cb Use a real skb member to store the skb to avoid clashes with qdiscs, which are allowed to use the cb area themselves. As currently only real devices that consume the skb set the NETIF_F_HW_VLAN_TX flag, no explicit invalidation is neccessary. The new member fills a hole on 64 bit, the skb layout changes from: __u32 mark; /* 172 4 */ sk_buff_data_t transport_header; /* 176 4 */ sk_buff_data_t network_header; /* 180 4 */ sk_buff_data_t mac_header; /* 184 4 */ sk_buff_data_t tail; /* 188 4 */ /* --- cacheline 3 boundary (192 bytes) --- */ sk_buff_data_t end; /* 192 4 */ /* XXX 4 bytes hole, try to pack */ to __u32 mark; /* 172 4 */ __u16 vlan_tci; /* 176 2 */ /* XXX 2 bytes hole, try to pack */ sk_buff_data_t transport_header; /* 180 4 */ sk_buff_data_t network_header; /* 184 4 */ Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/skbuff.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7c57156..50a853f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -459,6 +459,8 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->tc_verd = old->tc_verd; #endif #endif + new->vlan_tci = old->vlan_tci; + skb_copy_secmark(new, old); } @@ -2286,6 +2288,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features) skb_copy_queue_mapping(nskb, skb); nskb->priority = skb->priority; nskb->protocol = skb->protocol; + nskb->vlan_tci = skb->vlan_tci; nskb->dst = dst_clone(skb->dst); memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); nskb->pkt_type = skb->pkt_type; -- cgit v1.1 From bc1d0411b804ad190cdadabac48a10067f17b9e6 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 14 Jul 2008 22:49:30 -0700 Subject: vlan: deliver packets received with VLAN acceleration to network taps When VLAN header stripping is used, packets currently bypass packet sockets (and other network taps) completely. For locally existing VLANs, they appear directly on the VLAN device, for unknown VLANs they are silently dropped. Add a new function netif_nit_deliver() to deliver incoming packets to all network interface taps and use it in __vlan_hwaccel_rx() to make VLAN packets visible on the underlying device. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/dev.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index a29a359..feaab489 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2068,6 +2068,33 @@ out: } #endif +/* + * netif_nit_deliver - deliver received packets to network taps + * @skb: buffer + * + * This function is used to deliver incoming packets to network + * taps. It should be used when the normal netif_receive_skb path + * is bypassed, for example because of VLAN acceleration. + */ +void netif_nit_deliver(struct sk_buff *skb) +{ + struct packet_type *ptype; + + if (list_empty(&ptype_all)) + return; + + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb->mac_len = skb->network_header - skb->mac_header; + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &ptype_all, list) { + if (!ptype->dev || ptype->dev == skb->dev) + deliver_skb(skb, ptype, skb->dev); + } + rcu_read_unlock(); +} + /** * netif_receive_skb - process receive buffer from network * @skb: buffer to process -- cgit v1.1 From f1f28aa3510ddb84c966bac65611bb866c77a092 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 15 Jul 2008 00:08:33 -0700 Subject: netdev: Add addr_list_lock to struct net_device. This will be used to protect the per-device unicast and multicast address lists, as well as the callbacks into the drivers which configure such state such as ->set_rx_mode() and ->set_multicast_list(). Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index feaab489..d933d1b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3836,6 +3836,7 @@ int register_netdevice(struct net_device *dev) BUG_ON(!dev_net(dev)); net = dev_net(dev); + spin_lock_init(&dev->addr_list_lock); netdev_init_queue_locks(dev); dev->iflink = -1; -- cgit v1.1 From e308a5d806c852f56590ffdd3834d0df0cbed8d7 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 15 Jul 2008 00:13:44 -0700 Subject: netdev: Add netdev->addr_list_lock protection. Add netif_addr_{lock,unlock}{,_bh}() helpers. Use them to protect operations that operate on or read the network device unicast and multicast address lists. Also use them in cases where the code simply wants to block calls into the driver's ->set_rx_mode() and ->set_multicast_list() methods. Signed-off-by: David S. Miller --- net/core/dev.c | 14 ++++++++++++++ net/core/dev_mcast.c | 12 ++++++++++++ 2 files changed, 26 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index d933d1b..ef1502d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2982,7 +2982,9 @@ void __dev_set_rx_mode(struct net_device *dev) void dev_set_rx_mode(struct net_device *dev) { netif_tx_lock_bh(dev); + netif_addr_lock(dev); __dev_set_rx_mode(dev); + netif_addr_unlock(dev); netif_tx_unlock_bh(dev); } @@ -3062,9 +3064,11 @@ int dev_unicast_delete(struct net_device *dev, void *addr, int alen) ASSERT_RTNL(); netif_tx_lock_bh(dev); + netif_addr_lock(dev); err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0); if (!err) __dev_set_rx_mode(dev); + netif_addr_unlock(dev); netif_tx_unlock_bh(dev); return err; } @@ -3088,9 +3092,11 @@ int dev_unicast_add(struct net_device *dev, void *addr, int alen) ASSERT_RTNL(); netif_tx_lock_bh(dev); + netif_addr_lock(dev); err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0); if (!err) __dev_set_rx_mode(dev); + netif_addr_unlock(dev); netif_tx_unlock_bh(dev); return err; } @@ -3159,10 +3165,12 @@ int dev_unicast_sync(struct net_device *to, struct net_device *from) int err = 0; netif_tx_lock_bh(to); + netif_addr_lock(to); err = __dev_addr_sync(&to->uc_list, &to->uc_count, &from->uc_list, &from->uc_count); if (!err) __dev_set_rx_mode(to); + netif_addr_unlock(to); netif_tx_unlock_bh(to); return err; } @@ -3180,13 +3188,17 @@ EXPORT_SYMBOL(dev_unicast_sync); void dev_unicast_unsync(struct net_device *to, struct net_device *from) { netif_tx_lock_bh(from); + netif_addr_lock(from); netif_tx_lock_bh(to); + netif_addr_lock(to); __dev_addr_unsync(&to->uc_list, &to->uc_count, &from->uc_list, &from->uc_count); __dev_set_rx_mode(to); + netif_addr_unlock(to); netif_tx_unlock_bh(to); + netif_addr_unlock(from); netif_tx_unlock_bh(from); } EXPORT_SYMBOL(dev_unicast_unsync); @@ -3208,6 +3220,7 @@ static void __dev_addr_discard(struct dev_addr_list **list) static void dev_addr_discard(struct net_device *dev) { netif_tx_lock_bh(dev); + netif_addr_lock(dev); __dev_addr_discard(&dev->uc_list); dev->uc_count = 0; @@ -3215,6 +3228,7 @@ static void dev_addr_discard(struct net_device *dev) __dev_addr_discard(&dev->mc_list); dev->mc_count = 0; + netif_addr_unlock(dev); netif_tx_unlock_bh(dev); } diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c index f8a3455..b6b2a12 100644 --- a/net/core/dev_mcast.c +++ b/net/core/dev_mcast.c @@ -73,6 +73,7 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl) int err; netif_tx_lock_bh(dev); + netif_addr_lock(dev); err = __dev_addr_delete(&dev->mc_list, &dev->mc_count, addr, alen, glbl); if (!err) { @@ -83,6 +84,7 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl) __dev_set_rx_mode(dev); } + netif_addr_unlock(dev); netif_tx_unlock_bh(dev); return err; } @@ -96,9 +98,11 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) int err; netif_tx_lock_bh(dev); + netif_addr_lock(dev); err = __dev_addr_add(&dev->mc_list, &dev->mc_count, addr, alen, glbl); if (!err) __dev_set_rx_mode(dev); + netif_addr_unlock(dev); netif_tx_unlock_bh(dev); return err; } @@ -120,10 +124,12 @@ int dev_mc_sync(struct net_device *to, struct net_device *from) int err = 0; netif_tx_lock_bh(to); + netif_addr_lock(to); err = __dev_addr_sync(&to->mc_list, &to->mc_count, &from->mc_list, &from->mc_count); if (!err) __dev_set_rx_mode(to); + netif_addr_unlock(to); netif_tx_unlock_bh(to); return err; @@ -144,13 +150,17 @@ EXPORT_SYMBOL(dev_mc_sync); void dev_mc_unsync(struct net_device *to, struct net_device *from) { netif_tx_lock_bh(from); + netif_addr_lock(from); netif_tx_lock_bh(to); + netif_addr_lock(to); __dev_addr_unsync(&to->mc_list, &to->mc_count, &from->mc_list, &from->mc_count); __dev_set_rx_mode(to); + netif_addr_unlock(to); netif_tx_unlock_bh(to); + netif_addr_unlock(from); netif_tx_unlock_bh(from); } EXPORT_SYMBOL(dev_mc_unsync); @@ -165,6 +175,7 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v) return 0; netif_tx_lock_bh(dev); + netif_addr_lock(dev); for (m = dev->mc_list; m; m = m->next) { int i; @@ -176,6 +187,7 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v) seq_putc(seq, '\n'); } + netif_addr_unlock(dev); netif_tx_unlock_bh(dev); return 0; } -- cgit v1.1 From b9e40857682ecfc5bcd0356a23ff409883ffb982 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 15 Jul 2008 00:15:08 -0700 Subject: netdev: Do not use TX lock to protect address lists. Now that we have a specific lock to protect the network device unicast and multicast lists, remove extraneous grabs of the TX lock in cases where the code only needs address list protection. Signed-off-by: David S. Miller --- net/core/dev.c | 38 ++++++++++++-------------------------- net/core/dev_mcast.c | 32 ++++++++++---------------------- 2 files changed, 22 insertions(+), 48 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ef1502d..9b49f74 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2981,11 +2981,9 @@ void __dev_set_rx_mode(struct net_device *dev) void dev_set_rx_mode(struct net_device *dev) { - netif_tx_lock_bh(dev); - netif_addr_lock(dev); + netif_addr_lock_bh(dev); __dev_set_rx_mode(dev); - netif_addr_unlock(dev); - netif_tx_unlock_bh(dev); + netif_addr_unlock_bh(dev); } int __dev_addr_delete(struct dev_addr_list **list, int *count, @@ -3063,13 +3061,11 @@ int dev_unicast_delete(struct net_device *dev, void *addr, int alen) ASSERT_RTNL(); - netif_tx_lock_bh(dev); - netif_addr_lock(dev); + netif_addr_lock_bh(dev); err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0); if (!err) __dev_set_rx_mode(dev); - netif_addr_unlock(dev); - netif_tx_unlock_bh(dev); + netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(dev_unicast_delete); @@ -3091,13 +3087,11 @@ int dev_unicast_add(struct net_device *dev, void *addr, int alen) ASSERT_RTNL(); - netif_tx_lock_bh(dev); - netif_addr_lock(dev); + netif_addr_lock_bh(dev); err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0); if (!err) __dev_set_rx_mode(dev); - netif_addr_unlock(dev); - netif_tx_unlock_bh(dev); + netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(dev_unicast_add); @@ -3164,14 +3158,12 @@ int dev_unicast_sync(struct net_device *to, struct net_device *from) { int err = 0; - netif_tx_lock_bh(to); - netif_addr_lock(to); + netif_addr_lock_bh(to); err = __dev_addr_sync(&to->uc_list, &to->uc_count, &from->uc_list, &from->uc_count); if (!err) __dev_set_rx_mode(to); - netif_addr_unlock(to); - netif_tx_unlock_bh(to); + netif_addr_unlock_bh(to); return err; } EXPORT_SYMBOL(dev_unicast_sync); @@ -3187,9 +3179,7 @@ EXPORT_SYMBOL(dev_unicast_sync); */ void dev_unicast_unsync(struct net_device *to, struct net_device *from) { - netif_tx_lock_bh(from); - netif_addr_lock(from); - netif_tx_lock_bh(to); + netif_addr_lock_bh(from); netif_addr_lock(to); __dev_addr_unsync(&to->uc_list, &to->uc_count, @@ -3197,9 +3187,7 @@ void dev_unicast_unsync(struct net_device *to, struct net_device *from) __dev_set_rx_mode(to); netif_addr_unlock(to); - netif_tx_unlock_bh(to); - netif_addr_unlock(from); - netif_tx_unlock_bh(from); + netif_addr_unlock_bh(from); } EXPORT_SYMBOL(dev_unicast_unsync); @@ -3219,8 +3207,7 @@ static void __dev_addr_discard(struct dev_addr_list **list) static void dev_addr_discard(struct net_device *dev) { - netif_tx_lock_bh(dev); - netif_addr_lock(dev); + netif_addr_lock_bh(dev); __dev_addr_discard(&dev->uc_list); dev->uc_count = 0; @@ -3228,8 +3215,7 @@ static void dev_addr_discard(struct net_device *dev) __dev_addr_discard(&dev->mc_list); dev->mc_count = 0; - netif_addr_unlock(dev); - netif_tx_unlock_bh(dev); + netif_addr_unlock_bh(dev); } unsigned dev_get_flags(const struct net_device *dev) diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c index b6b2a12..5402b3b 100644 --- a/net/core/dev_mcast.c +++ b/net/core/dev_mcast.c @@ -72,8 +72,7 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl) { int err; - netif_tx_lock_bh(dev); - netif_addr_lock(dev); + netif_addr_lock_bh(dev); err = __dev_addr_delete(&dev->mc_list, &dev->mc_count, addr, alen, glbl); if (!err) { @@ -84,8 +83,7 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl) __dev_set_rx_mode(dev); } - netif_addr_unlock(dev); - netif_tx_unlock_bh(dev); + netif_addr_unlock_bh(dev); return err; } @@ -97,13 +95,11 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) { int err; - netif_tx_lock_bh(dev); - netif_addr_lock(dev); + netif_addr_lock_bh(dev); err = __dev_addr_add(&dev->mc_list, &dev->mc_count, addr, alen, glbl); if (!err) __dev_set_rx_mode(dev); - netif_addr_unlock(dev); - netif_tx_unlock_bh(dev); + netif_addr_unlock_bh(dev); return err; } @@ -123,14 +119,12 @@ int dev_mc_sync(struct net_device *to, struct net_device *from) { int err = 0; - netif_tx_lock_bh(to); - netif_addr_lock(to); + netif_addr_lock_bh(to); err = __dev_addr_sync(&to->mc_list, &to->mc_count, &from->mc_list, &from->mc_count); if (!err) __dev_set_rx_mode(to); - netif_addr_unlock(to); - netif_tx_unlock_bh(to); + netif_addr_unlock_bh(to); return err; } @@ -149,9 +143,7 @@ EXPORT_SYMBOL(dev_mc_sync); */ void dev_mc_unsync(struct net_device *to, struct net_device *from) { - netif_tx_lock_bh(from); - netif_addr_lock(from); - netif_tx_lock_bh(to); + netif_addr_lock_bh(from); netif_addr_lock(to); __dev_addr_unsync(&to->mc_list, &to->mc_count, @@ -159,9 +151,7 @@ void dev_mc_unsync(struct net_device *to, struct net_device *from) __dev_set_rx_mode(to); netif_addr_unlock(to); - netif_tx_unlock_bh(to); - netif_addr_unlock(from); - netif_tx_unlock_bh(from); + netif_addr_unlock_bh(from); } EXPORT_SYMBOL(dev_mc_unsync); @@ -174,8 +164,7 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) return 0; - netif_tx_lock_bh(dev); - netif_addr_lock(dev); + netif_addr_lock_bh(dev); for (m = dev->mc_list; m; m = m->next) { int i; @@ -187,8 +176,7 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v) seq_putc(seq, '\n'); } - netif_addr_unlock(dev); - netif_tx_unlock_bh(dev); + netif_addr_unlock_bh(dev); return 0; } -- cgit v1.1 From 2870c43d1795bcb40b12bad6456f07ad8e64b3de Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Tue, 15 Jul 2008 00:49:11 -0700 Subject: net: refactor tcp splice receive path to improve readability - move all of the details on offsets, lengths and buffers into a single function instead of doing these operation from multiple places - use a bottom up approach: try to avoid details in the high level functions, introduce them gradually as we go deeper in the function call stack With helpful feedback from Jarek Poplawski. Signed-off-by: Octavian Purdila Acked-by: Jarek Poplawski Signed-off-by: David S. Miller --- net/core/skbuff.c | 153 ++++++++++++++++++++++-------------------------------- 1 file changed, 61 insertions(+), 92 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 50a853f..48a4372 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1282,114 +1282,83 @@ static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page, return 0; } -/* - * Map linear and fragment data from the skb to spd. Returns number of - * pages mapped. - */ -static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, - unsigned int *total_len, - struct splice_pipe_desc *spd) -{ - unsigned int nr_pages = spd->nr_pages; - unsigned int poff, plen, len, toff, tlen; - int headlen, seg, error = 0; - - toff = *offset; - tlen = *total_len; - if (!tlen) { - error = 1; - goto err; +static inline void __segment_seek(struct page **page, unsigned int *poff, + unsigned int *plen, unsigned int off) +{ + *poff += off; + *page += *poff / PAGE_SIZE; + *poff = *poff % PAGE_SIZE; + *plen -= off; +} + +static inline int __splice_segment(struct page *page, unsigned int poff, + unsigned int plen, unsigned int *off, + unsigned int *len, struct sk_buff *skb, + struct splice_pipe_desc *spd) +{ + if (!*len) + return 1; + + /* skip this segment if already processed */ + if (*off >= plen) { + *off -= plen; + return 0; } - /* - * if the offset is greater than the linear part, go directly to - * the fragments. - */ - headlen = skb_headlen(skb); - if (toff >= headlen) { - toff -= headlen; - goto map_frag; + /* ignore any bits we already processed */ + if (*off) { + __segment_seek(&page, &poff, &plen, *off); + *off = 0; } - /* - * first map the linear region into the pages/partial map, skipping - * any potential initial offset. - */ - len = 0; - while (len < headlen) { - void *p = skb->data + len; - - poff = (unsigned long) p & (PAGE_SIZE - 1); - plen = min_t(unsigned int, headlen - len, PAGE_SIZE - poff); - len += plen; - - if (toff) { - if (plen <= toff) { - toff -= plen; - continue; - } - plen -= toff; - poff += toff; - toff = 0; - } + do { + unsigned int flen = min(*len, plen); - plen = min(plen, tlen); - if (!plen) - break; + /* the linear region may spread across several pages */ + flen = min_t(unsigned int, flen, PAGE_SIZE - poff); - /* - * just jump directly to update and return, no point - * in going over fragments when the output is full. - */ - error = spd_fill_page(spd, virt_to_page(p), plen, poff, skb); - if (error) - goto done; + if (spd_fill_page(spd, page, flen, poff, skb)) + return 1; - tlen -= plen; - } + __segment_seek(&page, &poff, &plen, flen); + *len -= flen; + + } while (*len && plen); + + return 0; +} + +/* + * Map linear and fragment data from the skb to spd. It reports failure if the + * pipe is full or if we already spliced the requested length. + */ +int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, + unsigned int *len, + struct splice_pipe_desc *spd) +{ + int seg; + + /* + * map the linear part + */ + if (__splice_segment(virt_to_page(skb->data), + (unsigned long) skb->data & (PAGE_SIZE - 1), + skb_headlen(skb), + offset, len, skb, spd)) + return 1; /* * then map the fragments */ -map_frag: for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; - plen = f->size; - poff = f->page_offset; - - if (toff) { - if (plen <= toff) { - toff -= plen; - continue; - } - plen -= toff; - poff += toff; - toff = 0; - } - - plen = min(plen, tlen); - if (!plen) - break; - - error = spd_fill_page(spd, f->page, plen, poff, skb); - if (error) - break; - - tlen -= plen; + if (__splice_segment(f->page, f->page_offset, f->size, + offset, len, skb, spd)) + return 1; } -done: - if (spd->nr_pages - nr_pages) { - *offset = 0; - *total_len = tlen; - return 0; - } -err: - /* update the offset to reflect the linear part skip, if any */ - if (!error) - *offset = toff; - return error; + return 0; } /* -- cgit v1.1 From 7b1c65faa27f5ade3915e4bbc9186b6e64d2d6ec Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Wed, 16 Jul 2008 20:12:30 -0700 Subject: net: make __skb_splice_bits static net/core/skbuff.c:1335:5: warning: symbol '__skb_splice_bits' was not declared. Should it be static? Signed-off-by: Harvey Harrison Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 48a4372..e411567 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1332,7 +1332,7 @@ static inline int __splice_segment(struct page *page, unsigned int poff, * Map linear and fragment data from the skb to spd. It reports failure if the * pipe is full or if we already spliced the requested length. */ -int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, +static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, unsigned int *len, struct splice_pipe_desc *spd) { -- cgit v1.1 From 5c52ba170f8167511bdb65b981f4582100c40675 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 16 Jul 2008 20:28:10 -0700 Subject: sock: add net to prot->enter_memory_pressure callback The tcp_enter_memory_pressure calls NET_INC_STATS, but doesn't have where to get the net from. I decided to add a sk argument, not the net itself, only to factor all the required sock_net(sk) calls inside the enter_memory_pressure callback itself. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 2c0ba52..10a64d5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1442,7 +1442,7 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) /* Under pressure. */ if (allocated > prot->sysctl_mem[1]) if (prot->enter_memory_pressure) - prot->enter_memory_pressure(); + prot->enter_memory_pressure(sk); /* Over hard limit. */ if (allocated > prot->sysctl_mem[2]) -- cgit v1.1 From 9a6d276e85aa3d8f308fc5e8de6892daeb60ae5f Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Wed, 16 Jul 2008 20:50:49 -0700 Subject: core: add stat to track unresolved discards in neighbor cache in __neigh_event_send, if we have a neighbour entry which is in NUD_INCOMPLETE state, we enqueue any outbound frames to that neighbour to the neighbours arp_queue, which is default capped to a length of 3 skbs. If that queue exceeds its set length, it will drop an skb on the queue to enqueue the newly arrived skb. This results in a drop for which we have no statistics incremented. This patch adds an unresolved_discards stat to /proc/net/stat/ndisc_cache to track these lost frames. Signed-off-by: Neil Horman Signed-off-by: David S. Miller --- net/core/neighbour.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 65f01f7..f62c8af 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -930,6 +930,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) buff = neigh->arp_queue.next; __skb_unlink(buff, &neigh->arp_queue); kfree_skb(buff); + NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); } __skb_queue_tail(&neigh->arp_queue, skb); } @@ -2462,12 +2463,12 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v) struct neigh_statistics *st = v; if (v == SEQ_START_TOKEN) { - seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs\n"); + seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards\n"); return 0; } seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " - "%08lx %08lx %08lx %08lx\n", + "%08lx %08lx %08lx %08lx %08lx\n", atomic_read(&tbl->entries), st->allocs, @@ -2483,7 +2484,8 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v) st->rcv_probes_ucast, st->periodic_gc_runs, - st->forced_gc_runs + st->forced_gc_runs, + st->unres_discards ); return 0; -- cgit v1.1 From e8a0464cc950972824e2e128028ae3db666ec1ed Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 17 Jul 2008 00:34:19 -0700 Subject: netdev: Allocate multiple queues for TX. alloc_netdev_mq() now allocates an array of netdev_queue structures for TX, based upon the queue_count argument. Furthermore, all accesses to the TX queues are now vectored through the netdev_get_tx_queue() and netdev_for_each_tx_queue() interfaces. This makes it easy to grep the tree for all things that want to get to a TX queue of a net device. Problem spots which are not really multiqueue aware yet, and only work with one queue, can easily be spotted by grepping for all netdev_get_tx_queue() calls that pass in a zero index. Signed-off-by: David S. Miller --- net/core/dev.c | 40 +++++++++++++++++++++++++++++++--------- net/core/rtnetlink.c | 2 +- 2 files changed, 32 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 9b49f74..69378f2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1666,6 +1666,12 @@ out_kfree_skb: * --BLG */ +static struct netdev_queue *dev_pick_tx(struct net_device *dev, + struct sk_buff *skb) +{ + return netdev_get_tx_queue(dev, 0); +} + int dev_queue_xmit(struct sk_buff *skb) { struct net_device *dev = skb->dev; @@ -1702,7 +1708,7 @@ int dev_queue_xmit(struct sk_buff *skb) } gso: - txq = &dev->tx_queue; + txq = dev_pick_tx(dev, skb); spin_lock_prefetch(&txq->lock); /* Disable soft irqs for various locks below. Also @@ -3788,8 +3794,9 @@ static void rollback_registered(struct net_device *dev) dev_put(dev); } -static void __netdev_init_queue_locks_one(struct netdev_queue *dev_queue, - struct net_device *dev) +static void __netdev_init_queue_locks_one(struct net_device *dev, + struct netdev_queue *dev_queue, + void *_unused) { spin_lock_init(&dev_queue->_xmit_lock); netdev_set_lockdep_class(&dev_queue->_xmit_lock, dev->type); @@ -3798,8 +3805,8 @@ static void __netdev_init_queue_locks_one(struct netdev_queue *dev_queue, static void netdev_init_queue_locks(struct net_device *dev) { - __netdev_init_queue_locks_one(&dev->tx_queue, dev); - __netdev_init_queue_locks_one(&dev->rx_queue, dev); + netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL); + __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL); } /** @@ -4119,7 +4126,8 @@ static struct net_device_stats *internal_stats(struct net_device *dev) } static void netdev_init_one_queue(struct net_device *dev, - struct netdev_queue *queue) + struct netdev_queue *queue, + void *_unused) { spin_lock_init(&queue->lock); queue->dev = dev; @@ -4127,8 +4135,8 @@ static void netdev_init_one_queue(struct net_device *dev, static void netdev_init_queues(struct net_device *dev) { - netdev_init_one_queue(dev, &dev->rx_queue); - netdev_init_one_queue(dev, &dev->tx_queue); + netdev_init_one_queue(dev, &dev->rx_queue, NULL); + netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); } /** @@ -4145,9 +4153,10 @@ static void netdev_init_queues(struct net_device *dev) struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, void (*setup)(struct net_device *), unsigned int queue_count) { - void *p; + struct netdev_queue *tx; struct net_device *dev; int alloc_size; + void *p; BUG_ON(strlen(name) >= sizeof(dev->name)); @@ -4167,11 +4176,22 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, return NULL; } + tx = kzalloc(sizeof(struct netdev_queue) * queue_count, GFP_KERNEL); + if (!tx) { + printk(KERN_ERR "alloc_netdev: Unable to allocate " + "tx qdiscs.\n"); + kfree(p); + return NULL; + } + dev = (struct net_device *) (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST); dev->padded = (char *)dev - (char *)p; dev_net_set(dev, &init_net); + dev->_tx = tx; + dev->num_tx_queues = queue_count; + if (sizeof_priv) { dev->priv = ((char *)dev + ((sizeof(struct net_device) + @@ -4205,6 +4225,8 @@ void free_netdev(struct net_device *dev) { release_net(dev_net(dev)); + kfree(dev->_tx); + /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { kfree((char *)dev - dev->padded); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 8ef9f1d..71edb8b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -636,7 +636,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (dev->master) NLA_PUT_U32(skb, IFLA_MASTER, dev->master->ifindex); - txq = &dev->tx_queue; + txq = netdev_get_tx_queue(dev, 0); if (txq->qdisc_sleeping) NLA_PUT_STRING(skb, IFLA_QDISC, txq->qdisc_sleeping->ops->id); -- cgit v1.1 From fd2ea0a79faad824258af5dcec1927aa24d81c16 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 17 Jul 2008 01:56:23 -0700 Subject: net: Use queue aware tests throughout. This effectively "flips the switch" by making the core networking and multiqueue-aware drivers use the new TX multiqueue structures. Non-multiqueue drivers need no changes. The interfaces they use such as netif_stop_queue() degenerate into an operation on TX queue zero. So everything "just works" for them. Code that really wants to do "X" to all TX queues now invokes a routine that does so, such as netif_tx_wake_all_queues(), netif_tx_stop_all_queues(), etc. pktgen and netpoll required a little bit more surgery than the others. In particular the pktgen changes, whilst functional, could be largely improved. The initial check in pktgen_xmit() will sometimes check the wrong queue, which is mostly harmless. The thing to do is probably to invoke fill_packet() earlier. The bulk of the netpoll changes is to make the code operate solely on the TX queue indicated by by the SKB queue mapping. Setting of the SKB queue mapping is entirely confined inside of net/core/dev.c:dev_pick_tx(). If we end up needing any kind of special semantics (drops, for example) it will be implemented here. Finally, we now have a "real_num_tx_queues" which is where the driver indicates how many TX queues are actually active. With IGB changes from Jeff Kirsher. Signed-off-by: David S. Miller --- net/core/dev.c | 28 ++++++++++------------ net/core/netpoll.c | 24 +++++++++++-------- net/core/pktgen.c | 69 ++++++++++++++++++++++++++++++++++-------------------- 3 files changed, 70 insertions(+), 51 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 69378f2..f027a1a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1598,7 +1598,8 @@ static int dev_gso_segment(struct sk_buff *skb) return 0; } -int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) +int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq) { if (likely(!skb->next)) { if (!list_empty(&ptype_all)) @@ -1627,9 +1628,7 @@ gso: skb->next = nskb; return rc; } - if (unlikely((netif_queue_stopped(dev) || - netif_subqueue_stopped(dev, skb)) && - skb->next)) + if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) return NETDEV_TX_BUSY; } while (skb->next); @@ -1669,7 +1668,10 @@ out_kfree_skb: static struct netdev_queue *dev_pick_tx(struct net_device *dev, struct sk_buff *skb) { - return netdev_get_tx_queue(dev, 0); + u16 queue_index = 0; + + skb_set_queue_mapping(skb, queue_index); + return netdev_get_tx_queue(dev, queue_index); } int dev_queue_xmit(struct sk_buff *skb) @@ -1737,8 +1739,6 @@ gso: spin_lock(&txq->lock); q = txq->qdisc; if (q->enqueue) { - /* reset queue_mapping to zero */ - skb_set_queue_mapping(skb, 0); rc = q->enqueue(skb, q); qdisc_run(txq); spin_unlock(&txq->lock); @@ -1768,10 +1768,9 @@ gso: HARD_TX_LOCK(dev, txq, cpu); - if (!netif_queue_stopped(dev) && - !netif_subqueue_stopped(dev, skb)) { + if (!netif_tx_queue_stopped(txq)) { rc = 0; - if (!dev_hard_start_xmit(skb, dev)) { + if (!dev_hard_start_xmit(skb, dev, txq)) { HARD_TX_UNLOCK(dev, txq); goto out; } @@ -4160,8 +4159,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, BUG_ON(strlen(name) >= sizeof(dev->name)); - alloc_size = sizeof(struct net_device) + - sizeof(struct net_device_subqueue) * (queue_count - 1); + alloc_size = sizeof(struct net_device); if (sizeof_priv) { /* ensure 32-byte alignment of private area */ alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST; @@ -4191,16 +4189,14 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev->_tx = tx; dev->num_tx_queues = queue_count; + dev->real_num_tx_queues = queue_count; if (sizeof_priv) { dev->priv = ((char *)dev + - ((sizeof(struct net_device) + - (sizeof(struct net_device_subqueue) * - (queue_count - 1)) + NETDEV_ALIGN_CONST) + ((sizeof(struct net_device) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST)); } - dev->egress_subqueue_count = queue_count; dev->gso_max_size = GSO_MAX_SIZE; netdev_init_queues(dev); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 8fb134d..c127208 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -58,25 +58,27 @@ static void queue_process(struct work_struct *work) while ((skb = skb_dequeue(&npinfo->txq))) { struct net_device *dev = skb->dev; + struct netdev_queue *txq; if (!netif_device_present(dev) || !netif_running(dev)) { __kfree_skb(skb); continue; } + txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + local_irq_save(flags); - netif_tx_lock(dev); - if ((netif_queue_stopped(dev) || - netif_subqueue_stopped(dev, skb)) || - dev->hard_start_xmit(skb, dev) != NETDEV_TX_OK) { + __netif_tx_lock(txq, smp_processor_id()); + if (netif_tx_queue_stopped(txq) || + dev->hard_start_xmit(skb, dev) != NETDEV_TX_OK) { skb_queue_head(&npinfo->txq, skb); - netif_tx_unlock(dev); + __netif_tx_unlock(txq); local_irq_restore(flags); schedule_delayed_work(&npinfo->tx_work, HZ/10); return; } - netif_tx_unlock(dev); + __netif_tx_unlock(txq); local_irq_restore(flags); } } @@ -278,17 +280,19 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) /* don't get messages out of order, and no recursion */ if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { + struct netdev_queue *txq; unsigned long flags; + txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + local_irq_save(flags); /* try until next clock tick */ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; tries > 0; --tries) { - if (netif_tx_trylock(dev)) { - if (!netif_queue_stopped(dev) && - !netif_subqueue_stopped(dev, skb)) + if (__netif_tx_trylock(txq)) { + if (!netif_tx_queue_stopped(txq)) status = dev->hard_start_xmit(skb, dev); - netif_tx_unlock(dev); + __netif_tx_unlock(txq); if (status == NETDEV_TX_OK) break; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index fdf5377..906802d 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2123,6 +2123,24 @@ static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) } } #endif +static void set_cur_queue_map(struct pktgen_dev *pkt_dev) +{ + if (pkt_dev->queue_map_min < pkt_dev->queue_map_max) { + __u16 t; + if (pkt_dev->flags & F_QUEUE_MAP_RND) { + t = random32() % + (pkt_dev->queue_map_max - + pkt_dev->queue_map_min + 1) + + pkt_dev->queue_map_min; + } else { + t = pkt_dev->cur_queue_map + 1; + if (t > pkt_dev->queue_map_max) + t = pkt_dev->queue_map_min; + } + pkt_dev->cur_queue_map = t; + } +} + /* Increment/randomize headers according to flags and current values * for IP src/dest, UDP src/dst port, MAC-Addr src/dst */ @@ -2325,19 +2343,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) pkt_dev->cur_pkt_size = t; } - if (pkt_dev->queue_map_min < pkt_dev->queue_map_max) { - __u16 t; - if (pkt_dev->flags & F_QUEUE_MAP_RND) { - t = random32() % - (pkt_dev->queue_map_max - pkt_dev->queue_map_min + 1) - + pkt_dev->queue_map_min; - } else { - t = pkt_dev->cur_queue_map + 1; - if (t > pkt_dev->queue_map_max) - t = pkt_dev->queue_map_min; - } - pkt_dev->cur_queue_map = t; - } + set_cur_queue_map(pkt_dev); pkt_dev->flows[flow].count++; } @@ -2458,7 +2464,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, __be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */ __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */ __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */ - + u16 queue_map; if (pkt_dev->nr_labels) protocol = htons(ETH_P_MPLS_UC); @@ -2469,6 +2475,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, /* Update any of the values, used when we're incrementing various * fields. */ + queue_map = pkt_dev->cur_queue_map; mod_cur_headers(pkt_dev); datalen = (odev->hard_header_len + 16) & ~0xf; @@ -2507,7 +2514,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, skb->network_header = skb->tail; skb->transport_header = skb->network_header + sizeof(struct iphdr); skb_put(skb, sizeof(struct iphdr) + sizeof(struct udphdr)); - skb_set_queue_mapping(skb, pkt_dev->cur_queue_map); + skb_set_queue_mapping(skb, queue_map); iph = ip_hdr(skb); udph = udp_hdr(skb); @@ -2797,6 +2804,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, __be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */ __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */ __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */ + u16 queue_map; if (pkt_dev->nr_labels) protocol = htons(ETH_P_MPLS_UC); @@ -2807,6 +2815,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, /* Update any of the values, used when we're incrementing various * fields. */ + queue_map = pkt_dev->cur_queue_map; mod_cur_headers(pkt_dev); skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16 + @@ -2844,7 +2853,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, skb->network_header = skb->tail; skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); skb_put(skb, sizeof(struct ipv6hdr) + sizeof(struct udphdr)); - skb_set_queue_mapping(skb, pkt_dev->cur_queue_map); + skb_set_queue_mapping(skb, queue_map); iph = ipv6_hdr(skb); udph = udp_hdr(skb); @@ -3263,7 +3272,9 @@ static void pktgen_rem_thread(struct pktgen_thread *t) static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) { struct net_device *odev = NULL; + struct netdev_queue *txq; __u64 idle_start = 0; + u16 queue_map; int ret; odev = pkt_dev->odev; @@ -3285,9 +3296,15 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) } } - if ((netif_queue_stopped(odev) || - (pkt_dev->skb && - netif_subqueue_stopped(odev, pkt_dev->skb))) || + if (!pkt_dev->skb) { + set_cur_queue_map(pkt_dev); + queue_map = pkt_dev->cur_queue_map; + } else { + queue_map = skb_get_queue_mapping(pkt_dev->skb); + } + + txq = netdev_get_tx_queue(odev, queue_map); + if (netif_tx_queue_stopped(txq) || need_resched()) { idle_start = getCurUs(); @@ -3303,8 +3320,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->idle_acc += getCurUs() - idle_start; - if (netif_queue_stopped(odev) || - netif_subqueue_stopped(odev, pkt_dev->skb)) { + if (netif_tx_queue_stopped(txq)) { pkt_dev->next_tx_us = getCurUs(); /* TODO */ pkt_dev->next_tx_ns = 0; goto out; /* Try the next interface */ @@ -3331,9 +3347,12 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) } } - netif_tx_lock_bh(odev); - if (!netif_queue_stopped(odev) && - !netif_subqueue_stopped(odev, pkt_dev->skb)) { + /* fill_packet() might have changed the queue */ + queue_map = skb_get_queue_mapping(pkt_dev->skb); + txq = netdev_get_tx_queue(odev, queue_map); + + __netif_tx_lock_bh(txq); + if (!netif_tx_queue_stopped(txq)) { atomic_inc(&(pkt_dev->skb->users)); retry_now: @@ -3377,7 +3396,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->next_tx_ns = 0; } - netif_tx_unlock_bh(odev); + __netif_tx_unlock_bh(txq); /* If pkt_dev->count is zero, then run forever */ if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) { -- cgit v1.1 From eae792b722fef08dcf3aee88266ee7def9710757 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 15 Jul 2008 03:03:33 -0700 Subject: netdev: Add netdev->select_queue() method. Devices or device layers can set this to control the queue selection performed by dev_pick_tx(). This function runs under RCU protection, which allows overriding functions to have some way of synchronizing with things like dynamic ->real_num_tx_queues adjustments. This makes the spinlock prefetch in dev_queue_xmit() a little bit less effective, but that's the price right now for correctness. Signed-off-by: David S. Miller --- net/core/dev.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index f027a1a..7ca9564 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1670,6 +1670,9 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev, { u16 queue_index = 0; + if (dev->select_queue) + queue_index = dev->select_queue(dev, skb); + skb_set_queue_mapping(skb, queue_index); return netdev_get_tx_queue(dev, queue_index); } @@ -1710,14 +1713,14 @@ int dev_queue_xmit(struct sk_buff *skb) } gso: - txq = dev_pick_tx(dev, skb); - spin_lock_prefetch(&txq->lock); - /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ rcu_read_lock_bh(); + txq = dev_pick_tx(dev, skb); + spin_lock_prefetch(&txq->lock); + /* Updates of qdisc are serialized by queue->lock. * The struct Qdisc which is pointed to by qdisc is now a * rcu structure - it may be accessed without acquiring -- cgit v1.1 From 8f0f2223cc08a5ae9a77f40edfe02e8a9f1abd77 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 15 Jul 2008 03:47:03 -0700 Subject: net: Implement simple sw TX hashing. It just xor hashes over IPv4/IPv6 addresses and ports of transport. The only assumption it makes is that skb_network_header() is set correctly. With bug fixes from Eric Dumazet. Signed-off-by: David S. Miller --- net/core/dev.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 7ca9564..467bfb3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -121,6 +121,9 @@ #include #include #include +#include +#include +#include #include "net-sysfs.h" @@ -1665,6 +1668,53 @@ out_kfree_skb: * --BLG */ +static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb) +{ + u32 *addr, *ports, hash, ihl; + u8 ip_proto; + int alen; + + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + ip_proto = ip_hdr(skb)->protocol; + addr = &ip_hdr(skb)->saddr; + ihl = ip_hdr(skb)->ihl; + alen = 2; + break; + case __constant_htons(ETH_P_IPV6): + ip_proto = ipv6_hdr(skb)->nexthdr; + addr = &ipv6_hdr(skb)->saddr.s6_addr32[0]; + ihl = (40 >> 2); + alen = 8; + break; + default: + return 0; + } + + ports = (u32 *) (skb_network_header(skb) + (ihl * 4)); + + hash = 0; + while (alen--) + hash ^= *addr++; + + switch (ip_proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_DCCP: + case IPPROTO_ESP: + case IPPROTO_AH: + case IPPROTO_SCTP: + case IPPROTO_UDPLITE: + hash ^= *ports; + break; + + default: + break; + } + + return hash % dev->real_num_tx_queues; +} + static struct netdev_queue *dev_pick_tx(struct net_device *dev, struct sk_buff *skb) { @@ -1672,6 +1722,8 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev, if (dev->select_queue) queue_index = dev->select_queue(dev, skb); + else if (dev->real_num_tx_queues > 1) + queue_index = simple_tx_hash(dev, skb); skb_set_queue_mapping(skb, queue_index); return netdev_get_tx_queue(dev, queue_index); -- cgit v1.1 From 37437bb2e1ae8af470dfcd5b4ff454110894ccaf Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 16 Jul 2008 02:15:04 -0700 Subject: pkt_sched: Schedule qdiscs instead of netdev_queue. When we have shared qdiscs, packets come out of the qdiscs for multiple transmit queues. Therefore it doesn't make any sense to schedule the transmit queue when logically we cannot know ahead of time the TX queue of the SKB that the qdisc->dequeue() will give us. Just for sanity I added a BUG check to make sure we never get into a state where the noop_qdisc is scheduled. Signed-off-by: David S. Miller --- net/core/dev.c | 68 ++++++++++++++++++++++++---------------------------------- 1 file changed, 28 insertions(+), 40 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 467bfb3..0b909b7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1323,18 +1323,18 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) } -void __netif_schedule(struct netdev_queue *txq) +void __netif_schedule(struct Qdisc *q) { - struct net_device *dev = txq->dev; + BUG_ON(q == &noop_qdisc); - if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) { + if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) { struct softnet_data *sd; unsigned long flags; local_irq_save(flags); sd = &__get_cpu_var(softnet_data); - txq->next_sched = sd->output_queue; - sd->output_queue = txq; + q->next_sched = sd->output_queue; + sd->output_queue = q; raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } @@ -1771,37 +1771,23 @@ gso: rcu_read_lock_bh(); txq = dev_pick_tx(dev, skb); - spin_lock_prefetch(&txq->lock); - - /* Updates of qdisc are serialized by queue->lock. - * The struct Qdisc which is pointed to by qdisc is now a - * rcu structure - it may be accessed without acquiring - * a lock (but the structure may be stale.) The freeing of the - * qdisc will be deferred until it's known that there are no - * more references to it. - * - * If the qdisc has an enqueue function, we still need to - * hold the queue->lock before calling it, since queue->lock - * also serializes access to the device queue. - */ - q = rcu_dereference(txq->qdisc); + #ifdef CONFIG_NET_CLS_ACT skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS); #endif if (q->enqueue) { - /* Grab device queue */ - spin_lock(&txq->lock); - q = txq->qdisc; - if (q->enqueue) { - rc = q->enqueue(skb, q); - qdisc_run(txq); - spin_unlock(&txq->lock); - - rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; - goto out; - } - spin_unlock(&txq->lock); + spinlock_t *root_lock = qdisc_root_lock(q); + + spin_lock(root_lock); + + rc = q->enqueue(skb, q); + qdisc_run(q); + + spin_unlock(root_lock); + + rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; + goto out; } /* The device has no queue. Common case for software devices: @@ -1974,7 +1960,7 @@ static void net_tx_action(struct softirq_action *h) } if (sd->output_queue) { - struct netdev_queue *head; + struct Qdisc *head; local_irq_disable(); head = sd->output_queue; @@ -1982,18 +1968,20 @@ static void net_tx_action(struct softirq_action *h) local_irq_enable(); while (head) { - struct netdev_queue *txq = head; - struct net_device *dev = txq->dev; + struct Qdisc *q = head; + spinlock_t *root_lock; + head = head->next_sched; smp_mb__before_clear_bit(); - clear_bit(__LINK_STATE_SCHED, &dev->state); + clear_bit(__QDISC_STATE_SCHED, &q->state); - if (spin_trylock(&txq->lock)) { - qdisc_run(txq); - spin_unlock(&txq->lock); + root_lock = qdisc_root_lock(q); + if (spin_trylock(root_lock)) { + qdisc_run(q); + spin_unlock(root_lock); } else { - netif_schedule_queue(txq); + __netif_schedule(q); } } } @@ -4459,7 +4447,7 @@ static int dev_cpu_callback(struct notifier_block *nfb, void *ocpu) { struct sk_buff **list_skb; - struct netdev_queue **list_net; + struct Qdisc **list_net; struct sk_buff *skb; unsigned int cpu, oldcpu = (unsigned long)ocpu; struct softnet_data *sd, *oldsd; -- cgit v1.1 From ead81cc5fc6d996db6afb20f211241612610a07a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 17 Jul 2008 00:50:32 -0700 Subject: netdevice: Move qdisc_list back into net_device proper. And give it it's own lock. Signed-off-by: David S. Miller --- net/core/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0b909b7..6741e34 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3886,6 +3886,8 @@ int register_netdevice(struct net_device *dev) net = dev_net(dev); spin_lock_init(&dev->addr_list_lock); + spin_lock_init(&dev->qdisc_list_lock); + INIT_LIST_HEAD(&dev->qdisc_list); netdev_init_queue_locks(dev); dev->iflink = -1; -- cgit v1.1 From 83874000929ed63aef30b44083a9f713135ff040 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 17 Jul 2008 00:53:03 -0700 Subject: pkt_sched: Kill netdev_queue lock. We can simply use the qdisc->q.lock for all of the qdisc tree synchronization. Signed-off-by: David S. Miller --- net/core/dev.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 6741e34..32a1377 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2080,10 +2080,12 @@ static int ing_filter(struct sk_buff *skb) rxq = &dev->rx_queue; - spin_lock(&rxq->lock); - if ((q = rxq->qdisc) != NULL) + q = rxq->qdisc; + if (q) { + spin_lock(qdisc_lock(q)); result = q->enqueue(skb, q); - spin_unlock(&rxq->lock); + spin_unlock(qdisc_lock(q)); + } return result; } @@ -4173,7 +4175,6 @@ static void netdev_init_one_queue(struct net_device *dev, struct netdev_queue *queue, void *_unused) { - spin_lock_init(&queue->lock); queue->dev = dev; } -- cgit v1.1 From 3072367300aa8c779e3a14ee8e89de079e90f3ad Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 18 Jul 2008 22:50:15 -0700 Subject: pkt_sched: Manage qdisc list inside of root qdisc. Idea is from Patrick McHardy. Instead of managing the list of qdiscs on the device level, manage it in the root qdisc of a netdev_queue. This solves all kinds of visibility issues during qdisc destruction. The way to iterate over all qdiscs of a netdev_queue is to visit the netdev_queue->qdisc, and then traverse it's list. The only special case is to ignore builting qdiscs at the root when dumping or doing a qdisc_lookup(). That was not needed previously because builtin qdiscs were not added to the device's qdisc_list. Signed-off-by: David S. Miller --- net/core/dev.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index e54acde..065b981 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3888,8 +3888,6 @@ int register_netdevice(struct net_device *dev) net = dev_net(dev); spin_lock_init(&dev->addr_list_lock); - spin_lock_init(&dev->qdisc_list_lock); - INIT_LIST_HEAD(&dev->qdisc_list); netdev_init_queue_locks(dev); dev->iflink = -1; -- cgit v1.1 From 721499e8931c5732202481ae24f2dfbf9910f129 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Sat, 19 Jul 2008 22:34:43 -0700 Subject: netns: Use net_eq() to compare net-namespaces for optimization. Without CONFIG_NET_NS, namespace is always &init_net. Compiler will be able to omit namespace comparisons with this patch. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/core/pktgen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 906802d..c7d484f 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -1875,7 +1875,7 @@ static int pktgen_device_event(struct notifier_block *unused, { struct net_device *dev = ptr; - if (dev_net(dev) != &init_net) + if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; /* It is OK that we do not hold the group lock right now, -- cgit v1.1 From 230b183921ecbaa5fedc0d35ad6ba7bb64b6e06a Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Sat, 19 Jul 2008 22:35:47 -0700 Subject: net: Use standard structures for generic socket address structures. Use sockaddr_storage{} for generic socket address storage and ensures proper alignment. Use sockaddr{} for pointers to omit several casts. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/core/iovec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/iovec.c b/net/core/iovec.c index 755c37f..4c9c012 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -36,7 +36,7 @@ * in any case. */ -int verify_iovec(struct msghdr *m, struct iovec *iov, char *address, int mode) +int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode) { int size, err, ct; -- cgit v1.1 From 5f86173bdf15981ca49d0434f638b68f70a35644 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Sun, 20 Jul 2008 00:08:04 -0700 Subject: net_sched: Add qdisc_enqueue wrapper Signed-off-by: Jussi Kivilinna Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 065b981..2eed17b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1781,7 +1781,7 @@ gso: spin_lock(root_lock); - rc = q->enqueue(skb, q); + rc = qdisc_enqueue_root(skb, q); qdisc_run(q); spin_unlock(root_lock); @@ -2083,7 +2083,7 @@ static int ing_filter(struct sk_buff *skb) q = rxq->qdisc; if (q) { spin_lock(qdisc_lock(q)); - result = q->enqueue(skb, q); + result = qdisc_enqueue_root(skb, q); spin_unlock(qdisc_lock(q)); } -- cgit v1.1