From 622e0ca1cd4d459f5af4f2c65f4dc0dd823cb4c3 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 20 May 2010 23:07:56 -0700 Subject: gro: Fix bogus gso_size on the first fraglist entry When GRO produces fraglist entries, and the resulting skb hits an interface that is incapable of TSO but capable of FRAGLIST, we end up producing a bogus packet with gso_size non-zero. This was reported in the field with older versions of KVM that did not set the TSO bits on tuntap. This patch fixes that. Reported-by: Igor Zhang Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/skbuff.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c543dd2..4c11000 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2718,6 +2718,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p); skb_shinfo(nskb)->frag_list = p; skb_shinfo(nskb)->gso_size = pinfo->gso_size; + pinfo->gso_size = 0; skb_header_release(p); nskb->prev = p; -- cgit v1.1 From 76cc8b13a6e41b537fd262b600da1571314add62 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 20 May 2010 18:37:59 +0000 Subject: net: fix problem in dequeuing from input_pkt_queue Fix some issues introduced in batch skb dequeuing for input_pkt_queue. The primary issue it that the queue head must be incremented only after a packet has been processed, that is only after __netif_receive_skb has been called. This is needed for the mechanism to prevent OOO packet in RFS. Also when flushing the input_pkt_queue and process_queue, the process queue should be done first to prevent OOO packets. Because the input_pkt_queue has been effectively split into two queues, the calculation of the tail ptr is no longer correct. The correct value would be head+input_pkt_queue->len+process_queue->len. To avoid this calculation we added an explict input_queue_tail in softnet_data. The tail value is simply incremented when queuing to input_pkt_queue. Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 6c82065..0aab66d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2426,10 +2426,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, if (skb_queue_len(&sd->input_pkt_queue)) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); -#ifdef CONFIG_RPS - *qtail = sd->input_queue_head + - skb_queue_len(&sd->input_pkt_queue); -#endif + input_queue_tail_incr_save(sd, qtail); rps_unlock(sd); local_irq_restore(flags); return NET_RX_SUCCESS; @@ -2964,7 +2961,7 @@ static void flush_backlog(void *arg) if (skb->dev == dev) { __skb_unlink(skb, &sd->input_pkt_queue); kfree_skb(skb); - input_queue_head_add(sd, 1); + input_queue_head_incr(sd); } } rps_unlock(sd); @@ -2973,6 +2970,7 @@ static void flush_backlog(void *arg) if (skb->dev == dev) { __skb_unlink(skb, &sd->process_queue); kfree_skb(skb); + input_queue_head_incr(sd); } } } @@ -3328,18 +3326,20 @@ static int process_backlog(struct napi_struct *napi, int quota) while ((skb = __skb_dequeue(&sd->process_queue))) { local_irq_enable(); __netif_receive_skb(skb); - if (++work >= quota) - return work; local_irq_disable(); + input_queue_head_incr(sd); + if (++work >= quota) { + local_irq_enable(); + return work; + } } rps_lock(sd); qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen) { - input_queue_head_add(sd, qlen); + if (qlen) skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); - } + if (qlen < quota - work) { /* * Inline a custom version of __napi_complete(). @@ -5679,12 +5679,14 @@ static int dev_cpu_callback(struct notifier_block *nfb, local_irq_enable(); /* Process offline CPU's input_pkt_queue */ - while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { + while ((skb = __skb_dequeue(&oldsd->process_queue))) { netif_rx(skb); - input_queue_head_add(oldsd, 1); + input_queue_head_incr(oldsd); } - while ((skb = __skb_dequeue(&oldsd->process_queue))) + while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx(skb); + input_queue_head_incr(oldsd); + } return NOTIFY_OK; } -- cgit v1.1 From 253683bbfb6bc5864417c8c35cb6ef13b5e259e6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 May 2010 02:25:27 +0000 Subject: rtnetlink: Fix error handling in do_setlink() Commit c02db8c6290bb992442fec1407643c94cc414375: Author: Chris Wright Date: Sun May 16 01:05:45 2010 -0700 Subject: rtnetlink: make SR-IOV VF interface symmetric adds broken error handling to do_setlink() in net/core/rtnetlink.c. The problem is the following chunk of code: if (tb[IFLA_VFINFO_LIST]) { struct nlattr *attr; int rem; nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) { if (nla_type(attr) != IFLA_VF_INFO) ----> goto errout; err = do_setvfinfo(dev, attr); if (err < 0) goto errout; modified = 1; } } which can get to errout without setting err, resulting in the following error: net/core/rtnetlink.c: In function 'do_setlink': net/core/rtnetlink.c:904: warning: 'err' may be used uninitialized in this function Change the code to return -EINVAL in this case. Note that this might not be the appropriate error though. Signed-off-by: David Howells cc: Chris Wright cc: David S. Miller Acked-by: Chris Wright Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e4b9870..7ab86f3 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1199,8 +1199,10 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, struct nlattr *attr; int rem; nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) { - if (nla_type(attr) != IFLA_VF_INFO) + if (nla_type(attr) != IFLA_VF_INFO) { + err = -EINVAL; goto errout; + } err = do_setvfinfo(dev, attr); if (err < 0) goto errout; -- cgit v1.1 From 8ce6cebc2f126f3ecf2d80746ea54245adf18057 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Wed, 19 May 2010 10:12:19 +0000 Subject: net-2.6 : V2 - fix dev_get_valid_name the commit: commit d90310243fd750240755e217c5faa13e24f41536 Author: Octavian Purdila Date: Wed Nov 18 02:36:59 2009 +0000 net: device name allocation cleanups introduced a bug when there is a hash collision making impossible to rename a device with eth%d. This bug is very hard to reproduce and appears rarely. The problem is coming from we don't pass a temporary buffer to __dev_alloc_name but 'dev->name' which is modified by the function. A detailed explanation is here: http://marc.info/?l=linux-netdev&m=127417784011987&w=2 Changelog: V2 : replaced strings comparison by pointers comparison Signed-off-by: Daniel Lezcano Reviewed-by: Octavian Purdila Signed-off-by: David S. Miller --- net/core/dev.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0aab66d..07a48e2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -954,18 +954,22 @@ int dev_alloc_name(struct net_device *dev, const char *name) } EXPORT_SYMBOL(dev_alloc_name); -static int dev_get_valid_name(struct net *net, const char *name, char *buf, - bool fmt) +static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt) { + struct net *net; + + BUG_ON(!dev_net(dev)); + net = dev_net(dev); + if (!dev_valid_name(name)) return -EINVAL; if (fmt && strchr(name, '%')) - return __dev_alloc_name(net, name, buf); + return dev_alloc_name(dev, name); else if (__dev_get_by_name(net, name)) return -EEXIST; - else if (buf != name) - strlcpy(buf, name, IFNAMSIZ); + else if (dev->name != name) + strlcpy(dev->name, name, IFNAMSIZ); return 0; } @@ -997,7 +1001,7 @@ int dev_change_name(struct net_device *dev, const char *newname) memcpy(oldname, dev->name, IFNAMSIZ); - err = dev_get_valid_name(net, newname, dev->name, 1); + err = dev_get_valid_name(dev, newname, 1); if (err < 0) return err; @@ -4965,7 +4969,7 @@ int register_netdevice(struct net_device *dev) } } - ret = dev_get_valid_name(net, dev->name, dev->name, 0); + ret = dev_get_valid_name(dev, dev->name, 0); if (ret) goto err_uninit; @@ -5574,7 +5578,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* We get here if we can't use the current device name */ if (!pat) goto out; - if (dev_get_valid_name(net, pat, dev->name, 1)) + if (dev_get_valid_name(dev, pat, 1)) goto out; } -- cgit v1.1 From f845172531fb7410c7fb7780b1a6e51ee6df7d52 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 24 May 2010 00:12:34 -0700 Subject: cls_cgroup: Store classid in struct sock Up until now cls_cgroup has relied on fetching the classid out of the current executing thread. This runs into trouble when a packet processing is delayed in which case it may execute out of another thread's context. Furthermore, even when a packet is not delayed we may fail to classify it if soft IRQs have been disabled, because this scenario is indistinguishable from one where a packet unrelated to the current thread is processed by a real soft IRQ. In fact, the current semantics is inherently broken, as a single skb may be constructed out of the writes of two different tasks. A different manifestation of this problem is when the TCP stack transmits in response of an incoming ACK. This is currently unclassified. As we already have a concept of packet ownership for accounting purposes in the skb->sk pointer, this is a natural place to store the classid in a persistent manner. This patch adds the cls_cgroup classid in struct sock, filling up an existing hole on 64-bit :) The value is set at socket creation time. So all sockets created via socket(2) automatically gains the ID of the thread creating it. Whenever another process touches the socket by either reading or writing to it, we will change the socket classid to that of the process if it has a valid (non-zero) classid. For sockets created on inbound connections through accept(2), we inherit the classid of the original listening socket through sk_clone, possibly preceding the actual accept(2) call. In order to minimise risks, I have not made this the authoritative classid. For now it is only used as a backup when we execute with soft IRQs disabled. Once we're completely happy with its semantics we can use it as the sole classid. Footnote: I have rearranged the error path on cls_group module creation. If we didn't do this, then there is a window where someone could create a tc rule using cls_group before the cgroup subsystem has been registered. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/sock.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index bf88a16..a05ae7f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -123,6 +123,7 @@ #include #include #include +#include #include @@ -217,6 +218,11 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); EXPORT_SYMBOL(sysctl_optmem_max); +#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP) +int net_cls_subsys_id = -1; +EXPORT_SYMBOL_GPL(net_cls_subsys_id); +#endif + static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) { struct timeval tv; @@ -1050,6 +1056,16 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) module_put(owner); } +#ifdef CONFIG_CGROUPS +void sock_update_classid(struct sock *sk) +{ + u32 classid = task_cls_classid(current); + + if (classid && classid != sk->sk_classid) + sk->sk_classid = classid; +} +#endif + /** * sk_alloc - All socket objects are allocated here * @net: the applicable net namespace @@ -1073,6 +1089,8 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_lock_init(sk); sock_net_set(sk, get_net(net)); atomic_set(&sk->sk_wmem_alloc, 1); + + sock_update_classid(sk); } return sk; -- cgit v1.1 From 8286274284e15b11b0f531b6ceeef21fbe00a8dd Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 24 May 2010 00:14:10 -0700 Subject: tun: Update classid on packet injection This patch makes tun update its socket classid every time we inject a packet into the network stack. This is so that any updates made by the admin to the process writing packets to tun is effected. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index a05ae7f..37fe9b6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1064,6 +1064,7 @@ void sock_update_classid(struct sock *sk) if (classid && classid != sk->sk_classid) sk->sk_classid = classid; } +EXPORT_SYMBOL(sock_update_classid); #endif /** -- cgit v1.1