From 875ad3f8e7dff6bc1d053e5bfe73d8e8d2e6ae67 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 23 Jan 2012 12:49:36 -0500 Subject: SUNRPC: Fix machine creds in generic_create_cred and generic_match - generic_create_cred needs to copy the '.principal' field. - generic_match needs to ignore the groups and match on the '.principal' field. This fixes an Oops that was introduced by commit 68c9715 (SUNRPC: Clean up the RPCSEC_GSS service ticket requests) Reported-by: J. Bruce Fields Signed-off-by: Trond Myklebust Tested-by: J. Bruce Fields --- net/sunrpc/auth_generic.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 1426ec3..75762f3 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -92,6 +92,7 @@ generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) if (gcred->acred.group_info != NULL) get_group_info(gcred->acred.group_info); gcred->acred.machine_cred = acred->machine_cred; + gcred->acred.principal = acred->principal; dprintk("RPC: allocated %s cred %p for uid %d gid %d\n", gcred->acred.machine_cred ? "machine" : "generic", @@ -123,6 +124,17 @@ generic_destroy_cred(struct rpc_cred *cred) call_rcu(&cred->cr_rcu, generic_free_cred_callback); } +static int +machine_cred_match(struct auth_cred *acred, struct generic_cred *gcred, int flags) +{ + if (!gcred->acred.machine_cred || + gcred->acred.principal != acred->principal || + gcred->acred.uid != acred->uid || + gcred->acred.gid != acred->gid) + return 0; + return 1; +} + /* * Match credentials against current process creds. */ @@ -132,9 +144,12 @@ generic_match(struct auth_cred *acred, struct rpc_cred *cred, int flags) struct generic_cred *gcred = container_of(cred, struct generic_cred, gc_base); int i; + if (acred->machine_cred) + return machine_cred_match(acred, gcred, flags); + if (gcred->acred.uid != acred->uid || gcred->acred.gid != acred->gid || - gcred->acred.machine_cred != acred->machine_cred) + gcred->acred.machine_cred != 0) goto out_nomatch; /* Optimisation in the case where pointers are identical... */ -- cgit v1.1 From ba1960257c5980f9b58057995ce3394bd8e48ca3 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Tue, 10 Jan 2012 15:19:54 +0200 Subject: mac80211: update oper_channel on ibss join Commit 13c40c5 ("mac80211: Add HT operation modes for IBSS") broke ibss operation by mistakenly removing the local->oper_channel update (causing ibss to start on the wrong channel). fix it. Signed-off-by: Eliad Peller Acked-by: Simon Wunderlich Signed-off-by: John W. Linville --- net/mac80211/ibss.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index b3d76b7..a464396 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -106,6 +106,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, sdata->drop_unencrypted = capability & WLAN_CAPABILITY_PRIVACY ? 1 : 0; + local->oper_channel = chan; channel_type = ifibss->channel_type; if (channel_type > NL80211_CHAN_HT20 && !cfg80211_can_beacon_sec_chan(local->hw.wiphy, chan, channel_type)) -- cgit v1.1 From 405385f8ce7a2ed8f82e216d88b5282142e1288b Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Wed, 11 Jan 2012 13:11:50 +0200 Subject: mac80211: set bss_conf.idle when vif is connected __ieee80211_recalc_idle() iterates through the vifs, sets bss_conf.idle = true if they are disconnected, and increases "count" if they are not (which later gets evaluated in order to determine whether the device is idle). However, the loop doesn't set bss_conf.idle = false (along with increasing "count"), causing the device idle state and the vif idle state to get out of sync in some cases. Signed-off-by: Eliad Peller Signed-off-by: John W. Linville --- net/mac80211/iface.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index e47768c..01a21c2 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1314,6 +1314,7 @@ u32 __ieee80211_recalc_idle(struct ieee80211_local *local) continue; } /* count everything else */ + sdata->vif.bss_conf.idle = false; count++; } -- cgit v1.1 From 68315801dbf3ab2001679fd2074c9dc5dcf87dfa Mon Sep 17 00:00:00 2001 From: James Chapman Date: Wed, 25 Jan 2012 02:39:05 +0000 Subject: l2tp: l2tp_ip - fix possible oops on packet receive When a packet is received on an L2TP IP socket (L2TPv3 IP link encapsulation), the l2tpip socket's backlog_rcv function calls xfrm4_policy_check(). This is not necessary, since it was called before the skb was added to the backlog. With CONFIG_NET_NS enabled, xfrm4_policy_check() will oops if skb->dev is null, so this trivial patch removes the call. This bug has always been present, but only when CONFIG_NET_NS is enabled does it cause problems. Most users are probably using UDP encapsulation for L2TP, hence the problem has only recently surfaced. EIP: 0060:[] EFLAGS: 00210246 CPU: 0 EIP is at l2tp_ip_recvmsg+0xd4/0x2a7 EAX: 00000001 EBX: d77b5180 ECX: 00000000 EDX: 00200246 ESI: 00000000 EDI: d63cbd30 EBP: d63cbd18 ESP: d63cbcf4 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 Call Trace: [] sock_common_recvmsg+0x31/0x46 [] __sock_recvmsg_nosec+0x45/0x4d [] __sock_recvmsg+0x31/0x3b [] sock_recvmsg+0x96/0xab [] ? might_fault+0x47/0x81 [] ? might_fault+0x47/0x81 [] ? _copy_from_user+0x31/0x115 [] ? copy_from_user+0x8/0xa [] ? verify_iovec+0x3e/0x78 [] __sys_recvmsg+0x10a/0x1aa [] ? sock_recvmsg+0x0/0xab [] ? __lock_acquire+0xbdf/0xbee [] ? do_page_fault+0x193/0x375 [] ? fcheck_files+0x9b/0xca [] ? fget_light+0x2a/0x9c [] sys_recvmsg+0x2b/0x43 [] sys_socketcall+0x16d/0x1a5 [] ? trace_hardirqs_on_thunk+0xc/0x10 [] sysenter_do_call+0x12/0x38 Code: c6 05 8c ea a8 c1 01 e8 0c d4 d9 ff 85 f6 74 07 3e ff 86 80 00 00 00 b9 17 b6 2b c1 ba 01 00 00 00 b8 78 ed 48 c1 e8 23 f6 d9 ff 76 0c 68 28 e3 30 c1 68 2d 44 41 c1 e8 89 57 01 00 83 c4 0c Signed-off-by: James Chapman Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/l2tp/l2tp_ip.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index d21e7eb..55670ec 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -393,11 +393,6 @@ static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb) { int rc; - if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) - goto drop; - - nf_reset(skb); - /* Charge it to the socket, dropping if the queue is full. */ rc = sock_queue_rcv_skb(sk, skb); if (rc < 0) -- cgit v1.1 From 2b05ad33e1e624e7f08b8676d270dc7725403b7e Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Wed, 25 Jan 2012 08:34:51 +0000 Subject: tcp: bind() fix autoselection to share ports The current code checks for conflicts when the application requests a specific port. If there is no conflict, then the request is granted. On the other hand, the port autoselection done by the kernel fails when all ports are bound even when there is a port with no conflict available. The fix changes port autoselection to check if there is a conflict and use it if not. Signed-off-by: Flavio Leitner Signed-off-by: Marcelo Ricardo Leitner Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 2e4e244..ecd19b5 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -128,6 +128,11 @@ again: goto have_snum; } } + if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) { + spin_unlock(&head->lock); + snum = rover; + goto have_snum; + } goto next; } break; -- cgit v1.1 From fddb7b5761f104f034a0e708ece756d9b2eb2cac Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Wed, 25 Jan 2012 08:34:52 +0000 Subject: tcp: bind() optimize port allocation Port autoselection finds a port and then drop the lock, then right after that, gets the hash bucket again and lock it. Fix it to go direct. Signed-off-by: Flavio Leitner Signed-off-by: Marcelo Ricardo Leitner Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index ecd19b5..19d66ce 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -123,15 +123,13 @@ again: smallest_size = tb->num_owners; smallest_rover = rover; if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) { - spin_unlock(&head->lock); snum = smallest_rover; - goto have_snum; + goto tb_found; } } if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) { - spin_unlock(&head->lock); snum = rover; - goto have_snum; + goto tb_found; } goto next; } -- cgit v1.1 From 073862ba5d249c20bd5c49fc6d904ff0e1f6a672 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Jan 2012 00:41:38 +0000 Subject: netns: fix net_alloc_generic() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a new net namespace is created, we should attach to it a "struct net_generic" with enough slots (even empty), or we can hit the following BUG_ON() : [ 200.752016] kernel BUG at include/net/netns/generic.h:40! ... [ 200.752016] [] ? get_cfcnfg+0x3a/0x180 [ 200.752016] [] ? lockdep_rtnl_is_held+0x10/0x20 [ 200.752016] [] caif_device_notify+0x2e/0x530 [ 200.752016] [] notifier_call_chain+0x67/0x110 [ 200.752016] [] raw_notifier_call_chain+0x11/0x20 [ 200.752016] [] call_netdevice_notifiers+0x32/0x60 [ 200.752016] [] register_netdevice+0x196/0x300 [ 200.752016] [] register_netdev+0x19/0x30 [ 200.752016] [] loopback_net_init+0x4a/0xa0 [ 200.752016] [] ops_init+0x42/0x180 [ 200.752016] [] setup_net+0x6b/0x100 [ 200.752016] [] copy_net_ns+0x86/0x110 [ 200.752016] [] create_new_namespaces+0xd9/0x190 net_alloc_generic() should take into account the maximum index into the ptr array, as a subsystem might use net_generic() anytime. This also reduces number of reallocations in net_assign_generic() Reported-by: Sasha Levin Tested-by: Sasha Levin Signed-off-by: Eric Dumazet Cc: Sjur Brændeland Cc: Eric W. Biederman Cc: Pavel Emelyanov Signed-off-by: David S. Miller --- net/core/net_namespace.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index aefcd7a..0e950fd 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -30,6 +30,20 @@ EXPORT_SYMBOL(init_net); #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ +static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; + +static struct net_generic *net_alloc_generic(void) +{ + struct net_generic *ng; + size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]); + + ng = kzalloc(generic_size, GFP_KERNEL); + if (ng) + ng->len = max_gen_ptrs; + + return ng; +} + static int net_assign_generic(struct net *net, int id, void *data) { struct net_generic *ng, *old_ng; @@ -43,8 +57,7 @@ static int net_assign_generic(struct net *net, int id, void *data) if (old_ng->len >= id) goto assign; - ng = kzalloc(sizeof(struct net_generic) + - id * sizeof(void *), GFP_KERNEL); + ng = net_alloc_generic(); if (ng == NULL) return -ENOMEM; @@ -59,7 +72,6 @@ static int net_assign_generic(struct net *net, int id, void *data) * the old copy for kfree after a grace period. */ - ng->len = id; memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*)); rcu_assign_pointer(net->gen, ng); @@ -161,18 +173,6 @@ out_undo: goto out; } -static struct net_generic *net_alloc_generic(void) -{ - struct net_generic *ng; - size_t generic_size = sizeof(struct net_generic) + - INITIAL_NET_GEN_PTRS * sizeof(void *); - - ng = kzalloc(generic_size, GFP_KERNEL); - if (ng) - ng->len = INITIAL_NET_GEN_PTRS; - - return ng; -} #ifdef CONFIG_NET_NS static struct kmem_cache *net_cachep; @@ -483,6 +483,7 @@ again: } return error; } + max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id); } error = __register_pernet_operations(list, ops); if (error) { -- cgit v1.1 From f2b3ee9e4200b32d113b1bd3c93f9a836c97357c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 26 Jan 2012 10:34:35 +0000 Subject: ipv6: Fix ip_gre lockless xmits. Tunnel devices set NETIF_F_LLTX to bypass HARD_TX_LOCK. Sit and ipip set this unconditionally in ops->setup, but gre enables it conditionally after parameter passing in ops->newlink. This is not called during tunnel setup as below, however, so GRE tunnels are still taking the lock. modprobe ip_gre ip tunnel add test0 mode gre remote 10.5.1.1 dev lo ip link set test0 up ip addr add 10.6.0.1 dev test0 # cat /sys/class/net/test0/features # $DIR/test_tunnel_xmit 10 10.5.2.1 ip route add 10.5.2.0/24 dev test0 ip tunnel del test0 The newlink callback is only called in rtnl_netlink, and only if the device is new, as it calls register_netdevice internally. Gre tunnels are created at 'ip tunnel add' with ioctl SIOCADDTUNNEL, which calls ipgre_tunnel_locate, which calls register_netdev. rtnl_newlink is called at 'ip link set', but skips ops->newlink and the device is up with locking still enabled. The equivalent ipip tunnel works fine, btw (just substitute 'method gre' for 'method ipip'). On kernels before /sys/class/net/*/features was removed [1], the first commented out line returns 0x6000 with method gre, which indicates that NETIF_F_LLTX (0x1000) is not set. With ipip, it reports 0x7000. This test cannot be used on recent kernels where the sysfs file is removed (and ETHTOOL_GFEATURES does not currently work for tunnel devices, because they lack dev->ethtool_ops). The second commented out line calls a simple transmission test [2] that sends on 24 cores at maximum rate. Results of a single run: ipip: 19,372,306 gre before patch: 4,839,753 gre after patch: 19,133,873 This patch replicates the condition check in ipgre_newlink to ipgre_tunnel_locate. It works for me, both with oseq on and off. This is the first time I looked at rtnetlink and iproute2 code, though, so someone more knowledgeable should probably check the patch. Thanks. The tail of both functions is now identical, by the way. To avoid code duplication, I'll be happy to rework this and merge the two. [1] http://patchwork.ozlabs.org/patch/104610/ [2] http://kernel.googlecode.com/files/xmit_udp_parallel.c Signed-off-by: Willem de Bruijn Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 2b53a1f..6b3ca5b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -422,6 +422,10 @@ static struct ip_tunnel *ipgre_tunnel_locate(struct net *net, if (register_netdevice(dev) < 0) goto failed_free; + /* Can use a lockless transmit, unless we generate output sequences */ + if (!(nt->parms.o_flags & GRE_SEQ)) + dev->features |= NETIF_F_LLTX; + dev_hold(dev); ipgre_tunnel_link(ign, nt); return nt; -- cgit v1.1 From f18da14565819ba43b8321237e2426a2914cc2ef Mon Sep 17 00:00:00 2001 From: Stefan Gula Date: Thu, 26 Jan 2012 11:01:06 +0000 Subject: net: RTNETLINK adjusting values of min_ifinfo_dump_size Setting link parameters on a netdevice changes the value of if_nlmsg_size(), therefore it is necessary to recalculate min_ifinfo_dump_size. Signed-off-by: Stefan Gula Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f16444b..65aebd4 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1509,6 +1509,9 @@ errout: if (send_addr_notify) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + min_ifinfo_dump_size = max_t(u16, if_nlmsg_size(dev), + min_ifinfo_dump_size); + return err; } -- cgit v1.1 From 8a8ee9aff6c3077dd9c2c7a77478e8ed362b96c6 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 26 Jan 2012 14:04:53 +0000 Subject: net caif: Register properly as a pernet subsystem. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit caif is a subsystem and as such it needs to register with register_pernet_subsys instead of register_pernet_device. Among other problems using register_pernet_device was resulting in net_generic being called before the caif_net structure was allocated. Which has been causing net_generic to fail with either BUG_ON's or by return NULL pointers. A more ugly problem that could be caused is packets in flight why the subsystem is shutting down. To remove confusion also remove the cruft cause by inappropriately trying to fix this bug. With the aid of the previous patch I have tested this patch and confirmed that using register_pernet_subsys makes the failure go away as it should. Signed-off-by: Eric W. Biederman Acked-by: Sjur Brændeland Tested-by: Sasha Levin Signed-off-by: David S. Miller --- net/caif/caif_dev.c | 22 ++-------------------- net/caif/cfcnfg.c | 1 - 2 files changed, 2 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 673728a..82c5706 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -59,8 +59,6 @@ struct cfcnfg *get_cfcnfg(struct net *net) { struct caif_net *caifn; caifn = net_generic(net, caif_net_id); - if (!caifn) - return NULL; return caifn->cfg; } EXPORT_SYMBOL(get_cfcnfg); @@ -69,8 +67,6 @@ static struct caif_device_entry_list *caif_device_list(struct net *net) { struct caif_net *caifn; caifn = net_generic(net, caif_net_id); - if (!caifn) - return NULL; return &caifn->caifdevs; } @@ -99,8 +95,6 @@ static struct caif_device_entry *caif_device_alloc(struct net_device *dev) struct caif_device_entry *caifd; caifdevs = caif_device_list(dev_net(dev)); - if (!caifdevs) - return NULL; caifd = kzalloc(sizeof(*caifd), GFP_KERNEL); if (!caifd) @@ -120,8 +114,6 @@ static struct caif_device_entry *caif_get(struct net_device *dev) struct caif_device_entry_list *caifdevs = caif_device_list(dev_net(dev)); struct caif_device_entry *caifd; - if (!caifdevs) - return NULL; list_for_each_entry_rcu(caifd, &caifdevs->list, list) { if (caifd->netdev == dev) @@ -321,8 +313,6 @@ void caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev, struct caif_device_entry_list *caifdevs; caifdevs = caif_device_list(dev_net(dev)); - if (!cfg || !caifdevs) - return; caifd = caif_device_alloc(dev); if (!caifd) return; @@ -374,8 +364,6 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, cfg = get_cfcnfg(dev_net(dev)); caifdevs = caif_device_list(dev_net(dev)); - if (!cfg || !caifdevs) - return 0; caifd = caif_get(dev); if (caifd == NULL && dev->type != ARPHRD_CAIF) @@ -507,9 +495,6 @@ static struct notifier_block caif_device_notifier = { static int caif_init_net(struct net *net) { struct caif_net *caifn = net_generic(net, caif_net_id); - if (WARN_ON(!caifn)) - return -EINVAL; - INIT_LIST_HEAD(&caifn->caifdevs.list); mutex_init(&caifn->caifdevs.lock); @@ -527,9 +512,6 @@ static void caif_exit_net(struct net *net) caif_device_list(net); struct cfcnfg *cfg = get_cfcnfg(net); - if (!cfg || !caifdevs) - return; - rtnl_lock(); mutex_lock(&caifdevs->lock); @@ -569,7 +551,7 @@ static int __init caif_device_init(void) { int result; - result = register_pernet_device(&caif_net_ops); + result = register_pernet_subsys(&caif_net_ops); if (result) return result; @@ -582,7 +564,7 @@ static int __init caif_device_init(void) static void __exit caif_device_exit(void) { - unregister_pernet_device(&caif_net_ops); + unregister_pernet_subsys(&caif_net_ops); unregister_netdevice_notifier(&caif_device_notifier); dev_remove_pack(&caif_packet_type); } diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c index 598aafb..ba9cfd4 100644 --- a/net/caif/cfcnfg.c +++ b/net/caif/cfcnfg.c @@ -309,7 +309,6 @@ int caif_connect_client(struct net *net, struct caif_connect_request *conn_req, int err; struct cfctrl_link_param param; struct cfcnfg *cfg = get_cfcnfg(net); - caif_assert(cfg != NULL); rcu_read_lock(); err = caif_connect_req_to_link_param(cfg, conn_req, ¶m); -- cgit v1.1 From 4acb41903b2f99f3dffd4c3df9acc84ca5942cb2 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 30 Jan 2012 01:20:17 +0000 Subject: net/tcp: Fix tcp memory limits initialization when !CONFIG_SYSCTL sysctl_tcp_mem() initialization was moved to sysctl_tcp_ipv4.c in commit 3dc43e3e4d0b52197d3205214fe8f162f9e0c334, since it became a per-ns value. That code, however, will never run when CONFIG_SYSCTL is disabled, leading to bogus values on those fields - causing hung TCP sockets. This patch fixes it by keeping an initialization code in tcp_init(). It will be overwritten by the first net namespace init if CONFIG_SYSCTL is compiled in, and do the right thing if it is compiled out. It is also named properly as tcp_init_mem(), to properly signal its non-sysctl side effect on TCP limits. Reported-by: Ingo Molnar Signed-off-by: Glauber Costa Cc: David S. Miller Link: http://lkml.kernel.org/r/4F22D05A.8030604@parallels.com [ renamed the function, tidied up the changelog a bit ] Signed-off-by: Ingo Molnar Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 1 + net/ipv4/tcp.c | 16 +++++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4aa7e9d..4cb9cd2 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -814,6 +814,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) net->ipv4.sysctl_rt_cache_rebuild_count = 4; + tcp_init_mem(net); limit = nr_free_buffer_pages() / 8; limit = max(limit, 128UL); net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9bcdec3..06373b4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3216,6 +3216,16 @@ static int __init set_thash_entries(char *str) } __setup("thash_entries=", set_thash_entries); +void tcp_init_mem(struct net *net) +{ + /* Set per-socket limits to no more than 1/128 the pressure threshold */ + unsigned long limit = nr_free_buffer_pages() / 8; + limit = max(limit, 128UL); + net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; + net->ipv4.sysctl_tcp_mem[1] = limit; + net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2; +} + void __init tcp_init(void) { struct sk_buff *skb = NULL; @@ -3276,9 +3286,9 @@ void __init tcp_init(void) sysctl_tcp_max_orphans = cnt / 2; sysctl_max_syn_backlog = max(128, cnt / 256); - /* Set per-socket limits to no more than 1/128 the pressure threshold */ - limit = ((unsigned long)init_net.ipv4.sysctl_tcp_mem[1]) - << (PAGE_SHIFT - 7); + tcp_init_mem(&init_net); + limit = nr_free_buffer_pages() / 8; + limit = max(limit, 128UL); max_share = min(4UL*1024*1024, limit); sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; -- cgit v1.1 From 5b35e1e6e9ca651e6b291c96d1106043c9af314a Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sat, 28 Jan 2012 17:29:46 +0000 Subject: tcp: fix tcp_trim_head() to adjust segment count with skb MSS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit fixes tcp_trim_head() to recalculate the number of segments in the skb with the skb's existing MSS, so trimming the head causes the skb segment count to be monotonically non-increasing - it should stay the same or go down, but not increase. Previously tcp_trim_head() used the current MSS of the connection. But if there was a decrease in MSS between original transmission and ACK (e.g. due to PMTUD), this could cause tcp_trim_head() to counter-intuitively increase the segment count when trimming bytes off the head of an skb. This violated assumptions in tcp_tso_acked() that tcp_trim_head() only decreases the packet count, so that packets_acked in tcp_tso_acked() could underflow, leading tcp_clean_rtx_queue() to pass u32 pkts_acked values as large as 0xffffffff to ca_ops->pkts_acked(). As an aside, if tcp_trim_head() had really wanted the skb to reflect the current MSS, it should have called tcp_set_skb_tso_segs() unconditionally, since a decrease in MSS would mean that a single-packet skb should now be sliced into multiple segments. Signed-off-by: Neal Cardwell Acked-by: Nandita Dukkipati Acked-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8c8de27..4ff3b6d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1141,11 +1141,9 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) sk_mem_uncharge(sk, len); sock_set_flag(sk, SOCK_QUEUE_SHRUNK); - /* Any change of skb->len requires recalculation of tso - * factor and mss. - */ + /* Any change of skb->len requires recalculation of tso factor. */ if (tcp_skb_pcount(skb) > 1) - tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk)); + tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); return 0; } -- cgit v1.1 From 6f01fd6e6f6809061b56e78f1e8d143099716d70 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 28 Jan 2012 16:11:03 +0000 Subject: af_unix: fix EPOLLET regression for stream sockets Commit 0884d7aa24 (AF_UNIX: Fix poll blocking problem when reading from a stream socket) added a regression for epoll() in Edge Triggered mode (EPOLLET) Appropriate fix is to use skb_peek()/skb_unlink() instead of skb_dequeue(), and only call skb_unlink() when skb is fully consumed. This remove the need to requeue a partial skb into sk_receive_queue head and the extra sk->sk_data_ready() calls that added the regression. This is safe because once skb is given to sk_receive_queue, it is not modified by a writer, and readers are serialized by u->readlock mutex. This also reduce number of spinlock acquisition for small reads or MSG_PEEK users so should improve overall performance. Reported-by: Nick Mathewson Signed-off-by: Eric Dumazet Cc: Alexey Moiseytsev Signed-off-by: David S. Miller --- net/unix/af_unix.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index aad8fb6..85d3bb7 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1918,7 +1918,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, struct sk_buff *skb; unix_state_lock(sk); - skb = skb_dequeue(&sk->sk_receive_queue); + skb = skb_peek(&sk->sk_receive_queue); if (skb == NULL) { unix_sk(sk)->recursion_level = 0; if (copied >= target) @@ -1958,11 +1958,8 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, if (check_creds) { /* Never glue messages from different writers */ if ((UNIXCB(skb).pid != siocb->scm->pid) || - (UNIXCB(skb).cred != siocb->scm->cred)) { - skb_queue_head(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); + (UNIXCB(skb).cred != siocb->scm->cred)) break; - } } else { /* Copy credentials */ scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred); @@ -1977,8 +1974,6 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, chunk = min_t(unsigned int, skb->len, size); if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) { - skb_queue_head(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); if (copied == 0) copied = -EFAULT; break; @@ -1993,13 +1988,10 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, if (UNIXCB(skb).fp) unix_detach_fds(siocb->scm, skb); - /* put the skb back if we didn't use it up.. */ - if (skb->len) { - skb_queue_head(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); + if (skb->len) break; - } + skb_unlink(skb, &sk->sk_receive_queue); consume_skb(skb); if (siocb->scm->fp) @@ -2010,9 +2002,6 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, if (UNIXCB(skb).fp) siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp); - /* put message back and return */ - skb_queue_head(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); break; } } while (size); -- cgit v1.1 From efcdbf24fd5daa88060869e51ed49f68b7ac8708 Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Mon, 30 Jan 2012 14:16:06 -0800 Subject: net: Disambiguate kernel message Some of our machines were reporting: TCP: too many of orphaned sockets even when the number of orphaned sockets was well below the limit. We print a different message depending on whether we're out of TCP memory or there are too many orphaned sockets. Also move the check out of line and cleanup the messages that were printed. Signed-off-by: Arun Sharma Suggested-by: Mohan Srinivasan Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: David Miller Cc: Glauber Costa Cc: Ingo Molnar Cc: Joe Perches Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 19 +++++++++++++++---- net/ipv4/tcp_timer.c | 5 +---- 2 files changed, 16 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 06373b4..a34f5cf 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1876,6 +1876,20 @@ void tcp_shutdown(struct sock *sk, int how) } EXPORT_SYMBOL(tcp_shutdown); +bool tcp_check_oom(struct sock *sk, int shift) +{ + bool too_many_orphans, out_of_socket_memory; + + too_many_orphans = tcp_too_many_orphans(sk, shift); + out_of_socket_memory = tcp_out_of_memory(sk); + + if (too_many_orphans && net_ratelimit()) + pr_info("TCP: too many orphaned sockets\n"); + if (out_of_socket_memory && net_ratelimit()) + pr_info("TCP: out of memory -- consider tuning tcp_mem\n"); + return too_many_orphans || out_of_socket_memory; +} + void tcp_close(struct sock *sk, long timeout) { struct sk_buff *skb; @@ -2015,10 +2029,7 @@ adjudge_to_death: } if (sk->sk_state != TCP_CLOSE) { sk_mem_reclaim(sk); - if (tcp_too_many_orphans(sk, 0)) { - if (net_ratelimit()) - printk(KERN_INFO "TCP: too many of orphaned " - "sockets\n"); + if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); NET_INC_STATS_BH(sock_net(sk), diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a516d1e..cd2e072 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -77,10 +77,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset) if (sk->sk_err_soft) shift++; - if (tcp_too_many_orphans(sk, shift)) { - if (net_ratelimit()) - printk(KERN_INFO "Out of socket memory\n"); - + if (tcp_check_oom(sk, shift)) { /* Catch exceptional cases, when connection requires reset. * 1. Last segment was sent recently. */ if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || -- cgit v1.1 From 786f528119722f564a22ad953411374e06116333 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 1 Feb 2012 09:32:25 +0000 Subject: ethtool: Null-terminate filename passed to ethtool_ops::flash_device The parameters for ETHTOOL_FLASHDEV include a filename, which ought to be null-terminated. Currently the only driver that implements ethtool_ops::flash_device attempts to add a null terminator if necessary, but does it wrongly. Do it in the ethtool core instead. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/ethtool.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 369b418..3f79db1 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1190,6 +1190,8 @@ static noinline_for_stack int ethtool_flash_device(struct net_device *dev, if (!dev->ethtool_ops->flash_device) return -EOPNOTSUPP; + efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0; + return dev->ethtool_ops->flash_device(dev, &efl); } -- cgit v1.1 From 07ae2dfcf4f7143ce191c6436da1c33f179af0d6 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Wed, 1 Feb 2012 18:48:09 +0200 Subject: mac80211: timeout a single frame in the rx reorder buffer The current code checks for stored_mpdu_num > 1, causing the reorder_timer to be triggered indefinitely, but the frame is never timed-out (until the next packet is received) Signed-off-by: Eliad Peller Cc: Acked-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 7514091..5a5e504 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -611,7 +611,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_hw *hw, index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) % tid_agg_rx->buf_size; if (!tid_agg_rx->reorder_buf[index] && - tid_agg_rx->stored_mpdu_num > 1) { + tid_agg_rx->stored_mpdu_num) { /* * No buffers ready to be released, but check whether any * frames in the reorder buffer have timed out. -- cgit v1.1 From c43b874d5d714f271b80d4c3f49e05d0cbf51ed2 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 2 Feb 2012 00:07:00 +0000 Subject: tcp: properly initialize tcp memory limits Commit 4acb4190 tries to fix the using uninitialized value introduced by commit 3dc43e3, but it would make the per-socket memory limits too small. This patch fixes this and also remove the redundant codes introduced in 4acb4190. Signed-off-by: Jason Wang Acked-by: Glauber Costa Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 6 ------ net/ipv4/tcp.c | 4 ++-- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4cb9cd2..7a7724d 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -778,7 +778,6 @@ EXPORT_SYMBOL_GPL(net_ipv4_ctl_path); static __net_init int ipv4_sysctl_init_net(struct net *net) { struct ctl_table *table; - unsigned long limit; table = ipv4_net_table; if (!net_eq(net, &init_net)) { @@ -815,11 +814,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) net->ipv4.sysctl_rt_cache_rebuild_count = 4; tcp_init_mem(net); - limit = nr_free_buffer_pages() / 8; - limit = max(limit, 128UL); - net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; - net->ipv4.sysctl_tcp_mem[1] = limit; - net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2; net->ipv4.ipv4_hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a34f5cf..37755cc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3229,7 +3229,6 @@ __setup("thash_entries=", set_thash_entries); void tcp_init_mem(struct net *net) { - /* Set per-socket limits to no more than 1/128 the pressure threshold */ unsigned long limit = nr_free_buffer_pages() / 8; limit = max(limit, 128UL); net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; @@ -3298,7 +3297,8 @@ void __init tcp_init(void) sysctl_max_syn_backlog = max(128, cnt / 256); tcp_init_mem(&init_net); - limit = nr_free_buffer_pages() / 8; + /* Set per-socket limits to no more than 1/128 the pressure threshold */ + limit = nr_free_buffer_pages() << (PAGE_SHIFT - 10); limit = max(limit, 128UL); max_share = min(4UL*1024*1024, limit); -- cgit v1.1 From b01377a4200d0dfc7b04a8daabb4739727353703 Mon Sep 17 00:00:00 2001 From: "sjur.brandeland@stericsson.com" Date: Thu, 2 Feb 2012 01:21:02 +0000 Subject: caif: Bugfix list_del_rcu race in cfmuxl_ctrlcmd. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Always use cfmuxl_remove_uplayer when removing a up-layer. cfmuxl_ctrlcmd() can be called independently and in parallel with cfmuxl_remove_uplayer(). The race between them could cause list_del_rcu to be called on a node which has been already taken out from the list. That lead to a (rare) crash on accessing poisoned node->prev inside list_del_rcu. This fix ensures that deletion are done holding the same lock. Reported-by: Dmitry Tarnyagin Signed-off-by: Sjur Brændeland Signed-off-by: David S. Miller --- net/caif/cfmuxl.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/caif/cfmuxl.c b/net/caif/cfmuxl.c index b36f24a..94b0861 100644 --- a/net/caif/cfmuxl.c +++ b/net/caif/cfmuxl.c @@ -248,7 +248,6 @@ static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, { struct cfmuxl *muxl = container_obj(layr); struct cflayer *layer; - int idx; rcu_read_lock(); list_for_each_entry_rcu(layer, &muxl->srvl_list, node) { @@ -257,14 +256,9 @@ static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, if ((ctrl == _CAIF_CTRLCMD_PHYIF_DOWN_IND || ctrl == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND) && - layer->id != 0) { - - idx = layer->id % UP_CACHE_SIZE; - spin_lock_bh(&muxl->receive_lock); - RCU_INIT_POINTER(muxl->up_cache[idx], NULL); - list_del_rcu(&layer->node); - spin_unlock_bh(&muxl->receive_lock); - } + layer->id != 0) + cfmuxl_remove_uplayer(layr, layer->id); + /* NOTE: ctrlcmd is not allowed to block */ layer->ctrlcmd(layer, ctrl, phyid); } -- cgit v1.1 From ba7605745d5c99f0e71b3ec6c7cb5ed6afe540ad Mon Sep 17 00:00:00 2001 From: Dmitry Tarnyagin Date: Thu, 2 Feb 2012 01:21:03 +0000 Subject: caif: Bugfix double kfree_skb upon xmit failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SKB is freed twice upon send error. The Network stack consumes SKB even when it returns error code. Signed-off-by: Sjur Brændeland Signed-off-by: David S. Miller --- net/caif/caif_socket.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index a986280..a97d97a 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -539,8 +539,10 @@ static int transmit_skb(struct sk_buff *skb, struct caifsock *cf_sk, pkt = cfpkt_fromnative(CAIF_DIR_OUT, skb); memset(skb->cb, 0, sizeof(struct caif_payload_info)); - if (cf_sk->layer.dn == NULL) + if (cf_sk->layer.dn == NULL) { + kfree_skb(skb); return -EINVAL; + } return cf_sk->layer.dn->transmit(cf_sk->layer.dn, pkt); } @@ -683,10 +685,10 @@ static int caif_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, } err = transmit_skb(skb, cf_sk, msg->msg_flags&MSG_DONTWAIT, timeo); - if (err < 0) { - kfree_skb(skb); + if (err < 0) + /* skb is already freed */ goto pipe_err; - } + sent += size; } -- cgit v1.1 From 5962b35c1de3254a2f03b95efd3b7854b874d7b7 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 3 Feb 2012 05:18:43 +0000 Subject: netprio_cgroup: Fix obo in get_prioidx It was recently pointed out to me that the get_prioidx function sets a bit in the prioidx map prior to checking to see if the index being set is out of bounds. This patch corrects that, avoiding the possiblity of us writing beyond the end of the array Signed-off-by: Neil Horman Reported-by: Stanislaw Gruszka CC: Stanislaw Gruszka CC: "David S. Miller" Signed-off-by: David S. Miller --- net/core/netprio_cgroup.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 3a9fd48..9ae183a 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -58,11 +58,12 @@ static int get_prioidx(u32 *prio) spin_lock_irqsave(&prioidx_map_lock, flags); prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ); + if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) { + spin_unlock_irqrestore(&prioidx_map_lock, flags); + return -ENOSPC; + } set_bit(prioidx, prioidx_map); spin_unlock_irqrestore(&prioidx_map_lock, flags); - if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) - return -ENOSPC; - atomic_set(&max_prioidx, prioidx); *prio = prioidx; return 0; -- cgit v1.1