From b2e4f544fddc812d6fe802bab5f600b4b783f45d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 23 May 2012 16:39:45 -0600 Subject: userns: Convert net/core/scm.c to use kuids and kgids With the existence of kuid_t and kgid_t we can take this further and remove the usage of struct cred altogether, ensuring we don't get cache line misses from reference counts. For now however start simply and do a straight forward conversion I can be certain is correct. In cred_to_ucred use from_kuid_munged and from_kgid_munged as these values are going directly to userspace and we want to use the userspace safe values not -1 when reporting a value that does not map. The earlier conversion that used from_kuid was buggy in that respect. Oops. Cc: Eric Dumazet Acked-by: David S. Miller Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- net/core/scm.c | 31 +++++++++++++++++++++++-------- net/core/sock.c | 4 ++-- 2 files changed, 25 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/scm.c b/net/core/scm.c index 8f6ccfd..5472ae7 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -45,12 +45,17 @@ static __inline__ int scm_check_creds(struct ucred *creds) { const struct cred *cred = current_cred(); + kuid_t uid = make_kuid(cred->user_ns, creds->uid); + kgid_t gid = make_kgid(cred->user_ns, creds->gid); + + if (!uid_valid(uid) || !gid_valid(gid)) + return -EINVAL; if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) && - ((creds->uid == cred->uid || creds->uid == cred->euid || - creds->uid == cred->suid) || capable(CAP_SETUID)) && - ((creds->gid == cred->gid || creds->gid == cred->egid || - creds->gid == cred->sgid) || capable(CAP_SETGID))) { + ((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) || + uid_eq(uid, cred->suid)) || capable(CAP_SETUID)) && + ((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) || + gid_eq(gid, cred->sgid)) || capable(CAP_SETGID))) { return 0; } return -EPERM; @@ -149,6 +154,9 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) goto error; break; case SCM_CREDENTIALS: + { + kuid_t uid; + kgid_t gid; if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred))) goto error; memcpy(&p->creds, CMSG_DATA(cmsg), sizeof(struct ucred)); @@ -166,22 +174,29 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) p->pid = pid; } + err = -EINVAL; + uid = make_kuid(current_user_ns(), p->creds.uid); + gid = make_kgid(current_user_ns(), p->creds.gid); + if (!uid_valid(uid) || !gid_valid(gid)) + goto error; + if (!p->cred || - (p->cred->euid != p->creds.uid) || - (p->cred->egid != p->creds.gid)) { + !uid_eq(p->cred->euid, uid) || + !gid_eq(p->cred->egid, gid)) { struct cred *cred; err = -ENOMEM; cred = prepare_creds(); if (!cred) goto error; - cred->uid = cred->euid = p->creds.uid; - cred->gid = cred->egid = p->creds.gid; + cred->uid = cred->euid = uid; + cred->gid = cred->egid = gid; if (p->cred) put_cred(p->cred); p->cred = cred; } break; + } default: goto error; } diff --git a/net/core/sock.c b/net/core/sock.c index 6b654b3..9c7fe4f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -868,8 +868,8 @@ void cred_to_ucred(struct pid *pid, const struct cred *cred, if (cred) { struct user_namespace *current_ns = current_user_ns(); - ucred->uid = from_kuid(current_ns, cred->euid); - ucred->gid = from_kgid(current_ns, cred->egid); + ucred->uid = from_kuid_munged(current_ns, cred->euid); + ucred->gid = from_kgid_munged(current_ns, cred->egid); } } EXPORT_SYMBOL_GPL(cred_to_ucred); -- cgit v1.1 From d04a48b06d63b6d6e9289ca8a5e6e84ebfe39bfd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 23 May 2012 17:01:57 -0600 Subject: userns: Convert __dev_set_promiscuity to use kuids in audit logs Cc: Klaus Heinrich Kiwi Cc: Eric Paris Acked-by: David S. Miller Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- net/core/dev.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0cb3fe8..026bb4a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4492,8 +4492,8 @@ static void dev_change_rx_flags(struct net_device *dev, int flags) static int __dev_set_promiscuity(struct net_device *dev, int inc) { unsigned int old_flags = dev->flags; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; ASSERT_RTNL(); @@ -4525,7 +4525,8 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc) dev->name, (dev->flags & IFF_PROMISC), (old_flags & IFF_PROMISC), audit_get_loginuid(current), - uid, gid, + from_kuid(&init_user_ns, uid), + from_kgid(&init_user_ns, gid), audit_get_sessionid(current)); } -- cgit v1.1 From 976d020150456fccbd34103fd117fab910eed09c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 23 May 2012 17:16:53 -0600 Subject: userns: Convert sock_i_uid to return a kuid_t Acked-by: David S. Miller Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- net/core/sock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 9c7fe4f..5c6a435 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1526,12 +1526,12 @@ void sock_edemux(struct sk_buff *skb) } EXPORT_SYMBOL(sock_edemux); -int sock_i_uid(struct sock *sk) +kuid_t sock_i_uid(struct sock *sk) { - int uid; + kuid_t uid; read_lock_bh(&sk->sk_callback_lock); - uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0; + uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; read_unlock_bh(&sk->sk_callback_lock); return uid; } -- cgit v1.1 From 748e2d9396a18c3fd3d07d47c1b41320acf1fbf4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Aug 2012 21:50:59 +0000 Subject: net: reinstate rtnl in call_netdevice_notifiers() Eric Biederman pointed out that not holding RTNL while calling call_netdevice_notifiers() was racy. This patch is a direct transcription his feedback against commit 0115e8e30d6fc (net: remove delay at device dismantle) Thanks Eric ! Signed-off-by: Eric Dumazet Cc: Tom Herbert Cc: Mahesh Bandewar Cc: "Eric W. Biederman" Cc: Gao feng Acked-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/core/dev.c | 9 +++++++-- net/core/fib_rules.c | 3 +-- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0640d2a..bc857fe 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1466,8 +1466,7 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - if (val != NETDEV_UNREGISTER_FINAL) - ASSERT_RTNL(); + ASSERT_RTNL(); return raw_notifier_call_chain(&netdev_chain, val, dev); } EXPORT_SYMBOL(call_netdevice_notifiers); @@ -5782,7 +5781,11 @@ static void netdev_wait_allrefs(struct net_device *dev) /* Rebroadcast unregister notification */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + + __rtnl_unlock(); rcu_barrier(); + rtnl_lock(); + call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); if (test_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { @@ -5855,7 +5858,9 @@ void netdev_run_todo(void) = list_first_entry(&list, struct net_device, todo_list); list_del(&dev->todo_list); + rtnl_lock(); call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); + __rtnl_unlock(); if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { pr_err("network todo '%s' but state %d\n", diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 5850937..ab7db83 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -711,16 +711,15 @@ static int fib_rules_event(struct notifier_block *this, unsigned long event, struct net *net = dev_net(dev); struct fib_rules_ops *ops; + ASSERT_RTNL(); switch (event) { case NETDEV_REGISTER: - ASSERT_RTNL(); list_for_each_entry(ops, &net->rules_ops, list) attach_rules(&ops->rules_list, dev); break; case NETDEV_UNREGISTER: - ASSERT_RTNL(); list_for_each_entry(ops, &net->rules_ops, list) detach_rules(&ops->rules_list, dev); break; -- cgit v1.1 From 3afa6d00fb4f9712fbb44b63ba31f88b6f9239fe Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 20 Aug 2012 07:59:10 +0000 Subject: cls_cgroup: Allow classifier cgroups to have their classid reset to 0 The network classifier cgroup initalizes each cgroups instance classid value to 0. However, the sock_update_classid function only updates classid's in sockets if the tasks cgroup classid is not zero, and if it differs from the current classid. The later check is to prevent cache line dirtying, but the former is detrimental, as it prevents resetting a classid for a cgroup to 0. While this is not a common action, it has administrative usefulness (if the admin wants to disable classification of a certain group temporarily for instance). Easy fix, just remove the zero check. Tested successfully by myself Signed-off-by: Neil Horman CC: "David S. Miller" Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 8f67ced..116786c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1230,7 +1230,7 @@ void sock_update_classid(struct sock *sk) rcu_read_lock(); /* doing current task, which cannot vanish. */ classid = task_cls_classid(current); rcu_read_unlock(); - if (classid && classid != sk->sk_classid) + if (classid != sk->sk_classid) sk->sk_classid = classid; } EXPORT_SYMBOL(sock_update_classid); -- cgit v1.1 From 8f4cccbbd92f2ad0ddbbc498ef7cee2a1c3defe9 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 20 Aug 2012 22:16:51 +0100 Subject: net: Set device operstate at registration time The operstate of a device is initially IF_OPER_UNKNOWN and is updated asynchronously by linkwatch after each change of carrier state reported by the driver. The default carrier state of a net device is on, and this will never be changed on drivers that do not support carrier detection, thus the operstate remains IF_OPER_UNKNOWN. For devices that do support carrier detection, the driver must set the carrier state to off initially, then poll the hardware state when the device is opened. However, we must not activate linkwatch for a unregistered device, and commit b473001 ('net: Do not fire linkwatch events until the device is registered.') ensured that we don't. But this means that the operstate for many devices that support carrier detection remains IF_OPER_UNKNOWN when it should be IF_OPER_DOWN. The same issue exists with the dormant state. The proper initialisation sequence, avoiding a race with opening of the device, is: rtnl_lock(); rc = register_netdevice(dev); if (rc) goto out_unlock; netif_carrier_off(dev); /* or netif_dormant_on(dev) */ rtnl_unlock(); but it seems silly that this should have to be repeated in so many drivers. Further, the operstate seen immediately after opening the device may still be IF_OPER_UNKNOWN due to the asynchronous nature of linkwatch. Commit 22604c8 ('net: Fix for initial link state in 2.6.28') attempted to fix this by setting the operstate synchronously, but it was reverted as it could lead to deadlock. This initialises the operstate synchronously at registration time only. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 2 ++ net/core/link_watch.c | 8 ++++++++ 2 files changed, 10 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index bc857fe..2f25d0c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5648,6 +5648,8 @@ int register_netdevice(struct net_device *dev) set_bit(__LINK_STATE_PRESENT, &dev->state); + linkwatch_init_dev(dev); + dev_init_scheduler(dev); dev_hold(dev); list_netdevice(dev); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index c3519c6..a019222 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -76,6 +76,14 @@ static void rfc2863_policy(struct net_device *dev) } +void linkwatch_init_dev(struct net_device *dev) +{ + /* Handle pre-registration link state changes */ + if (!netif_carrier_ok(dev) || netif_dormant(dev)) + rfc2863_policy(dev); +} + + static bool linkwatch_urgent_event(struct net_device *dev) { if (!netif_running(dev)) -- cgit v1.1 From 072a9c48600409d72aeb0d5b29fbb75861a06631 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 24 Aug 2012 21:41:11 +0000 Subject: netpoll: revert 6bdb7fe3104 and fix be_poll() instead Against -net. In the patch "netpoll: re-enable irq in poll_napi()", I tried to fix the following warning: [100718.051041] ------------[ cut here ]------------ [100718.051048] WARNING: at kernel/softirq.c:159 local_bh_enable_ip+0x7d/0xb0() (Not tainted) [100718.051049] Hardware name: ProLiant BL460c G7 ... [100718.051068] Call Trace: [100718.051073] [] ? warn_slowpath_common+0x87/0xc0 [100718.051075] [] ? warn_slowpath_null+0x1a/0x20 [100718.051077] [] ? local_bh_enable_ip+0x7d/0xb0 [100718.051080] [] ? _spin_unlock_bh+0x1b/0x20 [100718.051085] [] ? be_process_mcc+0x74/0x230 [be2net] [100718.051088] [] ? be_poll_tx_mcc+0x16c/0x290 [be2net] [100718.051090] [] ? netpoll_poll_dev+0xd6/0x490 [100718.051095] [] ? bond_poll_controller+0x75/0x80 [bonding] [100718.051097] [] ? netpoll_poll_dev+0x45/0x490 [100718.051100] [] ? ksize+0x19/0x80 [100718.051102] [] ? netpoll_send_skb_on_dev+0x157/0x240 by reenabling IRQ before calling ->poll, but it seems more problems are introduced after that patch: http://ozlabs.org/~akpm/stuff/IMG_20120824_122054.jpg http://marc.info/?l=linux-netdev&m=134563282530588&w=2 So it is safe to fix be2net driver code directly. This patch reverts the offending commit and fixes be_poll() by avoid disabling BH there, this is okay because be_poll() can be called either by poll_napi() which already disables IRQ, or by net_rx_action() which already disables BH. Reported-by: Andrew Morton Reported-by: Sylvain Munaut Cc: Sylvain Munaut Cc: Andrew Morton Cc: David Miller Cc: Sathya Perla Cc: Subbu Seetharaman Cc: Ajit Khaparde Signed-off-by: Cong Wang Tested-by: Sylvain Munaut Signed-off-by: David S. Miller --- net/core/netpoll.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'net/core') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 346b1eb..e4ba3e7 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -168,24 +168,16 @@ static void poll_napi(struct net_device *dev) struct napi_struct *napi; int budget = 16; - WARN_ON_ONCE(!irqs_disabled()); - list_for_each_entry(napi, &dev->napi_list, dev_list) { - local_irq_enable(); if (napi->poll_owner != smp_processor_id() && spin_trylock(&napi->poll_lock)) { - rcu_read_lock_bh(); budget = poll_one_napi(rcu_dereference_bh(dev->npinfo), napi, budget); - rcu_read_unlock_bh(); spin_unlock(&napi->poll_lock); - if (!budget) { - local_irq_disable(); + if (!budget) break; - } } - local_irq_disable(); } } -- cgit v1.1 From 6549dd43c04327071571edf7aea4465b16539422 Mon Sep 17 00:00:00 2001 From: Gao feng Date: Thu, 23 Aug 2012 15:36:55 +0000 Subject: net: dev: fix the incorrect hold of net namespace's lo device When moving a net device from one net namespace to another net namespace,dev_change_net_namespace calls NETDEV_DOWN event,so the original net namespace's dst entries which beloned to this net device will be put into dst_garbage list. then dev_change_net_namespace will set this net device's net to the new net namespace. If we unregister this net device's driver, this will trigger the NETDEV_UNREGISTER_FINAL event, dst_ifdown will be called, and get this net device's dst entries from dst_garbage list, put these entries' dev to the new net namespace's lo device. It's not what we want,actually we need these dst entries hold the original net namespace's lo device,this incorrect device holding will trigger emg message like below. unregister_netdevice: waiting for lo to become free. Usage count = 1 so we should call NETDEV_UNREGISTER_FINAL event in dev_change_net_namespace too,in order to make sure dst entries already in the dst_garbage list, we need rcu_barrier before we call NETDEV_UNREGISTER_FINAL event. With help form Eric Dumazet. Signed-off-by: Gao feng Cc: Eric Dumazet Cc: "Eric W. Biederman" Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 3401e2d..a5fc3e3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6259,6 +6259,8 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char the device is just moving and can keep their slaves up. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + rcu_barrier(); + call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); /* -- cgit v1.1 From ee13040901c11b016d996ab5a946acaaa3461737 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 24 Aug 2012 01:47:26 +0000 Subject: netpoll: provide an IP ident in UDP frames Let's fill IP header ident field with a meaningful value, it might help some setups. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/netpoll.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 346b1eb..5af9c26 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -388,6 +388,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) struct udphdr *udph; struct iphdr *iph; struct ethhdr *eth; + static atomic_t ip_ident; udp_len = len + sizeof(*udph); ip_len = udp_len + sizeof(*iph); @@ -423,7 +424,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) put_unaligned(0x45, (unsigned char *)iph); iph->tos = 0; put_unaligned(htons(ip_len), &(iph->tot_len)); - iph->id = 0; + iph->id = htons(atomic_inc_return(&ip_ident)); iph->frag_off = 0; iph->ttl = 64; iph->protocol = IPPROTO_UDP; -- cgit v1.1 From d1a53dfd114a6c53464de116cdab88d934bfe92f Mon Sep 17 00:00:00 2001 From: Rami Rosen Date: Mon, 27 Aug 2012 23:39:24 +0000 Subject: net: fix documentation of skb_needs_linearize(). skb_needs_linearize() does not check highmem DMA as it does not call illegal_highdma() anymore, so there is no need to mention highmem DMA here. (Indeed, ~NETIF_F_SG flag, which is checked in skb_needs_linearize(), can be set when illegal_highdma() returns true, and we are assured that illegal_highdma() is invoked prior to skb_needs_linearize() as skb_needs_linearize() is a static method called only once. But ~NETIF_F_SG can be set not only there in this same invocation path. It can also be set when can_checksum_protocol() returns false). see commit 02932ce9e2c136e6fab2571c8e0dd69ae8ec9853, Convert skb_need_linearize() to use precomputed features. Signed-off-by: Rami Rosen Signed-off-by: David S. Miller --- net/core/dev.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index a5fc3e3..b1e6d63 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2184,9 +2184,7 @@ EXPORT_SYMBOL(netif_skb_features); /* * Returns true if either: * 1. skb has frag_list and the device doesn't support FRAGLIST, or - * 2. skb is fragmented and the device does not support SG, or if - * at least one of fragments is in highmem and device does not - * support DMA from it. + * 2. skb is fragmented and the device does not support SG. */ static inline int skb_needs_linearize(struct sk_buff *skb, int features) -- cgit v1.1 From 8336886f786fdacbc19b719c1f7ea91eb70706d4 Mon Sep 17 00:00:00 2001 From: Jerry Chu Date: Fri, 31 Aug 2012 12:29:12 +0000 Subject: tcp: TCP Fast Open Server - support TFO listeners This patch builds on top of the previous patch to add the support for TFO listeners. This includes - 1. allocating, properly initializing, and managing the per listener fastopen_queue structure when TFO is enabled 2. changes to the inet_csk_accept code to support TFO. E.g., the request_sock can no longer be freed upon accept(), not until 3WHS finishes 3. allowing a TCP_SYN_RECV socket to properly poll() and sendmsg() if it's a TFO socket 4. properly closing a TFO listener, and a TFO socket before 3WHS finishes 5. supporting TCP_FASTOPEN socket option 6. modifying tcp_check_req() to use to check a TFO socket as well as request_sock 7. supporting TCP's TFO cookie option 8. adding a new SYN-ACK retransmit handler to use the timer directly off the TFO socket rather than the listener socket. Note that TFO server side will not retransmit anything other than SYN-ACK until the 3WHS is completed. The patch also contains an important function "reqsk_fastopen_remove()" to manage the somewhat complex relation between a listener, its request_sock, and the corresponding child socket. See the comment above the function for the detail. Signed-off-by: H.K. Jerry Chu Cc: Yuchung Cheng Cc: Neal Cardwell Cc: Eric Dumazet Cc: Tom Herbert Signed-off-by: David S. Miller --- net/core/request_sock.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) (limited to 'net/core') diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 9b570a6..c31d9e8 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -130,3 +131,97 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) kfree(lopt); } +/* + * This function is called to set a Fast Open socket's "fastopen_rsk" field + * to NULL when a TFO socket no longer needs to access the request_sock. + * This happens only after 3WHS has been either completed or aborted (e.g., + * RST is received). + * + * Before TFO, a child socket is created only after 3WHS is completed, + * hence it never needs to access the request_sock. things get a lot more + * complex with TFO. A child socket, accepted or not, has to access its + * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts, + * until 3WHS is either completed or aborted. Afterwards the req will stay + * until either the child socket is accepted, or in the rare case when the + * listener is closed before the child is accepted. + * + * In short, a request socket is only freed after BOTH 3WHS has completed + * (or aborted) and the child socket has been accepted (or listener closed). + * When a child socket is accepted, its corresponding req->sk is set to + * NULL since it's no longer needed. More importantly, "req->sk == NULL" + * will be used by the code below to determine if a child socket has been + * accepted or not, and the check is protected by the fastopenq->lock + * described below. + * + * Note that fastopen_rsk is only accessed from the child socket's context + * with its socket lock held. But a request_sock (req) can be accessed by + * both its child socket through fastopen_rsk, and a listener socket through + * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin + * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created. + * only in the rare case when both the listener and the child locks are held, + * e.g., in inet_csk_listen_stop() do we not need to acquire the lock. + * The lock also protects other fields such as fastopenq->qlen, which is + * decremented by this function when fastopen_rsk is no longer needed. + * + * Note that another solution was to simply use the existing socket lock + * from the listener. But first socket lock is difficult to use. It is not + * a simple spin lock - one must consider sock_owned_by_user() and arrange + * to use sk_add_backlog() stuff. But what really makes it infeasible is the + * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to + * acquire a child's lock while holding listener's socket lock. A corner + * case might also exist in tcp_v4_hnd_req() that will trigger this locking + * order. + * + * When a TFO req is created, it needs to sock_hold its listener to prevent + * the latter data structure from going away. + * + * This function also sets "treq->listener" to NULL and unreference listener + * socket. treq->listener is used by the listener so it is protected by the + * fastopenq->lock in this function. + */ +void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, + bool reset) +{ + struct sock *lsk = tcp_rsk(req)->listener; + struct fastopen_queue *fastopenq = + inet_csk(lsk)->icsk_accept_queue.fastopenq; + + BUG_ON(!spin_is_locked(&sk->sk_lock.slock) && !sock_owned_by_user(sk)); + + tcp_sk(sk)->fastopen_rsk = NULL; + spin_lock_bh(&fastopenq->lock); + fastopenq->qlen--; + tcp_rsk(req)->listener = NULL; + if (req->sk) /* the child socket hasn't been accepted yet */ + goto out; + + if (!reset || lsk->sk_state != TCP_LISTEN) { + /* If the listener has been closed don't bother with the + * special RST handling below. + */ + spin_unlock_bh(&fastopenq->lock); + sock_put(lsk); + reqsk_free(req); + return; + } + /* Wait for 60secs before removing a req that has triggered RST. + * This is a simple defense against TFO spoofing attack - by + * counting the req against fastopen.max_qlen, and disabling + * TFO when the qlen exceeds max_qlen. + * + * For more details see CoNext'11 "TCP Fast Open" paper. + */ + req->expires = jiffies + 60*HZ; + if (fastopenq->rskq_rst_head == NULL) + fastopenq->rskq_rst_head = req; + else + fastopenq->rskq_rst_tail->dl_next = req; + + req->dl_next = NULL; + fastopenq->rskq_rst_tail = req; + fastopenq->qlen++; +out: + spin_unlock_bh(&fastopenq->lock); + sock_put(lsk); + return; +} -- cgit v1.1