diff options
author | David S. Miller <davem@davemloft.net> | 2015-10-03 04:32:52 -0700 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2015-10-03 04:32:52 -0700 |
commit | c3fc7ac9a0b978ee8538058743d21feef25f7b33 (patch) | |
tree | 0caf05649d27830ba0f9548704abbb1ec4b5bb91 /net/ipv6/tcp_ipv6.c | |
parent | f6d3125fa3c2f55ddf7cf69365c41089de6cfae6 (diff) | |
parent | e994b2f0fb9229aeff5eea9541320bd7b2ca8714 (diff) | |
download | op-kernel-dev-c3fc7ac9a0b978ee8538058743d21feef25f7b33.zip op-kernel-dev-c3fc7ac9a0b978ee8538058743d21feef25f7b33.tar.gz |
Merge branch 'tcp-lockless-listener'
Eric Dumazet says:
====================
tcp/dccp: lockless listener
TCP listener refactoring : this is becoming interesting !
This patch series takes the steps to use normal TCP/DCCP ehash
table to store SYN_RECV requests, instead of the private per-listener
hash table we had until now.
SYNACK skb are now attached to their syn_recv request socket,
so that we no longer heavily modify listener sk_wmem_alloc.
listener lock is no longer held in fast path, including
SYNCOOKIE mode.
During my tests, my server was able to process 3,500,000
SYN packets per second on one listener and still had available
cpu cycles.
That is about 2 to 3 order of magnitude what we had with older kernels.
This effort started two years ago and I am pleased to reach expectations.
We'll probably extend SO_REUSEPORT to add proper cpu/numa affinities,
so that heavy duty TCP servers can get proper siloing thanks to multi-queues
NIC.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv6/tcp_ipv6.c')
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 117 |
1 files changed, 59 insertions, 58 deletions
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2ae95e1..33334f0 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -438,7 +438,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, u16 queue_mapping, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + bool attach_req) { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); @@ -451,7 +452,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, IPPROTO_TCP)) == NULL) goto done; - skb = tcp_make_synack(sk, dst, req, foc); + skb = tcp_make_synack(sk, dst, req, foc, attach_req); if (skb) { __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, @@ -622,8 +623,12 @@ clear_hash_noput: return 1; } -static bool tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) +#endif + +static bool tcp_v6_inbound_md5_hash(const struct sock *sk, + const struct sk_buff *skb) { +#ifdef CONFIG_TCP_MD5SIG const __u8 *hash_location = NULL; struct tcp_md5sig_key *hash_expected; const struct ipv6hdr *ip6h = ipv6_hdr(skb); @@ -660,9 +665,9 @@ static bool tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) &ip6h->daddr, ntohs(th->dest)); return true; } +#endif return false; } -#endif static void tcp_v6_init_req(struct request_sock *req, const struct sock *sk_listener, @@ -723,7 +728,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .route_req = tcp_v6_route_req, .init_seq = tcp_v6_init_sequence, .send_synack = tcp_v6_send_synack, - .queue_hash_add = inet6_csk_reqsk_queue_hash_add, }; static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq, @@ -934,37 +938,11 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, } -static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) +static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb) { +#ifdef CONFIG_SYN_COOKIES const struct tcphdr *th = tcp_hdr(skb); - struct request_sock *req; - struct sock *nsk; - - /* Find possible connection requests. */ - req = inet6_csk_search_req(sk, th->source, - &ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, tcp_v6_iif(skb)); - if (req) { - nsk = tcp_check_req(sk, skb, req, false); - if (!nsk || nsk == sk) - reqsk_put(req); - return nsk; - } - nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo, - &ipv6_hdr(skb)->saddr, th->source, - &ipv6_hdr(skb)->daddr, ntohs(th->dest), - tcp_v6_iif(skb)); - - if (nsk) { - if (nsk->sk_state != TCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } -#ifdef CONFIG_SYN_COOKIES if (!th->syn) sk = cookie_v6_check(sk, skb); #endif @@ -1183,7 +1161,7 @@ out: } /* The socket must have it's spinlock held when we get - * here. + * here, unless it is a TCP_LISTEN socket. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. @@ -1254,18 +1232,14 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) goto csum_err; if (sk->sk_state == TCP_LISTEN) { - struct sock *nsk = tcp_v6_hnd_req(sk, skb); + struct sock *nsk = tcp_v6_cookie_check(sk, skb); + if (!nsk) goto discard; - /* - * Queue it on the new socket if the new socket is active, - * otherwise we just shortcircuit this and continue with - * the new socket.. - */ if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); - sk_mark_napi_id(sk, skb); + sk_mark_napi_id(nsk, skb); if (tcp_child_process(sk, nsk, skb)) goto reset; if (opt_skb) @@ -1398,6 +1372,33 @@ process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; + if (sk->sk_state == TCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk = NULL; + + sk = req->rsk_listener; + tcp_v6_fill_cb(skb, hdr, th); + if (tcp_v6_inbound_md5_hash(sk, skb)) { + reqsk_put(req); + goto discard_it; + } + if (sk->sk_state == TCP_LISTEN) + nsk = tcp_check_req(sk, skb, req, false); + if (!nsk) { + reqsk_put(req); + goto discard_it; + } + if (nsk == sk) { + sock_hold(sk); + reqsk_put(req); + tcp_v6_restore_cb(skb); + } else if (tcp_child_process(sk, nsk, skb)) { + tcp_v6_send_reset(nsk, skb); + goto discard_it; + } else { + return 0; + } + } if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; @@ -1408,17 +1409,21 @@ process: tcp_v6_fill_cb(skb, hdr, th); -#ifdef CONFIG_TCP_MD5SIG if (tcp_v6_inbound_md5_hash(sk, skb)) goto discard_and_relse; -#endif if (sk_filter(sk, skb)) goto discard_and_relse; - sk_incoming_cpu_update(sk); skb->dev = NULL; + if (sk->sk_state == TCP_LISTEN) { + ret = tcp_v6_do_rcv(sk, skb); + goto put_and_return; + } + + sk_incoming_cpu_update(sk); + bh_lock_sock_nested(sk); tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); ret = 0; @@ -1433,6 +1438,7 @@ process: } bh_unlock_sock(sk); +put_and_return: sock_put(sk); return ret ? -1 : 0; @@ -1633,7 +1639,7 @@ static void tcp_v6_destroy_sock(struct sock *sk) #ifdef CONFIG_PROC_FS /* Proc filesystem TCPv6 sock list dumping. */ static void get_openreq6(struct seq_file *seq, - struct request_sock *req, int i, kuid_t uid) + const struct request_sock *req, int i) { long ttd = req->rsk_timer.expires - jiffies; const struct in6_addr *src = &inet_rsk(req)->ir_v6_loc_addr; @@ -1657,7 +1663,8 @@ static void get_openreq6(struct seq_file *seq, 1, /* timers active (only the expire timer) */ jiffies_to_clock_t(ttd), req->num_timeout, - from_kuid_munged(seq_user_ns(seq), uid), + from_kuid_munged(seq_user_ns(seq), + sock_i_uid(req->rsk_listener)), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, req); @@ -1762,18 +1769,12 @@ static int tcp6_seq_show(struct seq_file *seq, void *v) } st = seq->private; - switch (st->state) { - case TCP_SEQ_STATE_LISTENING: - case TCP_SEQ_STATE_ESTABLISHED: - if (sk->sk_state == TCP_TIME_WAIT) - get_timewait6_sock(seq, v, st->num); - else - get_tcp6_sock(seq, v, st->num); - break; - case TCP_SEQ_STATE_OPENREQ: - get_openreq6(seq, v, st->num, st->uid); - break; - } + if (sk->sk_state == TCP_TIME_WAIT) + get_timewait6_sock(seq, v, st->num); + else if (sk->sk_state == TCP_NEW_SYN_RECV) + get_openreq6(seq, v, st->num); + else + get_tcp6_sock(seq, v, st->num); out: return 0; } |