From 8336886f786fdacbc19b719c1f7ea91eb70706d4 Mon Sep 17 00:00:00 2001
From: Jerry Chu <hkchu@google.com>
Date: Fri, 31 Aug 2012 12:29:12 +0000
Subject: tcp: TCP Fast Open Server - support TFO listeners

This patch builds on top of the previous patch to add the support
for TFO listeners. This includes -

1. allocating, properly initializing, and managing the per listener
fastopen_queue structure when TFO is enabled

2. changes to the inet_csk_accept code to support TFO. E.g., the
request_sock can no longer be freed upon accept(), not until 3WHS
finishes

3. allowing a TCP_SYN_RECV socket to properly poll() and sendmsg()
if it's a TFO socket

4. properly closing a TFO listener, and a TFO socket before 3WHS
finishes

5. supporting TCP_FASTOPEN socket option

6. modifying tcp_check_req() to use to check a TFO socket as well
as request_sock

7. supporting TCP's TFO cookie option

8. adding a new SYN-ACK retransmit handler to use the timer directly
off the TFO socket rather than the listener socket. Note that TFO
server side will not retransmit anything other than SYN-ACK until
the 3WHS is completed.

The patch also contains an important function
"reqsk_fastopen_remove()" to manage the somewhat complex relation
between a listener, its request_sock, and the corresponding child
socket. See the comment above the function for the detail.

Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 49 ++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 42 insertions(+), 7 deletions(-)

(limited to 'net/ipv4/tcp.c')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2109ff4..df83d74 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -486,8 +486,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
 		mask |= POLLIN | POLLRDNORM | POLLRDHUP;
 
-	/* Connected? */
-	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+	/* Connected or passive Fast Open socket? */
+	if (sk->sk_state != TCP_SYN_SENT &&
+	    (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
 		int target = sock_rcvlowat(sk, 0, INT_MAX);
 
 		if (tp->urg_seq == tp->copied_seq &&
@@ -840,10 +841,15 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
 	ssize_t copied;
 	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 
-	/* Wait for a connection to finish. */
-	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+	/* Wait for a connection to finish. One exception is TCP Fast Open
+	 * (passive side) where data is allowed to be sent before a connection
+	 * is fully established.
+	 */
+	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
+	    !tcp_passive_fastopen(sk)) {
 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 			goto out_err;
+	}
 
 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 
@@ -1042,10 +1048,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 
-	/* Wait for a connection to finish. */
-	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+	/* Wait for a connection to finish. One exception is TCP Fast Open
+	 * (passive side) where data is allowed to be sent before a connection
+	 * is fully established.
+	 */
+	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
+	    !tcp_passive_fastopen(sk)) {
 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 			goto do_error;
+	}
 
 	if (unlikely(tp->repair)) {
 		if (tp->repair_queue == TCP_RECV_QUEUE) {
@@ -2144,6 +2155,10 @@ void tcp_close(struct sock *sk, long timeout)
 		 * they look as CLOSING or LAST_ACK for Linux)
 		 * Probably, I missed some more holelets.
 		 * 						--ANK
+		 * XXX (TFO) - To start off we don't support SYN+ACK+FIN
+		 * in a single packet! (May consider it later but will
+		 * probably need API support or TCP_CORK SYN-ACK until
+		 * data is written and socket is closed.)
 		 */
 		tcp_send_fin(sk);
 	}
@@ -2215,8 +2230,16 @@ adjudge_to_death:
 		}
 	}
 
-	if (sk->sk_state == TCP_CLOSE)
+	if (sk->sk_state == TCP_CLOSE) {
+		struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+		/* We could get here with a non-NULL req if the socket is
+		 * aborted (e.g., closed with unread data) before 3WHS
+		 * finishes.
+		 */
+		if (req != NULL)
+			reqsk_fastopen_remove(sk, req, false);
 		inet_csk_destroy_sock(sk);
+	}
 	/* Otherwise, socket is reprieved until protocol close. */
 
 out:
@@ -2688,6 +2711,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		else
 			icsk->icsk_user_timeout = msecs_to_jiffies(val);
 		break;
+
+	case TCP_FASTOPEN:
+		if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
+		    TCPF_LISTEN)))
+			err = fastopen_init_queue(sk, val);
+		else
+			err = -EINVAL;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -3501,11 +3532,15 @@ EXPORT_SYMBOL(tcp_cookie_generator);
 
 void tcp_done(struct sock *sk)
 {
+	struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+
 	if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
 		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
 
 	tcp_set_state(sk, TCP_CLOSE);
 	tcp_clear_xmit_timers(sk);
+	if (req != NULL)
+		reqsk_fastopen_remove(sk, req, false);
 
 	sk->sk_shutdown = SHUTDOWN_MASK;
 
-- 
cgit v1.1


From bb68b64724a4fd6b93d83b39aeffa4aadb2562fc Mon Sep 17 00:00:00 2001
From: Christoph Paasch <christoph.paasch@uclouvain.be>
Date: Tue, 18 Sep 2012 14:19:23 +0000
Subject: ipv4: Don't add TCP-code in inet_sock_destruct

Signed-off-by: Christoph Paasch <christoph.paasch@uclouvain.be>
Acked-by: H.K. Jerry Chu <hkchu@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net/ipv4/tcp.c')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index df83d74..7b1e940 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2325,6 +2325,13 @@ int tcp_disconnect(struct sock *sk, int flags)
 }
 EXPORT_SYMBOL(tcp_disconnect);
 
+void tcp_sock_destruct(struct sock *sk)
+{
+	inet_sock_destruct(sk);
+
+	kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
+}
+
 static inline bool tcp_can_repair_sock(const struct sock *sk)
 {
 	return capable(CAP_NET_ADMIN) &&
-- 
cgit v1.1


From 5640f7685831e088fe6c2e1f863a6805962f8e81 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 23 Sep 2012 23:04:42 +0000
Subject: net: use a per task frag allocator

We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.

This page is used to build fragments for skbs.

Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)

But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page

Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.

This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.

(up to 32768 bytes per frag, thats order-3 pages on x86)

This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.

Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536

Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 79 ++++++++++++++++------------------------------------------
 1 file changed, 22 insertions(+), 57 deletions(-)

(limited to 'net/ipv4/tcp.c')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7b1e940..72ea475 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1150,78 +1150,43 @@ new_segment:
 				if (err)
 					goto do_fault;
 			} else {
-				bool merge = false;
+				bool merge = true;
 				int i = skb_shinfo(skb)->nr_frags;
-				struct page *page = sk->sk_sndmsg_page;
-				int off;
-
-				if (page && page_count(page) == 1)
-					sk->sk_sndmsg_off = 0;
-
-				off = sk->sk_sndmsg_off;
-
-				if (skb_can_coalesce(skb, i, page, off) &&
-				    off != PAGE_SIZE) {
-					/* We can extend the last page
-					 * fragment. */
-					merge = true;
-				} else if (i == MAX_SKB_FRAGS || !sg) {
-					/* Need to add new fragment and cannot
-					 * do this because interface is non-SG,
-					 * or because all the page slots are
-					 * busy. */
-					tcp_mark_push(tp, skb);
-					goto new_segment;
-				} else if (page) {
-					if (off == PAGE_SIZE) {
-						put_page(page);
-						sk->sk_sndmsg_page = page = NULL;
-						off = 0;
+				struct page_frag *pfrag = sk_page_frag(sk);
+
+				if (!sk_page_frag_refill(sk, pfrag))
+					goto wait_for_memory;
+
+				if (!skb_can_coalesce(skb, i, pfrag->page,
+						      pfrag->offset)) {
+					if (i == MAX_SKB_FRAGS || !sg) {
+						tcp_mark_push(tp, skb);
+						goto new_segment;
 					}
-				} else
-					off = 0;
+					merge = false;
+				}
 
-				if (copy > PAGE_SIZE - off)
-					copy = PAGE_SIZE - off;
+				copy = min_t(int, copy, pfrag->size - pfrag->offset);
 
 				if (!sk_wmem_schedule(sk, copy))
 					goto wait_for_memory;
 
-				if (!page) {
-					/* Allocate new cache page. */
-					if (!(page = sk_stream_alloc_page(sk)))
-						goto wait_for_memory;
-				}
-
-				/* Time to copy data. We are close to
-				 * the end! */
 				err = skb_copy_to_page_nocache(sk, from, skb,
-							       page, off, copy);
-				if (err) {
-					/* If this page was new, give it to the
-					 * socket so it does not get leaked.
-					 */
-					if (!sk->sk_sndmsg_page) {
-						sk->sk_sndmsg_page = page;
-						sk->sk_sndmsg_off = 0;
-					}
+							       pfrag->page,
+							       pfrag->offset,
+							       copy);
+				if (err)
 					goto do_error;
-				}
 
 				/* Update the skb. */
 				if (merge) {
 					skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
 				} else {
-					skb_fill_page_desc(skb, i, page, off, copy);
-					if (sk->sk_sndmsg_page) {
-						get_page(page);
-					} else if (off + copy < PAGE_SIZE) {
-						get_page(page);
-						sk->sk_sndmsg_page = page;
-					}
+					skb_fill_page_desc(skb, i, pfrag->page,
+							   pfrag->offset, copy);
+					get_page(pfrag->page);
 				}
-
-				sk->sk_sndmsg_off = off + copy;
+				pfrag->offset += copy;
 			}
 
 			if (!copied)
-- 
cgit v1.1