First cut of the modularization of our TCP stack. Still

to do is to clean up the timer handling using the async-drain. Other optimizations may be coming to go with this. Whats here will allow differnet tcp implementations (one included). Reviewed by: jtl, hiren, transports Sponsored by: Netflix Inc. Differential Revision: D4055
author: rrs <rrs@FreeBSD.org> 2015-12-16 00:56:45 +0000
committer: rrs <rrs@FreeBSD.org> 2015-12-16 00:56:45 +0000
commit: 50f477e18282c69de187b4f12997286e5dacd64c (patch)
tree: 783c0558e06b050ee57c6af56f4c51a9f65359e7
parent: 7541e09c63c2e2c2fcea0611970bc84fe7b87372 (diff)
download: FreeBSD-src-50f477e18282c69de187b4f12997286e5dacd64c.zip
FreeBSD-src-50f477e18282c69de187b4f12997286e5dacd64c.tar.gz
12 files changed, 3046 insertions, 70 deletions
diff --git a/sys/modules/Makefile b/sys/modules/Makefile
index 85c3553..dc95750 100644
--- a/sys/modules/Makefile
+++ b/sys/modules/Makefile
@@ -346,6 +346,7 @@ SUBDIR=	\
 	${_syscons} \
 	sysvipc \
 	${_ti} \
+	tcp/fastpath \
 	tests/framework \
 	tests/callout_test \
 	tl \
diff --git a/sys/modules/tcp/fastpath/Makefile b/sys/modules/tcp/fastpath/Makefile
new file mode 100644
index 0000000..526b39a
--- /dev/null
+++ b/sys/modules/tcp/fastpath/Makefile
@@ -0,0 +1,15 @@
+#
+# $FreeBSD$
+#
+
+.PATH: ${.CURDIR}/../../../netinet/tcp_stacks
+
+KMOD=	fastpath
+SRCS=	fastpath.c
+
+#
+# Enable full debugging
+#
+#CFLAGS += -g
+
+.include <bsd.kmod.mk>
diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
index 3760860..294e994 100644
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@@ -167,7 +167,7 @@ struct tcphdr {
 #define	TCP_KEEPCNT	1024	/* L,N number of keepalives before close */
 #define	TCP_PCAP_OUT	2048	/* number of output packets to keep */
 #define	TCP_PCAP_IN	4096	/* number of input packets to keep */
-
+#define TCP_FUNCTION_BLK 8192	/* Set the tcp function pointers to the specified stack */
 /* Start of reserved space for third-party user-settable options. */
 #define	TCP_VENDOR	SO_VENDOR
 
@@ -245,5 +245,11 @@ struct tcp_info {
 	u_int32_t	__tcpi_pad[26];		/* Padding. */
 };
 #endif
+#define TCP_FUNCTION_NAME_LEN_MAX 32
+
+struct tcp_function_set {
+	char function_set_name[TCP_FUNCTION_NAME_LEN_MAX];
+	uint32_t pcbcnt;
+};
 
 #endif /* !_NETINET_TCP_H_ */
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index f73c397..140cee7 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -230,23 +230,6 @@ VNET_DEFINE(struct inpcbhead, tcb);
 #define	tcb6	tcb  /* for KAME src sync over BSD*'s */
 VNET_DEFINE(struct inpcbinfo, tcbinfo);
 
-static void	 tcp_dooptions(struct tcpopt *, u_char *, int, int);
-static void	 tcp_do_segment(struct mbuf *, struct tcphdr *,
-		     struct socket *, struct tcpcb *, int, int, uint8_t,
-		     int);
-static void	 tcp_dropwithreset(struct mbuf *, struct tcphdr *,
-		     struct tcpcb *, int, int);
-static void	 tcp_pulloutofband(struct socket *,
-		     struct tcphdr *, struct mbuf *, int);
-static void	 tcp_xmit_timer(struct tcpcb *, int);
-static void	 tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
-static void inline	cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
-			    uint16_t type);
-static void inline	cc_conn_init(struct tcpcb *tp);
-static void inline	cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
-static void inline	hhook_run_tcp_est_in(struct tcpcb *tp,
-			    struct tcphdr *th, struct tcpopt *to);
-
 /*
  * TCP statistics are stored in an "array" of counter(9)s.
  */
@@ -272,7 +255,7 @@ kmod_tcpstat_inc(int statnum)
 /*
  * Wrapper for the TCP established input helper hook.
  */
-static void inline
+void
 hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
 {
 	struct tcp_hhook_data hhook_data;
@@ -290,7 +273,7 @@ hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
 /*
  * CC wrapper hook functions
  */
-static void inline
+void
 cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
 {
 	INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -322,7 +305,7 @@ cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
 	}
 }
 
-static void inline
+void 
 cc_conn_init(struct tcpcb *tp)
 {
 	struct hc_metrics_lite metrics;
@@ -446,7 +429,7 @@ cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
 	}
 }
 
-static void inline
+void inline
 cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
 {
 	INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -601,9 +584,6 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
 	struct tcpopt to;		/* options in this segment */
 	char *s = NULL;			/* address and port logging */
 	int ti_locked;
-#define	TI_UNLOCKED	1
-#define	TI_RLOCKED	2
-
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
@@ -1175,7 +1155,7 @@ relocked:
 			 * contains.  tcp_do_segment() consumes
 			 * the mbuf chain and unlocks the inpcb.
 			 */
-			tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
+			tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
 			    iptos, ti_locked);
 			INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 			return (IPPROTO_DONE);
@@ -1421,7 +1401,7 @@ relocked:
 	 * state.  tcp_do_segment() always consumes the mbuf chain, unlocks
 	 * the inpcb, and unlocks pcbinfo.
 	 */
-	tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked);
+	tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked);
 	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	return (IPPROTO_DONE);
 
@@ -1476,7 +1456,7 @@ drop:
 	return (IPPROTO_DONE);
 }
 
-static void
+void
 tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
     int ti_locked)
@@ -1788,7 +1768,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
 						      tp->t_rxtcur);
 				sowwakeup(so);
 				if (sbavail(&so->so_snd))
-					(void) tcp_output(tp);
+					(void) tp->t_fb->tfb_tcp_output(tp);
 				goto check_delack;
 			}
 		} else if (th->th_ack == tp->snd_una &&
@@ -1907,7 +1887,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
 				tp->t_flags |= TF_DELACK;
 			} else {
 				tp->t_flags |= TF_ACKNOW;
-				tcp_output(tp);
+				tp->t_fb->tfb_tcp_output(tp);
 			}
 			goto check_delack;
 		}
@@ -2522,7 +2502,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
 						}
 					} else
 						tp->snd_cwnd += tp->t_maxseg;
-					(void) tcp_output(tp);
+					(void) tp->t_fb->tfb_tcp_output(tp);
 					goto drop;
 				} else if (tp->t_dupacks == tcprexmtthresh) {
 					tcp_seq onxt = tp->snd_nxt;
@@ -2556,12 +2536,12 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
 						    tcps_sack_recovery_episode);
 						tp->sack_newdata = tp->snd_nxt;
 						tp->snd_cwnd = tp->t_maxseg;
-						(void) tcp_output(tp);
+						(void) tp->t_fb->tfb_tcp_output(tp);
 						goto drop;
 					}
 					tp->snd_nxt = th->th_ack;
 					tp->snd_cwnd = tp->t_maxseg;
-					(void) tcp_output(tp);
+					(void) tp->t_fb->tfb_tcp_output(tp);
 					KASSERT(tp->snd_limited <= 2,
 					    ("%s: tp->snd_limited too big",
 					    __func__));
@@ -2608,7 +2588,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
 					    (tp->snd_nxt - tp->snd_una);
 					SOCKBUF_UNLOCK(&so->so_snd);
 					if (avail > 0)
-						(void) tcp_output(tp);
+						(void) tp->t_fb->tfb_tcp_output(tp);
 					sent = tp->snd_max - oldsndmax;
 					if (sent > tp->t_maxseg) {
 						KASSERT((tp->t_dupacks == 2 &&
@@ -3074,7 +3054,7 @@ dodata:							/* XXX */
 	 * Return any desired output.
 	 */
 	if (needoutput || (tp->t_flags & TF_ACKNOW))
-		(void) tcp_output(tp);
+		(void) tp->t_fb->tfb_tcp_output(tp);
 
 check_delack:
 	KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
@@ -3122,7 +3102,7 @@ dropafterack:
 	ti_locked = TI_UNLOCKED;
 
 	tp->t_flags |= TF_ACKNOW;
-	(void) tcp_output(tp);
+	(void) tp->t_fb->tfb_tcp_output(tp);
 	INP_WUNLOCK(tp->t_inpcb);
 	m_freem(m);
 	return;
@@ -3168,7 +3148,7 @@ drop:
  * The mbuf must still include the original packet header.
  * tp may be NULL.
  */
-static void
+void
 tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
     int tlen, int rstreason)
 {
@@ -3231,7 +3211,7 @@ drop:
 /*
  * Parse TCP options and place in tcpopt.
  */
-static void
+void
 tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
 {
 	int opt, optlen;
@@ -3325,7 +3305,7 @@ tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
  * It is still reflected in the segment length for
  * sequencing purposes.
  */
-static void
+void
 tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m,
     int off)
 {
@@ -3358,7 +3338,7 @@ tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m,
  * Collect new round-trip time estimate
  * and update averages and current timeout.
  */
-static void
+void
 tcp_xmit_timer(struct tcpcb *tp, int rtt)
 {
 	int delta;
@@ -3738,7 +3718,7 @@ tcp_mssopt(struct in_conninfo *inc)
  * By setting snd_nxt to ti_ack, this forces retransmission timer to
  * be started again.
  */
-static void
+void
 tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
 {
 	tcp_seq onxt = tp->snd_nxt;
@@ -3755,7 +3735,7 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
 	 */
 	tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th);
 	tp->t_flags |= TF_ACKNOW;
-	(void) tcp_output(tp);
+	(void) tp->t_fb->tfb_tcp_output(tp);
 	tp->snd_cwnd = ocwnd;
 	if (SEQ_GT(onxt, tp->snd_nxt))
 		tp->snd_nxt = onxt;
diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c
index 82e2251..1e31871 100644
--- a/sys/netinet/tcp_sack.c
+++ b/sys/netinet/tcp_sack.c
@@ -599,7 +599,7 @@ tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th)
 	if (tp->snd_cwnd > tp->snd_ssthresh)
 		tp->snd_cwnd = tp->snd_ssthresh;
 	tp->t_flags |= TF_ACKNOW;
-	(void) tcp_output(tp);
+	(void) tp->t_fb->tfb_tcp_output(tp);
 }
 
 #if 0
diff --git a/sys/netinet/tcp_stacks/fastpath.c b/sys/netinet/tcp_stacks/fastpath.c
new file mode 100644
index 0000000..597efe6
--- /dev/null
+++ b/sys/netinet/tcp_stacks/fastpath.c
@@ -0,0 +1,2486 @@
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ * Copyright (c) 2007-2008,2010
+ *	Swinburne University of Technology, Melbourne, Australia.
+ * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * Copyright (c) 2010-2011 Juniper Networks, Inc.
+ * Copyright (c) 2015 Netflix Inc.
+ * All rights reserved.
+ *
+ * Portions of this software were developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University of Technology, by Lawrence Stewart,
+ * James Healy and David Hayes, made possible in part by a grant from the Cisco
+ * University Research Program Fund at Community Foundation Silicon Valley.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
+ *
+ * Portions of this software were developed by Robert N. M. Watson under
+ * contract to Juniper Networks, Inc.
+ *
+ * Portions of this software were developed by Randall R. Stewart while
+ * working for Netflix Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ipfw.h"		/* for ipfw_fwd	*/
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_kdtrace.h"
+#include "opt_tcpdebug.h"
+
+#include <sys/param.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/hhook.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h>		/* for proc0 declaration */
+#include <sys/protosw.h>
+#include <sys/sdt.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+
+#include <machine/cpu.h>	/* before tcp_seq.h, for tcp_random18() */
+
+#include <vm/uma.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/route.h>
+#include <net/vnet.h>
+
+#define TCPSTATES		/* for logging */
+
+#include <netinet/cc.h>
+#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>	/* required for icmp_var.h */
+#include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
+#include <netinet/ip_var.h>
+#include <netinet/ip_options.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet6/ip6_var.h>
+#include <netinet6/nd6.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet6/tcp6_var.h>
+#include <netinet/tcpip.h>
+#include <netinet/tcp_syncache.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+#endif /* TCPDEBUG */
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
+
+#ifdef IPSEC
+#include <netipsec/ipsec.h>
+#include <netipsec/ipsec6.h>
+#endif /*IPSEC*/
+
+#include <machine/in_cksum.h>
+
+#include <security/mac/mac_framework.h>
+
+const int tcprexmtthresh;
+
+VNET_DECLARE(int, tcp_autorcvbuf_inc);
+#define	V_tcp_autorcvbuf_inc	VNET(tcp_autorcvbuf_inc)
+VNET_DECLARE(int, tcp_autorcvbuf_max);
+#define	V_tcp_autorcvbuf_max	VNET(tcp_autorcvbuf_max)
+VNET_DECLARE(int, tcp_do_rfc3042);
+#define	V_tcp_do_rfc3042	VNET(tcp_do_rfc3042)
+VNET_DECLARE(int, tcp_do_autorcvbuf);
+#define	V_tcp_do_autorcvbuf	VNET(tcp_do_autorcvbuf)
+VNET_DECLARE(int, tcp_insecure_rst);
+#define	V_tcp_insecure_rst	VNET(tcp_insecure_rst)
+VNET_DECLARE(int, tcp_insecure_syn);
+#define	V_tcp_insecure_syn	VNET(tcp_insecure_syn)
+
+
+
+
+extern void	tcp_dooptions(struct tcpopt *, u_char *, int, int);
+extern void	tcp_dropwithreset(struct mbuf *, struct tcphdr *,
+		     struct tcpcb *, int, int);
+extern void	tcp_pulloutofband(struct socket *,
+		     struct tcphdr *, struct mbuf *, int);
+extern void	tcp_xmit_timer(struct tcpcb *, int);
+extern void	tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
+extern void	tcp_mss(struct tcpcb *tp, int offer);
+extern void 	cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
+				uint16_t type);
+extern void cc_conn_init(struct tcpcb *tp);
+extern void cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
+extern void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type);
+extern void hhook_run_tcp_est_in(struct tcpcb *tp,
+				 struct tcphdr *th, struct tcpopt *to);
+
+extern void kmod_tcpstat_inc(int statnum);
+#ifdef TCP_SIGNATURE
+extern int tcp_signature_verify_input(struct mbuf *m, int off0, int tlen, int optlen,
+	     struct tcpopt *to, struct tcphdr *th, u_int tcpbflag);
+#endif
+
+static void	 tcp_do_segment_fastslow(struct mbuf *, struct tcphdr *,
+			struct socket *, struct tcpcb *, int, int, uint8_t,
+			int);
+
+static void	 tcp_do_segment_fastack(struct mbuf *, struct tcphdr *,
+			struct socket *, struct tcpcb *, int, int, uint8_t,
+			int);
+
+/*
+ * Indicate whether this ack should be delayed.  We can delay the ack if
+ * following conditions are met:
+ *	- There is no delayed ack timer in progress.
+ *	- Our last ack wasn't a 0-sized window. We never want to delay
+ *	  the ack that opens up a 0-sized window.
+ *	- LRO wasn't used for this segment. We make sure by checking that the
+ *	  segment size is not larger than the MSS.
+ *	- Delayed acks are enabled or this is a half-synchronized T/TCP
+ *	  connection.
+ */
+#define DELAY_ACK(tp, tlen)						\
+	((!tcp_timer_active(tp, TT_DELACK) &&				\
+	    (tp->t_flags & TF_RXWIN0SENT) == 0) &&			\
+	    (tlen <= tp->t_maxopd) &&					\
+	    (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
+
+/*
+ * So how is this faster than the normal fast ack?
+ * It basically allows us to also stay in the fastpath
+ * when a window-update ack also arrives. In testing
+ * we saw only 25-30% of connections doing fastpath 
+ * due to the fact that along with moving forward
+ * in sequence the window was also updated.
+ */
+static void
+tcp_do_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
+	       struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 
+	       int ti_locked, u_long tiwin)
+{
+	int acked;
+	int winup_only=0;
+#ifdef TCPDEBUG
+	/*
+	 * The size of tcp_saveipgen must be the size of the max ip header,
+	 * now IPv6.
+	 */
+	u_char tcp_saveipgen[IP6_HDR_LEN];
+	struct tcphdr tcp_savetcp;
+	short ostate = 0;
+#endif
+        /*
+	 * The following if statment will be true if
+	 * we are doing the win_up_in_fp <and>
+	 * - We have more new data (SEQ_LT(tp->snd_wl1, th->th_seq)) <or>
+	 * - No more new data, but we have an ack for new data
+	 *   (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack))
+	 * - No more new data, the same ack point but the window grew
+	 *   (tp->snd_wl1 == th->th_seq && tp->snd_wl2 == th->th_ack && twin > tp->snd_wnd)
+	 */
+	if ((SEQ_LT(tp->snd_wl1, th->th_seq) ||
+	     (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
+					    (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
+		/* keep track of pure window updates */
+		if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) {
+			winup_only = 1;
+			TCPSTAT_INC(tcps_rcvwinupd);
+		}
+		tp->snd_wnd = tiwin;
+		tp->snd_wl1 = th->th_seq;
+		tp->snd_wl2 = th->th_ack;
+		if (tp->snd_wnd > tp->max_sndwnd)
+			tp->max_sndwnd = tp->snd_wnd;
+	}
+	/*
+	 * If last ACK falls within this segment's sequence numbers,
+	 * record the timestamp.
+	 * NOTE that the test is modified according to the latest
+	 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
+	 */
+	if ((to->to_flags & TOF_TS) != 0 &&
+	    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
+		tp->ts_recent_age = tcp_ts_getticks();
+		tp->ts_recent = to->to_tsval;
+	}
+	/*
+	 * This is a pure ack for outstanding data.
+	 */
+	if (ti_locked == TI_RLOCKED) {
+		INP_INFO_RUNLOCK(&V_tcbinfo);
+	}
+	ti_locked = TI_UNLOCKED;
+
+	TCPSTAT_INC(tcps_predack);
+
+	/*
+	 * "bad retransmit" recovery.
+	 */
+	if (tp->t_rxtshift == 1 &&
+	    tp->t_flags & TF_PREVVALID &&
+	    (int)(ticks - tp->t_badrxtwin) < 0) {
+		cc_cong_signal(tp, th, CC_RTO_ERR);
+	}
+
+	/*
+	 * Recalculate the transmit timer / rtt.
+	 *
+	 * Some boxes send broken timestamp replies
+	 * during the SYN+ACK phase, ignore
+	 * timestamps of 0 or we could calculate a
+	 * huge RTT and blow up the retransmit timer.
+	 */
+	if ((to->to_flags & TOF_TS) != 0 &&
+	    to->to_tsecr) {
+		u_int t;
+
+		t = tcp_ts_getticks() - to->to_tsecr;
+		if (!tp->t_rttlow || tp->t_rttlow > t)
+			tp->t_rttlow = t;
+		tcp_xmit_timer(tp,
+			       TCP_TS_TO_TICKS(t) + 1);
+	} else if (tp->t_rtttime &&
+		   SEQ_GT(th->th_ack, tp->t_rtseq)) {
+		if (!tp->t_rttlow ||
+		    tp->t_rttlow > ticks - tp->t_rtttime)
+			tp->t_rttlow = ticks - tp->t_rtttime;
+		tcp_xmit_timer(tp,
+			       ticks - tp->t_rtttime);
+	}
+	if (winup_only == 0) {
+		acked = BYTES_THIS_ACK(tp, th);
+
+		/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
+		hhook_run_tcp_est_in(tp, th, to);
+
+		TCPSTAT_ADD(tcps_rcvackbyte, acked);
+		sbdrop(&so->so_snd, acked);
+		if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
+		    SEQ_LEQ(th->th_ack, tp->snd_recover))
+			tp->snd_recover = th->th_ack - 1;
+				
+		/*
+		 * Let the congestion control algorithm update
+		 * congestion control related information. This
+		 * typically means increasing the congestion
+		 * window.
+		 */
+		cc_ack_received(tp, th, CC_ACK);
+
+		tp->snd_una = th->th_ack;
+		/*
+		 * Pull snd_wl2 up to prevent seq wrap relative
+		 * to th_ack.
+		 */
+		tp->snd_wl2 = th->th_ack;
+		tp->t_dupacks = 0;
+		m_freem(m);
+
+		/*
+		 * If all outstanding data are acked, stop
+		 * retransmit timer, otherwise restart timer
+		 * using current (possibly backed-off) value.
+		 * If process is waiting for space,
+		 * wakeup/selwakeup/signal.  If data
+		 * are ready to send, let tcp_output
+		 * decide between more output or persist.
+		 */
+#ifdef TCPDEBUG
+		if (so->so_options & SO_DEBUG)
+			tcp_trace(TA_INPUT, ostate, tp,
+				  (void *)tcp_saveipgen,
+				  &tcp_savetcp, 0);
+#endif
+		if (tp->snd_una == tp->snd_max)
+			tcp_timer_activate(tp, TT_REXMT, 0);
+		else if (!tcp_timer_active(tp, TT_PERSIST))
+			tcp_timer_activate(tp, TT_REXMT,
+					   tp->t_rxtcur);
+	} else {
+		/* 
+		 * Window update only, just free the mbufs and
+		 * send out whatever we can.
+		 */
+		m_freem(m);
+	}
+	sowwakeup(so);
+	if (sbavail(&so->so_snd))
+		(void) tcp_output(tp);
+	KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
+					    __func__, ti_locked));
+	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	if (tp->t_flags & TF_DELACK) {
+		tp->t_flags &= ~TF_DELACK;
+		tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
+	}
+	INP_WUNLOCK(tp->t_inpcb);
+}
+
+/*
+ * Here nothing is really faster, its just that we
+ * have broken out the fast-data path also just like
+ * the fast-ack. 
+ */
+static void
+tcp_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
+		   struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 
+		   int ti_locked, u_long tiwin)
+{
+	int newsize = 0;	/* automatic sockbuf scaling */
+#ifdef TCPDEBUG
+	/*
+	 * The size of tcp_saveipgen must be the size of the max ip header,
+	 * now IPv6.
+	 */
+	u_char tcp_saveipgen[IP6_HDR_LEN];
+	struct tcphdr tcp_savetcp;
+	short ostate = 0;
+#endif
+	/*
+	 * If last ACK falls within this segment's sequence numbers,
+	 * record the timestamp.
+	 * NOTE that the test is modified according to the latest
+	 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
+	 */
+	if ((to->to_flags & TOF_TS) != 0 &&
+	    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
+		tp->ts_recent_age = tcp_ts_getticks();
+		tp->ts_recent = to->to_tsval;
+	}
+
+	/*
+	 * This is a pure, in-sequence data packet with
+	 * nothing on the reassembly queue and we have enough
+	 * buffer space to take it.
+	 */
+	if (ti_locked == TI_RLOCKED) {
+		INP_INFO_RUNLOCK(&V_tcbinfo);
+	}
+	ti_locked = TI_UNLOCKED;
+
+	/* Clean receiver SACK report if present */
+	if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
+		tcp_clean_sackreport(tp);
+	TCPSTAT_INC(tcps_preddat);
+	tp->rcv_nxt += tlen;
+	/*
+	 * Pull snd_wl1 up to prevent seq wrap relative to
+	 * th_seq.
+	 */
+	tp->snd_wl1 = th->th_seq;
+	/*
+	 * Pull rcv_up up to prevent seq wrap relative to
+	 * rcv_nxt.
+	 */
+	tp->rcv_up = tp->rcv_nxt;
+	TCPSTAT_ADD(tcps_rcvbyte, tlen);
+#ifdef TCPDEBUG
+	if (so->so_options & SO_DEBUG)
+		tcp_trace(TA_INPUT, ostate, tp,
+			  (void *)tcp_saveipgen, &tcp_savetcp, 0);
+#endif
+	/*
+	 * Automatic sizing of receive socket buffer.  Often the send
+	 * buffer size is not optimally adjusted to the actual network
+	 * conditions at hand (delay bandwidth product).  Setting the
+	 * buffer size too small limits throughput on links with high
+	 * bandwidth and high delay (eg. trans-continental/oceanic links).
+	 *
+	 * On the receive side the socket buffer memory is only rarely
+	 * used to any significant extent.  This allows us to be much
+	 * more aggressive in scaling the receive socket buffer.  For
+	 * the case that the buffer space is actually used to a large
+	 * extent and we run out of kernel memory we can simply drop
+	 * the new segments; TCP on the sender will just retransmit it
+	 * later.  Setting the buffer size too big may only consume too
+	 * much kernel memory if the application doesn't read() from
+	 * the socket or packet loss or reordering makes use of the
+	 * reassembly queue.
+	 *
+	 * The criteria to step up the receive buffer one notch are:
+	 *  1. Application has not set receive buffer size with
+	 *     SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
+	 *  2. the number of bytes received during the time it takes
+	 *     one timestamp to be reflected back to us (the RTT);
+	 *  3. received bytes per RTT is within seven eighth of the
+	 *     current socket buffer size;
+	 *  4. receive buffer size has not hit maximal automatic size;
+	 *
+	 * This algorithm does one step per RTT at most and only if
+	 * we receive a bulk stream w/o packet losses or reorderings.
+	 * Shrinking the buffer during idle times is not necessary as
+	 * it doesn't consume any memory when idle.
+	 *
+	 * TODO: Only step up if the application is actually serving
+	 * the buffer to better manage the socket buffer resources.
+	 */
+	if (V_tcp_do_autorcvbuf &&
+	    (to->to_flags & TOF_TS) &&
+	    to->to_tsecr &&
+	    (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
+		if (TSTMP_GT(to->to_tsecr, tp->rfbuf_ts) &&
+		    to->to_tsecr - tp->rfbuf_ts < hz) {
+			if (tp->rfbuf_cnt >
+			    (so->so_rcv.sb_hiwat / 8 * 7) &&
+			    so->so_rcv.sb_hiwat <
+			    V_tcp_autorcvbuf_max) {
+				newsize =
+					min(so->so_rcv.sb_hiwat +
+					    V_tcp_autorcvbuf_inc,
+					    V_tcp_autorcvbuf_max);
+			}
+			/* Start over with next RTT. */
+			tp->rfbuf_ts = 0;
+			tp->rfbuf_cnt = 0;
+		} else
+			tp->rfbuf_cnt += tlen;	/* add up */
+	}
+
+	/* Add data to socket buffer. */
+	SOCKBUF_LOCK(&so->so_rcv);
+	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+		m_freem(m);
+	} else {
+		/*
+		 * Set new socket buffer size.
+		 * Give up when limit is reached.
+		 */
+		if (newsize)
+			if (!sbreserve_locked(&so->so_rcv,
+					      newsize, so, NULL))
+				so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
+		m_adj(m, drop_hdrlen);	/* delayed header drop */
+		sbappendstream_locked(&so->so_rcv, m, 0);
+	}
+	/* NB: sorwakeup_locked() does an implicit unlock. */
+	sorwakeup_locked(so);
+	if (DELAY_ACK(tp, tlen)) {
+		tp->t_flags |= TF_DELACK;
+	} else {
+		tp->t_flags |= TF_ACKNOW;
+		tcp_output(tp);
+	}
+	KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
+					    __func__, ti_locked));
+	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	if (tp->t_flags & TF_DELACK) {
+		tp->t_flags &= ~TF_DELACK;
+		tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
+	}
+	INP_WUNLOCK(tp->t_inpcb);
+}
+
+/*
+ * The slow-path is the clone of the long long part
+ * of tcp_do_segment past all the fast-path stuff. We
+ * use it here by two different callers, the fast/slow and
+ * the fastack only.
+ */
+static void
+tcp_do_slowpath(struct mbuf *m, struct tcphdr *th, struct socket *so,
+		struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 
+		int ti_locked, u_long tiwin, int thflags)
+{
+	int  acked, ourfinisacked, needoutput = 0;
+	int rstreason, todrop, win;
+	char *s;
+	struct in_conninfo *inc;
+	struct mbuf *mfree = NULL;
+#ifdef TCPDEBUG
+	/*
+	 * The size of tcp_saveipgen must be the size of the max ip header,
+	 * now IPv6.
+	 */
+	u_char tcp_saveipgen[IP6_HDR_LEN];
+	struct tcphdr tcp_savetcp;
+	short ostate = 0;
+#endif
+	/*
+	 * Calculate amount of space in receive window,
+	 * and then do TCP input processing.
+	 * Receive window is amount of space in rcv queue,
+	 * but not less than advertised window.
+	 */
+	inc = &tp->t_inpcb->inp_inc;
+	win = sbspace(&so->so_rcv);
+	if (win < 0)
+		win = 0;
+	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
+
+	/* Reset receive buffer auto scaling when not in bulk receive mode. */
+	tp->rfbuf_ts = 0;
+	tp->rfbuf_cnt = 0;
+
+	switch (tp->t_state) {
+
+	/*
+	 * If the state is SYN_RECEIVED:
+	 *	if seg contains an ACK, but not for our SYN/ACK, send a RST.
+	 */
+	case TCPS_SYN_RECEIVED:
+		if ((thflags & TH_ACK) &&
+		    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
+		     SEQ_GT(th->th_ack, tp->snd_max))) {
+				rstreason = BANDLIM_RST_OPENPORT;
+				goto dropwithreset;
+		}
+		break;
+
+	/*
+	 * If the state is SYN_SENT:
+	 *	if seg contains an ACK, but not for our SYN, drop the input.
+	 *	if seg contains a RST, then drop the connection.
+	 *	if seg does not contain SYN, then drop it.
+	 * Otherwise this is an acceptable SYN segment
+	 *	initialize tp->rcv_nxt and tp->irs
+	 *	if seg contains ack then advance tp->snd_una
+	 *	if seg contains an ECE and ECN support is enabled, the stream
+	 *	    is ECN capable.
+	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
+	 *	arrange for segment to be acked (eventually)
+	 *	continue processing rest of data/controls, beginning with URG
+	 */
+	case TCPS_SYN_SENT:
+		if ((thflags & TH_ACK) &&
+		    (SEQ_LEQ(th->th_ack, tp->iss) ||
+		     SEQ_GT(th->th_ack, tp->snd_max))) {
+			rstreason = BANDLIM_UNLIMITED;
+			goto dropwithreset;
+		}
+		if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) {
+			TCP_PROBE5(connect__refused, NULL, tp,
+			    mtod(m, const char *), tp, th);
+			tp = tcp_drop(tp, ECONNREFUSED);
+		}
+		if (thflags & TH_RST)
+			goto drop;
+		if (!(thflags & TH_SYN))
+			goto drop;
+
+		tp->irs = th->th_seq;
+		tcp_rcvseqinit(tp);
+		if (thflags & TH_ACK) {
+			TCPSTAT_INC(tcps_connects);
+			soisconnected(so);
+#ifdef MAC
+			mac_socketpeer_set_from_mbuf(m, so);
+#endif
+			/* Do window scaling on this connection? */
+			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
+				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
+				tp->rcv_scale = tp->request_r_scale;
+			}
+			tp->rcv_adv += imin(tp->rcv_wnd,
+			    TCP_MAXWIN << tp->rcv_scale);
+			tp->snd_una++;		/* SYN is acked */
+			/*
+			 * If there's data, delay ACK; if there's also a FIN
+			 * ACKNOW will be turned on later.
+			 */
+			if (DELAY_ACK(tp, tlen) && tlen != 0)
+				tcp_timer_activate(tp, TT_DELACK,
+				    tcp_delacktime);
+			else
+				tp->t_flags |= TF_ACKNOW;
+
+			if ((thflags & TH_ECE) && V_tcp_do_ecn) {
+				tp->t_flags |= TF_ECN_PERMIT;
+				TCPSTAT_INC(tcps_ecn_shs);
+			}
+			
+			/*
+			 * Received <SYN,ACK> in SYN_SENT[*] state.
+			 * Transitions:
+			 *	SYN_SENT  --> ESTABLISHED
+			 *	SYN_SENT* --> FIN_WAIT_1
+			 */
+			tp->t_starttime = ticks;
+			if (tp->t_flags & TF_NEEDFIN) {
+				tcp_state_change(tp, TCPS_FIN_WAIT_1);
+				tp->t_flags &= ~TF_NEEDFIN;
+				thflags &= ~TH_SYN;
+			} else {
+				tcp_state_change(tp, TCPS_ESTABLISHED);
+				TCP_PROBE5(connect__established, NULL, tp,
+				    mtod(m, const char *), tp, th);
+				cc_conn_init(tp);
+				tcp_timer_activate(tp, TT_KEEP,
+				    TP_KEEPIDLE(tp));
+			}
+		} else {
+			/*
+			 * Received initial SYN in SYN-SENT[*] state =>
+			 * simultaneous open.
+			 * If it succeeds, connection is * half-synchronized.
+			 * Otherwise, do 3-way handshake:
+			 *        SYN-SENT -> SYN-RECEIVED
+			 *        SYN-SENT* -> SYN-RECEIVED*
+			 */
+			tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
+			tcp_timer_activate(tp, TT_REXMT, 0);
+			tcp_state_change(tp, TCPS_SYN_RECEIVED);
+		}
+
+		KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: "
+		    "ti_locked %d", __func__, ti_locked));
+		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+		INP_WLOCK_ASSERT(tp->t_inpcb);
+
+		/*
+		 * Advance th->th_seq to correspond to first data byte.
+		 * If data, trim to stay within window,
+		 * dropping FIN if necessary.
+		 */
+		th->th_seq++;
+		if (tlen > tp->rcv_wnd) {
+			todrop = tlen - tp->rcv_wnd;
+			m_adj(m, -todrop);
+			tlen = tp->rcv_wnd;
+			thflags &= ~TH_FIN;
+			TCPSTAT_INC(tcps_rcvpackafterwin);
+			TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
+		}
+		tp->snd_wl1 = th->th_seq - 1;
+		tp->rcv_up = th->th_seq;
+		/*
+		 * Client side of transaction: already sent SYN and data.
+		 * If the remote host used T/TCP to validate the SYN,
+		 * our data will be ACK'd; if so, enter normal data segment
+		 * processing in the middle of step 5, ack processing.
+		 * Otherwise, goto step 6.
+		 */
+		if (thflags & TH_ACK)
+			goto process_ACK;
+
+		goto step6;
+
+	/*
+	 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
+	 *      do normal processing.
+	 *
+	 * NB: Leftover from RFC1644 T/TCP.  Cases to be reused later.
+	 */
+	case TCPS_LAST_ACK:
+	case TCPS_CLOSING:
+		break;  /* continue normal processing */
+	}
+
+	/*
+	 * States other than LISTEN or SYN_SENT.
+	 * First check the RST flag and sequence number since reset segments
+	 * are exempt from the timestamp and connection count tests.  This
+	 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
+	 * below which allowed reset segments in half the sequence space
+	 * to fall though and be processed (which gives forged reset
+	 * segments with a random sequence number a 50 percent chance of
+	 * killing a connection).
+	 * Then check timestamp, if present.
+	 * Then check the connection count, if present.
+	 * Then check that at least some bytes of segment are within
+	 * receive window.  If segment begins before rcv_nxt,
+	 * drop leading data (and SYN); if nothing left, just ack.
+	 */
+	if (thflags & TH_RST) {
+		/*
+		 * RFC5961 Section 3.2
+		 *
+		 * - RST drops connection only if SEG.SEQ == RCV.NXT.
+		 * - If RST is in window, we send challenge ACK.
+		 *
+		 * Note: to take into account delayed ACKs, we should
+		 *   test against last_ack_sent instead of rcv_nxt.
+		 * Note 2: we handle special case of closed window, not
+		 *   covered by the RFC.
+		 */
+		if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
+		    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
+		    (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
+			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+			KASSERT(ti_locked == TI_RLOCKED,
+			    ("%s: TH_RST ti_locked %d, th %p tp %p",
+			    __func__, ti_locked, th, tp));
+			KASSERT(tp->t_state != TCPS_SYN_SENT,
+			    ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
+			    __func__, th, tp));
+
+			if (V_tcp_insecure_rst ||
+			    tp->last_ack_sent == th->th_seq) {
+				TCPSTAT_INC(tcps_drops);
+				/* Drop the connection. */
+				switch (tp->t_state) {
+				case TCPS_SYN_RECEIVED:
+					so->so_error = ECONNREFUSED;
+					goto close;
+				case TCPS_ESTABLISHED:
+				case TCPS_FIN_WAIT_1:
+				case TCPS_FIN_WAIT_2:
+				case TCPS_CLOSE_WAIT:
+					so->so_error = ECONNRESET;
+				close:
+					tcp_state_change(tp, TCPS_CLOSED);
+					/* FALLTHROUGH */
+				default:
+					tp = tcp_close(tp);
+				}
+			} else {
+				TCPSTAT_INC(tcps_badrst);
+				/* Send challenge ACK. */
+				tcp_respond(tp, mtod(m, void *), th, m,
+				    tp->rcv_nxt, tp->snd_nxt, TH_ACK);
+				tp->last_ack_sent = tp->rcv_nxt;
+				m = NULL;
+			}
+		}
+		goto drop;
+	}
+
+	/*
+	 * RFC5961 Section 4.2
+	 * Send challenge ACK for any SYN in synchronized state.
+	 */
+	if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT) {
+		KASSERT(ti_locked == TI_RLOCKED,
+		    ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked));
+		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+
+		TCPSTAT_INC(tcps_badsyn);
+		if (V_tcp_insecure_syn &&
+		    SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
+		    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
+			tp = tcp_drop(tp, ECONNRESET);
+			rstreason = BANDLIM_UNLIMITED;
+		} else {
+			/* Send challenge ACK. */
+			tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
+			    tp->snd_nxt, TH_ACK);
+			tp->last_ack_sent = tp->rcv_nxt;
+			m = NULL;
+		}
+		goto drop;
+	}
+
+	/*
+	 * RFC 1323 PAWS: If we have a timestamp reply on this segment
+	 * and it's less than ts_recent, drop it.
+	 */
+	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
+	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
+
+		/* Check to see if ts_recent is over 24 days old.  */
+		if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
+			/*
+			 * Invalidate ts_recent.  If this segment updates
+			 * ts_recent, the age will be reset later and ts_recent
+			 * will get a valid value.  If it does not, setting
+			 * ts_recent to zero will at least satisfy the
+			 * requirement that zero be placed in the timestamp
+			 * echo reply when ts_recent isn't valid.  The
+			 * age isn't reset until we get a valid ts_recent
+			 * because we don't want out-of-order segments to be
+			 * dropped when ts_recent is old.
+			 */
+			tp->ts_recent = 0;
+		} else {
+			TCPSTAT_INC(tcps_rcvduppack);
+			TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
+			TCPSTAT_INC(tcps_pawsdrop);
+			if (tlen)
+				goto dropafterack;
+			goto drop;
+		}
+	}
+
+	/*
+	 * In the SYN-RECEIVED state, validate that the packet belongs to
+	 * this connection before trimming the data to fit the receive
+	 * window.  Check the sequence number versus IRS since we know
+	 * the sequence numbers haven't wrapped.  This is a partial fix
+	 * for the "LAND" DoS attack.
+	 */
+	if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
+		rstreason = BANDLIM_RST_OPENPORT;
+		goto dropwithreset;
+	}
+
+	todrop = tp->rcv_nxt - th->th_seq;
+	if (todrop > 0) {
+		if (thflags & TH_SYN) {
+			thflags &= ~TH_SYN;
+			th->th_seq++;
+			if (th->th_urp > 1)
+				th->th_urp--;
+			else
+				thflags &= ~TH_URG;
+			todrop--;
+		}
+		/*
+		 * Following if statement from Stevens, vol. 2, p. 960.
+		 */
+		if (todrop > tlen
+		    || (todrop == tlen && (thflags & TH_FIN) == 0)) {
+			/*
+			 * Any valid FIN must be to the left of the window.
+			 * At this point the FIN must be a duplicate or out
+			 * of sequence; drop it.
+			 */
+			thflags &= ~TH_FIN;
+
+			/*
+			 * Send an ACK to resynchronize and drop any data.
+			 * But keep on processing for RST or ACK.
+			 */
+			tp->t_flags |= TF_ACKNOW;
+			todrop = tlen;
+			TCPSTAT_INC(tcps_rcvduppack);
+			TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
+		} else {
+			TCPSTAT_INC(tcps_rcvpartduppack);
+			TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
+		}
+		drop_hdrlen += todrop;	/* drop from the top afterwards */
+		th->th_seq += todrop;
+		tlen -= todrop;
+		if (th->th_urp > todrop)
+			th->th_urp -= todrop;
+		else {
+			thflags &= ~TH_URG;
+			th->th_urp = 0;
+		}
+	}
+
+	/*
+	 * If new data are received on a connection after the
+	 * user processes are gone, then RST the other end.
+	 */
+	if ((so->so_state & SS_NOFDREF) &&
+	    tp->t_state > TCPS_CLOSE_WAIT && tlen) {
+		KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && "
+		    "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked));
+		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+
+		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
+			log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data "
+			    "after socket was closed, "
+			    "sending RST and removing tcpcb\n",
+			    s, __func__, tcpstates[tp->t_state], tlen);
+			free(s, M_TCPLOG);
+		}
+		tp = tcp_close(tp);
+		TCPSTAT_INC(tcps_rcvafterclose);
+		rstreason = BANDLIM_UNLIMITED;
+		goto dropwithreset;
+	}
+
+	/*
+	 * If segment ends after window, drop trailing data
+	 * (and PUSH and FIN); if nothing left, just ACK.
+	 */
+	todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
+	if (todrop > 0) {
+		TCPSTAT_INC(tcps_rcvpackafterwin);
+		if (todrop >= tlen) {
+			TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
+			/*
+			 * If window is closed can only take segments at
+			 * window edge, and have to drop data and PUSH from
+			 * incoming segments.  Continue processing, but
+			 * remember to ack.  Otherwise, drop segment
+			 * and ack.
+			 */
+			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
+				tp->t_flags |= TF_ACKNOW;
+				TCPSTAT_INC(tcps_rcvwinprobe);
+			} else
+				goto dropafterack;
+		} else
+			TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
+		m_adj(m, -todrop);
+		tlen -= todrop;
+		thflags &= ~(TH_PUSH|TH_FIN);
+	}
+
+	/*
+	 * If last ACK falls within this segment's sequence numbers,
+	 * record its timestamp.
+	 * NOTE: 
+	 * 1) That the test incorporates suggestions from the latest
+	 *    proposal of the tcplw@cray.com list (Braden 1993/04/26).
+	 * 2) That updating only on newer timestamps interferes with
+	 *    our earlier PAWS tests, so this check should be solely
+	 *    predicated on the sequence space of this segment.
+	 * 3) That we modify the segment boundary check to be 
+	 *        Last.ACK.Sent <= SEG.SEQ + SEG.Len  
+	 *    instead of RFC1323's
+	 *        Last.ACK.Sent < SEG.SEQ + SEG.Len,
+	 *    This modified check allows us to overcome RFC1323's
+	 *    limitations as described in Stevens TCP/IP Illustrated
+	 *    Vol. 2 p.869. In such cases, we can still calculate the
+	 *    RTT correctly when RCV.NXT == Last.ACK.Sent.
+	 */
+	if ((to->to_flags & TOF_TS) != 0 &&
+	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
+	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
+		((thflags & (TH_SYN|TH_FIN)) != 0))) {
+		tp->ts_recent_age = tcp_ts_getticks();
+		tp->ts_recent = to->to_tsval;
+	}
+
+	/*
+	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN
+	 * flag is on (half-synchronized state), then queue data for
+	 * later processing; else drop segment and return.
+	 */
+	if ((thflags & TH_ACK) == 0) {
+		if (tp->t_state == TCPS_SYN_RECEIVED ||
+		    (tp->t_flags & TF_NEEDSYN))
+			goto step6;
+		else if (tp->t_flags & TF_ACKNOW)
+			goto dropafterack;
+		else
+			goto drop;
+	}
+
+	/*
+	 * Ack processing.
+	 */
+	switch (tp->t_state) {
+
+	/*
+	 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
+	 * ESTABLISHED state and continue processing.
+	 * The ACK was checked above.
+	 */
+	case TCPS_SYN_RECEIVED:
+
+		TCPSTAT_INC(tcps_connects);
+		soisconnected(so);
+		/* Do window scaling? */
+		if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
+			(TF_RCVD_SCALE|TF_REQ_SCALE)) {
+			tp->rcv_scale = tp->request_r_scale;
+			tp->snd_wnd = tiwin;
+		}
+		/*
+		 * Make transitions:
+		 *      SYN-RECEIVED  -> ESTABLISHED
+		 *      SYN-RECEIVED* -> FIN-WAIT-1
+		 */
+		tp->t_starttime = ticks;
+		if (tp->t_flags & TF_NEEDFIN) {
+			tcp_state_change(tp, TCPS_FIN_WAIT_1);
+			tp->t_flags &= ~TF_NEEDFIN;
+		} else {
+			tcp_state_change(tp, TCPS_ESTABLISHED);
+			TCP_PROBE5(accept__established, NULL, tp,
+			    mtod(m, const char *), tp, th);
+			cc_conn_init(tp);
+			tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
+		}
+		/*
+		 * If segment contains data or ACK, will call tcp_reass()
+		 * later; if not, do so now to pass queued data to user.
+		 */
+		if (tlen == 0 && (thflags & TH_FIN) == 0)
+			(void) tcp_reass(tp, (struct tcphdr *)0, 0,
+			    (struct mbuf *)0);
+		tp->snd_wl1 = th->th_seq - 1;
+		/* FALLTHROUGH */
+
+	/*
+	 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
+	 * ACKs.  If the ack is in the range
+	 *	tp->snd_una < th->th_ack <= tp->snd_max
+	 * then advance tp->snd_una to th->th_ack and drop
+	 * data from the retransmission queue.  If this ACK reflects
+	 * more up to date window information we update our window information.
+	 */
+	case TCPS_ESTABLISHED:
+	case TCPS_FIN_WAIT_1:
+	case TCPS_FIN_WAIT_2:
+	case TCPS_CLOSE_WAIT:
+	case TCPS_CLOSING:
+	case TCPS_LAST_ACK:
+		if (SEQ_GT(th->th_ack, tp->snd_max)) {
+			TCPSTAT_INC(tcps_rcvacktoomuch);
+			goto dropafterack;
+		}
+		if ((tp->t_flags & TF_SACK_PERMIT) &&
+		    ((to->to_flags & TOF_SACK) ||
+		     !TAILQ_EMPTY(&tp->snd_holes)))
+			tcp_sack_doack(tp, to, th->th_ack);
+		else
+			/*
+			 * Reset the value so that previous (valid) value
+			 * from the last ack with SACK doesn't get used.
+			 */
+			tp->sackhint.sacked_bytes = 0;
+
+		/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
+		hhook_run_tcp_est_in(tp, th, to);
+
+		if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
+			if (tlen == 0 && tiwin == tp->snd_wnd) {
+				/*
+				 * If this is the first time we've seen a
+				 * FIN from the remote, this is not a
+				 * duplicate and it needs to be processed
+				 * normally.  This happens during a
+				 * simultaneous close.
+				 */
+				if ((thflags & TH_FIN) &&
+				    (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
+					tp->t_dupacks = 0;
+					break;
+				}
+				TCPSTAT_INC(tcps_rcvdupack);
+				/*
+				 * If we have outstanding data (other than
+				 * a window probe), this is a completely
+				 * duplicate ack (ie, window info didn't
+				 * change and FIN isn't set),
+				 * the ack is the biggest we've
+				 * seen and we've seen exactly our rexmt
+				 * threshhold of them, assume a packet
+				 * has been dropped and retransmit it.
+				 * Kludge snd_nxt & the congestion
+				 * window so we send only this one
+				 * packet.
+				 *
+				 * We know we're losing at the current
+				 * window size so do congestion avoidance
+				 * (set ssthresh to half the current window
+				 * and pull our congestion window back to
+				 * the new ssthresh).
+				 *
+				 * Dup acks mean that packets have left the
+				 * network (they're now cached at the receiver)
+				 * so bump cwnd by the amount in the receiver
+				 * to keep a constant cwnd packets in the
+				 * network.
+				 *
+				 * When using TCP ECN, notify the peer that
+				 * we reduced the cwnd.
+				 */
+				if (!tcp_timer_active(tp, TT_REXMT) ||
+				    th->th_ack != tp->snd_una)
+					tp->t_dupacks = 0;
+				else if (++tp->t_dupacks > tcprexmtthresh ||
+				     IN_FASTRECOVERY(tp->t_flags)) {
+					cc_ack_received(tp, th, CC_DUPACK);
+					if ((tp->t_flags & TF_SACK_PERMIT) &&
+					    IN_FASTRECOVERY(tp->t_flags)) {
+						int awnd;
+						
+						/*
+						 * Compute the amount of data in flight first.
+						 * We can inject new data into the pipe iff 
+						 * we have less than 1/2 the original window's
+						 * worth of data in flight.
+						 */
+						if (V_tcp_do_rfc6675_pipe)
+							awnd = tcp_compute_pipe(tp);
+						else
+							awnd = (tp->snd_nxt - tp->snd_fack) +
+								tp->sackhint.sack_bytes_rexmit;
+
+						if (awnd < tp->snd_ssthresh) {
+							tp->snd_cwnd += tp->t_maxseg;
+							if (tp->snd_cwnd > tp->snd_ssthresh)
+								tp->snd_cwnd = tp->snd_ssthresh;
+						}
+					} else
+						tp->snd_cwnd += tp->t_maxseg;
+					(void) tp->t_fb->tfb_tcp_output(tp);
+					goto drop;
+				} else if (tp->t_dupacks == tcprexmtthresh) {
+					tcp_seq onxt = tp->snd_nxt;
+
+					/*
+					 * If we're doing sack, check to
+					 * see if we're already in sack
+					 * recovery. If we're not doing sack,
+					 * check to see if we're in newreno
+					 * recovery.
+					 */
+					if (tp->t_flags & TF_SACK_PERMIT) {
+						if (IN_FASTRECOVERY(tp->t_flags)) {
+							tp->t_dupacks = 0;
+							break;
+						}
+					} else {
+						if (SEQ_LEQ(th->th_ack,
+						    tp->snd_recover)) {
+							tp->t_dupacks = 0;
+							break;
+						}
+					}
+					/* Congestion signal before ack. */
+					cc_cong_signal(tp, th, CC_NDUPACK);
+					cc_ack_received(tp, th, CC_DUPACK);
+					tcp_timer_activate(tp, TT_REXMT, 0);
+					tp->t_rtttime = 0;
+					if (tp->t_flags & TF_SACK_PERMIT) {
+						TCPSTAT_INC(
+						    tcps_sack_recovery_episode);
+						tp->sack_newdata = tp->snd_nxt;
+						tp->snd_cwnd = tp->t_maxseg;
+						(void) tp->t_fb->tfb_tcp_output(tp);
+						goto drop;
+					}
+					tp->snd_nxt = th->th_ack;
+					tp->snd_cwnd = tp->t_maxseg;
+					(void) tp->t_fb->tfb_tcp_output(tp);
+					KASSERT(tp->snd_limited <= 2,
+					    ("%s: tp->snd_limited too big",
+					    __func__));
+					tp->snd_cwnd = tp->snd_ssthresh +
+					     tp->t_maxseg *
+					     (tp->t_dupacks - tp->snd_limited);
+					if (SEQ_GT(onxt, tp->snd_nxt))
+						tp->snd_nxt = onxt;
+					goto drop;
+				} else if (V_tcp_do_rfc3042) {
+					/*
+					 * Process first and second duplicate
+					 * ACKs. Each indicates a segment
+					 * leaving the network, creating room
+					 * for more. Make sure we can send a
+					 * packet on reception of each duplicate
+					 * ACK by increasing snd_cwnd by one
+					 * segment. Restore the original
+					 * snd_cwnd after packet transmission.
+					 */
+					cc_ack_received(tp, th, CC_DUPACK);
+					u_long oldcwnd = tp->snd_cwnd;
+					tcp_seq oldsndmax = tp->snd_max;
+					u_int sent;
+					int avail;
+
+					KASSERT(tp->t_dupacks == 1 ||
+					    tp->t_dupacks == 2,
+					    ("%s: dupacks not 1 or 2",
+					    __func__));
+					if (tp->t_dupacks == 1)
+						tp->snd_limited = 0;
+					tp->snd_cwnd =
+					    (tp->snd_nxt - tp->snd_una) +
+					    (tp->t_dupacks - tp->snd_limited) *
+					    tp->t_maxseg;
+					/*
+					 * Only call tcp_output when there
+					 * is new data available to be sent.
+					 * Otherwise we would send pure ACKs.
+					 */
+					SOCKBUF_LOCK(&so->so_snd);
+					avail = sbavail(&so->so_snd) -
+					    (tp->snd_nxt - tp->snd_una);
+					SOCKBUF_UNLOCK(&so->so_snd);
+					if (avail > 0)
+						(void) tp->t_fb->tfb_tcp_output(tp);
+					sent = tp->snd_max - oldsndmax;
+					if (sent > tp->t_maxseg) {
+						KASSERT((tp->t_dupacks == 2 &&
+						    tp->snd_limited == 0) ||
+						   (sent == tp->t_maxseg + 1 &&
+						    tp->t_flags & TF_SENTFIN),
+						    ("%s: sent too much",
+						    __func__));
+						tp->snd_limited = 2;
+					} else if (sent > 0)
+						++tp->snd_limited;
+					tp->snd_cwnd = oldcwnd;
+					goto drop;
+				}
+			} else
+				tp->t_dupacks = 0;
+			break;
+		}
+
+		KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
+		    ("%s: th_ack <= snd_una", __func__));
+
+		/*
+		 * If the congestion window was inflated to account
+		 * for the other side's cached packets, retract it.
+		 */
+		if (IN_FASTRECOVERY(tp->t_flags)) {
+			if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+				if (tp->t_flags & TF_SACK_PERMIT)
+					tcp_sack_partialack(tp, th);
+				else
+					tcp_newreno_partial_ack(tp, th);
+			} else
+				cc_post_recovery(tp, th);
+		}
+		tp->t_dupacks = 0;
+		/*
+		 * If we reach this point, ACK is not a duplicate,
+		 *     i.e., it ACKs something we sent.
+		 */
+		if (tp->t_flags & TF_NEEDSYN) {
+			/*
+			 * T/TCP: Connection was half-synchronized, and our
+			 * SYN has been ACK'd (so connection is now fully
+			 * synchronized).  Go to non-starred state,
+			 * increment snd_una for ACK of SYN, and check if
+			 * we can do window scaling.
+			 */
+			tp->t_flags &= ~TF_NEEDSYN;
+			tp->snd_una++;
+			/* Do window scaling? */
+			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
+				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
+				tp->rcv_scale = tp->request_r_scale;
+				/* Send window already scaled. */
+			}
+		}
+
+process_ACK:
+		INP_WLOCK_ASSERT(tp->t_inpcb);
+
+		acked = BYTES_THIS_ACK(tp, th);
+		TCPSTAT_INC(tcps_rcvackpack);
+		TCPSTAT_ADD(tcps_rcvackbyte, acked);
+
+		/*
+		 * If we just performed our first retransmit, and the ACK
+		 * arrives within our recovery window, then it was a mistake
+		 * to do the retransmit in the first place.  Recover our
+		 * original cwnd and ssthresh, and proceed to transmit where
+		 * we left off.
+		 */
+		if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID &&
+		    (int)(ticks - tp->t_badrxtwin) < 0)
+			cc_cong_signal(tp, th, CC_RTO_ERR);
+
+		/*
+		 * If we have a timestamp reply, update smoothed
+		 * round trip time.  If no timestamp is present but
+		 * transmit timer is running and timed sequence
+		 * number was acked, update smoothed round trip time.
+		 * Since we now have an rtt measurement, cancel the
+		 * timer backoff (cf., Phil Karn's retransmit alg.).
+		 * Recompute the initial retransmit timer.
+		 *
+		 * Some boxes send broken timestamp replies
+		 * during the SYN+ACK phase, ignore
+		 * timestamps of 0 or we could calculate a
+		 * huge RTT and blow up the retransmit timer.
+		 */
+		if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
+			u_int t;
+
+			t = tcp_ts_getticks() - to->to_tsecr;
+			if (!tp->t_rttlow || tp->t_rttlow > t)
+				tp->t_rttlow = t;
+			tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1);
+		} else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
+			if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime)
+				tp->t_rttlow = ticks - tp->t_rtttime;
+			tcp_xmit_timer(tp, ticks - tp->t_rtttime);
+		}
+
+		/*
+		 * If all outstanding data is acked, stop retransmit
+		 * timer and remember to restart (more output or persist).
+		 * If there is more data to be acked, restart retransmit
+		 * timer, using current (possibly backed-off) value.
+		 */
+		if (th->th_ack == tp->snd_max) {
+			tcp_timer_activate(tp, TT_REXMT, 0);
+			needoutput = 1;
+		} else if (!tcp_timer_active(tp, TT_PERSIST))
+			tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
+
+		/*
+		 * If no data (only SYN) was ACK'd,
+		 *    skip rest of ACK processing.
+		 */
+		if (acked == 0)
+			goto step6;
+
+		/*
+		 * Let the congestion control algorithm update congestion
+		 * control related information. This typically means increasing
+		 * the congestion window.
+		 */
+		cc_ack_received(tp, th, CC_ACK);
+
+		SOCKBUF_LOCK(&so->so_snd);
+		if (acked > sbavail(&so->so_snd)) {
+			tp->snd_wnd -= sbavail(&so->so_snd);
+			mfree = sbcut_locked(&so->so_snd,
+			    (int)sbavail(&so->so_snd));
+			ourfinisacked = 1;
+		} else {
+			mfree = sbcut_locked(&so->so_snd, acked);
+			tp->snd_wnd -= acked;
+			ourfinisacked = 0;
+		}
+		/* NB: sowwakeup_locked() does an implicit unlock. */
+		sowwakeup_locked(so);
+		m_freem(mfree);
+		/* Detect una wraparound. */
+		if (!IN_RECOVERY(tp->t_flags) &&
+		    SEQ_GT(tp->snd_una, tp->snd_recover) &&
+		    SEQ_LEQ(th->th_ack, tp->snd_recover))
+			tp->snd_recover = th->th_ack - 1;
+		/* XXXLAS: Can this be moved up into cc_post_recovery? */
+		if (IN_RECOVERY(tp->t_flags) &&
+		    SEQ_GEQ(th->th_ack, tp->snd_recover)) {
+			EXIT_RECOVERY(tp->t_flags);
+		}
+		tp->snd_una = th->th_ack;
+		if (tp->t_flags & TF_SACK_PERMIT) {
+			if (SEQ_GT(tp->snd_una, tp->snd_recover))
+				tp->snd_recover = tp->snd_una;
+		}
+		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
+			tp->snd_nxt = tp->snd_una;
+
+		switch (tp->t_state) {
+
+		/*
+		 * In FIN_WAIT_1 STATE in addition to the processing
+		 * for the ESTABLISHED state if our FIN is now acknowledged
+		 * then enter FIN_WAIT_2.
+		 */
+		case TCPS_FIN_WAIT_1:
+			if (ourfinisacked) {
+				/*
+				 * If we can't receive any more
+				 * data, then closing user can proceed.
+				 * Starting the timer is contrary to the
+				 * specification, but if we don't get a FIN
+				 * we'll hang forever.
+				 *
+				 * XXXjl:
+				 * we should release the tp also, and use a
+				 * compressed state.
+				 */
+				if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+					soisdisconnected(so);
+					tcp_timer_activate(tp, TT_2MSL,
+					    (tcp_fast_finwait2_recycle ?
+					    tcp_finwait2_timeout :
+					    TP_MAXIDLE(tp)));
+				}
+				tcp_state_change(tp, TCPS_FIN_WAIT_2);
+			}
+			break;
+
+		/*
+		 * In CLOSING STATE in addition to the processing for
+		 * the ESTABLISHED state if the ACK acknowledges our FIN
+		 * then enter the TIME-WAIT state, otherwise ignore
+		 * the segment.
+		 */
+		case TCPS_CLOSING:
+			if (ourfinisacked) {
+				INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+				tcp_twstart(tp);
+				INP_INFO_RUNLOCK(&V_tcbinfo);
+				m_freem(m);
+				return;
+			}
+			break;
+
+		/*
+		 * In LAST_ACK, we may still be waiting for data to drain
+		 * and/or to be acked, as well as for the ack of our FIN.
+		 * If our FIN is now acknowledged, delete the TCB,
+		 * enter the closed state and return.
+		 */
+		case TCPS_LAST_ACK:
+			if (ourfinisacked) {
+				INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+				tp = tcp_close(tp);
+				goto drop;
+			}
+			break;
+		}
+	}
+
+step6:
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	/*
+	 * Update window information.
+	 * Don't look at window if no ACK: TAC's send garbage on first SYN.
+	 */
+	if ((thflags & TH_ACK) &&
+	    (SEQ_LT(tp->snd_wl1, th->th_seq) ||
+	    (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
+	     (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
+		/* keep track of pure window updates */
+		if (tlen == 0 &&
+		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
+			TCPSTAT_INC(tcps_rcvwinupd);
+		tp->snd_wnd = tiwin;
+		tp->snd_wl1 = th->th_seq;
+		tp->snd_wl2 = th->th_ack;
+		if (tp->snd_wnd > tp->max_sndwnd)
+			tp->max_sndwnd = tp->snd_wnd;
+		needoutput = 1;
+	}
+
+	/*
+	 * Process segments with URG.
+	 */
+	if ((thflags & TH_URG) && th->th_urp &&
+	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
+		/*
+		 * This is a kludge, but if we receive and accept
+		 * random urgent pointers, we'll crash in
+		 * soreceive.  It's hard to imagine someone
+		 * actually wanting to send this much urgent data.
+		 */
+		SOCKBUF_LOCK(&so->so_rcv);
+		if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
+			th->th_urp = 0;			/* XXX */
+			thflags &= ~TH_URG;		/* XXX */
+			SOCKBUF_UNLOCK(&so->so_rcv);	/* XXX */
+			goto dodata;			/* XXX */
+		}
+		/*
+		 * If this segment advances the known urgent pointer,
+		 * then mark the data stream.  This should not happen
+		 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
+		 * a FIN has been received from the remote side.
+		 * In these states we ignore the URG.
+		 *
+		 * According to RFC961 (Assigned Protocols),
+		 * the urgent pointer points to the last octet
+		 * of urgent data.  We continue, however,
+		 * to consider it to indicate the first octet
+		 * of data past the urgent section as the original
+		 * spec states (in one of two places).
+		 */
+		if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
+			tp->rcv_up = th->th_seq + th->th_urp;
+			so->so_oobmark = sbavail(&so->so_rcv) +
+			    (tp->rcv_up - tp->rcv_nxt) - 1;
+			if (so->so_oobmark == 0)
+				so->so_rcv.sb_state |= SBS_RCVATMARK;
+			sohasoutofband(so);
+			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
+		}
+		SOCKBUF_UNLOCK(&so->so_rcv);
+		/*
+		 * Remove out of band data so doesn't get presented to user.
+		 * This can happen independent of advancing the URG pointer,
+		 * but if two URG's are pending at once, some out-of-band
+		 * data may creep in... ick.
+		 */
+		if (th->th_urp <= (u_long)tlen &&
+		    !(so->so_options & SO_OOBINLINE)) {
+			/* hdr drop is delayed */
+			tcp_pulloutofband(so, th, m, drop_hdrlen);
+		}
+	} else {
+		/*
+		 * If no out of band data is expected,
+		 * pull receive urgent pointer along
+		 * with the receive window.
+		 */
+		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
+			tp->rcv_up = tp->rcv_nxt;
+	}
+dodata:							/* XXX */
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	/*
+	 * Process the segment text, merging it into the TCP sequencing queue,
+	 * and arranging for acknowledgment of receipt if necessary.
+	 * This process logically involves adjusting tp->rcv_wnd as data
+	 * is presented to the user (this happens in tcp_usrreq.c,
+	 * case PRU_RCVD).  If a FIN has already been received on this
+	 * connection then we just ignore the text.
+	 */
+	if ((tlen || (thflags & TH_FIN)) &&
+	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
+		tcp_seq save_start = th->th_seq;
+		m_adj(m, drop_hdrlen);	/* delayed header drop */
+		/*
+		 * Insert segment which includes th into TCP reassembly queue
+		 * with control block tp.  Set thflags to whether reassembly now
+		 * includes a segment with FIN.  This handles the common case
+		 * inline (segment is the next to be received on an established
+		 * connection, and the queue is empty), avoiding linkage into
+		 * and removal from the queue and repetition of various
+		 * conversions.
+		 * Set DELACK for segments received in order, but ack
+		 * immediately when segments are out of order (so
+		 * fast retransmit can work).
+		 */
+		if (th->th_seq == tp->rcv_nxt &&
+		    LIST_EMPTY(&tp->t_segq) &&
+		    TCPS_HAVEESTABLISHED(tp->t_state)) {
+			if (DELAY_ACK(tp, tlen))
+				tp->t_flags |= TF_DELACK;
+			else
+				tp->t_flags |= TF_ACKNOW;
+			tp->rcv_nxt += tlen;
+			thflags = th->th_flags & TH_FIN;
+			TCPSTAT_INC(tcps_rcvpack);
+			TCPSTAT_ADD(tcps_rcvbyte, tlen);
+			SOCKBUF_LOCK(&so->so_rcv);
+			if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
+				m_freem(m);
+			else
+				sbappendstream_locked(&so->so_rcv, m, 0);
+			/* NB: sorwakeup_locked() does an implicit unlock. */
+			sorwakeup_locked(so);
+		} else {
+			/*
+			 * XXX: Due to the header drop above "th" is
+			 * theoretically invalid by now.  Fortunately
+			 * m_adj() doesn't actually frees any mbufs
+			 * when trimming from the head.
+			 */
+			thflags = tcp_reass(tp, th, &tlen, m);
+			tp->t_flags |= TF_ACKNOW;
+		}
+		if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT))
+			tcp_update_sack_list(tp, save_start, save_start + tlen);
+#if 0
+		/*
+		 * Note the amount of data that peer has sent into
+		 * our window, in order to estimate the sender's
+		 * buffer size.
+		 * XXX: Unused.
+		 */
+		if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
+			len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
+		else
+			len = so->so_rcv.sb_hiwat;
+#endif
+	} else {
+		m_freem(m);
+		thflags &= ~TH_FIN;
+	}
+
+	/*
+	 * If FIN is received ACK the FIN and let the user know
+	 * that the connection is closing.
+	 */
+	if (thflags & TH_FIN) {
+		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
+			socantrcvmore(so);
+			/*
+			 * If connection is half-synchronized
+			 * (ie NEEDSYN flag on) then delay ACK,
+			 * so it may be piggybacked when SYN is sent.
+			 * Otherwise, since we received a FIN then no
+			 * more input can be expected, send ACK now.
+			 */
+			if (tp->t_flags & TF_NEEDSYN)
+				tp->t_flags |= TF_DELACK;
+			else
+				tp->t_flags |= TF_ACKNOW;
+			tp->rcv_nxt++;
+		}
+		switch (tp->t_state) {
+
+		/*
+		 * In SYN_RECEIVED and ESTABLISHED STATES
+		 * enter the CLOSE_WAIT state.
+		 */
+		case TCPS_SYN_RECEIVED:
+			tp->t_starttime = ticks;
+			/* FALLTHROUGH */
+		case TCPS_ESTABLISHED:
+			tcp_state_change(tp, TCPS_CLOSE_WAIT);
+			break;
+
+		/*
+		 * If still in FIN_WAIT_1 STATE FIN has not been acked so
+		 * enter the CLOSING state.
+		 */
+		case TCPS_FIN_WAIT_1:
+			tcp_state_change(tp, TCPS_CLOSING);
+			break;
+
+		/*
+		 * In FIN_WAIT_2 state enter the TIME_WAIT state,
+		 * starting the time-wait timer, turning off the other
+		 * standard timers.
+		 */
+		case TCPS_FIN_WAIT_2:
+			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+			KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata "
+			    "TCP_FIN_WAIT_2 ti_locked: %d", __func__,
+			    ti_locked));
+
+			tcp_twstart(tp);
+			INP_INFO_RUNLOCK(&V_tcbinfo);
+			return;
+		}
+	}
+	if (ti_locked == TI_RLOCKED) {
+		INP_INFO_RUNLOCK(&V_tcbinfo);
+	}
+	ti_locked = TI_UNLOCKED;
+
+#ifdef TCPDEBUG
+	if (so->so_options & SO_DEBUG)
+		tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
+			  &tcp_savetcp, 0);
+#endif
+	TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
+
+	/*
+	 * Return any desired output.
+	 */
+	if (needoutput || (tp->t_flags & TF_ACKNOW))
+		(void) tp->t_fb->tfb_tcp_output(tp);
+
+	KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
+	    __func__, ti_locked));
+	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	if (tp->t_flags & TF_DELACK) {
+		tp->t_flags &= ~TF_DELACK;
+		tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
+	}
+	INP_WUNLOCK(tp->t_inpcb);
+	return;
+
+dropafterack:
+	/*
+	 * Generate an ACK dropping incoming segment if it occupies
+	 * sequence space, where the ACK reflects our state.
+	 *
+	 * We can now skip the test for the RST flag since all
+	 * paths to this code happen after packets containing
+	 * RST have been dropped.
+	 *
+	 * In the SYN-RECEIVED state, don't send an ACK unless the
+	 * segment we received passes the SYN-RECEIVED ACK test.
+	 * If it fails send a RST.  This breaks the loop in the
+	 * "LAND" DoS attack, and also prevents an ACK storm
+	 * between two listening ports that have been sent forged
+	 * SYN segments, each with the source address of the other.
+	 */
+	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
+	    (SEQ_GT(tp->snd_una, th->th_ack) ||
+	     SEQ_GT(th->th_ack, tp->snd_max)) ) {
+		rstreason = BANDLIM_RST_OPENPORT;
+		goto dropwithreset;
+	}
+#ifdef TCPDEBUG
+	if (so->so_options & SO_DEBUG)
+		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
+			  &tcp_savetcp, 0);
+#endif
+	TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
+	if (ti_locked == TI_RLOCKED) {
+		INP_INFO_RUNLOCK(&V_tcbinfo);
+	}
+	ti_locked = TI_UNLOCKED;
+
+	tp->t_flags |= TF_ACKNOW;
+	(void) tp->t_fb->tfb_tcp_output(tp);
+	INP_WUNLOCK(tp->t_inpcb);
+	m_freem(m);
+	return;
+
+dropwithreset:
+	if (ti_locked == TI_RLOCKED) {
+		INP_INFO_RUNLOCK(&V_tcbinfo);
+	}
+	ti_locked = TI_UNLOCKED;
+
+	if (tp != NULL) {
+		tcp_dropwithreset(m, th, tp, tlen, rstreason);
+		INP_WUNLOCK(tp->t_inpcb);
+	} else
+		tcp_dropwithreset(m, th, NULL, tlen, rstreason);
+	return;
+
+drop:
+	if (ti_locked == TI_RLOCKED) {
+		INP_INFO_RUNLOCK(&V_tcbinfo);
+		ti_locked = TI_UNLOCKED;
+	}
+#ifdef INVARIANTS
+	else
+		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+#endif
+
+	/*
+	 * Drop space held by incoming segment and return.
+	 */
+#ifdef TCPDEBUG
+	if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
+		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
+			  &tcp_savetcp, 0);
+#endif
+	TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
+	if (tp != NULL)
+		INP_WUNLOCK(tp->t_inpcb);
+	m_freem(m);
+}
+
+
+/*
+ * Do fast slow is a combination of the original
+ * tcp_dosegment and a split fastpath, one function
+ * for the fast-ack which also includes allowing fastpath
+ * for window advanced in sequence acks. And also a
+ * sub-function that handles the insequence data.
+ */
+void
+tcp_do_segment_fastslow(struct mbuf *m, struct tcphdr *th, struct socket *so,
+			struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
+			int ti_locked)
+{
+	int thflags;
+	u_long tiwin;
+	char *s;
+	int can_enter;
+	struct in_conninfo *inc;
+	struct tcpopt to;
+
+	thflags = th->th_flags;
+	tp->sackhint.last_sack_ack = 0;
+	inc = &tp->t_inpcb->inp_inc;
+	/*
+	 * If this is either a state-changing packet or current state isn't
+	 * established, we require a write lock on tcbinfo.  Otherwise, we
+	 * allow the tcbinfo to be in either alocked or unlocked, as the
+	 * caller may have unnecessarily acquired a write lock due to a race.
+	 */
+	if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
+	    tp->t_state != TCPS_ESTABLISHED) {
+		KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for "
+						  "SYN/FIN/RST/!EST", __func__, ti_locked));
+		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+	} else {
+#ifdef INVARIANTS
+		if (ti_locked == TI_RLOCKED) {
+			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+		} else {
+			KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
+							   "ti_locked: %d", __func__, ti_locked));
+			INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+		}
+#endif
+	}
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
+					    __func__));
+	KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
+						__func__));
+
+	/*
+	 * Segment received on connection.
+	 * Reset idle time and keep-alive timer.
+	 * XXX: This should be done after segment
+	 * validation to ignore broken/spoofed segs.
+	 */
+	tp->t_rcvtime = ticks;
+	if (TCPS_HAVEESTABLISHED(tp->t_state))
+		tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
+
+	/*
+	 * Unscale the window into a 32-bit value.
+	 * For the SYN_SENT state the scale is zero.
+	 */
+	tiwin = th->th_win << tp->snd_scale;
+
+	/*
+	 * TCP ECN processing.
+	 */
+	if (tp->t_flags & TF_ECN_PERMIT) {
+		if (thflags & TH_CWR)
+			tp->t_flags &= ~TF_ECN_SND_ECE;
+		switch (iptos & IPTOS_ECN_MASK) {
+		case IPTOS_ECN_CE:
+			tp->t_flags |= TF_ECN_SND_ECE;
+			TCPSTAT_INC(tcps_ecn_ce);
+			break;
+		case IPTOS_ECN_ECT0:
+			TCPSTAT_INC(tcps_ecn_ect0);
+			break;
+		case IPTOS_ECN_ECT1:
+			TCPSTAT_INC(tcps_ecn_ect1);
+			break;
+		}
+		/* Congestion experienced. */
+		if (thflags & TH_ECE) {
+			cc_cong_signal(tp, th, CC_ECN);
+		}
+	}
+
+	/*
+	 * Parse options on any incoming segment.
+	 */
+	tcp_dooptions(&to, (u_char *)(th + 1),
+		      (th->th_off << 2) - sizeof(struct tcphdr),
+		      (thflags & TH_SYN) ? TO_SYN : 0);
+
+	/*
+	 * If echoed timestamp is later than the current time,
+	 * fall back to non RFC1323 RTT calculation.  Normalize
+	 * timestamp if syncookies were used when this connection
+	 * was established.
+	 */
+	if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
+		to.to_tsecr -= tp->ts_offset;
+		if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
+			to.to_tsecr = 0;
+	}
+	/*
+	 * If timestamps were negotiated during SYN/ACK they should
+	 * appear on every segment during this session and vice versa.
+	 */
+	if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
+		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
+			log(LOG_DEBUG, "%s; %s: Timestamp missing, "
+			    "no action\n", s, __func__);
+			free(s, M_TCPLOG);
+		}
+	}
+	if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
+		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
+			log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
+			    "no action\n", s, __func__);
+			free(s, M_TCPLOG);
+		}
+	}
+
+	/*
+	 * Process options only when we get SYN/ACK back. The SYN case
+	 * for incoming connections is handled in tcp_syncache.
+	 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
+	 * or <SYN,ACK>) segment itself is never scaled.
+	 * XXX this is traditional behavior, may need to be cleaned up.
+	 */
+	if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
+		if ((to.to_flags & TOF_SCALE) &&
+		    (tp->t_flags & TF_REQ_SCALE)) {
+			tp->t_flags |= TF_RCVD_SCALE;
+			tp->snd_scale = to.to_wscale;
+		}
+		/*
+		 * Initial send window.  It will be updated with
+		 * the next incoming segment to the scaled value.
+		 */
+		tp->snd_wnd = th->th_win;
+		if (to.to_flags & TOF_TS) {
+			tp->t_flags |= TF_RCVD_TSTMP;
+			tp->ts_recent = to.to_tsval;
+			tp->ts_recent_age = tcp_ts_getticks();
+		}
+		if (to.to_flags & TOF_MSS)
+			tcp_mss(tp, to.to_mss);
+		if ((tp->t_flags & TF_SACK_PERMIT) &&
+		    (to.to_flags & TOF_SACKPERM) == 0)
+			tp->t_flags &= ~TF_SACK_PERMIT;
+	}
+	can_enter = 0;
+	if (__predict_true((tlen == 0))) {
+		/*
+		 * The ack moved forward and we have a window (non-zero)
+		 * <or>
+		 * The ack did not move forward, but the window increased.
+		 */
+		if (__predict_true((SEQ_GT(th->th_ack, tp->snd_una) && tiwin) ||
+				   ((th->th_ack == tp->snd_una) && tiwin && (tiwin > tp->snd_wnd)))) {
+			can_enter = 1;
+		}
+	} else {
+		/* 
+		 * Data incoming, use the old entry criteria
+		 * for fast-path with data.
+		 */
+		if ((tiwin && tiwin == tp->snd_wnd)) {
+			can_enter = 1;
+		}
+	}
+	/*
+	 * Header prediction: check for the two common cases
+	 * of a uni-directional data xfer.  If the packet has
+	 * no control flags, is in-sequence, the window didn't
+	 * change and we're not retransmitting, it's a
+	 * candidate.  If the length is zero and the ack moved
+	 * forward, we're the sender side of the xfer.  Just
+	 * free the data acked & wake any higher level process
+	 * that was blocked waiting for space.  If the length
+	 * is non-zero and the ack didn't move, we're the
+	 * receiver side.  If we're getting packets in-order
+	 * (the reassembly queue is empty), add the data to
+	 * the socket buffer and note that we need a delayed ack.
+	 * Make sure that the hidden state-flags are also off.
+	 * Since we check for TCPS_ESTABLISHED first, it can only
+	 * be TH_NEEDSYN.
+	 */
+	if (__predict_true(tp->t_state == TCPS_ESTABLISHED &&
+	    th->th_seq == tp->rcv_nxt &&
+	    (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
+	    tp->snd_nxt == tp->snd_max &&
+	    can_enter &&
+	    ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
+	    LIST_EMPTY(&tp->t_segq) &&
+	    ((to.to_flags & TOF_TS) == 0 ||
+	     TSTMP_GEQ(to.to_tsval, tp->ts_recent)))) {
+		if (__predict_true((tlen == 0) &&
+		    (SEQ_LEQ(th->th_ack, tp->snd_max) &&
+		     !IN_RECOVERY(tp->t_flags) &&
+		     (to.to_flags & TOF_SACK) == 0 &&
+		     TAILQ_EMPTY(&tp->snd_holes)))) {
+			/* We are done */
+			tcp_do_fastack(m, th, so, tp, &to, drop_hdrlen, tlen, 
+				       ti_locked, tiwin);
+			return;
+		} else if ((tlen) &&
+			   (th->th_ack == tp->snd_una &&
+			    tlen <= sbspace(&so->so_rcv))) {
+			tcp_do_fastnewdata(m, th, so, tp, &to, drop_hdrlen, tlen, 
+					   ti_locked, tiwin);
+			/* We are done */
+			return;
+		}
+	}
+	tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen,
+			ti_locked, tiwin, thflags);
+}
+
+
+/*
+ * This subfunction is used to try to highly optimize the
+ * fast path. We again allow window updates that are
+ * in sequence to remain in the fast-path. We also add
+ * in the __predict's to attempt to help the compiler.
+ * Note that if we return a 0, then we can *not* process
+ * it and the caller should push the packet into the 
+ * slow-path.
+ */
+static int
+tcp_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
+	       struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 
+	       int ti_locked, u_long tiwin)
+{
+	int acked;
+	int winup_only=0;
+#ifdef TCPDEBUG
+	/*
+	 * The size of tcp_saveipgen must be the size of the max ip header,
+	 * now IPv6.
+	 */
+	u_char tcp_saveipgen[IP6_HDR_LEN];
+	struct tcphdr tcp_savetcp;
+	short ostate = 0;
+#endif
+
+
+	if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
+		/* Old ack, behind (or duplicate to) the last one rcv'd */
+		return (0);
+	}
+	if (__predict_false(th->th_ack == tp->snd_una) && 
+	    __predict_false(tiwin <= tp->snd_wnd)) {
+		/* duplicate ack <or> a shrinking dup ack with shrinking window */
+		return (0);
+	}
+	if (__predict_false(tiwin == 0)) {
+		/* zero window */
+		return (0);
+	}
+	if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
+		/* Above what we have sent? */
+		return (0);
+	}
+	if (__predict_false(tp->snd_nxt != tp->snd_max)) {
+		/* We are retransmitting */
+		return (0);
+	}
+	if (__predict_false(tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN))) {
+		/* We need a SYN or a FIN, unlikely.. */
+		return (0);
+	}
+	if((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
+		/* Timestamp is behind .. old ack with seq wrap? */
+		return (0);
+	}
+	if (__predict_false(IN_RECOVERY(tp->t_flags))) {
+		/* Still recovering */
+		return (0);
+	}
+	if (__predict_false(to->to_flags & TOF_SACK)) {
+		/* Sack included in the ack..  */
+		return (0);
+	}
+	if (!TAILQ_EMPTY(&tp->snd_holes)) {
+		/* We have sack holes on our scoreboard */
+		return (0);
+	}
+	/* Ok if we reach here, we can process a fast-ack */
+
+	/* Did the window get updated? */
+	if (tiwin != tp->snd_wnd) {
+		/* keep track of pure window updates */
+		if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) {
+			winup_only = 1;
+			TCPSTAT_INC(tcps_rcvwinupd);
+		}
+		tp->snd_wnd = tiwin;
+		tp->snd_wl1 = th->th_seq;
+		if (tp->snd_wnd > tp->max_sndwnd)
+			tp->max_sndwnd = tp->snd_wnd;
+	}
+	/*
+	 * Pull snd_wl2 up to prevent seq wrap relative
+	 * to th_ack.
+	 */
+	tp->snd_wl2 = th->th_ack;
+	/*
+	 * If last ACK falls within this segment's sequence numbers,
+	 * record the timestamp.
+	 * NOTE that the test is modified according to the latest
+	 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
+	 */
+	if ((to->to_flags & TOF_TS) != 0 &&
+	    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
+		tp->ts_recent_age = tcp_ts_getticks();
+		tp->ts_recent = to->to_tsval;
+	}
+	/*
+	 * This is a pure ack for outstanding data.
+	 */
+	if (ti_locked == TI_RLOCKED) {
+		INP_INFO_RUNLOCK(&V_tcbinfo);
+	}
+	ti_locked = TI_UNLOCKED;
+
+	TCPSTAT_INC(tcps_predack);
+
+	/*
+	 * "bad retransmit" recovery.
+	 */
+	if (tp->t_rxtshift == 1 &&
+	    tp->t_flags & TF_PREVVALID &&
+	    (int)(ticks - tp->t_badrxtwin) < 0) {
+		cc_cong_signal(tp, th, CC_RTO_ERR);
+	}
+
+	/*
+	 * Recalculate the transmit timer / rtt.
+	 *
+	 * Some boxes send broken timestamp replies
+	 * during the SYN+ACK phase, ignore
+	 * timestamps of 0 or we could calculate a
+	 * huge RTT and blow up the retransmit timer.
+	 */
+	if ((to->to_flags & TOF_TS) != 0 &&
+	    to->to_tsecr) {
+		u_int t;
+
+		t = tcp_ts_getticks() - to->to_tsecr;
+		if (!tp->t_rttlow || tp->t_rttlow > t)
+			tp->t_rttlow = t;
+		tcp_xmit_timer(tp,
+			       TCP_TS_TO_TICKS(t) + 1);
+	} else if (tp->t_rtttime &&
+		   SEQ_GT(th->th_ack, tp->t_rtseq)) {
+		if (!tp->t_rttlow ||
+		    tp->t_rttlow > ticks - tp->t_rtttime)
+			tp->t_rttlow = ticks - tp->t_rtttime;
+		tcp_xmit_timer(tp,
+			       ticks - tp->t_rtttime);
+	}
+	if (winup_only == 0) {
+		acked = BYTES_THIS_ACK(tp, th);
+
+		/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
+		hhook_run_tcp_est_in(tp, th, to);
+
+		TCPSTAT_ADD(tcps_rcvackbyte, acked);
+		sbdrop(&so->so_snd, acked);
+		if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
+		    SEQ_LEQ(th->th_ack, tp->snd_recover))
+			tp->snd_recover = th->th_ack - 1;
+				
+		/*
+		 * Let the congestion control algorithm update
+		 * congestion control related information. This
+		 * typically means increasing the congestion
+		 * window.
+		 */
+		cc_ack_received(tp, th, CC_ACK);
+
+		tp->snd_una = th->th_ack;
+		tp->t_dupacks = 0;
+		m_freem(m);
+
+		/*
+		 * If all outstanding data are acked, stop
+		 * retransmit timer, otherwise restart timer
+		 * using current (possibly backed-off) value.
+		 * If process is waiting for space,
+		 * wakeup/selwakeup/signal.  If data
+		 * are ready to send, let tcp_output
+		 * decide between more output or persist.
+		 */
+#ifdef TCPDEBUG
+		if (so->so_options & SO_DEBUG)
+			tcp_trace(TA_INPUT, ostate, tp,
+				  (void *)tcp_saveipgen,
+				  &tcp_savetcp, 0);
+#endif
+		if (tp->snd_una == tp->snd_max)
+			tcp_timer_activate(tp, TT_REXMT, 0);
+		else if (!tcp_timer_active(tp, TT_PERSIST))
+			tcp_timer_activate(tp, TT_REXMT,
+					   tp->t_rxtcur);
+		/* Wake up the socket if we have room to write more */
+		sowwakeup(so);
+	} else {
+		/* 
+		 * Window update only, just free the mbufs and
+		 * send out whatever we can.
+		 */
+		m_freem(m);
+	}
+	if (sbavail(&so->so_snd))
+		(void) tcp_output(tp);
+	KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
+					    __func__, ti_locked));
+	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	if (tp->t_flags & TF_DELACK) {
+		tp->t_flags &= ~TF_DELACK;
+		tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
+	}
+	INP_WUNLOCK(tp->t_inpcb);
+	return (1);
+}
+
+/*
+ * This tcp-do-segment concentrates on making the fastest
+ * ack processing path. It does not have a fast-path for
+ * data (it possibly could which would then eliminate the
+ * need for fast-slow above). For a content distributor having
+ * large outgoing elephants and very very little coming in
+ * having no fastpath for data does not really help (since you
+ * don't get much data in). The most important thing is 
+ * processing ack's quickly and getting the rest of the data
+ * output to the peer as quickly as possible. This routine
+ * seems to be about an overall 3% faster then the old
+ * tcp_do_segment and keeps us in the fast-path for packets
+ * much more (by allowing window updates to also stay in the fastpath).
+ */
+void
+tcp_do_segment_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
+		       struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
+		       int ti_locked)
+{
+	int thflags;
+	u_long tiwin;
+	char *s;
+	struct in_conninfo *inc;
+	struct tcpopt to;
+
+	thflags = th->th_flags;
+	tp->sackhint.last_sack_ack = 0;
+	inc = &tp->t_inpcb->inp_inc;
+	/*
+	 * If this is either a state-changing packet or current state isn't
+	 * established, we require a write lock on tcbinfo.  Otherwise, we
+	 * allow the tcbinfo to be in either alocked or unlocked, as the
+	 * caller may have unnecessarily acquired a write lock due to a race.
+	 */
+	if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
+	    tp->t_state != TCPS_ESTABLISHED) {
+		KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for "
+						  "SYN/FIN/RST/!EST", __func__, ti_locked));
+		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+	} else {
+#ifdef INVARIANTS
+		if (ti_locked == TI_RLOCKED) {
+			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+		} else {
+			KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
+							   "ti_locked: %d", __func__, ti_locked));
+			INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+		}
+#endif
+	}
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
+					    __func__));
+	KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
+						__func__));
+
+	/*
+	 * Segment received on connection.
+	 * Reset idle time and keep-alive timer.
+	 * XXX: This should be done after segment
+	 * validation to ignore broken/spoofed segs.
+	 */
+	tp->t_rcvtime = ticks;
+	if (TCPS_HAVEESTABLISHED(tp->t_state))
+		tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
+
+	/*
+	 * Unscale the window into a 32-bit value.
+	 * For the SYN_SENT state the scale is zero.
+	 */
+	tiwin = th->th_win << tp->snd_scale;
+
+	/*
+	 * TCP ECN processing.
+	 */
+	if (tp->t_flags & TF_ECN_PERMIT) {
+		if (thflags & TH_CWR)
+			tp->t_flags &= ~TF_ECN_SND_ECE;
+		switch (iptos & IPTOS_ECN_MASK) {
+		case IPTOS_ECN_CE:
+			tp->t_flags |= TF_ECN_SND_ECE;
+			TCPSTAT_INC(tcps_ecn_ce);
+			break;
+		case IPTOS_ECN_ECT0:
+			TCPSTAT_INC(tcps_ecn_ect0);
+			break;
+		case IPTOS_ECN_ECT1:
+			TCPSTAT_INC(tcps_ecn_ect1);
+			break;
+		}
+		/* Congestion experienced. */
+		if (thflags & TH_ECE) {
+			cc_cong_signal(tp, th, CC_ECN);
+		}
+	}
+
+	/*
+	 * Parse options on any incoming segment.
+	 */
+	tcp_dooptions(&to, (u_char *)(th + 1),
+		      (th->th_off << 2) - sizeof(struct tcphdr),
+		      (thflags & TH_SYN) ? TO_SYN : 0);
+
+	/*
+	 * If echoed timestamp is later than the current time,
+	 * fall back to non RFC1323 RTT calculation.  Normalize
+	 * timestamp if syncookies were used when this connection
+	 * was established.
+	 */
+	if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
+		to.to_tsecr -= tp->ts_offset;
+		if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
+			to.to_tsecr = 0;
+	}
+	/*
+	 * If timestamps were negotiated during SYN/ACK they should
+	 * appear on every segment during this session and vice versa.
+	 */
+	if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
+		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
+			log(LOG_DEBUG, "%s; %s: Timestamp missing, "
+			    "no action\n", s, __func__);
+			free(s, M_TCPLOG);
+		}
+	}
+	if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
+		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
+			log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
+			    "no action\n", s, __func__);
+			free(s, M_TCPLOG);
+		}
+	}
+
+	/*
+	 * Process options only when we get SYN/ACK back. The SYN case
+	 * for incoming connections is handled in tcp_syncache.
+	 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
+	 * or <SYN,ACK>) segment itself is never scaled.
+	 * XXX this is traditional behavior, may need to be cleaned up.
+	 */
+	if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
+		if ((to.to_flags & TOF_SCALE) &&
+		    (tp->t_flags & TF_REQ_SCALE)) {
+			tp->t_flags |= TF_RCVD_SCALE;
+			tp->snd_scale = to.to_wscale;
+		}
+		/*
+		 * Initial send window.  It will be updated with
+		 * the next incoming segment to the scaled value.
+		 */
+		tp->snd_wnd = th->th_win;
+		if (to.to_flags & TOF_TS) {
+			tp->t_flags |= TF_RCVD_TSTMP;
+			tp->ts_recent = to.to_tsval;
+			tp->ts_recent_age = tcp_ts_getticks();
+		}
+		if (to.to_flags & TOF_MSS)
+			tcp_mss(tp, to.to_mss);
+		if ((tp->t_flags & TF_SACK_PERMIT) &&
+		    (to.to_flags & TOF_SACKPERM) == 0)
+			tp->t_flags &= ~TF_SACK_PERMIT;
+	}
+	/*
+	 * Header prediction: check for the two common cases
+	 * of a uni-directional data xfer.  If the packet has
+	 * no control flags, is in-sequence, the window didn't
+	 * change and we're not retransmitting, it's a
+	 * candidate.  If the length is zero and the ack moved
+	 * forward, we're the sender side of the xfer.  Just
+	 * free the data acked & wake any higher level process
+	 * that was blocked waiting for space.  If the length
+	 * is non-zero and the ack didn't move, we're the
+	 * receiver side.  If we're getting packets in-order
+	 * (the reassembly queue is empty), add the data to
+	 * the socket buffer and note that we need a delayed ack.
+	 * Make sure that the hidden state-flags are also off.
+	 * Since we check for TCPS_ESTABLISHED first, it can only
+	 * be TH_NEEDSYN.
+	 */
+	if (__predict_true(tp->t_state == TCPS_ESTABLISHED) &&
+	    __predict_true(((to.to_flags & TOF_SACK) == 0)) &&
+	    __predict_true(tlen == 0) &&
+	    __predict_true((thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK) &&
+	    __predict_true(LIST_EMPTY(&tp->t_segq)) &&
+	    __predict_true(th->th_seq == tp->rcv_nxt)) {
+		    if (tcp_fastack(m, th, so, tp, &to, drop_hdrlen, tlen, 
+				    ti_locked, tiwin)) {
+			    return;
+		    }
+	} 
+	tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen,
+			ti_locked, tiwin, thflags);
+}
+
+struct tcp_function_block __tcp_fastslow = {
+	"fastslow",
+	tcp_output,
+	tcp_do_segment_fastslow,
+	tcp_default_ctloutput,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	0,
+	0
+
+};
+
+struct tcp_function_block __tcp_fastack = {
+	"fastack",
+	tcp_output,
+	tcp_do_segment_fastack,
+	tcp_default_ctloutput,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	0,
+	0
+};
+
+static int
+tcp_addfastpaths(module_t mod, int type, void *data)
+{
+	int err=0;
+
+	switch (type) {
+	case MOD_LOAD:
+		err = register_tcp_functions(&__tcp_fastack, M_WAITOK);
+		if (err) {
+			printf("Failed to register fastack module -- err:%d\n", err);
+			return(err);
+		}
+		err = register_tcp_functions(&__tcp_fastslow, M_WAITOK); 
+		if (err) {
+			printf("Failed to register fastslow module -- err:%d\n", err);
+			deregister_tcp_functions(&__tcp_fastack);
+			return(err);
+		}
+		break;
+	case MOD_QUIESCE:
+		if ((__tcp_fastslow.tfb_refcnt) ||( __tcp_fastack.tfb_refcnt)) {
+			return(EBUSY);
+		}
+		break;
+	case MOD_UNLOAD:
+		err = deregister_tcp_functions(&__tcp_fastack);
+		if (err == EBUSY)
+			break;
+		err = deregister_tcp_functions(&__tcp_fastslow);
+		if (err == EBUSY)
+			break;
+		err = 0;
+		break;
+	default:
+		return (EOPNOTSUPP);
+	}
+	return (err);
+}
+
+static moduledata_t new_tcp_fastpaths = {
+	.name = "tcp_fastpaths",
+	.evhand = tcp_addfastpaths,
+	.priv = 0
+};
+
+MODULE_VERSION(kern_tcpfastpaths, 1);
+DECLARE_MODULE(kern_tcpfastpaths, new_tcp_fastpaths, SI_SUB_PSEUDO, SI_ORDER_ANY);
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 29af766..00869a6 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -47,6 +47,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/sysctl.h>
 #include <sys/jail.h>
 #include <sys/malloc.h>
+#include <sys/refcount.h>
 #include <sys/mbuf.h>
 #ifdef INET6
 #include <sys/domain.h>
@@ -125,6 +126,8 @@ VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS;
 VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS;
 #endif
 
+struct rwlock tcp_function_lock;
+
 static int
 sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS)
 {
@@ -236,6 +239,179 @@ static char *	tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th,
 		    void *ip4hdr, const void *ip6hdr);
 static void	tcp_timer_discard(struct tcpcb *, uint32_t);
 
+
+static struct tcp_function_block tcp_def_funcblk = {
+	"default",
+	tcp_output,
+	tcp_do_segment,
+	tcp_default_ctloutput,
+	NULL,
+	NULL,	
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	0,
+	0
+};
+
+struct tcp_funchead t_functions;
+static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk;
+
+static struct tcp_function_block *
+find_tcp_functions_locked(struct tcp_function_set *fs)
+{
+	struct tcp_function *f;
+	struct tcp_function_block *blk=NULL;
+
+	TAILQ_FOREACH(f, &t_functions, tf_next) {
+		if (strcmp(f->tf_fb->tfb_tcp_block_name, fs->function_set_name) == 0) {
+			blk = f->tf_fb;
+			break;
+		}
+	}
+	return(blk);
+}
+
+static struct tcp_function_block *
+find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s)
+{
+	struct tcp_function_block *rblk=NULL;
+	struct tcp_function *f;
+
+	TAILQ_FOREACH(f, &t_functions, tf_next) {
+		if (f->tf_fb == blk) {
+			rblk = blk;
+			if (s) {
+				*s = f;
+			}
+			break;
+		}
+	}
+	return (rblk);
+}
+
+struct tcp_function_block *
+find_and_ref_tcp_functions(struct tcp_function_set *fs)
+{
+	struct tcp_function_block *blk;
+	
+	rw_rlock(&tcp_function_lock);	
+	blk = find_tcp_functions_locked(fs);
+	if (blk)
+		refcount_acquire(&blk->tfb_refcnt); 
+	rw_runlock(&tcp_function_lock);
+	return(blk);
+}
+
+struct tcp_function_block *
+find_and_ref_tcp_fb(struct tcp_function_block *blk)
+{
+	struct tcp_function_block *rblk;
+	
+	rw_rlock(&tcp_function_lock);	
+	rblk = find_tcp_fb_locked(blk, NULL);
+	if (rblk) 
+		refcount_acquire(&rblk->tfb_refcnt);
+	rw_runlock(&tcp_function_lock);
+	return(rblk);
+}
+
+
+static int
+sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS)
+{
+	int error=ENOENT;
+	struct tcp_function_set fs;
+	struct tcp_function_block *blk;
+
+	memset(&fs, 0, sizeof(fs));
+	rw_rlock(&tcp_function_lock);
+	blk = find_tcp_fb_locked(tcp_func_set_ptr, NULL);
+	if (blk) {
+		/* Found him */
+		strcpy(fs.function_set_name, blk->tfb_tcp_block_name);
+		fs.pcbcnt = blk->tfb_refcnt;
+	}
+	rw_runlock(&tcp_function_lock);	
+	error = sysctl_handle_string(oidp, fs.function_set_name,
+				     sizeof(fs.function_set_name), req);
+
+	/* Check for error or no change */
+	if (error != 0 || req->newptr == NULL)
+		return(error);
+
+	rw_wlock(&tcp_function_lock);
+	blk = find_tcp_functions_locked(&fs);
+	if ((blk == NULL) ||
+	    (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) { 
+		error = ENOENT; 
+		goto done;
+	}
+	tcp_func_set_ptr = blk;
+done:
+	rw_wunlock(&tcp_function_lock);
+	return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default,
+	    CTLTYPE_STRING | CTLFLAG_RW,
+	    NULL, 0, sysctl_net_inet_default_tcp_functions, "A",
+	    "Set/get the default TCP functions");
+
+static int
+sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS)
+{
+	int error, cnt, linesz;
+	struct tcp_function *f;
+	char *buffer, *cp;
+	size_t bufsz, outsz;
+
+	cnt = 0;
+	rw_rlock(&tcp_function_lock);
+	TAILQ_FOREACH(f, &t_functions, tf_next) {
+		cnt++;
+	}
+	rw_runlock(&tcp_function_lock);
+
+	bufsz = (cnt+2) * (TCP_FUNCTION_NAME_LEN_MAX + 12) + 1;
+	buffer = malloc(bufsz, M_TEMP, M_WAITOK);
+
+	error = 0;
+	cp = buffer;
+
+	linesz = snprintf(cp, bufsz, "\n%-32s%c %s\n", "Stack", 'D', "PCB count");
+	cp += linesz;
+	bufsz -= linesz;
+	outsz = linesz;
+
+	rw_rlock(&tcp_function_lock);	
+	TAILQ_FOREACH(f, &t_functions, tf_next) {
+		linesz = snprintf(cp, bufsz, "%-32s%c %u\n",
+		    f->tf_fb->tfb_tcp_block_name,
+		    (f->tf_fb == tcp_func_set_ptr) ? '*' : ' ',
+		    f->tf_fb->tfb_refcnt);
+		if (linesz >= bufsz) {
+			error = EOVERFLOW;
+			break;
+		}
+		cp += linesz;
+		bufsz -= linesz;
+		outsz += linesz;
+	}
+	rw_runlock(&tcp_function_lock);
+	if (error == 0)
+		error = sysctl_handle_string(oidp, buffer, outsz + 1, req);
+	free(buffer, M_TEMP);
+	return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available,
+	    CTLTYPE_STRING|CTLFLAG_RD,
+	    NULL, 0, sysctl_net_inet_list_available, "A",
+	    "list available TCP Function sets");
+
 /*
  * Target size of TCP PCB hash tables. Must be a power of two.
  *
@@ -263,6 +439,8 @@ static VNET_DEFINE(uma_zone_t, tcpcb_zone);
 #define	V_tcpcb_zone			VNET(tcpcb_zone)
 
 MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
+MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory");
+
 static struct mtx isn_mtx;
 
 #define	ISN_LOCK_INIT()	mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF)
@@ -311,6 +489,96 @@ maketcp_hashsize(int size)
 	return (hashsize);
 }
 
+int
+register_tcp_functions(struct tcp_function_block *blk, int wait)
+{
+	struct tcp_function_block *lblk;
+	struct tcp_function *n;
+	struct tcp_function_set fs;
+
+	if ((blk->tfb_tcp_output == NULL) ||
+	    (blk->tfb_tcp_do_segment == NULL) ||
+	    (blk->tfb_tcp_ctloutput == NULL) ||
+	    (strlen(blk->tfb_tcp_block_name) == 0)) {
+		/* 
+		 * These functions are required and you
+		 * need a name.
+		 */
+		return (EINVAL);
+	}
+	if (blk->tfb_tcp_timer_stop_all ||
+	    blk->tfb_tcp_timers_left ||
+	    blk->tfb_tcp_timer_activate ||
+	    blk->tfb_tcp_timer_active ||
+	    blk->tfb_tcp_timer_stop) {
+		/*
+		 * If you define one timer function you 
+		 * must have them all.
+		 */
+		if ((blk->tfb_tcp_timer_stop_all == NULL) ||
+		    (blk->tfb_tcp_timers_left  == NULL) ||
+		    (blk->tfb_tcp_timer_activate == NULL) ||
+		    (blk->tfb_tcp_timer_active == NULL) ||
+		    (blk->tfb_tcp_timer_stop == NULL)) {
+			return (EINVAL);			
+		}
+	}	
+	n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait);
+	if (n == NULL) {
+		return (ENOMEM);
+	}
+	n->tf_fb = blk;
+	strcpy(fs.function_set_name, blk->tfb_tcp_block_name);
+	rw_wlock(&tcp_function_lock);
+	lblk = find_tcp_functions_locked(&fs);
+	if (lblk) {
+		/* Duplicate name space not allowed */
+		rw_wunlock(&tcp_function_lock);
+		free(n, M_TCPFUNCTIONS);
+		return (EALREADY);
+	}
+	refcount_init(&blk->tfb_refcnt, 0);
+	blk->tfb_flags = 0;
+	TAILQ_INSERT_TAIL(&t_functions, n, tf_next);
+	rw_wunlock(&tcp_function_lock);
+	return(0);
+}	
+
+int
+deregister_tcp_functions(struct tcp_function_block *blk)
+{
+	struct tcp_function_block *lblk;
+	struct tcp_function *f;
+	int error=ENOENT;
+	
+	if (strcmp(blk->tfb_tcp_block_name, "default") == 0) {
+		/* You can't un-register the default */
+		return (EPERM);
+	}
+	rw_wlock(&tcp_function_lock);
+	if (blk == tcp_func_set_ptr) {
+		/* You can't free the current default */
+		rw_wunlock(&tcp_function_lock);
+		return (EBUSY);
+	}
+	if (blk->tfb_refcnt) {
+		/* Still tcb attached, mark it. */
+		blk->tfb_flags |= TCP_FUNC_BEING_REMOVED;
+		rw_wunlock(&tcp_function_lock);		
+		return (EBUSY);
+	}
+	lblk = find_tcp_fb_locked(blk, &f);
+	if (lblk) {
+		/* Found */
+		TAILQ_REMOVE(&t_functions, f, tf_next);
+		f->tf_fb = NULL;
+		free(f, M_TCPFUNCTIONS);
+		error = 0;
+	}
+	rw_wunlock(&tcp_function_lock);
+	return (error);
+}
+
 void
 tcp_init(void)
 {
@@ -325,7 +593,10 @@ tcp_init(void)
 	if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT,
 	    &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register helper hook\n", __func__);
-
+	/* Setup the tcp function block list */
+	TAILQ_INIT(&t_functions);
+	rw_init_flags(&tcp_function_lock, "tcp_func_lock" , 0);
+	register_tcp_functions(&tcp_def_funcblk, M_WAITOK);
 	hashsize = TCBHASHSIZE;
 	TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize);
 	if (hashsize == 0) {
@@ -768,7 +1039,13 @@ tcp_newtcpcb(struct inpcb *inp)
 	tp->ccv = &tm->ccv;
 	tp->ccv->type = IPPROTO_TCP;
 	tp->ccv->ccvc.tcp = tp;
-
+	rw_rlock(&tcp_function_lock);
+	tp->t_fb = tcp_func_set_ptr;
+	refcount_acquire(&tp->t_fb->tfb_refcnt);
+	rw_runlock(&tcp_function_lock);
+	if (tp->t_fb->tfb_tcp_fb_init) {
+		(*tp->t_fb->tfb_tcp_fb_init)(tp);
+	}
 	/*
 	 * Use the current system default CC algorithm.
 	 */
@@ -779,12 +1056,18 @@ tcp_newtcpcb(struct inpcb *inp)
 
 	if (CC_ALGO(tp)->cb_init != NULL)
 		if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) {
+			if (tp->t_fb->tfb_tcp_fb_fini)
+				(*tp->t_fb->tfb_tcp_fb_fini)(tp);
+			refcount_release(&tp->t_fb->tfb_refcnt);
 			uma_zfree(V_tcpcb_zone, tm);
 			return (NULL);
 		}
 
 	tp->osd = &tm->osd;
 	if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) {
+		if (tp->t_fb->tfb_tcp_fb_fini)
+			(*tp->t_fb->tfb_tcp_fb_fini)(tp);
+		refcount_release(&tp->t_fb->tfb_refcnt);
 		uma_zfree(V_tcpcb_zone, tm);
 		return (NULL);
 	}
@@ -925,7 +1208,7 @@ tcp_drop(struct tcpcb *tp, int errno)
 
 	if (TCPS_HAVERCVDSYN(tp->t_state)) {
 		tcp_state_change(tp, TCPS_CLOSED);
-		(void) tcp_output(tp);
+		(void) tp->t_fb->tfb_tcp_output(tp);
 		TCPSTAT_INC(tcps_drops);
 	} else
 		TCPSTAT_INC(tcps_conndrops);
@@ -960,6 +1243,10 @@ tcp_discardcb(struct tcpcb *tp)
 	tcp_timer_stop(tp, TT_KEEP);
 	tcp_timer_stop(tp, TT_2MSL);
 	tcp_timer_stop(tp, TT_DELACK);
+	if (tp->t_fb->tfb_tcp_timer_stop_all) {
+		/* Call the stop-all function of the methods */
+		tp->t_fb->tfb_tcp_timer_stop_all(tp);
+	}
 
 	/*
 	 * If we got enough samples through the srtt filter,
@@ -1044,6 +1331,14 @@ tcp_discardcb(struct tcpcb *tp)
 	inp->inp_ppcb = NULL;
 	if ((tp->t_timers->tt_flags & TT_MASK) == 0) {
 		/* We own the last reference on tcpcb, let's free it. */
+		if ((tp->t_fb->tfb_tcp_timers_left) &&
+		    (tp->t_fb->tfb_tcp_timers_left(tp))) {
+			    /* Some fb timers left running! */
+			    return;
+		}
+		if (tp->t_fb->tfb_tcp_fb_fini)
+			(*tp->t_fb->tfb_tcp_fb_fini)(tp);
+		refcount_release(&tp->t_fb->tfb_refcnt);
 		tp->t_inpcb = NULL;
 		uma_zfree(V_tcpcb_zone, tp);
 		released = in_pcbrele_wlocked(inp);
@@ -1105,6 +1400,14 @@ tcp_timer_discard(struct tcpcb *tp, uint32_t timer_type)
 	tp->t_timers->tt_flags &= ~timer_type;
 	if ((tp->t_timers->tt_flags & TT_MASK) == 0) {
 		/* We own the last reference on this tcpcb, let's free it. */
+		if ((tp->t_fb->tfb_tcp_timers_left) &&
+		    (tp->t_fb->tfb_tcp_timers_left(tp))) {
+			    /* Some fb timers left running! */
+			    goto leave;
+		}
+		if (tp->t_fb->tfb_tcp_fb_fini)
+			(*tp->t_fb->tfb_tcp_fb_fini)(tp);
+		refcount_release(&tp->t_fb->tfb_refcnt);
 		tp->t_inpcb = NULL;
 		uma_zfree(V_tcpcb_zone, tp);
 		if (in_pcbrele_wlocked(inp)) {
@@ -1113,6 +1416,7 @@ tcp_timer_discard(struct tcpcb *tp, uint32_t timer_type)
 			return;
 		}
 	}
+leave:
 	INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
@@ -1865,7 +2169,7 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer)
 	tp->snd_recover = tp->snd_max;
 	if (tp->t_flags & TF_SACK_PERMIT)
 		EXIT_FASTRECOVERY(tp->t_flags);
-	tcp_output(tp);
+	tp->t_fb->tfb_tcp_output(tp);
 }
 
 #ifdef INET
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
index e5a38d5..8ad1b22 100644
--- a/sys/netinet/tcp_syncache.c
+++ b/sys/netinet/tcp_syncache.c
@@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/hash.h>
+#include <sys/refcount.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/limits.h>
@@ -626,6 +627,7 @@ done:
 static struct socket *
 syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
 {
+	struct tcp_function_block *blk;
 	struct inpcb *inp = NULL;
 	struct socket *so;
 	struct tcpcb *tp;
@@ -817,6 +819,26 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
 	tp->irs = sc->sc_irs;
 	tcp_rcvseqinit(tp);
 	tcp_sendseqinit(tp);
+	blk = sototcpcb(lso)->t_fb;
+	if (blk != tp->t_fb) {
+		/*
+		 * Our parents t_fb was not the default,
+		 * we need to release our ref on tp->t_fb and 
+		 * pickup one on the new entry.
+		 */
+		struct tcp_function_block *rblk;
+		
+		rblk = find_and_ref_tcp_fb(blk);
+		KASSERT(rblk != NULL,
+		    ("cannot find blk %p out of syncache?", blk));
+		if (tp->t_fb->tfb_tcp_fb_fini)
+			(*tp->t_fb->tfb_tcp_fb_fini)(tp);
+		refcount_release(&tp->t_fb->tfb_refcnt);
+		tp->t_fb = rblk;
+		if (tp->t_fb->tfb_tcp_fb_init) {
+			(*tp->t_fb->tfb_tcp_fb_init)(tp);
+		}
+	}		
 	tp->snd_wl1 = sc->sc_irs;
 	tp->snd_max = tp->iss + 1;
 	tp->snd_nxt = tp->iss + 1;
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index d129586..6a24ce7 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -292,7 +292,7 @@ tcp_timer_delack(void *xtp)
 
 	tp->t_flags |= TF_ACKNOW;
 	TCPSTAT_INC(tcps_delack);
-	(void) tcp_output(tp);
+	(void) tp->t_fb->tfb_tcp_output(tp);
 	INP_WUNLOCK(inp);
 	CURVNET_RESTORE();
 }
@@ -543,7 +543,7 @@ tcp_timer_persist(void *xtp)
 	}
 	tcp_setpersist(tp);
 	tp->t_flags |= TF_FORCEDATA;
-	(void) tcp_output(tp);
+	(void) tp->t_fb->tfb_tcp_output(tp);
 	tp->t_flags &= ~TF_FORCEDATA;
 
 out:
@@ -798,7 +798,7 @@ tcp_timer_rexmt(void * xtp)
 
 	cc_cong_signal(tp, NULL, CC_RTO);
 
-	(void) tcp_output(tp);
+	(void) tp->t_fb->tfb_tcp_output(tp);
 
 out:
 #ifdef TCPDEBUG
@@ -858,6 +858,10 @@ tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
 			f_reset = TT_2MSL_RST;
 			break;
 		default:
+			if (tp->t_fb->tfb_tcp_timer_activate) {
+				tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta);
+				return;
+			}
 			panic("tp %p bad timer_type %#x", tp, timer_type);
 		}
 	if (delta == 0) {
@@ -904,6 +908,9 @@ tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
 			t_callout = &tp->t_timers->tt_2msl;
 			break;
 		default:
+			if (tp->t_fb->tfb_tcp_timer_active) {
+				return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type));
+			}
 			panic("tp %p bad timer_type %#x", tp, timer_type);
 		}
 	return callout_active(t_callout);
@@ -945,6 +952,14 @@ tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
 			f_reset = TT_2MSL_RST;
 			break;
 		default:
+			if (tp->t_fb->tfb_tcp_timer_stop) {
+				/* 
+				 * XXXrrs we need to look at this with the
+				 * stop case below (flags).
+				 */
+				tp->t_fb->tfb_tcp_timer_stop(tp, timer_type);
+				return;
+			}
 			panic("tp %p bad timer_type %#x", tp, timer_type);
 		}
 
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index a1f8a0c..42e2ea7 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -47,6 +47,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
+#include <sys/refcount.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/mbuf.h>
@@ -509,7 +510,7 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 		goto out;
 #endif
 	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
-	error = tcp_output(tp);
+	error = tp->t_fb->tfb_tcp_output(tp);
 out:
 	TCPDEBUG2(PRU_CONNECT);
 	INP_WUNLOCK(inp);
@@ -579,7 +580,7 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 		    (error = tcp_offload_connect(so, nam)) == 0)
 			goto out;
 #endif
-		error = tcp_output(tp);
+		error = tp->t_fb->tfb_tcp_output(tp);
 		goto out;
 	}
 #endif
@@ -597,7 +598,7 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 		goto out;
 #endif
 	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
-	error = tcp_output(tp);
+	error = tp->t_fb->tfb_tcp_output(tp);
 
 out:
 	TCPDEBUG2(PRU_CONNECT);
@@ -773,7 +774,7 @@ tcp_usr_shutdown(struct socket *so)
 	socantsendmore(so);
 	tcp_usrclosed(tp);
 	if (!(inp->inp_flags & INP_DROPPED))
-		error = tcp_output(tp);
+		error = tp->t_fb->tfb_tcp_output(tp);
 
 out:
 	TCPDEBUG2(PRU_SHUTDOWN);
@@ -809,7 +810,7 @@ tcp_usr_rcvd(struct socket *so, int flags)
 		tcp_offload_rcvd(tp);
 	else
 #endif
-	tcp_output(tp);
+	tp->t_fb->tfb_tcp_output(tp);
 
 out:
 	TCPDEBUG2(PRU_RCVD);
@@ -911,7 +912,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
 		    !(flags & PRUS_NOTREADY)) {
 			if (flags & PRUS_MORETOCOME)
 				tp->t_flags |= TF_MORETOCOME;
-			error = tcp_output(tp);
+			error = tp->t_fb->tfb_tcp_output(tp);
 			if (flags & PRUS_MORETOCOME)
 				tp->t_flags &= ~TF_MORETOCOME;
 		}
@@ -961,7 +962,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
 		tp->snd_up = tp->snd_una + sbavail(&so->so_snd);
 		if (!(flags & PRUS_NOTREADY)) {
 			tp->t_flags |= TF_FORCEDATA;
-			error = tcp_output(tp);
+			error = tp->t_fb->tfb_tcp_output(tp);
 			tp->t_flags &= ~TF_FORCEDATA;
 		}
 	}
@@ -997,7 +998,7 @@ tcp_usr_ready(struct socket *so, struct mbuf *m, int count)
 	error = sbready(&so->so_snd, m, count);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	if (error == 0)
-		error = tcp_output(tp);
+		error = tp->t_fb->tfb_tcp_output(tp);
 	INP_WUNLOCK(inp);
 
 	return (error);
@@ -1349,13 +1350,11 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti)
 int
 tcp_ctloutput(struct socket *so, struct sockopt *sopt)
 {
-	int	error, opt, optval;
-	u_int	ui;
+	int	error;
 	struct	inpcb *inp;
 	struct	tcpcb *tp;
-	struct	tcp_info ti;
-	char buf[TCP_CA_NAME_MAX];
-	struct cc_algo *algo;
+	struct tcp_function_block *blk;
+	struct tcp_function_set fsn;
 
 	error = 0;
 	inp = sotoinpcb(so);
@@ -1383,7 +1382,83 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
+	tp = intotcpcb(inp);
+	/*
+	 * Protect the TCP option TCP_FUNCTION_BLK so
+	 * that a sub-function can *never* overwrite this.
+	 */
+	if ((sopt->sopt_dir == SOPT_SET) && 
+	    (sopt->sopt_name == TCP_FUNCTION_BLK)) {
+		INP_WUNLOCK(inp);
+		error = sooptcopyin(sopt, &fsn, sizeof fsn,
+		    sizeof fsn);
+		if (error)
+			return (error);
+		INP_WLOCK_RECHECK(inp);
+		if (tp->t_state != TCPS_CLOSED) {
+			/* 
+			 * The user has advanced the state
+			 * past the initial point, we can't
+			 * switch since we are down the road
+			 * and a new set of functions may
+			 * not be compatibile.
+			 */
+			INP_WUNLOCK(inp);
+			return(EINVAL);
+		}
+		blk = find_and_ref_tcp_functions(&fsn);
+		if (blk == NULL) {
+			INP_WUNLOCK(inp);
+			return (ENOENT);
+		}
+		if (tp->t_fb != blk) {
+			if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) {
+				refcount_release(&blk->tfb_refcnt);
+				INP_WUNLOCK(inp);
+				return (ENOENT);
+			}
+			/* 
+			 * Release the old refcnt, the
+			 * lookup acquires a ref on the
+			 * new one.
+			 */
+			if (tp->t_fb->tfb_tcp_fb_fini)
+				(*tp->t_fb->tfb_tcp_fb_fini)(tp);
+			refcount_release(&tp->t_fb->tfb_refcnt);
+			tp->t_fb = blk;
+			if (tp->t_fb->tfb_tcp_fb_init) {
+				(*tp->t_fb->tfb_tcp_fb_init)(tp);
+			}
+		}
+#ifdef TCP_OFFLOAD
+		if (tp->t_flags & TF_TOE) {
+			tcp_offload_ctloutput(tp, sopt->sopt_dir,
+			     sopt->sopt_name);
+		}
+#endif
+		INP_WUNLOCK(inp);
+		return (error);
+	} else if ((sopt->sopt_dir == SOPT_GET) && 
+	    (sopt->sopt_name == TCP_FUNCTION_BLK)) {
+		strcpy(fsn.function_set_name, tp->t_fb->tfb_tcp_block_name);
+		fsn.pcbcnt = tp->t_fb->tfb_refcnt;
+		INP_WUNLOCK(inp);
+		error = sooptcopyout(sopt, &fsn, sizeof fsn);
+		return (error);
+	}
+	/* Pass in the INP locked, called must unlock it */
+	return (tp->t_fb->tfb_tcp_ctloutput(so, sopt, inp, tp));
+}
 
+int
+tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
+{
+	int	error, opt, optval;
+	u_int	ui;
+	struct	tcp_info ti;
+	struct cc_algo *algo;
+	char buf[TCP_CA_NAME_MAX];
+	
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
@@ -1451,7 +1526,7 @@ unlock_and_done:
 			else if (tp->t_flags & TF_NOPUSH) {
 				tp->t_flags &= ~TF_NOPUSH;
 				if (TCPS_HAVEESTABLISHED(tp->t_state))
-					error = tcp_output(tp);
+					error = tp->t_fb->tfb_tcp_output(tp);
 			}
 			goto unlock_and_done;
 
@@ -1770,7 +1845,7 @@ tcp_disconnect(struct tcpcb *tp)
 		sbflush(&so->so_rcv);
 		tcp_usrclosed(tp);
 		if (!(inp->inp_flags & INP_DROPPED))
-			tcp_output(tp);
+			tp->t_fb->tfb_tcp_output(tp);
 	}
 }
 
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index bf8347a..2b3954a 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -89,6 +89,52 @@ struct tcptemp {
 
 #define tcp6cb		tcpcb  /* for KAME src sync over BSD*'s */
 
+/* 
+ * TODO: We yet need to brave plowing in
+ * to tcp_input() and the pru_usrreq() block.
+ * Right now these go to the old standards which
+ * are somewhat ok, but in the long term may
+ * need to be changed. If we do tackle tcp_input()
+ * then we need to get rid of the tcp_do_segment()
+ * function below.
+ */
+/* Flags for tcp functions */
+#define TCP_FUNC_BEING_REMOVED 0x01   	/* Can no longer be referenced */
+struct tcpcb;
+struct inpcb;
+struct sockopt;
+struct socket;
+
+struct tcp_function_block {
+	char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX];
+	int	(*tfb_tcp_output)(struct tcpcb *);
+	void	(*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *,
+			    struct socket *, struct tcpcb *,
+			    int, int, uint8_t,
+			    int);
+	int     (*tfb_tcp_ctloutput)(struct socket *so, struct sockopt *sopt,
+			    struct inpcb *inp, struct tcpcb *tp);
+	/* Optional memory allocation/free routine */
+	void	(*tfb_tcp_fb_init)(struct tcpcb *);
+	void	(*tfb_tcp_fb_fini)(struct tcpcb *);
+	/* Optional timers, must define all if you define one */
+	int	(*tfb_tcp_timer_stop_all)(struct tcpcb *);
+	int	(*tfb_tcp_timers_left)(struct tcpcb *);
+	void	(*tfb_tcp_timer_activate)(struct tcpcb *,
+			    uint32_t, u_int);
+	int	(*tfb_tcp_timer_active)(struct tcpcb *, uint32_t);
+	void	(*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t);
+	volatile uint32_t tfb_refcnt;
+	uint32_t  tfb_flags;
+};
+
+struct tcp_function {
+	TAILQ_ENTRY(tcp_function) tf_next;
+	struct tcp_function_block *tf_fb;
+};
+
+TAILQ_HEAD(tcp_funchead, tcp_function);
+
 /*
  * Tcp control block, one per tcp; fields:
  * Organized for 16 byte cacheline efficiency.
@@ -207,9 +253,10 @@ struct tcpcb {
 	u_int	t_tsomaxsegsize;	/* TSO maximum segment size in bytes */
 	u_int	t_pmtud_saved_maxopd;	/* pre-blackhole MSS */
 	u_int	t_flags2;		/* More tcpcb flags storage */
-
 	uint32_t t_ispare[8];		/* 5 UTO, 3 TBD */
-	void	*t_pspare2[4];		/* 1 TCP_SIGNATURE, 3 TBD */
+	struct tcp_function_block *t_fb;/* TCP function call block */
+	void	*t_fb_ptr;		/* Pointer to t_fb specific data */
+	void	*t_pspare2[2];		/* 1 TCP_SIGNATURE, 1 TBD */
 #if defined(_KERNEL) && defined(TCPPCAP)
 	struct mbufq t_inpkts;		/* List of saved input packets. */
 	struct mbufq t_outpkts;		/* List of saved output packets. */
@@ -534,6 +581,8 @@ struct	tcpstat {
 #define	tcps_rcvmemdrop	tcps_rcvreassfull	/* compat */
 
 #ifdef _KERNEL
+#define	TI_UNLOCKED	1
+#define	TI_RLOCKED	2
 #include <sys/counter.h>
 
 VNET_PCPUSTAT_DECLARE(struct tcpstat, tcpstat);	/* tcp statistics */
@@ -684,7 +733,32 @@ char	*tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *,
 int	 tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *);
 void	 tcp_reass_global_init(void);
 void	 tcp_reass_flush(struct tcpcb *);
+void	 tcp_dooptions(struct tcpopt *, u_char *, int, int);
+void	tcp_dropwithreset(struct mbuf *, struct tcphdr *,
+		     struct tcpcb *, int, int);
+void	tcp_pulloutofband(struct socket *,
+		     struct tcphdr *, struct mbuf *, int);
+void	tcp_xmit_timer(struct tcpcb *, int);
+void	tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
+void	cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
+			    uint16_t type);
+void 	cc_conn_init(struct tcpcb *tp);
+void 	cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
+void	cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type);
+void	hhook_run_tcp_est_in(struct tcpcb *tp,
+			    struct tcphdr *th, struct tcpopt *to);
+
 int	 tcp_input(struct mbuf **, int *, int);
+void	 tcp_do_segment(struct mbuf *, struct tcphdr *,
+			struct socket *, struct tcpcb *, int, int, uint8_t,
+			int);
+
+int register_tcp_functions(struct tcp_function_block *blk, int wait);
+int deregister_tcp_functions(struct tcp_function_block *blk);
+struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs);
+struct tcp_function_block *find_and_ref_tcp_fb(struct tcp_function_block *blk);
+int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp);
+
 u_long	 tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);
 u_long	 tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *);
 void	 tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *,
@@ -752,8 +826,6 @@ int	 tcp_newreno(struct tcpcb *, struct tcphdr *);
 u_long	 tcp_seq_subtract(u_long, u_long );
 int	 tcp_compute_pipe(struct tcpcb *);
 
-void	cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type);
-
 static inline void
 tcp_fields_to_host(struct tcphdr *th)
 {
diff --git a/sys/netinet/toecore.c b/sys/netinet/toecore.c
index 87d8856..ef82f8a 100644
--- a/sys/netinet/toecore.c
+++ b/sys/netinet/toecore.c
@@ -509,7 +509,7 @@ toe_connect_failed(struct toedev *tod, struct inpcb *inp, int err)
 			KASSERT(!(tp->t_flags & TF_TOE),
 			    ("%s: tp %p still offloaded.", __func__, tp));
 			tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
-			(void) tcp_output(tp);
+			(void) tp->t_fb->tfb_tcp_output(tp);
 		} else {
 
 			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
author	rrs <rrs@FreeBSD.org>	2015-12-16 00:56:45 +0000
committer	rrs <rrs@FreeBSD.org>	2015-12-16 00:56:45 +0000
commit	50f477e18282c69de187b4f12997286e5dacd64c (patch)
tree	783c0558e06b050ee57c6af56f4c51a9f65359e7
parent	7541e09c63c2e2c2fcea0611970bc84fe7b87372 (diff)
download	FreeBSD-src-50f477e18282c69de187b4f12997286e5dacd64c.zip FreeBSD-src-50f477e18282c69de187b4f12997286e5dacd64c.tar.gz