- update firmware to 5.0

- add support for T3C - add DDP support (zero-copy receive) - fix TOE transmit of large requests - fix shutdown so that sockets don't remain in CLOSING state indefinitely - register listeners when an interface is brought up after tom is loaded - fix setting of multicast filter - enable link at device attach - exit tick handler if shutdown is in progress - add helper for logging TCB - add sysctls for dumping transmit queues - note that TOE wxill not be MFC'd until after 7.0 has been finalized MFC after: 3 days
author: kmacy <kmacy@FreeBSD.org> 2008-02-23 01:06:17 +0000
committer: kmacy <kmacy@FreeBSD.org> 2008-02-23 01:06:17 +0000
commit: 48fe676ff5ddc104ebc346eebf48c7c0e285f833 (patch)
tree: 02a3e854ca5eb4caea80ce68a9a12f620befb52d /sys/dev/cxgb/ulp
parent: df26e399aa077b14fb965be866012bccf2847bae (diff)
download: FreeBSD-src-48fe676ff5ddc104ebc346eebf48c7c0e285f833.zip
FreeBSD-src-48fe676ff5ddc104ebc346eebf48c7c0e285f833.tar.gz
15 files changed, 2898 insertions, 2702 deletions
diff --git a/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h b/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h
index 8e88d6b..c70c37d 100644
--- a/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h
+++ b/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h
@@ -41,6 +41,8 @@ enum {
 	TOE_ID_CHELSIO_T2,
 	TOE_ID_CHELSIO_T3,
 	TOE_ID_CHELSIO_T3B,
-};
+	TOE_ID_CHELSIO_T3C,
+}
+	;
 
 #endif
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
index 0f2f2ee..96e5b65 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
@@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
+#include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
@@ -63,9 +64,9 @@ __FBSDID("$FreeBSD$");
 #include <netinet/tcp_offload.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_syncache.h>
+#include <netinet/tcp_timer.h>
 #include <net/route.h>
 
-
 #include <dev/cxgb/t3cdev.h>
 #include <dev/cxgb/common/cxgb_firmware_exports.h>
 #include <dev/cxgb/common/cxgb_t3_cpl.h>
@@ -84,8 +85,6 @@ __FBSDID("$FreeBSD$");
 #include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
 #include <dev/cxgb/ulp/tom/cxgb_tcp.h>
 
-
-
 /*
  * For ULP connections HW may add headers, e.g., for digests, that aren't part
  * of the messages sent by the host but that are part of the TCP payload and
@@ -118,7 +117,7 @@ static unsigned int wrlen __read_mostly;
  * in the skb and whether it has any payload in its main body.  This maps the
  * length of the gather list represented by an skb into the # of necessary WRs.
  */
-static unsigned int mbuf_wrs[TX_MAX_SEGS] __read_mostly;
+static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
 
 /*
  * Max receive window supported by HW in bytes.  Only a small part of it can
@@ -147,6 +146,37 @@ static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
 static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
 static void handle_syncache_event(int event, void *arg);
 
+static inline void
+SBAPPEND(struct sockbuf *sb, struct mbuf *n)
+{
+	struct mbuf * m;
+
+	m = sb->sb_mb;
+	while (m) {
+		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
+		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
+			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
+		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
+			m->m_next, m->m_nextpkt, m->m_flags));
+		m = m->m_next;
+	}
+	m = n;
+	while (m) {
+		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
+		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
+			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
+		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
+			m->m_next, m->m_nextpkt, m->m_flags));
+		m = m->m_next;
+	}
+	sbappend_locked(sb, n);
+	m = sb->sb_mb;
+	while (m) {
+		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
+			m->m_next, m->m_nextpkt, m->m_flags));
+		m = m->m_next;
+	}
+}
 
 static inline int
 is_t3a(const struct toedev *dev)
@@ -166,6 +196,7 @@ dump_toepcb(struct toepcb *toep)
 	    toep->tp_mss_clamp, toep->tp_flags);
 }
 
+#ifndef RTALLOC2_DEFINED
 static struct rtentry *
 rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
 {
@@ -176,7 +207,7 @@ rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
 
 	return (rt);
 }
-
+#endif
 /*
  * Determine whether to send a CPL message now or defer it.  A message is
  * deferred if the connection is in SYN_SENT since we don't know the TID yet.
@@ -185,39 +216,39 @@ rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
  * it is sent directly.
  */
 static inline void
-send_or_defer(struct socket *so, struct tcpcb *tp, struct mbuf *m, int through_l2t)
+send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
 {
-	struct toepcb *toep = tp->t_toe;
+	struct tcpcb *tp = toep->tp_tp;
 
-	
 	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
 		INP_LOCK(tp->t_inpcb);
 		mbufq_tail(&toep->out_of_order_queue, m);  // defer
 		INP_UNLOCK(tp->t_inpcb);
 	} else if (through_l2t)
-		l2t_send(T3C_DEV(so), m, toep->tp_l2t);  // send through L2T
+		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
 	else
-		cxgb_ofld_send(T3C_DEV(so), m);          // send directly
+		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
 }
 
 static inline unsigned int
-mkprio(unsigned int cntrl, const struct socket *so)
+mkprio(unsigned int cntrl, const struct toepcb *toep)
 {
-        return cntrl;
+        return (cntrl);
 }
 
 /*
  * Populate a TID_RELEASE WR.  The skb must be already propely sized.
  */
 static inline void
-mk_tid_release(struct mbuf *m, const struct socket *so, unsigned int tid)
+mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
 {
 	struct cpl_tid_release *req;
 
-	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, so));
+	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
 	m->m_pkthdr.len = m->m_len = sizeof(*req);
 	req = mtod(m, struct cpl_tid_release *);
 	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->wr.wr_lo = 0;
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
 }
 
@@ -257,6 +288,8 @@ make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
 	}
 }
 
+#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
+
 int
 t3_push_frames(struct socket *so, int req_completion)
 {
@@ -266,9 +299,8 @@ t3_push_frames(struct socket *so, int req_completion)
 	struct mbuf *tail, *m0, *last;
 	struct t3cdev *cdev;
 	struct tom_data *d;
-	int bytes, count, total_bytes;
+	int i, bytes, count, total_bytes;
 	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
-	segp = segs;
 
 	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
 		DPRINTF("tcp state=%d\n", tp->t_state);	
@@ -281,10 +313,9 @@ t3_push_frames(struct socket *so, int req_completion)
 		return (0);
 	}
 
-	INP_LOCK_ASSERT(tp->t_inpcb);
 	
+	INP_LOCK_ASSERT(tp->t_inpcb);
 	SOCKBUF_LOCK(&so->so_snd);
-	
 	d = TOM_DATA(TOE_DEV(so));
 	cdev = d->cdev;
 	last = tail = so->so_snd.sb_sndptr ? so->so_snd.sb_sndptr : so->so_snd.sb_mb;
@@ -306,61 +337,103 @@ t3_push_frames(struct socket *so, int req_completion)
 	toep->tp_m_last = NULL;
 	while (toep->tp_wr_avail && (tail != NULL)) {
 		count = bytes = 0;
+		segp = segs;
 		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			return (0);
 		}
-		while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
-		    && (tail != NULL) && (count < TX_MAX_SEGS)) {
-			bytes += tail->m_len;
-			count++;
+		/*
+		 * If the data in tail fits as in-line, then
+		 * make an immediate data wr.
+		 */
+		if (tail->m_len <= IMM_LEN) {
+			count = 1;
+			bytes = tail->m_len;
 			last = tail;
-			/*
-			 * technically an abuse to be using this for a VA
-			 * but less gross than defining my own structure
-			 * or calling pmap_kextract from here :-|
-			 */
-			segp->ds_addr = (bus_addr_t)tail->m_data;
-			segp->ds_len = tail->m_len;
-			DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
-			    count, mbuf_wrs[count], tail->m_data, tail->m_len);
-			
-			segp++;
 			tail = tail->m_next;
+			m_set_sgl(m0, NULL);
+			m_set_sgllen(m0, 0);
+			make_tx_data_wr(so, m0, bytes, tail);
+			m_append(m0, bytes, mtod(last, caddr_t));
+			KASSERT(!m0->m_next, ("bad append"));
+		} else {
+			while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
+			    && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
+				bytes += tail->m_len;
+				last = tail;
+				count++;
+				/*
+				 * technically an abuse to be using this for a VA
+				 * but less gross than defining my own structure
+				 * or calling pmap_kextract from here :-|
+				 */
+				segp->ds_addr = (bus_addr_t)tail->m_data;
+				segp->ds_len = tail->m_len;
+				DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
+				    count, mbuf_wrs[count], tail->m_data, tail->m_len);
+				segp++;
+				tail = tail->m_next;
+			}
+			DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
+			    toep->tp_wr_avail, count, mbuf_wrs[count], tail);	
+
+			m_set_sgl(m0, segs);
+			m_set_sgllen(m0, count);
+			make_tx_data_wr(so, m0, bytes, tail);
 		}
-		DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
-		    toep->tp_wr_avail, count, mbuf_wrs[count], tail);	
+		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
+
 		if (tail) {
 			so->so_snd.sb_sndptr = tail;
 			toep->tp_m_last = NULL;
 		} else 
 			toep->tp_m_last = so->so_snd.sb_sndptr = last;
 
+
 		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
 
 		so->so_snd.sb_sndptroff += bytes;
 		total_bytes += bytes;
 		toep->tp_write_seq += bytes;
-
-
-		SOCKBUF_UNLOCK(&so->so_snd);
-		
-		/*
-		 * XXX can drop socket buffer lock here
-		 */
+		CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d tail=%p sndptr=%p sndptroff=%d",
+		    toep->tp_wr_avail, count, mbuf_wrs[count], tail, so->so_snd.sb_sndptr, so->so_snd.sb_sndptroff);	
+		if (tail)
+			CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p tailbuf=%p snd_una=0x%08x",
+			    total_bytes, toep->tp_m_last, tail->m_data, tp->snd_una);
+		else
+			CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p snd_una=0x%08x",
+			    total_bytes, toep->tp_m_last, tp->snd_una);
+
+
+		i = 0;
+		while (i < count && m_get_sgllen(m0)) {
+			if ((count - i) >= 3) {
+				CTR6(KTR_TOM,
+				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d pa=0x%zx len=%d",
+				    segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len,
+				    segs[i + 2].ds_addr, segs[i + 2].ds_len);
+				    i += 3;
+			} else if ((count - i) == 2) {
+				CTR4(KTR_TOM, 
+				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d",
+				    segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len);
+				    i += 2;
+			} else {
+				CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
+				    segs[i].ds_addr, segs[i].ds_len);
+				i++;
+			}
 	
-		toep->tp_wr_avail -= mbuf_wrs[count];
-		toep->tp_wr_unacked += mbuf_wrs[count];
+		}
 		
-		make_tx_data_wr(so, m0, bytes, tail);
-		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, so));
-		m_set_sgl(m0, segs);
-		m_set_sgllen(m0, count);
-		/*
+                 /*
 		 * remember credits used
 		 */
 		m0->m_pkthdr.csum_data = mbuf_wrs[count];
 		m0->m_pkthdr.len = bytes;
+		toep->tp_wr_avail -= mbuf_wrs[count];
+		toep->tp_wr_unacked += mbuf_wrs[count];
+		
 		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
 		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
 			struct work_request_hdr *wr = cplhdr(m0);
@@ -368,18 +441,16 @@ t3_push_frames(struct socket *so, int req_completion)
 			wr->wr_hi |= htonl(F_WR_COMPL);
 			toep->tp_wr_unacked = 0;	
 		}
-
+		KASSERT((m0->m_pkthdr.csum_data > 0) &&
+		    (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
+			m0->m_pkthdr.csum_data));
 		m0->m_type = MT_DONTFREE;
 		enqueue_wr(toep, m0);
 		DPRINTF("sending offload tx with %d bytes in %d segments\n",
 		    bytes, count);
-		
 		l2t_send(cdev, m0, toep->tp_l2t);
-		if (toep->tp_wr_avail && (tail != NULL)) 
-			SOCKBUF_LOCK(&so->so_snd);
 	}
-
-	SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
+	SOCKBUF_UNLOCK(&so->so_snd);
 	return (total_bytes);
 }
 
@@ -467,13 +538,105 @@ t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail
 	
 	req = mtod(m, struct cpl_rx_data_ack *);
 	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->wr.wr_lo = 0;
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
 	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
-	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toeptoso(toep))); 
+	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep)); 
 	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
 	return (credits);
 }
 
+/*
+ * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
+ * This is only used in DDP mode, so we take the opportunity to also set the
+ * DACK mode and flush any Rx credits.
+ */
+void
+t3_send_rx_modulate(struct toepcb *toep)
+{
+	struct mbuf *m;
+	struct cpl_rx_data_ack *req;
+
+	m = m_gethdr_nofail(sizeof(*req));
+
+	req = mtod(m, struct cpl_rx_data_ack *);
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->wr.wr_lo = 0;
+	m->m_pkthdr.len = m->m_len = sizeof(*req);
+	
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
+	req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
+				 V_RX_DACK_MODE(1) |
+				 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
+	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
+	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
+	toep->tp_rcv_wup = toep->tp_copied_seq;
+}
+
+/*
+ * Handle receipt of an urgent pointer.
+ */
+static void
+handle_urg_ptr(struct socket *so, uint32_t urg_seq)
+{
+#ifdef URGENT_DATA_SUPPORTED
+	struct tcpcb *tp = sototcpcb(so);
+
+	urg_seq--;   /* initially points past the urgent data, per BSD */
+
+	if (tp->urg_data && !after(urg_seq, tp->urg_seq))
+		return;                                 /* duplicate pointer */
+	sk_send_sigurg(sk);
+	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
+	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
+		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+
+		tp->copied_seq++;
+		if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
+			tom_eat_skb(sk, skb, 0);
+	}
+	tp->urg_data = TCP_URG_NOTYET;
+	tp->urg_seq = urg_seq;
+#endif
+}
+
+/*
+ * Returns true if a socket cannot accept new Rx data.
+ */
+static inline int
+so_no_receive(const struct socket *so)
+{
+	return (so->so_state & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
+}
+
+/*
+ * Process an urgent data notification.
+ */
+static void
+rx_urg_notify(struct toepcb *toep, struct mbuf *m)
+{
+	struct cpl_rx_urg_notify *hdr = cplhdr(m);
+	struct socket *so = toeptoso(toep);
+
+	VALIDATE_SOCK(so);
+
+	if (!so_no_receive(so))
+		handle_urg_ptr(so, ntohl(hdr->seq));
+
+	m_freem(m);
+}
+
+/*
+ * Handler for RX_URG_NOTIFY CPL messages.
+ */
+static int
+do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct toepcb *toep = (struct toepcb *)ctx;
+
+	rx_urg_notify(toep, m);
+	return (0);
+}
 
 /*
  * Set of states for which we should return RX credits.
@@ -485,7 +648,7 @@ t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail
  * to the HW for the amount of data processed.
  */
 void
-t3_cleanup_rbuf(struct tcpcb *tp)
+t3_cleanup_rbuf(struct tcpcb *tp, int copied)
 {
 	struct toepcb *toep = tp->t_toe;
 	struct socket *so;
@@ -493,23 +656,38 @@ t3_cleanup_rbuf(struct tcpcb *tp)
 	int dack_mode, must_send, read;
 	u32 thres, credits, dack = 0;
 
+	so = tp->t_inpcb->inp_socket;
 	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
-		(tp->t_state == TCPS_FIN_WAIT_2)))
+		(tp->t_state == TCPS_FIN_WAIT_2))) {
+		if (copied) {
+			SOCKBUF_LOCK(&so->so_rcv);
+			toep->tp_copied_seq += copied;
+			SOCKBUF_UNLOCK(&so->so_rcv);
+		}
+		
 		return;
-	INP_LOCK_ASSERT(tp->t_inpcb);
+	}
 	
-	so = tp->t_inpcb->inp_socket;
+	INP_LOCK_ASSERT(tp->t_inpcb);	
 	SOCKBUF_LOCK(&so->so_rcv);
-	read = toep->tp_enqueued_bytes - so->so_rcv.sb_cc;
-	toep->tp_copied_seq += read;
-	toep->tp_enqueued_bytes -= read;
+	if (copied)
+		toep->tp_copied_seq += copied;
+	else {
+		read = toep->tp_enqueued_bytes - so->so_rcv.sb_cc;
+		toep->tp_copied_seq += read;
+	}
 	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
+	toep->tp_enqueued_bytes = so->so_rcv.sb_cc;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
-	if (credits > so->so_rcv.sb_mbmax)
+	if (credits > so->so_rcv.sb_mbmax) {
 	    printf("copied_seq=%u rcv_wup=%u credits=%u\n",
 		toep->tp_copied_seq, toep->tp_rcv_wup, credits);
-	/*
+	    credits = so->so_rcv.sb_mbmax;
+	}
+	
+	    
+	    /*
 	 * XXX this won't accurately reflect credit return - we need
 	 * to look at the difference between the amount that has been 
 	 * put in the recv sockbuf and what is there now
@@ -593,7 +771,7 @@ static int
 cxgb_toe_rcvd(struct tcpcb *tp)
 {
 	INP_LOCK_ASSERT(tp->t_inpcb);
-	t3_cleanup_rbuf(tp);
+	t3_cleanup_rbuf(tp, 0);
 	
 	return (0);
 }
@@ -631,16 +809,18 @@ static struct toe_usrreqs cxgb_toe_usrreqs = {
 
 
 static void
-__set_tcb_field(struct socket *so, struct mbuf *m, uint16_t word,
+__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
 			    uint64_t mask, uint64_t val, int no_reply)
 {
 	struct cpl_set_tcb_field *req;
-	struct tcpcb *tp = sototcpcb(so);
-	struct toepcb *toep = tp->t_toe;
+
+	CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
+	    toep->tp_tid, word, mask, val);
 
 	req = mtod(m, struct cpl_set_tcb_field *);
 	m->m_pkthdr.len = m->m_len = sizeof(*req);
 	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->wr.wr_lo = 0;
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
 	req->reply = V_NO_REPLY(no_reply);
 	req->cpu_idx = 0;
@@ -648,8 +828,8 @@ __set_tcb_field(struct socket *so, struct mbuf *m, uint16_t word,
 	req->mask = htobe64(mask);
 	req->val = htobe64(val);
 
-	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so));
-	send_or_defer(so, tp, m, 0);
+	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
+	send_or_defer(toep, m, 0);
 }
 
 static void
@@ -661,13 +841,15 @@ t3_set_tcb_field(struct socket *so, uint16_t word, uint64_t mask, uint64_t val)
 	
 	if (toep == NULL)
 		return;
-
-	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN))
+ 
+	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
+		printf("not seting field\n");
 		return;
-
+	}
+	
 	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
 
-	__set_tcb_field(so, m, word, mask, val, 1);
+	__set_tcb_field(toep, m, word, mask, val, 1);
 }
 
 /*
@@ -735,10 +917,11 @@ t3_set_tos(struct socket *so)
 static void
 t3_enable_ddp(struct socket *so, int on)
 {
-	if (on)
+	if (on) {
+		
 		t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
 				 V_TF_DDP_OFF(0));
-	else
+	} else
 		t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS,
 				 V_TF_DDP_OFF(1) |
 				 TP_DDP_TIMER_WORKAROUND_MASK,
@@ -747,7 +930,6 @@ t3_enable_ddp(struct socket *so, int on)
 
 }
 
-
 void
 t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag_color)
 {
@@ -777,7 +959,7 @@ t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset,
 static int
 t3_set_cong_control(struct socket *so, const char *name)
 {
-#ifdef notyet	
+#ifdef CONGESTION_CONTROL_SUPPORTED	
 	int cong_algo;
 
 	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
@@ -802,12 +984,14 @@ t3_get_tcb(struct socket *so)
 		return (ENOMEM);
 	
 	INP_LOCK_ASSERT(tp->t_inpcb);	
-	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so));	
+	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
 	req = mtod(m, struct cpl_get_tcb *);
 	m->m_pkthdr.len = m->m_len = sizeof(*req);
 	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->wr.wr_lo = 0;
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
 	req->cpuno = htons(toep->tp_qset);
+	req->rsvd = 0;
 	if (sototcpcb(so)->t_state == TCPS_SYN_SENT)
 		mbufq_tail(&toep->out_of_order_queue, m);	// defer
 	else
@@ -863,14 +1047,6 @@ select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
 	return (idx);
 }
 
-void
-t3_release_ddp_resources(struct toepcb *toep)
-{
-	/*
-	 * This is a no-op until we have DDP support
-	 */
-}
-
 static inline void
 free_atid(struct t3cdev *cdev, unsigned int tid)
 {
@@ -915,8 +1091,6 @@ t3_release_offload_resources(struct toepcb *toep)
 		l2t_release(L2DATA(cdev), toep->tp_l2t);
 		toep->tp_l2t = NULL;
 	}
-	printf("setting toep->tp_tp to NULL\n");
-	
 	toep->tp_tp = NULL;
 	if (tp) {
 		INP_LOCK_ASSERT(tp->t_inpcb);
@@ -964,16 +1138,16 @@ select_rcv_wscale(int space)
 
 	if (tcp_do_rfc1323)
 		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
-	return wscale;
+
+	return (wscale);
 }
 
 /*
  * Determine the receive window size for a socket.
  */
-static unsigned int
-select_rcv_wnd(struct socket *so)
+static unsigned long
+select_rcv_wnd(struct toedev *dev, struct socket *so)
 {
-	struct toedev *dev = TOE_DEV(so);
 	struct tom_data *d = TOM_DATA(dev);
 	unsigned int wnd;
 	unsigned int max_rcv_wnd;
@@ -981,7 +1155,9 @@ select_rcv_wnd(struct socket *so)
 	if (tcp_do_autorcvbuf)
 		wnd = tcp_autorcvbuf_max;
 	else
-		wnd = sbspace(&so->so_rcv);
+		wnd = so->so_rcv.sb_hiwat;
+
+	
 	
 	/* XXX
 	 * For receive coalescing to work effectively we need a receive window
@@ -991,7 +1167,7 @@ select_rcv_wnd(struct socket *so)
 		wnd = MIN_RCV_WND; 
 	
 	/* PR 5138 */
-	max_rcv_wnd = (dev->tod_ttid == TOE_ID_CHELSIO_T3B ? 
+	max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ? 
 				    (uint32_t)d->rx_page_size * 23 :
 				    MAX_RCV_WND);
 	
@@ -1017,7 +1193,8 @@ init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
 	 * or we need to add this
 	 */
 	so->so_snd.sb_flags |= SB_NOCOALESCE;
-
+	so->so_rcv.sb_flags |= SB_NOCOALESCE;
+	
 	tp->t_toe = toep;
 	toep->tp_tp = tp;
 	toep->tp_toedev = dev;
@@ -1033,7 +1210,8 @@ init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
 	 * XXX broken
 	 * 
 	 */
-	tp->rcv_wnd = select_rcv_wnd(so);
+	tp->rcv_wnd = select_rcv_wnd(dev, so);
+
         toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) &&
 		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
 	toep->tp_qset_idx = 0;
@@ -1076,9 +1254,23 @@ calc_opt2(const struct socket *so, struct toedev *dev)
 
 	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
 
-	return V_FLAVORS_VALID(flv_valid) |
-	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0);
+	return (V_FLAVORS_VALID(flv_valid) |
+	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
 }
+
+#if DEBUG_WR > 1
+static int
+count_pending_wrs(const struct toepcb *toep)
+{
+	const struct mbuf *m;
+	int n = 0;
+
+	wr_queue_walk(toep, m)
+		n += m->m_pkthdr.csum_data;
+	return (n);
+}
+#endif
+
 #if 0
 (((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
 #endif
@@ -1093,18 +1285,18 @@ mk_act_open_req(struct socket *so, struct mbuf *m,
 	struct toepcb *toep = tp->t_toe;
 	struct toedev *tdev = TOE_DEV(so);
 	
-	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, so));
+	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
 	
 	req = mtod(m, struct cpl_act_open_req *);
 	m->m_pkthdr.len = m->m_len = sizeof(*req);
-	
+
 	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->wr.wr_lo = 0;
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
 	req->local_port = inp->inp_lport;
 	req->peer_port = inp->inp_fport;
 	memcpy(&req->local_ip, &inp->inp_laddr, 4);
 	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
-	DPRINTF("connect smt_idx=%d\n", e->smt_idx);
 	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
 			   V_TX_CHANNEL(e->smt_idx));
 	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
@@ -1144,7 +1336,7 @@ fail_act_open(struct toepcb *toep, int errno)
 	t3_release_offload_resources(toep);
 	if (tp) {
 		INP_LOCK_ASSERT(tp->t_inpcb);
-		cxgb_tcp_drop(tp, errno);
+		tcp_drop(tp, errno);
 	}
 	
 #ifdef notyet
@@ -1289,8 +1481,6 @@ t3_connect(struct toedev *tdev, struct socket *so,
 	toep = tp->t_toe;
 	m_set_toep(m, tp->t_toe);
 	
-	printf("sending off request\n");
-	
 	toep->tp_state = TCPS_SYN_SENT;
 	l2t_send(d->cdev, (struct mbuf *)m, e);
 
@@ -1342,7 +1532,7 @@ t3_send_reset(struct toepcb *toep)
 		mode |= CPL_ABORT_POST_CLOSE_REQ;
 
 	m = m_gethdr_nofail(sizeof(*req));
-	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, so));
+	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
 	set_arp_failure_handler(m, abort_arp_failure);
 
 	req = mtod(m, struct cpl_abort_req *);
@@ -1416,7 +1606,7 @@ t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
 		 * XXX I need to revisit this
 		 */
 		if ((err = t3_set_cong_control(so, name)) == 0) {
-#ifdef notyet
+#ifdef CONGESTION_CONTROL_SUPPORTED
 			tp->t_cong_control = strdup(name, M_CXGB);
 #endif			
 		} else
@@ -1465,7 +1655,280 @@ t3_ctloutput(struct socket *so, struct sockopt *sopt)
 	if (err != EOPNOTSUPP)
 		return (err);
 
-	return tcp_ctloutput(so, sopt);
+	return (tcp_ctloutput(so, sopt));
+}
+
+/*
+ * Returns true if we need to explicitly request RST when we receive new data
+ * on an RX-closed connection.
+ */
+static inline int
+need_rst_on_excess_rx(const struct toepcb *toep)
+{
+	return (1);
+}
+
+/*
+ * Handles Rx data that arrives in a state where the socket isn't accepting
+ * new data.
+ */
+static void
+handle_excess_rx(struct toepcb *toep, struct mbuf *m)
+{
+	
+	if (need_rst_on_excess_rx(toep) && !(toep->tp_flags & TP_ABORT_SHUTDOWN))
+		t3_send_reset(toep);
+	m_freem(m); 
+}
+
+/*
+ * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
+ * by getting the DDP offset from the TCB.
+ */
+static void
+tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
+{
+	struct ddp_state *q = &toep->tp_ddp_state;
+	struct ddp_buf_state *bsp;
+	struct cpl_get_tcb_rpl *hdr;
+	unsigned int ddp_offset;
+	struct socket *so;
+	struct tcpcb *tp;
+	
+	uint64_t t;
+	__be64 *tcb;
+
+	so = toeptoso(toep);
+	tp = toep->tp_tp;
+
+	INP_LOCK_ASSERT(tp->t_inpcb);
+	SOCKBUF_LOCK(&so->so_rcv);
+	
+	/* Note that we only accout for CPL_GET_TCB issued by the DDP code. We
+	 * really need a cookie in order to dispatch the RPLs.
+	 */
+	q->get_tcb_count--;
+
+	/* It is a possible that a previous CPL already invalidated UBUF DDP
+	 * and moved the cur_buf idx and hence no further processing of this
+	 * skb is required. However, the app might be sleeping on
+	 * !q->get_tcb_count and we need to wake it up.
+	 */
+	if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
+		struct socket *so = toeptoso(toep);
+		
+		m_freem(m);
+		if (__predict_true((so->so_state & SS_NOFDREF) == 0))
+			sorwakeup_locked(so);
+		else
+			SOCKBUF_UNLOCK(&so->so_rcv);
+		return;
+	}
+
+	bsp = &q->buf_state[q->cur_buf];
+	hdr = cplhdr(m);
+	tcb = (__be64 *)(hdr + 1);
+	if (q->cur_buf == 0) {
+		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
+		ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
+	} else {
+		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
+		ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
+	}
+	ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
+	m->m_cur_offset = bsp->cur_offset;
+	bsp->cur_offset = ddp_offset;
+	m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
+
+	CTR5(KTR_TOM,
+	    "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
+	    q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
+	KASSERT(ddp_offset >= m->m_cur_offset, ("ddp_offset=%u less than cur_offset=%u",
+		ddp_offset, m->m_cur_offset));
+	
+#ifdef T3_TRACE
+	T3_TRACE3(TIDTB(so),
+		  "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u ddp_offset %u",
+		  tp->rcv_nxt, q->cur_buf, ddp_offset);
+#endif
+	
+#if 0
+{
+	unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
+
+	t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
+	ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
+
+        t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
+        rcv_nxt = t >> S_TCB_RCV_NXT;
+        rcv_nxt &= M_TCB_RCV_NXT;
+
+        t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
+        rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
+        rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
+
+	T3_TRACE2(TIDTB(sk),
+		  "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
+		  ddp_flags, rcv_nxt - rx_hdr_offset);
+	T3_TRACE4(TB(q),
+		  "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
+		  tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
+	T3_TRACE3(TB(q),
+		  "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
+		  rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
+	T3_TRACE2(TB(q),
+		  "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
+		 q->buf_state[0].flags, q->buf_state[1].flags);
+
+}
+#endif
+	if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
+		handle_excess_rx(toep, m);
+		return;
+	}
+
+#ifdef T3_TRACE
+	if ((int)m->m_pkthdr.len < 0) {
+		t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
+	}
+#endif
+	if (bsp->flags & DDP_BF_NOCOPY) {
+#ifdef T3_TRACE
+		T3_TRACE0(TB(q),
+			  "tcb_rpl_as_ddp_complete: CANCEL UBUF");
+
+		if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
+			printk("!cancel_ubuf");
+			t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
+		}
+#endif
+		m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
+		bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
+		q->cur_buf ^= 1;
+	} else if (bsp->flags & DDP_BF_NOFLIP) {
+
+		m->m_ddp_flags = 1;    /* always a kernel buffer */
+
+		/* now HW buffer carries a user buffer */
+		bsp->flags &= ~DDP_BF_NOFLIP;
+		bsp->flags |= DDP_BF_NOCOPY;
+
+		/* It is possible that the CPL_GET_TCB_RPL doesn't indicate
+		 * any new data in which case we're done. If in addition the
+		 * offset is 0, then there wasn't a completion for the kbuf
+		 * and we need to decrement the posted count.
+		 */
+		if (m->m_pkthdr.len == 0) {
+			if (ddp_offset == 0) {
+				q->kbuf_posted--;
+				bsp->flags |= DDP_BF_NODATA;
+			}
+			SOCKBUF_UNLOCK(&so->so_rcv);
+
+			m_free(m);
+			return;
+		}
+	} else {
+		SOCKBUF_UNLOCK(&so->so_rcv);
+		/* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
+		 * but it got here way late and nobody cares anymore.
+		 */
+		m_free(m);
+		return;
+	}
+
+	m->m_ddp_gl = (unsigned char *)bsp->gl;
+	m->m_flags |= M_DDP;
+	m->m_seq = tp->rcv_nxt;
+	tp->rcv_nxt += m->m_pkthdr.len;
+	tp->t_rcvtime = ticks;
+#ifdef T3_TRACE
+	T3_TRACE3(TB(q),
+		  "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u lskb->len %u",
+		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
+#endif
+	CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
+		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
+	if (m->m_pkthdr.len == 0)
+		q->user_ddp_pending = 0;
+	else 
+		SBAPPEND(&so->so_rcv, m);
+	if (__predict_true((so->so_state & SS_NOFDREF) == 0))
+		sorwakeup_locked(so);
+	else
+		SOCKBUF_UNLOCK(&so->so_rcv);
+}
+
+/*
+ * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
+ * in that case they are similar to DDP completions.
+ */
+static int
+do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct toepcb *toep = (struct toepcb *)ctx;
+
+	/* OK if socket doesn't exist */
+	if (toep == NULL) {
+		printf("null toep in do_get_tcb_rpl\n");
+		return (CPL_RET_BUF_DONE);
+	}
+
+	INP_LOCK(toep->tp_tp->t_inpcb);
+	tcb_rpl_as_ddp_complete(toep, m);
+	INP_UNLOCK(toep->tp_tp->t_inpcb);
+	
+	return (0);
+}
+
+static void
+handle_ddp_data(struct toepcb *toep, struct mbuf *m)
+{
+	struct tcpcb *tp = toep->tp_tp;
+	struct socket *so = toeptoso(toep);
+	struct ddp_state *q;
+	struct ddp_buf_state *bsp;
+	struct cpl_rx_data *hdr = cplhdr(m);
+	unsigned int rcv_nxt = ntohl(hdr->seq);
+
+	if (tp->rcv_nxt == rcv_nxt)
+		return;
+
+	INP_LOCK_ASSERT(tp->t_inpcb);
+	SOCKBUF_LOCK(&so->so_rcv);
+	q = &toep->tp_ddp_state;
+	bsp = &q->buf_state[q->cur_buf];
+	KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
+		rcv_nxt, tp->rcv_nxt));
+	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
+	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
+	CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
+	    rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
+
+#ifdef T3_TRACE
+	if ((int)m->m_pkthdr.len < 0) {
+		t3_ddp_error(so, "handle_ddp_data: neg len");
+	}
+#endif
+
+	m->m_ddp_gl = (unsigned char *)bsp->gl;
+	m->m_flags |= M_DDP;
+	m->m_cur_offset = bsp->cur_offset;
+	m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
+	if (bsp->flags & DDP_BF_NOCOPY)
+		bsp->flags &= ~DDP_BF_NOCOPY;
+
+	m->m_seq = tp->rcv_nxt;
+	tp->rcv_nxt = rcv_nxt;
+	bsp->cur_offset += m->m_pkthdr.len;
+	if (!(bsp->flags & DDP_BF_NOFLIP))
+		q->cur_buf ^= 1;
+	/*
+	 * For now, don't re-enable DDP after a connection fell out of  DDP
+	 * mode.
+	 */
+	q->ubuf_ddp_ready = 0;
+	SOCKBUF_UNLOCK(&so->so_rcv);
 }
 
 /*
@@ -1481,32 +1944,33 @@ new_rx_data(struct toepcb *toep, struct mbuf *m)
 
 	INP_LOCK(tp->t_inpcb);
 	
-#ifdef notyet	
-	if (__predict_false(sk_no_receive(sk))) {
-		handle_excess_rx(so, skb);
+	if (__predict_false(so_no_receive(so))) {
+		handle_excess_rx(toep, m);
+		INP_UNLOCK(tp->t_inpcb);
+		TRACE_EXIT;
 		return;
 	}
 
-	if (ULP_MODE(tp) == ULP_MODE_TCPDDP)
-		handle_ddp_data(so, skb);
+	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
+		handle_ddp_data(toep, m);
+	
+	m->m_seq = ntohl(hdr->seq);
+	m->m_ulp_mode = 0;                    /* for iSCSI */
 
-	TCP_SKB_CB(skb)->seq = ntohl(hdr->seq);
-	TCP_SKB_CB(skb)->flags = 0;
-	skb_ulp_mode(skb) = 0;                    /* for iSCSI */
-#endif
 #if VALIDATE_SEQ
-	if (__predict_false(TCP_SKB_CB(skb)->seq != tp->rcv_nxt)) {
-		printk(KERN_ERR
+	if (__predict_false(m->m_seq != tp->rcv_nxt)) {
+		log(LOG_ERR,
 		       "%s: TID %u: Bad sequence number %u, expected %u\n",
-		       TOE_DEV(sk)->name, TID(tp), TCP_SKB_CB(skb)->seq,
+		    TOE_DEV(toeptoso(toep))->name, toep->tp_tid, m->m_seq,
 		       tp->rcv_nxt);
-		__kfree_skb(skb);
+		m_freem(m);
+		INP_UNLOCK(tp->t_inpcb);
 		return;
 	}
 #endif
 	m_adj(m, sizeof(*hdr));
 
-#ifdef notyet
+#ifdef URGENT_DATA_SUPPORTED
 	/*
 	 * We don't handle urgent data yet
 	 */
@@ -1521,8 +1985,8 @@ new_rx_data(struct toepcb *toep, struct mbuf *m)
 		toep->tp_delack_mode = hdr->dack_mode;
 		toep->tp_delack_seq = tp->rcv_nxt;
 	}
-
-	DPRINTF("appending mbuf=%p pktlen=%d m_len=%d len=%d\n", m, m->m_pkthdr.len, m->m_len, len);
+	CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
+	    m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
 	
 	if (len < m->m_pkthdr.len)
 		m->m_pkthdr.len = m->m_len = len;
@@ -1532,21 +1996,29 @@ new_rx_data(struct toepcb *toep, struct mbuf *m)
 	toep->tp_enqueued_bytes += m->m_pkthdr.len;
 #ifdef T3_TRACE
 	T3_TRACE2(TIDTB(sk),
-		  "new_rx_data: seq 0x%x len %u",
-		  TCP_SKB_CB(skb)->seq, skb->len);
+	    "new_rx_data: seq 0x%x len %u",
+	    m->m_seq, m->m_pkthdr.len);
 #endif
+	INP_UNLOCK(tp->t_inpcb);
 	SOCKBUF_LOCK(&so->so_rcv);
 	if (sb_notify(&so->so_rcv))
 		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, so->so_rcv.sb_flags, m->m_pkthdr.len);
 
-	sbappend_locked(&so->so_rcv, m);
-	KASSERT(so->so_rcv.sb_cc < so->so_rcv.sb_mbmax,
+	SBAPPEND(&so->so_rcv, m);
+
+#ifdef notyet
+	/*
+	 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
+	 *
+	 */
+	KASSERT(so->so_rcv.sb_cc < (so->so_rcv.sb_mbmax << 1),
 
 	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
 		so, so->so_rcv.sb_cc, so->so_rcv.sb_mbmax));
+#endif
 	
-	INP_UNLOCK(tp->t_inpcb);
-	DPRINTF("sb_cc=%d sb_mbcnt=%d\n",
+
+	CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
 	    so->so_rcv.sb_cc, so->so_rcv.sb_mbcnt);
 	    
 	if (__predict_true((so->so_state & SS_NOFDREF) == 0))
@@ -1571,22 +2043,26 @@ do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 }
 
 static void
-new_rx_data_ddp(struct socket *so, struct mbuf *m)
+new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
 {
-	struct tcpcb *tp = sototcpcb(so);
-	struct toepcb *toep = tp->t_toe;
+	struct tcpcb *tp;
 	struct ddp_state *q;
 	struct ddp_buf_state *bsp;
 	struct cpl_rx_data_ddp *hdr;
 	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
+	struct socket *so = toeptoso(toep);
+	int nomoredata = 0;
 
-#ifdef notyet
-	if (unlikely(sk_no_receive(sk))) {
-		handle_excess_rx(so, m);
+	tp = sototcpcb(so);
+	
+	INP_LOCK(tp->t_inpcb);
+	if (__predict_false(so_no_receive(so))) {
+
+		handle_excess_rx(toep, m);
+		INP_UNLOCK(tp->t_inpcb);
 		return;
 	}
-#endif
-	tp = sototcpcb(so);
+	
 	q = &toep->tp_ddp_state;
 	hdr = cplhdr(m);
 	ddp_report = ntohl(hdr->u.ddp_report);
@@ -1603,69 +2079,91 @@ new_rx_data_ddp(struct socket *so, struct mbuf *m)
 		  "new_rx_data_ddp: ddp_report 0x%x",
 		  ddp_report);
 #endif
-
+	CTR4(KTR_TOM,
+	    "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
+	    "hdr seq 0x%x len %u",
+	    tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
+	    ntohs(hdr->len));
+	CTR3(KTR_TOM,
+	    "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
+	    G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
+	
 	ddp_len = ntohs(hdr->len);
 	rcv_nxt = ntohl(hdr->seq) + ddp_len;
 
-	/*
-	 * Overload to store old rcv_next
-	 */
-	m->m_pkthdr.csum_data = tp->rcv_nxt;
+	m->m_seq = tp->rcv_nxt;
 	tp->rcv_nxt = rcv_nxt;
 
+	tp->t_rcvtime = ticks;
 	/*
 	 * Store the length in m->m_len.  We are changing the meaning of
 	 * m->m_len here, we need to be very careful that nothing from now on
 	 * interprets ->len of this packet the usual way.
 	 */
-	m->m_len = tp->rcv_nxt - m->m_pkthdr.csum_data;
-
+	m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
+	INP_UNLOCK(tp->t_inpcb);
+	CTR3(KTR_TOM,
+	    "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
+	    m->m_len, rcv_nxt, m->m_seq);
 	/*
 	 * Figure out where the new data was placed in the buffer and store it
 	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
 	 * account for page pod's pg_offset.
 	 */
 	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
-#ifdef notyet	
-	TCP_SKB_CB(skb)->when = end_offset - skb->len;
+	m->m_cur_offset = end_offset - m->m_pkthdr.len;
 
-	/*
-	 * We store in mac.raw the address of the gather list where the
-	 * placement happened.
-	 */
-	skb->mac.raw = (unsigned char *)bsp->gl;
-#endif	
+	SOCKBUF_LOCK(&so->so_rcv);
+	m->m_ddp_gl = (unsigned char *)bsp->gl;
+	m->m_flags |= M_DDP;
 	bsp->cur_offset = end_offset;
+	toep->tp_enqueued_bytes += m->m_pkthdr.len;
 
 	/*
+	 * Length is only meaningful for kbuf
+	 */
+	if (!(bsp->flags & DDP_BF_NOCOPY))
+		KASSERT(m->m_len <= bsp->gl->dgl_length,
+		    ("length received exceeds ddp pages: len=%d dgl_length=%d",
+			m->m_len, bsp->gl->dgl_length));
+
+	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
+	KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
+
+
+        /*
 	 * Bit 0 of flags stores whether the DDP buffer is completed.
 	 * Note that other parts of the code depend on this being in bit 0.
 	 */
 	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
-#if 0		
-		TCP_SKB_CB(skb)->flags = 0;  /* potential spurious completion */
-#endif		
 		panic("spurious ddp completion");
 	} else {
-		m->m_pkthdr.csum_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
-		if (m->m_pkthdr.csum_flags && !(bsp->flags & DDP_BF_NOFLIP))
+		m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
+		if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP)) 
 			q->cur_buf ^= 1;                     /* flip buffers */
 	}
 
 	if (bsp->flags & DDP_BF_NOCOPY) {
-		m->m_pkthdr.csum_flags |= (bsp->flags & DDP_BF_NOCOPY);
+		m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
 		bsp->flags &= ~DDP_BF_NOCOPY;
 	}
 
 	if (ddp_report & F_DDP_PSH)
-		m->m_pkthdr.csum_flags |= DDP_BF_PSH;
+		m->m_ddp_flags |= DDP_BF_PSH;
+	if (nomoredata)
+		m->m_ddp_flags |= DDP_BF_NODATA;
+
+	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
+		toep->tp_delack_mode = G_DDP_DACK_MODE(ddp_report);
+		toep->tp_delack_seq = tp->rcv_nxt;
+	}
+
+	SBAPPEND(&so->so_rcv, m);
 	
-	tp->t_rcvtime = ticks;
-	sbappendstream_locked(&so->so_rcv, m);
-#ifdef notyet	
-	if (!sock_flag(sk, SOCK_DEAD))
-		sk->sk_data_ready(sk, 0);
-#endif	
+	if ((so->so_state & SS_NOFDREF) == 0)
+		sorwakeup_locked(so);
+	else
+		SOCKBUF_UNLOCK(&so->so_rcv);
 }
 
 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
@@ -1680,7 +2178,6 @@ static int
 do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 {
 	struct toepcb *toep = ctx;
-	struct socket *so = toeptoso(toep);
 	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
 
 	VALIDATE_SOCK(so);
@@ -1688,40 +2185,50 @@ do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
 		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
 		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
-		return CPL_RET_BUF_DONE;
+		return (CPL_RET_BUF_DONE);
 	}
 #if 0
 	skb->h.th = tcphdr_skb->h.th;
 #endif	
-	new_rx_data_ddp(so, m);
+	new_rx_data_ddp(toep, m);
 	return (0);
 }
 
 static void
-process_ddp_complete(struct socket *so, struct mbuf *m)
+process_ddp_complete(struct toepcb *toep, struct mbuf *m)
 {
-	struct tcpcb *tp = sototcpcb(so);
-	struct toepcb *toep = tp->t_toe;
+	struct tcpcb *tp = toep->tp_tp;
+	struct socket *so = toeptoso(toep);
 	struct ddp_state *q;
 	struct ddp_buf_state *bsp;
 	struct cpl_rx_ddp_complete *hdr;
 	unsigned int ddp_report, buf_idx, when;
+	int nomoredata = 0;
 
-#ifdef notyet	
-	if (unlikely(sk_no_receive(sk))) {
-		handle_excess_rx(sk, skb);
+	INP_LOCK(tp->t_inpcb);
+	if (__predict_false(so_no_receive(so))) {
+		struct inpcb *inp = sotoinpcb(so);
+
+		handle_excess_rx(toep, m);
+		INP_UNLOCK(inp);
 		return;
 	}
-#endif
 	q = &toep->tp_ddp_state; 
 	hdr = cplhdr(m);
 	ddp_report = ntohl(hdr->ddp_report);
 	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
-	bsp = &q->buf_state[buf_idx];
+	m->m_pkthdr.csum_data = tp->rcv_nxt;
 
+	
+	SOCKBUF_LOCK(&so->so_rcv);
+	bsp = &q->buf_state[buf_idx];
 	when = bsp->cur_offset;
-	m->m_len = G_DDP_OFFSET(ddp_report) - when;
+	m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
+	tp->rcv_nxt += m->m_len;
+	tp->t_rcvtime = ticks;
+	INP_UNLOCK(tp->t_inpcb);
 
+	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
 #ifdef T3_TRACE
 	T3_TRACE5(TIDTB(sk),
 		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
@@ -1729,12 +2236,20 @@ process_ddp_complete(struct socket *so, struct mbuf *m)
 		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
 		   G_DDP_OFFSET(ddp_report), skb->len);
 #endif
-
+	CTR5(KTR_TOM,
+		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
+		  "ddp_report 0x%x offset %u, len %u",
+		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
+		   G_DDP_OFFSET(ddp_report), m->m_len);
+	
 	bsp->cur_offset += m->m_len;
 
-	if (!(bsp->flags & DDP_BF_NOFLIP))
+	if (!(bsp->flags & DDP_BF_NOFLIP)) {
 		q->cur_buf ^= 1;                     /* flip buffers */
-
+		if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
+			nomoredata=1;
+	}
+		
 #ifdef T3_TRACE
 	T3_TRACE4(TIDTB(sk),
 		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
@@ -1742,21 +2257,26 @@ process_ddp_complete(struct socket *so, struct mbuf *m)
 		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
 		   G_DDP_OFFSET(ddp_report));
 #endif
-#if 0	
-	skb->mac.raw = (unsigned char *)bsp->gl;
-#endif	
-	m->m_pkthdr.csum_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
+	CTR4(KTR_TOM,
+		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
+		  "ddp_report %u offset %u",
+		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
+		   G_DDP_OFFSET(ddp_report));
+	
+	m->m_ddp_gl = (unsigned char *)bsp->gl;
+	m->m_flags |= M_DDP;
+	m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
 	if (bsp->flags & DDP_BF_NOCOPY)
 		bsp->flags &= ~DDP_BF_NOCOPY;
-	m->m_pkthdr.csum_data = tp->rcv_nxt;
-	tp->rcv_nxt += m->m_len;
+	if (nomoredata)
+		m->m_ddp_flags |= DDP_BF_NODATA;
 
-	tp->t_rcvtime = ticks;
-	sbappendstream_locked(&so->so_rcv, m);
-#ifdef notyet	
-	if (!sock_flag(sk, SOCK_DEAD))
-		sk->sk_data_ready(sk, 0);
-#endif	
+	SBAPPEND(&so->so_rcv, m);
+	
+	if ((so->so_state & SS_NOFDREF) == 0)
+		sorwakeup_locked(so);
+	else
+		SOCKBUF_UNLOCK(&so->so_rcv);
 }
 
 /*
@@ -1766,13 +2286,12 @@ static int
 do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 {
 	struct toepcb *toep = ctx;
-	struct socket *so = toeptoso(toep);
 
 	VALIDATE_SOCK(so);
 #if 0
 	skb->h.th = tcphdr_skb->h.th;
 #endif	
-	process_ddp_complete(so, m);
+	process_ddp_complete(toep, m);
 	return (0);
 }
 
@@ -1801,6 +2320,65 @@ enter_timewait(struct socket *so)
 }
 
 /*
+ * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
+ * function deals with the data that may be reported along with the FIN.
+ * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
+ * perform normal FIN-related processing.  In the latter case 1 indicates that
+ * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
+ * skb can be freed.
+ */
+static int
+handle_peer_close_data(struct socket *so, struct mbuf *m)
+{
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	struct ddp_state *q;
+	struct ddp_buf_state *bsp;
+	struct cpl_peer_close *req = cplhdr(m);
+	unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
+
+	if (tp->rcv_nxt == rcv_nxt)			/* no data */
+		return (0);
+
+	if (__predict_false(so_no_receive(so))) {
+		handle_excess_rx(toep, m);
+
+		/*
+		 * Although we discard the data we want to process the FIN so
+		 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
+		 * PEER_CLOSE without data.  In particular this PEER_CLOSE
+		 * may be what will close the connection.  We return 1 because
+		 * handle_excess_rx() already freed the packet.
+		 */
+		return (1);
+	}
+
+	INP_LOCK_ASSERT(tp->t_inpcb);
+	q = &toep->tp_ddp_state;
+	SOCKBUF_LOCK(&so->so_rcv);
+	bsp = &q->buf_state[q->cur_buf];
+	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
+	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
+	m->m_ddp_gl = (unsigned char *)bsp->gl;
+	m->m_flags |= M_DDP;
+	m->m_cur_offset = bsp->cur_offset;
+	m->m_ddp_flags = 
+	    DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
+	m->m_seq = tp->rcv_nxt;
+	tp->rcv_nxt = rcv_nxt;
+	bsp->cur_offset += m->m_pkthdr.len;
+	if (!(bsp->flags & DDP_BF_NOFLIP))
+		q->cur_buf ^= 1;
+	tp->t_rcvtime = ticks;
+	SBAPPEND(&so->so_rcv, m);
+	if (__predict_true((so->so_state & SS_NOFDREF) == 0))
+		sorwakeup_locked(so);
+	else
+		SOCKBUF_UNLOCK(&so->so_rcv);
+	return (1);
+}
+
+/*
  * Handle a peer FIN.
  */
 static void
@@ -1808,9 +2386,8 @@ do_peer_fin(struct socket *so, struct mbuf *m)
 {
 	struct tcpcb *tp = sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
-	int keep = 0, dead = (so->so_state & SS_NOFDREF);
-
-	DPRINTF("do_peer_fin state=%d dead=%d\n", tp->t_state, !!dead);
+	int keep = 0;
+	DPRINTF("do_peer_fin state=%d\n", tp->t_state);
 	
 #ifdef T3_TRACE
 	T3_TRACE0(TIDTB(sk),"do_peer_fin:");
@@ -1821,20 +2398,32 @@ do_peer_fin(struct socket *so, struct mbuf *m)
 		
 		goto out;
 	}
-	
-#ifdef notyet
-	if (ULP_MODE(tp) == ULP_MODE_TCPDDP) {
-		keep = handle_peer_close_data(so, skb);
-		if (keep < 0)
-			return;
-	}
-	sk->sk_shutdown |= RCV_SHUTDOWN;
-	sock_set_flag(so, SOCK_DONE);
-#endif
 	INP_INFO_WLOCK(&tcbinfo);
 	INP_LOCK(tp->t_inpcb);
-	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) 
+	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
+		keep = handle_peer_close_data(so, m);
+		if (keep < 0) {
+			INP_INFO_WUNLOCK(&tcbinfo);
+			INP_UNLOCK(tp->t_inpcb);					
+			return;
+		}
+	}
+	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		socantrcvmore(so);
+		/*
+		 * If connection is half-synchronized
+		 * (ie NEEDSYN flag on) then delay ACK,
+		 * so it may be piggybacked when SYN is sent.
+		 * Otherwise, since we received a FIN then no
+		 * more input can be expected, send ACK now.
+		 */
+		if (tp->t_flags & TF_NEEDSYN)
+			tp->t_flags |= TF_DELACK;
+		else
+			tp->t_flags |= TF_ACKNOW;
+		tp->rcv_nxt++;
+	}
+	
 	switch (tp->t_state) {
 	case TCPS_SYN_RECEIVED:
 	    tp->t_starttime = ticks;
@@ -1858,8 +2447,9 @@ do_peer_fin(struct socket *so, struct mbuf *m)
 		t3_release_offload_resources(toep);
 		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
 			tp = tcp_close(tp);
-		} else
+		} else {
 			enter_timewait(so);
+		}
 		break;
 	default:
 		log(LOG_ERR,
@@ -1870,23 +2460,17 @@ do_peer_fin(struct socket *so, struct mbuf *m)
 	if (tp)
 		INP_UNLOCK(tp->t_inpcb);					
 
-	if (!dead) {
-		DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(&so->so_rcv), so->so_rcv.sb_flags);
-		
-		sorwakeup(so);
-		sowwakeup(so);
-		wakeup(&so->so_timeo);
-#ifdef notyet		
-		sk->sk_state_change(sk);
+	DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(&so->so_rcv), so->so_rcv.sb_flags);
 
-		/* Do not send POLL_HUP for half duplex close. */
-		if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
-		    sk->sk_state == TCP_CLOSE)
-			sk_wake_async(so, 1, POLL_HUP);
-		else
-			sk_wake_async(so, 1, POLL_IN);
+#ifdef notyet		
+	/* Do not send POLL_HUP for half duplex close. */
+	if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
+	    sk->sk_state == TCP_CLOSE)
+		sk_wake_async(so, 1, POLL_HUP);
+	else
+		sk_wake_async(so, 1, POLL_IN);
 #endif
-	}
+
 out:
 	if (!keep)
 		m_free(m);
@@ -1929,8 +2513,10 @@ process_close_con_rpl(struct socket *so, struct mbuf *m)
 		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
 			tp = tcp_close(tp);
 
-		} else
+		} else {
 			enter_timewait(so);
+			soisdisconnected(so);
+		}
 		break;
 	case TCPS_LAST_ACK:
 		/*
@@ -1942,21 +2528,29 @@ process_close_con_rpl(struct socket *so, struct mbuf *m)
 		tp = tcp_close(tp);
 		break;
 	case TCPS_FIN_WAIT_1:
-#ifdef notyet
-		dst_confirm(sk->sk_dst_cache);
-#endif
-		soisdisconnecting(so);
-		
-		if ((so->so_state & SS_NOFDREF) == 0) {
-			/*
-			 * Wake up lingering close
-			 */
-			sowwakeup(so);
-			sorwakeup(so);
-			wakeup(&so->so_timeo);
-		} else if ((so->so_options & SO_LINGER) && so->so_linger == 0 &&
+		/*
+		 * If we can't receive any more
+		 * data, then closing user can proceed.
+		 * Starting the timer is contrary to the
+		 * specification, but if we don't get a FIN
+		 * we'll hang forever.
+		 *
+		 * XXXjl:
+		 * we should release the tp also, and use a
+		 * compressed state.
+		 */
+		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+			int timeout;
+			
+			soisdisconnected(so);
+			timeout = (tcp_fast_finwait2_recycle) ? 
+			    tcp_finwait2_timeout : tcp_maxidle;
+			tcp_timer_activate(tp, TT_2MSL, timeout);
+		}
+		tp->t_state = TCPS_FIN_WAIT_2;
+		if ((so->so_options & SO_LINGER) && so->so_linger == 0 &&
 		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
-			tp = cxgb_tcp_drop(tp, 0);
+			tp = tcp_drop(tp, 0);
 		}
 
 		break;
@@ -1970,7 +2564,7 @@ process_close_con_rpl(struct socket *so, struct mbuf *m)
 	if (tp)
 		INP_UNLOCK(tp->t_inpcb);
 out:
-	m_free(m);
+	m_freem(m);
 }
 
 /*
@@ -2006,6 +2600,8 @@ process_abort_rpl(struct socket *so, struct mbuf *m)
 		  "process_abort_rpl: GTS rpl pending %d",
 		  sock_flag(sk, ABORT_RPL_PENDING));
 #endif
+	
+	INP_INFO_WLOCK(&tcbinfo);
 	INP_LOCK(tp->t_inpcb);
 	
 	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
@@ -2020,16 +2616,14 @@ process_abort_rpl(struct socket *so, struct mbuf *m)
 			    !is_t3a(TOE_DEV(so))) {
 				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
 					panic("TP_ABORT_REQ_RCVD set");
-				INP_INFO_WLOCK(&tcbinfo);
-				INP_LOCK(tp->t_inpcb);
 				t3_release_offload_resources(toep);
 				tp = tcp_close(tp);
-				INP_INFO_WUNLOCK(&tcbinfo);
 			}
 		}
 	}
 	if (tp)
 		INP_UNLOCK(tp->t_inpcb);
+	INP_INFO_WUNLOCK(&tcbinfo);
 
 	m_free(m);
 }
@@ -2089,7 +2683,7 @@ discard:
 }
 
 /*
- * Convert the status code of an ABORT_REQ into a Linux error code.  Also
+ * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
  * indicate whether RST should be sent in response.
  */
 static int
@@ -2289,10 +2883,8 @@ process_abort_req(struct socket *so, struct mbuf *m, struct toedev *tdev)
 	    (is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
 		so->so_error = abort_status_to_errno(so, req->status,
 		    &rst_status);
-#if 0	
-		if (!sock_flag(sk, SOCK_DEAD))
-			sk->sk_error_report(sk);
-#endif		
+		if (__predict_true((so->so_state & SS_NOFDREF) == 0))
+			sorwakeup(so);
 		/*
 		 * SYN_RECV needs special processing.  If abort_syn_rcv()
 		 * returns 0 is has taken care of the abort.
@@ -2513,7 +3105,8 @@ syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, str
 	struct tcphdr th;
 	struct inpcb *inp;
 	int mss, wsf, sack, ts;
-
+	uint32_t rcv_isn = ntohl(req->rcv_isn);
+	
 	bzero(&to, sizeof(struct tcpopt));
 	inp = sotoinpcb(lso);
 	
@@ -2522,10 +3115,11 @@ syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, str
 	 */
 	inc.inc_fport = th.th_sport = req->peer_port;
 	inc.inc_lport = th.th_dport = req->local_port;
-	toep->tp_iss = th.th_seq = req->rcv_isn;
+	th.th_seq = req->rcv_isn;
 	th.th_flags = TH_SYN;
 
-	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = ntohl(req->rcv_isn);
+	toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
+
 	
 	inc.inc_isipv6 = 0;
 	inc.inc_len = 0;
@@ -2543,7 +3137,6 @@ syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, str
 	to.to_mss = mss;
 	to.to_wscale = wsf;
 	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
-
 	INP_INFO_WLOCK(&tcbinfo);
 	INP_LOCK(inp);
 	syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
@@ -2654,34 +3247,31 @@ process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
 	newtoep->tp_flags = TP_SYN_RCVD;
 	newtoep->tp_tid = tid;
 	newtoep->tp_toedev = tdev;
+	tp->rcv_wnd = select_rcv_wnd(tdev, so);
 	
-	printf("inserting tid=%d\n", tid);
 	cxgb_insert_tid(cdev, d->client, newtoep, tid);
 	SOCK_LOCK(so);
 	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
 	SOCK_UNLOCK(so);
 	
-	
-	if (lctx->ulp_mode) {
+	newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so->so_options & SO_NO_DDP) &&
+		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
+
+	if (newtoep->tp_ulp_mode) {
 		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
 		
-		if (!ddp_mbuf)
+		if (ddp_mbuf == NULL)
 			newtoep->tp_ulp_mode = 0;
-		else
-			newtoep->tp_ulp_mode = lctx->ulp_mode;
 	}
-
+	
+	CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
+	    TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
 	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
-
-	DPRINTF("adding request to syn cache\n");
-
 	/*
 	 * XXX workaround for lack of syncache drop
 	 */
 	toepcb_hold(newtoep);
 	syncache_add_accept_req(req, so, newtoep);
-
-	
 	
 	rpl = cplhdr(reply_mbuf);
 	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
@@ -2692,50 +3282,34 @@ process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
 	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
 	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
 
-	DPRINTF("accept smt_idx=%d\n", e->smt_idx);
-	
 	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
 	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
-	rpl->opt0l_status = htonl(calc_opt0l(so, lctx->ulp_mode) |
+	rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
 				  CPL_PASS_OPEN_ACCEPT);
 
 	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
 	
-	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, so));
-	
-#ifdef DEBUG_PRINT		
-	{
-		int i;
-
-		DPRINTF("rpl:\n");
-		uint32_t *rplbuf = mtod(reply_mbuf, uint32_t *);
-		
-		for (i = 0; i < sizeof(*rpl)/sizeof(uint32_t); i++)
-			DPRINTF("[%d] %08x\n", i, rplbuf[i]);
-	}
-#endif	
-
+	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
 		
 	l2t_send(cdev, reply_mbuf, e);
 	m_free(m);
-#ifdef notyet
-	/*
-	 * XXX this call path has to be converted to not depend on sockets
-	 */
-	if (newtoep->tp_ulp_mode) 
-		__set_tcb_field(newso, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
+	if (newtoep->tp_ulp_mode) {	
+		__set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
 				V_TF_DDP_OFF(1) |
 				TP_DDP_TIMER_WORKAROUND_MASK,
 				V_TF_DDP_OFF(1) |
-				TP_DDP_TIMER_WORKAROUND_VAL, 1);
+		    TP_DDP_TIMER_WORKAROUND_VAL, 1);
+	} else
+		printf("not offloading\n");
+	
+	
 
-#endif	
 	return;
 reject:
 	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
 		mk_pass_accept_rpl(reply_mbuf, m);
 	else 
-		mk_tid_release(reply_mbuf, NULL, tid);
+		mk_tid_release(reply_mbuf, newtoep, tid);
 	cxgb_ofld_send(cdev, reply_mbuf);
 	m_free(m);
 out:
@@ -2793,7 +3367,7 @@ do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 
 /*
  * Called when a connection is established to translate the TCP options
- * reported by HW to Linux's native format.
+ * reported by HW to FreeBSD's native format.
  */
 static void
 assign_rxopt(struct socket *so, unsigned int opt)
@@ -2808,8 +3382,9 @@ assign_rxopt(struct socket *so, unsigned int opt)
 	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
 	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
 	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
-	if (tp->t_flags & TF_RCVD_SCALE)
-		tp->rcv_scale = 0;
+	if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
+	    (TF_RCVD_SCALE|TF_REQ_SCALE))
+		tp->rcv_scale = tp->request_r_scale;
 }
 
 /*
@@ -2831,8 +3406,6 @@ make_established(struct socket *so, u32 snd_isn, unsigned int opt)
 #if 0	
 	inet_sk(sk)->id = tp->write_seq ^ jiffies;
 #endif	
-
-
 	/*
 	 * XXX not clear what rcv_wup maps to
 	 */
@@ -2851,7 +3424,9 @@ make_established(struct socket *so, u32 snd_isn, unsigned int opt)
  */
 	dst_confirm(sk->sk_dst_cache);
 #endif
+	tp->t_starttime = ticks;
 	tp->t_state = TCPS_ESTABLISHED;
+	soisconnected(so);
 }
 
 static int
@@ -2948,23 +3523,21 @@ do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 	
 	tp = sototcpcb(so);
 	INP_LOCK(tp->t_inpcb);
-#ifdef notyet	
-	so->so_snd.sb_flags |= SB_TOE;
-	so->so_rcv.sb_flags |= SB_TOE;
-#endif	
+
+	so->so_snd.sb_flags |= SB_NOCOALESCE;
+	so->so_rcv.sb_flags |= SB_NOCOALESCE;
+
 	toep->tp_tp = tp;
 	toep->tp_flags = 0;
 	tp->t_toe = toep;
 	reset_wr_list(toep);
-	tp->rcv_wnd = select_rcv_wnd(so);
-	DPRINTF("rcv_wnd=%ld\n", tp->rcv_wnd);
+	tp->rcv_wnd = select_rcv_wnd(tdev, so);
+	tp->rcv_nxt = toep->tp_copied_seq;
 	install_offload_ops(so);
 	
 	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
 	toep->tp_wr_unacked = 0;
 	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
-	toep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so->so_options & SO_NO_DDP) &&
-	    tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
 	toep->tp_qset_idx = 0;
 	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
 	
@@ -2975,8 +3548,9 @@ do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
 	INP_INFO_WUNLOCK(&tcbinfo);
 	INP_UNLOCK(tp->t_inpcb);
-	soisconnected(so);
 	
+	CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
+	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
 #ifdef notyet
 	/*
 	 * XXX not sure how these checks map to us
@@ -3066,14 +3640,10 @@ socket_act_establish(struct socket *so, struct mbuf *m)
 		fixup_and_send_ofo(so);
 
 	if (__predict_false(so->so_state & SS_NOFDREF)) {
-#ifdef notyet
-		/*	
-		 * XXX 	not clear what should be done here
-		 * appears to correspond to sorwakeup_locked
+		/*
+		 * XXX does this even make sense?
 		 */
-		sk->sk_state_change(sk);
-		sk_wake_async(so, 0, POLL_OUT);
-#endif
+		sorwakeup(so);
 	}
 	m_free(m);
 #ifdef notyet
@@ -3095,8 +3665,7 @@ socket_act_establish(struct socket *so, struct mbuf *m)
 		sk->sk_write_space(sk);
 #endif
 
-	soisconnected(so);
-	toep->tp_state = tp->t_state = TCPS_ESTABLISHED;
+	toep->tp_state = tp->t_state;
 	tcpstat.tcps_connects++;
 				
 }
@@ -3139,6 +3708,9 @@ do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 
 	socket_act_establish(so, m);
 	INP_UNLOCK(tp->t_inpcb);
+	CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
+	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
+
 	return (0);
 }
 
@@ -3156,7 +3728,7 @@ wr_ack(struct toepcb *toep, struct mbuf *m)
 	u32 snd_una = ntohl(hdr->snd_una);
 	int bytes = 0;
 	
-	DPRINTF("wr_ack: snd_una=%u credits=%d\n", snd_una, credits);
+	CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
 
 	INP_LOCK(tp->t_inpcb);
 	
@@ -3166,18 +3738,21 @@ wr_ack(struct toepcb *toep, struct mbuf *m)
 
 	while (credits) {
 		struct mbuf *p = peek_wr(toep);
-		DPRINTF("p->credits=%d p->bytes=%d\n", p->m_pkthdr.csum_data, p->m_pkthdr.len) ;
 		
 		if (__predict_false(!p)) {
 			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
-			    "nothing pending, state %u\n",
-			       credits, toep->tp_tid, tp->t_state);
+			    "nothing pending, state %u wr_avail=%u\n",
+			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
 			break;
 		}
+		CTR2(KTR_TOM,
+			"wr_ack: p->credits=%d p->bytes=%d", p->m_pkthdr.csum_data, p->m_pkthdr.len);
+
+		KASSERT(p->m_pkthdr.csum_data != 0, ("empty request still on list"));
 		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
+
 #if DEBUG_WR > 1
 			struct tx_data_wr *w = cplhdr(p);
-#ifdef notyet
 			log(LOG_ERR,
 			       "TID %u got %u WR credits, need %u, len %u, "
 			       "main body %u, frags %u, seq # %u, ACK una %u,"
@@ -3185,8 +3760,7 @@ wr_ack(struct toepcb *toep, struct mbuf *m)
 			       toep->tp_tid, credits, p->csum, p->len,
 			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
 			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
-			       WR_AVAIL(tp), count_pending_wrs(tp) - credits);
-#endif			
+			    toep->tp_wr_avail, count_pending_wrs(tp) - credits);
 #endif
 			p->m_pkthdr.csum_data -= credits;
 			break;
@@ -3194,7 +3768,9 @@ wr_ack(struct toepcb *toep, struct mbuf *m)
 			dequeue_wr(toep);
 			credits -= p->m_pkthdr.csum_data;
 			bytes += p->m_pkthdr.len;
-			DPRINTF("done with wr of %d bytes\n", p->m_pkthdr.len);
+			CTR3(KTR_TOM,
+			    "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
+			    p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
 	
 			m_free(p);
 		}
@@ -3228,7 +3804,7 @@ wr_ack(struct toepcb *toep, struct mbuf *m)
 			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
 	}
 	if (bytes) {
-		DPRINTF("sbdrop(%d)\n", bytes);
+		CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
 		SOCKBUF_LOCK(&so->so_snd);
 		sbdrop_locked(&so->so_snd, bytes);
 		sowwakeup_locked(so);
@@ -3250,15 +3826,21 @@ do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
 {
 	struct toepcb *toep = (struct toepcb *)ctx;
 
-	DPRINTF("do_wr_ack\n");
-	dump_toepcb(toep);
-	
 	VALIDATE_SOCK(so);
 
 	wr_ack(toep, m);
 	return 0;
 }
 
+/*
+ * Handler for TRACE_PKT CPL messages.  Just sink these packets.
+ */
+static int
+do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
+{
+	m_freem(m);
+	return 0;
+}
 
 /*
  * Reset a connection that is on a listener's SYN queue or accept queue,
@@ -3320,6 +3902,336 @@ t3_reset_synq(struct listen_ctx *lctx)
 	SOCK_UNLOCK(lctx->lso);
 }
 
+
+int
+t3_setup_ppods(struct socket *so, const struct ddp_gather_list *gl,
+		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
+		   unsigned int pg_off, unsigned int color)
+{
+	unsigned int i, j, pidx;
+	struct pagepod *p;
+	struct mbuf *m;
+	struct ulp_mem_io *req;
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	unsigned int tid = toep->tp_tid;
+	const struct tom_data *td = TOM_DATA(TOE_DEV(so));
+	unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
+
+	CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
+	    gl, nppods, tag, maxoff, pg_off, color);
+	
+	for (i = 0; i < nppods; ++i) {
+		m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
+		m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
+		req = mtod(m, struct ulp_mem_io *);
+		m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
+		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
+		req->wr.wr_lo = 0;
+		req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
+					   V_ULPTX_CMD(ULP_MEM_WRITE));
+		req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
+				 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
+
+		p = (struct pagepod *)(req + 1);
+		if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
+			p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
+			p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
+						  V_PPOD_COLOR(color));
+			p->pp_max_offset = htonl(maxoff);
+			p->pp_page_offset = htonl(pg_off);
+			p->pp_rsvd = 0;
+			for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
+				p->pp_addr[j] = pidx < gl->dgl_nelem ?
+				    htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
+		} else
+			p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
+		send_or_defer(toep, m, 0);
+		ppod_addr += PPOD_SIZE;
+	}
+	return (0);
+}
+
+/*
+ * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
+ */
+static inline void
+mk_cpl_barrier_ulp(struct cpl_barrier *b)
+{
+	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
+
+	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
+	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
+	b->opcode = CPL_BARRIER;
+}
+
+/*
+ * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
+ */
+static inline void
+mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
+{
+	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
+
+	txpkt = (struct ulp_txpkt *)req;
+	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
+	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
+	req->cpuno = htons(cpuno);
+}
+
+/*
+ * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
+ */
+static inline void
+mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
+                     unsigned int word, uint64_t mask, uint64_t val)
+{
+	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
+	
+	CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
+	    tid, word, mask, val);
+	
+	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
+	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
+	req->reply = V_NO_REPLY(1);
+	req->cpu_idx = 0;
+	req->word = htons(word);
+	req->mask = htobe64(mask);
+	req->val = htobe64(val);
+}
+
+/*
+ * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
+ */
+static void
+mk_rx_data_ack_ulp(struct cpl_rx_data_ack *ack, unsigned int tid, unsigned int credits)
+{
+	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
+
+	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
+	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
+	OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
+	ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
+				 V_RX_DACK_MODE(1) | V_RX_CREDITS(credits));
+}
+
+void
+t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
+{
+	unsigned int wrlen;
+	struct mbuf *m;
+	struct work_request_hdr *wr;
+	struct cpl_barrier *lock;
+	struct cpl_set_tcb_field *req;
+	struct cpl_get_tcb *getreq;
+	struct ddp_state *p = &toep->tp_ddp_state;
+
+	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
+	wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
+		sizeof(*getreq);
+	m = m_gethdr_nofail(wrlen);
+	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
+	wr = mtod(m, struct work_request_hdr *);
+	bzero(wr, wrlen);
+	
+	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
+	m->m_pkthdr.len = m->m_len = wrlen;
+
+	lock = (struct cpl_barrier *)(wr + 1);
+	mk_cpl_barrier_ulp(lock);
+
+	req = (struct cpl_set_tcb_field *)(lock + 1);
+
+	CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
+
+	/* Hmmm, not sure if this actually a good thing: reactivating
+	 * the other buffer might be an issue if it has been completed
+	 * already. However, that is unlikely, since the fact that the UBUF
+	 * is not completed indicates that there is no oustanding data.
+	 */
+	if (bufidx == 0)
+		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
+				     V_TF_DDP_ACTIVE_BUF(1) |
+				     V_TF_DDP_BUF0_VALID(1),
+				     V_TF_DDP_ACTIVE_BUF(1));
+	else
+		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
+				     V_TF_DDP_ACTIVE_BUF(1) |
+				     V_TF_DDP_BUF1_VALID(1), 0);
+
+	getreq = (struct cpl_get_tcb *)(req + 1);
+	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
+
+	mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
+
+	/* Keep track of the number of oustanding CPL_GET_TCB requests
+	 */
+	p->get_tcb_count++;
+	
+#ifdef T3_TRACE
+	T3_TRACE1(TIDTB(so),
+		  "t3_cancel_ddpbuf: bufidx %u", bufidx);
+#endif
+	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
+}
+
+/**
+ * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
+ * @sk: the socket associated with the buffers
+ * @bufidx: index of HW DDP buffer (0 or 1)
+ * @tag0: new tag for HW buffer 0
+ * @tag1: new tag for HW buffer 1
+ * @len: new length for HW buf @bufidx
+ *
+ * Sends a compound WR to overlay a new DDP buffer on top of an existing
+ * buffer by changing the buffer tag and length and setting the valid and
+ * active flag accordingly.  The caller must ensure the new buffer is at
+ * least as big as the existing one.  Since we typically reprogram both HW
+ * buffers this function sets both tags for convenience. Read the TCB to
+ * determine how made data was written into the buffer before the overlay
+ * took place.
+ */
+void
+t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
+	 	       unsigned int tag1, unsigned int len)
+{
+	unsigned int wrlen;
+	struct mbuf *m;
+	struct work_request_hdr *wr;
+	struct cpl_get_tcb *getreq;
+	struct cpl_set_tcb_field *req;
+	struct ddp_state *p = &toep->tp_ddp_state;
+
+	CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
+	    bufidx, tag0, tag1, len);
+	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
+	wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
+	m = m_gethdr_nofail(wrlen);
+	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
+	wr = mtod(m, struct work_request_hdr *);
+	m->m_pkthdr.len = m->m_len = wrlen;
+	bzero(wr, wrlen);
+
+	
+	/* Set the ATOMIC flag to make sure that TP processes the following
+	 * CPLs in an atomic manner and no wire segments can be interleaved.
+	 */
+	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
+	req = (struct cpl_set_tcb_field *)(wr + 1);
+	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
+			     V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
+			     V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
+			     V_TCB_RX_DDP_BUF0_TAG(tag0) |
+			     V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
+	req++;
+	if (bufidx == 0) {
+		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
+			    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
+			    V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
+		req++;
+		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
+			    V_TF_DDP_PUSH_DISABLE_0(1) |
+			    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
+			    V_TF_DDP_PUSH_DISABLE_0(0) |
+			    V_TF_DDP_BUF0_VALID(1));
+	} else {
+		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
+			    V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
+			    V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
+		req++;
+		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
+			    V_TF_DDP_PUSH_DISABLE_1(1) |
+			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
+			    V_TF_DDP_PUSH_DISABLE_1(0) |
+			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
+	}
+
+	getreq = (struct cpl_get_tcb *)(req + 1);
+	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
+
+	/* Keep track of the number of oustanding CPL_GET_TCB requests
+	 */
+	p->get_tcb_count++;
+
+#ifdef T3_TRACE
+	T3_TRACE4(TIDTB(sk),
+		  "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
+		  "len %d",
+		  bufidx, tag0, tag1, len);
+#endif
+	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
+}
+
+/*
+ * Sends a compound WR containing all the CPL messages needed to program the
+ * two HW DDP buffers, namely optionally setting up the length and offset of
+ * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
+ */
+void
+t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
+		      unsigned int len1, unsigned int offset1,
+                      uint64_t ddp_flags, uint64_t flag_mask, int modulate)
+{
+	unsigned int wrlen;
+	struct mbuf *m;
+	struct work_request_hdr *wr;
+	struct cpl_set_tcb_field *req;
+
+	CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
+	    len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
+	
+	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
+	wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
+		(len1 ? sizeof(*req) : 0) +
+		(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
+	m = m_gethdr_nofail(wrlen);
+	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
+	wr = mtod(m, struct work_request_hdr *);
+	bzero(wr, wrlen);
+	
+	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
+	m->m_pkthdr.len = m->m_len = wrlen;
+
+	req = (struct cpl_set_tcb_field *)(wr + 1);
+	if (len0) {                  /* program buffer 0 offset and length */
+		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
+			V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
+			V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
+			V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
+			V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
+		req++;
+	}
+	if (len1) {                  /* program buffer 1 offset and length */
+		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
+			V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
+			V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
+			V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
+			V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
+		req++;
+	}
+
+	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
+			     ddp_flags);
+
+	if (modulate) {
+		mk_rx_data_ack_ulp((struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
+				   toep->tp_copied_seq - toep->tp_rcv_wup);
+		toep->tp_rcv_wup = toep->tp_copied_seq;
+	}
+
+#ifdef T3_TRACE
+	T3_TRACE5(TIDTB(sk),
+		  "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
+		  "modulate %d",
+		  len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
+		  modulate);
+#endif
+
+	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
+}
+
 void
 t3_init_wr_tab(unsigned int wr_len)
 {
@@ -3353,7 +4265,6 @@ t3_init_cpl_io(void)
 	tcphdr_skb->h.raw = tcphdr_skb->data;
 	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
 #endif
-
 	
 	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
 	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
@@ -3367,11 +4278,9 @@ t3_init_cpl_io(void)
 	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
 	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
 	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
-#ifdef notyet	
 	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
 	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
 	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
-#endif
 	return (0);
 }
 
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
index a3dd692..6edeacd 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
@@ -38,14 +38,18 @@ __FBSDID("$FreeBSD$");
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
+#include <sys/condvar.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/syslog.h>
 #include <sys/socketvar.h>
 #include <sys/uio.h>
+#include <sys/file.h>
 
 #include <machine/bus.h>
+#include <machine/cpu.h>
 
 #include <net/if.h>
 #include <net/route.h>
@@ -56,6 +60,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet/in_var.h>
 
 
+#include <dev/cxgb/cxgb_config.h>
 #include <dev/cxgb/cxgb_osdep.h>
 #include <dev/cxgb/sys/mbufq.h>
 
@@ -72,6 +77,7 @@ __FBSDID("$FreeBSD$");
 #include <dev/cxgb/common/cxgb_ctl_defs.h>
 #include <dev/cxgb/cxgb_l2t.h>
 #include <dev/cxgb/cxgb_offload.h>
+
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
@@ -85,6 +91,7 @@ __FBSDID("$FreeBSD$");
 #include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
 #include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
 #include <dev/cxgb/ulp/tom/cxgb_tcp.h>
+#include <dev/cxgb/ulp/tom/cxgb_vm.h>
 
 static int	(*pru_sosend)(struct socket *so, struct sockaddr *addr,
     struct uio *uio, struct mbuf *top, struct mbuf *control,
@@ -94,13 +101,11 @@ static int	(*pru_soreceive)(struct socket *so, struct sockaddr **paddr,
     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
     int *flagsp);
 
-#ifdef notyet
-#define VM_HOLD_WRITEABLE	0x1
-static int  vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp,
-    int *count, int flags);
-#endif
-static void vm_fault_unhold_pages(vm_page_t *m, int count);
 #define TMP_IOV_MAX 16
+#ifndef PG_FRAME
+#define PG_FRAME	~PAGE_MASK
+#endif
+#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
 
 void
 t3_init_socket_ops(void)
@@ -110,20 +115,8 @@ t3_init_socket_ops(void)
 	prp = pffindtype(AF_INET, SOCK_STREAM);
 	pru_sosend = prp->pr_usrreqs->pru_sosend;
 	pru_soreceive = prp->pr_usrreqs->pru_soreceive;
-#ifdef TCP_USRREQS_OVERLOAD	
-	tcp_usrreqs.pru_connect = cxgb_tcp_usrreqs.pru_connect;
-	tcp_usrreqs.pru_abort = cxgb_tcp_usrreqs.pru_abort;
-	tcp_usrreqs.pru_listen = cxgb_tcp_usrreqs.pru_listen;
-	tcp_usrreqs.pru_send = cxgb_tcp_usrreqs.pru_send;
-	tcp_usrreqs.pru_abort = cxgb_tcp_usrreqs.pru_abort;
-	tcp_usrreqs.pru_disconnect = cxgb_tcp_usrreqs.pru_disconnect;
-	tcp_usrreqs.pru_close = cxgb_tcp_usrreqs.pru_close;
-	tcp_usrreqs.pru_shutdown = cxgb_tcp_usrreqs.pru_shutdown;
-	tcp_usrreqs.pru_rcvd = cxgb_tcp_usrreqs.pru_rcvd;
-#endif
 }
 
-
 struct cxgb_dma_info {
 	size_t			cdi_mapped;
 	int			cdi_nsegs;
@@ -182,21 +175,172 @@ iov_adj(struct iovec **iov, int *iovcnt, size_t count)
 	}
 }
 
-
 static void
-cxgb_zero_copy_free(void *cl, void *arg) {}
+cxgb_zero_copy_free(void *cl, void *arg)
+{
+	struct mbuf_vec *mv;
+	struct mbuf *m = (struct mbuf *)cl;
+
+	mv = mtomv(m);
+	/*
+	 * Physical addresses, don't try to free should be unheld separately from sbdrop
+	 *
+	 */
+	mv->mv_count = 0;
+	m_free_iovec(m, m->m_type);
+}
+
 
 static int
 cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, int flags)
 {
+	struct iovec *iov = uio->uio_iov;
+	int iovcnt = uio->uio_iovcnt;
+	int err, i, count, totcount, maxcount, totbytes, npages, curbytes;
+	uint64_t start, end;
+	vm_page_t *mp;
+	
+	totbytes = totcount = 0;
+	maxcount = *held;
+
+	mp = m;
+	for (totcount = i = 0; (i < iovcnt) && (totcount < maxcount);  i++, iov++) {
+		count = maxcount - totcount;
+		    
+		start = (uintptr_t)iov->iov_base;
+		end = (uintptr_t)((caddr_t)iov->iov_base + iov->iov_len);
+		start &= PG_FRAME;
+		end += PAGE_MASK;
+		end &= PG_FRAME;
+		npages = (end - start) >> PAGE_SHIFT;
+		
+		count = min(count, npages);
+
+		err = vm_fault_hold_user_pages((vm_offset_t)iov->iov_base, mp, count, flags);
+		if (err) {
+			vm_fault_unhold_pages(m, totcount);
+			return (err);
+		}
+		mp += count;
+		totcount += count;
+		curbytes = iov->iov_len;
+		if (count != npages)
+			curbytes = count*PAGE_SIZE - (((uintptr_t)iov->iov_base)&PAGE_MASK);
+		totbytes += curbytes;
+	}
+	uio->uio_resid -= totbytes;
 
-	return (EINVAL);
+	return (0);
+}
+
+/*
+ * Returns whether a connection should enable DDP.  This happens when all of
+ * the following conditions are met:
+ * - the connection's ULP mode is DDP
+ * - DDP is not already enabled
+ * - the last receive was above the DDP threshold
+ * - receive buffers are in user space
+ * - receive side isn't shutdown (handled by caller)
+ * - the connection's receive window is big enough so that sizable buffers
+ *   can be posted without closing the window in the middle of DDP (checked
+ *   when the connection is offloaded)
+ */
+static int
+so_should_ddp(const struct toepcb *toep, int last_recv_len)
+{
+
+	DPRINTF("ulp_mode=%d last_recv_len=%d ddp_thresh=%d rcv_wnd=%ld ddp_copy_limit=%d\n",
+	    toep->tp_ulp_mode, last_recv_len,  TOM_TUNABLE(toep->tp_toedev, ddp_thres),
+	    toep->tp_tp->rcv_wnd, (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN));
+
+	return toep->tp_ulp_mode == ULP_MODE_TCPDDP && (toep->tp_ddp_state.kbuf[0] == NULL) &&
+	       last_recv_len > TOM_TUNABLE(toep->tp_toedev, ddp_thres) &&
+	       toep->tp_tp->rcv_wnd > 
+	           (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN);
+}
+
+static inline int
+is_ddp(const struct mbuf *m)
+{
+	return (m->m_flags & M_DDP);
+}
+
+static inline int
+is_ddp_psh(const struct mbuf *m)
+{
+        return is_ddp(m) && (m->m_pkthdr.csum_flags & DDP_BF_PSH);
+}
+
+static int
+m_uiomove(const struct mbuf *m, int offset, int len, struct uio *uio)
+{
+	int curlen, startlen, resid_init, err = 0;
+	caddr_t buf;
+
+	DPRINTF("m_uiomove(m=%p, offset=%d, len=%d, ...)\n",
+	    m, offset, len);
+
+	startlen = len;
+	resid_init = uio->uio_resid;
+	while (m && len) {
+		buf = mtod(m, caddr_t);
+		curlen = m->m_len;
+		if (offset && (offset < curlen)) {
+			curlen -= offset;
+			buf += offset;
+			offset = 0;
+		} else if (offset) {
+			offset -= curlen;
+			m = m->m_next;
+			continue;
+		}
+		err = uiomove(buf, min(len, curlen), uio);
+		if (err) {
+			printf("uiomove returned %d\n", err);
+			return (err);
+		}
+		
+		len -= min(len, curlen);
+		m = m->m_next;
+	}
+	DPRINTF("copied %d bytes - resid_init=%d uio_resid=%d\n",
+	    startlen - len, resid_init, uio->uio_resid);
+	return (err);
+}
+
+/*
+ * Copy data from an sk_buff to an iovec.  Deals with RX_DATA, which carry the
+ * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a
+ * DDP buffer.
+ */
+static inline int
+copy_data(const struct mbuf *m, int offset, int len, struct uio *uio)
+{
+	struct iovec *to = uio->uio_iov;
+	int err;
+
+	
+	if (__predict_true(!is_ddp(m))) {                             /* RX_DATA */
+		return m_uiomove(m, offset, len, uio);
+	} if (__predict_true(m->m_ddp_flags & DDP_BF_NOCOPY)) { /* user DDP */
+		to->iov_len -= len;
+		to->iov_base = ((caddr_t)to->iov_base) + len;
+		uio->uio_iov = to;
+		uio->uio_resid -= len;
+		return (0);
+	}
+	err = t3_ddp_copy(m, offset, uio, len);             /* kernel DDP */
+	return (err);
 }
 
 static void
-cxgb_wait_dma_completion(struct toepcb *tp)
+cxgb_wait_dma_completion(struct toepcb *toep)
 {
+	struct mtx *lock;
 	
+	lock = &toep->tp_tp->t_inpcb->inp_mtx;
+	INP_LOCK(toep->tp_tp->t_inpcb);
+	cv_wait_unlock(&toep->tp_cv, lock);
 }
 
 static int
@@ -234,7 +378,13 @@ cxgb_vm_page_to_miov(struct toepcb *toep, struct uio *uio, struct mbuf **m)
 		mi_collapse_sge(mi, segs);
 
 	*m = m0;
-	
+
+	/*
+	 * This appears to be a no-op at the moment
+	 * as busdma is all or nothing need to make
+	 * sure the tag values are large enough
+	 *
+	 */
 	if (cdi.cdi_mapped < uio->uio_resid) {
 		uio->uio_resid -= cdi.cdi_mapped;
 	} else
@@ -305,10 +455,11 @@ sendmore:
 		}
 		uio->uio_resid -= m->m_pkthdr.len;
 		sent += m->m_pkthdr.len;
-		sbappend_locked(&so->so_snd, m);
+		sbappend(&so->so_snd, m);
 		t3_push_frames(so, TRUE);
 		iov_adj(&uiotmp.uio_iov, &iovcnt, uiotmp.uio_resid);
 	}
+
 	/*
 	 * Wait for pending I/O to be DMA'd to the card 
 	 * 
@@ -357,7 +508,7 @@ cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
 		zcopy_thres = TOM_TUNABLE(tdev, zcopy_sosend_partial_thres);
 		zcopy_enabled = TOM_TUNABLE(tdev, zcopy_sosend_enabled);
 
-		if ((uio->uio_resid > zcopy_thres) &&
+		if (uio && (uio->uio_resid > zcopy_thres) &&
 		    (uio->uio_iovcnt < TMP_IOV_MAX) &&  ((so->so_state & SS_NBIO) == 0)
 		    && zcopy_enabled) {
 			rv = t3_sosend(so, uio);
@@ -368,36 +519,378 @@ cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
 	return pru_sosend(so, addr, uio, top, control, flags, td);
 }
 
+/*
+ * Following replacement or removal of the first mbuf on the first mbuf chain
+ * of a socket buffer, push necessary state changes back into the socket
+ * buffer so that other consumers see the values consistently.  'nextrecord'
+ * is the callers locally stored value of the original value of
+ * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
+ * NOTE: 'nextrecord' may be NULL.
+ */
+static __inline void
+sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
+{
+
+	SOCKBUF_LOCK_ASSERT(sb);
+	/*
+	 * First, update for the new value of nextrecord.  If necessary, make
+	 * it the first record.
+	 */
+	if (sb->sb_mb != NULL)
+		sb->sb_mb->m_nextpkt = nextrecord;
+	else
+		sb->sb_mb = nextrecord;
+
+        /*
+         * Now update any dependent socket buffer fields to reflect the new
+         * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
+	 * addition of a second clause that takes care of the case where
+	 * sb_mb has been updated, but remains the last record.
+         */
+        if (sb->sb_mb == NULL) {
+                sb->sb_mbtail = NULL;
+                sb->sb_lastrecord = NULL;
+        } else if (sb->sb_mb->m_nextpkt == NULL)
+                sb->sb_lastrecord = sb->sb_mb;
+}
+
+#define IS_NONBLOCKING(so)	((so)->so_state & SS_NBIO)
+
 
 static int
-t3_soreceive(struct socket *so, struct uio *uio)
+t3_soreceive(struct socket *so, int *flagsp, struct uio *uio)
 {
-#ifdef notyet
-	int i, rv, count, hold_resid, sent, iovcnt;
-	struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov;
 	struct tcpcb *tp = sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct mbuf *m;
-	struct uio uiotmp;
+	uint32_t offset;
+	int err, flags, avail, len, copied, copied_unacked;
+	int target;		/* Read at least this many bytes */
+	int user_ddp_ok;
+	struct ddp_state *p;
+	struct inpcb *inp = sotoinpcb(so);
+
+	avail = offset = copied = copied_unacked = 0;
+	flags = flagsp ? (*flagsp &~ MSG_EOR) : 0;
+	err = sblock(&so->so_rcv, SBLOCKWAIT(flags));
+	p = &toep->tp_ddp_state;
+
+	if (err)
+		return (err);
 
+	SOCKBUF_LOCK(&so->so_rcv);
+	p->user_ddp_pending = 0;
+restart:
+	len = uio->uio_resid;
+	m = so->so_rcv.sb_mb;
+	target = (flags & MSG_WAITALL) ? len : so->so_rcv.sb_lowat;
+	user_ddp_ok = p->ubuf_ddp_ready;
+	p->cancel_ubuf = 0;
+
+	if (len == 0)
+		goto done;
+#if 0	
+	while (m && m->m_len == 0) {
+		so->so_rcv.sb_mb = m_free(m);
+		m = so->so_rcv.sb_mb;
+	}
+#endif	
+	if (m) 
+		goto got_mbuf;
+
+	/* empty receive queue */
+	if (copied >= target && (so->so_rcv.sb_mb == NULL) &&
+	    !p->user_ddp_pending)
+		goto done;
+
+	if (copied) {
+		if (so->so_error || tp->t_state == TCPS_CLOSED || 
+		    (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)))
+			goto done;
+	} else {
+		if (so->so_state & SS_NOFDREF)
+			goto done;
+		if (so->so_error) {
+			err = so->so_error;
+			so->so_error = 0;
+			goto done;
+		}
+		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 
+			goto done;
+		if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
+			goto done;
+		if (tp->t_state == TCPS_CLOSED) {
+			err = ENOTCONN; 
+			goto done;
+		}
+	}
+	if (so->so_rcv.sb_mb && !p->user_ddp_pending) {
+		SOCKBUF_UNLOCK(&so->so_rcv);
+		INP_LOCK(inp);
+		t3_cleanup_rbuf(tp, copied_unacked);
+		INP_UNLOCK(inp);
+		SOCKBUF_LOCK(&so->so_rcv);
+		copied_unacked = 0;
+		goto restart;
+	}
+	if (p->kbuf[0] && user_ddp_ok && !p->user_ddp_pending && 
+	    uio->uio_iov->iov_len > p->kbuf[0]->dgl_length &&
+	    p->ubuf_ddp_ready) {
+		p->user_ddp_pending =
+		    !t3_overlay_ubuf(so, uio, IS_NONBLOCKING(so), flags, 1, 1);
+		if (p->user_ddp_pending) {
+			p->kbuf_posted++;
+			user_ddp_ok = 0;
+		}
+	}
+	if (p->kbuf[0] && (p->kbuf_posted == 0)) {
+		t3_post_kbuf(so, 1, IS_NONBLOCKING(so));
+		p->kbuf_posted++;
+	}
+	if (p->user_ddp_pending) {
+		/* One shot at DDP if we already have enough data */
+		if (copied >= target)
+			user_ddp_ok = 0;
+
+		DPRINTF("sbwaiting 1\n");
+		if ((err = sbwait(&so->so_rcv)) != 0)
+			goto done;
+//for timers to work			await_ddp_completion(sk, flags, &timeo);
+	} else if (copied >= target)
+		goto done;
+	else {
+		if (copied_unacked) {
+			int i = 0;
+
+			SOCKBUF_UNLOCK(&so->so_rcv);
+			INP_LOCK(inp);
+			t3_cleanup_rbuf(tp, copied_unacked);
+			INP_UNLOCK(inp);
+			copied_unacked = 0;
+			if (mp_ncpus > 1)
+				while (i++ < 200 && so->so_rcv.sb_mb == NULL)
+					cpu_spinwait();
+			SOCKBUF_LOCK(&so->so_rcv);
+		}
+		
+		if (so->so_rcv.sb_mb)
+			goto restart;
+		DPRINTF("sbwaiting 2 copied=%d target=%d avail=%d so=%p mb=%p cc=%d\n", copied, target, avail, so,
+		    so->so_rcv.sb_mb, so->so_rcv.sb_cc);
+		if ((err = sbwait(&so->so_rcv)) != 0)
+			goto done;
+	}
+     	goto restart;
+got_mbuf:
+	KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d m_pktlen=%d\n", !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len, m->m_pkthdr.len));
+	KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x m->m_len=%d",
+		m->m_next, m->m_nextpkt, m->m_flags, m->m_len));
+	if (m->m_pkthdr.len == 0) {
+		if ((m->m_ddp_flags & DDP_BF_NOCOPY) == 0)
+			panic("empty mbuf and NOCOPY not set\n");
+		CTR0(KTR_TOM, "ddp done notification");
+		p->user_ddp_pending = 0;
+		sbdroprecord_locked(&so->so_rcv);
+		goto done;
+	}
+	
+	offset = toep->tp_copied_seq + copied_unacked - m->m_seq;
+	DPRINTF("m=%p copied_seq=0x%x copied_unacked=%d m_seq=0x%x offset=%d pktlen=%d is_ddp(m)=%d\n",
+	    m, toep->tp_copied_seq, copied_unacked, m->m_seq, offset, m->m_pkthdr.len, !!is_ddp(m));
+
+	if (offset >= m->m_pkthdr.len)
+		panic("t3_soreceive: OFFSET >= LEN offset %d copied_seq 0x%x seq 0x%x "
+		    "pktlen %d ddp flags 0x%x", offset, toep->tp_copied_seq + copied_unacked, m->m_seq,
+		    m->m_pkthdr.len, m->m_ddp_flags);
+
+	avail = m->m_pkthdr.len - offset;
+	if (len < avail) {
+		if (is_ddp(m) && (m->m_ddp_flags & DDP_BF_NOCOPY)) 
+			panic("bad state in t3_soreceive len=%d avail=%d offset=%d\n", len, avail, offset);
+		avail = len;
+	}
+	CTR4(KTR_TOM, "t3_soreceive: m_len=%u offset=%u len=%u m_seq=0%08x", m->m_pkthdr.len, offset, len, m->m_seq);
+	
+#ifdef URGENT_DATA_SUPPORTED
 	/*
-	 * Events requiring iteration:
-	 *  - number of pages exceeds max hold pages for process or system
-	 *  - number of pages exceeds maximum sg entries for a single WR
-	 *
-	 * We're limited to holding 128 pages at once - and we're limited to
-	 * 34 SG entries per work request, but each SG entry can be any number 
-	 * of contiguous pages
-	 *
+	 * Check if the data we are preparing to copy contains urgent
+	 * data.  Either stop short of urgent data or skip it if it's
+	 * first and we are not delivering urgent data inline.
+	 */
+	if (__predict_false(toep->tp_urg_data)) {
+		uint32_t urg_offset = tp->rcv_up - tp->copied_seq + copied_unacked;
+		
+		if (urg_offset < avail) {
+			if (urg_offset) {
+				/* stop short of the urgent data */
+				avail = urg_offset;
+			} else if ((so->so_options & SO_OOBINLINE) == 0) {
+				/* First byte is urgent, skip */
+				toep->tp_copied_seq++;
+				offset++;
+				avail--;
+				if (!avail)
+					goto skip_copy;
+			}	
+		}	
+	}	
+#endif
+	if (is_ddp_psh(m) || offset) {
+		user_ddp_ok = 0;
+#ifdef T3_TRACE	
+		T3_TRACE0(TIDTB(so), "t3_sosend: PSH");
+#endif	
+	}
+	
+	if (user_ddp_ok && !p->user_ddp_pending &&
+	    uio->uio_iov->iov_len > p->kbuf[0]->dgl_length &&
+	    p->ubuf_ddp_ready) {
+		p->user_ddp_pending = 
+		    !t3_overlay_ubuf(so, uio, IS_NONBLOCKING(so), flags, 1, 1);
+		if (p->user_ddp_pending) {
+			p->kbuf_posted++;
+			user_ddp_ok = 0;
+		}
+		DPRINTF("user_ddp_pending=%d\n", p->user_ddp_pending);
+	} else
+		DPRINTF("user_ddp_ok=%d user_ddp_pending=%d iov_len=%ld dgl_length=%d ubuf_ddp_ready=%d ulp_mode=%d is_ddp(m)=%d flags=0x%x ubuf=%p kbuf_posted=%d\n",
+		    user_ddp_ok, p->user_ddp_pending, uio->uio_iov->iov_len, p->kbuf[0] ? p->kbuf[0]->dgl_length : 0,
+		    p->ubuf_ddp_ready, toep->tp_ulp_mode, !!is_ddp(m), m->m_ddp_flags, p->ubuf, p->kbuf_posted);
+	
+	/*
+	 * If MSG_TRUNC is specified the data is discarded.
+	 * XXX need to check pr_atomic
 	 */
+	KASSERT(avail > 0, ("avail=%d resid=%d offset=%d", avail,  uio->uio_resid, offset));
+	if (__predict_true(!(flags & MSG_TRUNC))) {
+		int resid = uio->uio_resid;
+		
+		SOCKBUF_UNLOCK(&so->so_rcv);
+		if ((err = copy_data(m, offset, avail, uio))) {
+			if (err)
+				err = EFAULT;
+			goto done_unlocked;
+		}
+		SOCKBUF_LOCK(&so->so_rcv);
+		if (avail != (resid - uio->uio_resid))
+			printf("didn't copy all bytes :-/ avail=%d offset=%d pktlen=%d resid=%d uio_resid=%d copied=%d copied_unacked=%d is_ddp(m)=%d\n",
+			    avail, offset, m->m_pkthdr.len, resid, uio->uio_resid, copied, copied_unacked, is_ddp(m));
+	}
+	
+	copied += avail;
+	copied_unacked += avail;
+	len -= avail;
+	
+#ifdef URGENT_DATA_SUPPORTED
+skip_copy:
+	if (tp->urg_data && after(tp->copied_seq + copied_unacked, tp->urg_seq))
+		tp->urg_data = 0;
+#endif
+	/*
+	 * If the buffer is fully consumed free it.  If it's a DDP
+	 * buffer also handle any events it indicates.
+	 */
+	if (avail + offset >= m->m_pkthdr.len) {
+		unsigned int fl = m->m_ddp_flags;
+		int exitnow, got_psh = 0, nomoredata = 0;
+		int count;
+		struct mbuf *nextrecord;
+
+		if (p->kbuf[0] != NULL && is_ddp(m) && (fl & 1)) {
+			if (is_ddp_psh(m) && p->user_ddp_pending)
+				got_psh = 1;
+			
+			if (fl & DDP_BF_NOCOPY)
+				p->user_ddp_pending = 0;
+			else if ((fl & DDP_BF_NODATA) && IS_NONBLOCKING(so)) {
+				p->kbuf_posted--;
+				nomoredata = 1;
+			} else {
+				p->kbuf_posted--;
+				p->ubuf_ddp_ready = 1;
+			}
+		}
 
-	uiotmp = *uio;
-	iovcnt = uio->uio_iovcnt;
-	iov = uio->uio_iov;
-	sent = 0;
-	re;
-#endif  
-	return (0);
+		nextrecord = m->m_nextpkt;
+		count = m->m_pkthdr.len;
+		while (count > 0) {
+			count -= m->m_len;
+			KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
+			sbfree(&so->so_rcv, m);
+			so->so_rcv.sb_mb = m_free(m);
+			m = so->so_rcv.sb_mb;
+		}
+		sockbuf_pushsync(&so->so_rcv, nextrecord);
+#if 0
+		sbdrop_locked(&so->so_rcv, m->m_pkthdr.len);
+#endif		
+		exitnow = got_psh || nomoredata;
+		if  ((so->so_rcv.sb_mb == NULL) && exitnow)
+			goto done;
+		if (copied_unacked > (so->so_rcv.sb_hiwat >> 2)) {
+			SOCKBUF_UNLOCK(&so->so_rcv);
+			INP_LOCK(inp);
+			t3_cleanup_rbuf(tp, copied_unacked);
+			INP_UNLOCK(inp);
+			copied_unacked = 0;
+			SOCKBUF_LOCK(&so->so_rcv);
+		}
+	} 
+	if (len > 0)
+		goto restart;
+
+	done:
+	/*
+	 * If we can still receive decide what to do in preparation for the
+	 * next receive.  Note that RCV_SHUTDOWN is set if the connection
+	 * transitioned to CLOSE but not if it was in that state to begin with.
+	 */
+	if (__predict_true((so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) == 0)) {
+		if (p->user_ddp_pending) {
+			SOCKBUF_UNLOCK(&so->so_rcv);
+			SOCKBUF_LOCK(&so->so_rcv);
+			user_ddp_ok = 0;
+			t3_cancel_ubuf(toep);
+			if (so->so_rcv.sb_mb) {
+				if (copied < 0)
+					copied = 0;
+				if (len > 0)
+					goto restart;
+			}
+			p->user_ddp_pending = 0;
+		}
+		if ((p->kbuf[0] != NULL) && (p->kbuf_posted == 0)) {
+#ifdef T3_TRACE
+			T3_TRACE0(TIDTB(so),
+			  "chelsio_recvmsg: about to exit, repost kbuf");
+#endif
+
+			t3_post_kbuf(so, 1, IS_NONBLOCKING(so));
+			p->kbuf_posted++;
+		} else if (so_should_ddp(toep, copied) && uio->uio_iovcnt == 1) {
+			CTR1(KTR_TOM ,"entering ddp on tid=%u", toep->tp_tid);
+			if (!t3_enter_ddp(so, TOM_TUNABLE(TOE_DEV(so),
+				    ddp_copy_limit), 0, IS_NONBLOCKING(so)))
+				p->kbuf_posted = 1;
+		}
+	}
+#ifdef T3_TRACE
+	T3_TRACE5(TIDTB(so),
+		  "chelsio_recvmsg <-: copied %d len %d buffers_freed %d "
+		  "kbuf_posted %d user_ddp_pending %u",
+		  copied, len, buffers_freed, p ? p->kbuf_posted : -1, 
+	    p->user_ddp_pending);
+#endif
+	SOCKBUF_UNLOCK(&so->so_rcv);
+done_unlocked:	
+	if (copied_unacked) {
+		INP_LOCK(inp);
+		t3_cleanup_rbuf(tp, copied_unacked);
+		INP_UNLOCK(inp);
+	}
+	sbunlock(&so->so_rcv);
+
+	return (err);
 }
 
 static int
@@ -405,9 +898,11 @@ cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 {
 	struct toedev *tdev;
-	int rv, zcopy_thres, zcopy_enabled;
+	int rv, zcopy_thres, zcopy_enabled, flags;
 	struct tcpcb *tp = sototcpcb(so);
 
+	flags = flagsp ? *flagsp &~ MSG_EOR : 0;
+	
 	/*
 	 * In order to use DMA direct from userspace the following
 	 * conditions must be met:
@@ -421,150 +916,30 @@ cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
 	 *  - iovcnt is 1
 	 *
 	 */
-	if (tp->t_flags & TF_TOE) {
+	
+	if ((tp->t_flags & TF_TOE) && uio && ((flags & (MSG_WAITALL|MSG_OOB|MSG_PEEK|MSG_DONTWAIT)) == 0)
+	    && (uio->uio_iovcnt == 1) && (mp0 == NULL)) {
 		tdev =  TOE_DEV(so);
 		zcopy_thres = TOM_TUNABLE(tdev, ddp_thres);
 		zcopy_enabled = TOM_TUNABLE(tdev, ddp);
 		if ((uio->uio_resid > zcopy_thres) &&
-		    (uio->uio_iovcnt == 1) &&  ((so->so_state & SS_NBIO) == 0)
+		    (uio->uio_iovcnt == 1)
 		    && zcopy_enabled) {
-			rv = t3_soreceive(so, uio);
+			rv = t3_soreceive(so, flagsp, uio);
 			if (rv != EAGAIN)
 				return (rv);
-		}
-	}
-	
+			else
+				printf("returned EAGAIN\n");
+		} 
+	} else if ((tp->t_flags & TF_TOE) && uio && mp0 == NULL)
+		printf("skipping t3_soreceive flags=0x%x iovcnt=%d sb_state=0x%x\n",
+		    flags, uio->uio_iovcnt, so->so_rcv.sb_state);
 	return pru_soreceive(so, psa, uio, mp0, controlp, flagsp);
 }
 
-
 void
 t3_install_socket_ops(struct socket *so)
 {
 	so->so_proto->pr_usrreqs->pru_sosend = cxgb_sosend;
 	so->so_proto->pr_usrreqs->pru_soreceive = cxgb_soreceive;
 }
-
-/*
- * This routine takes a user address range and does the following:
- *  - validate that the user has access to those pages (flags indicates read or write) - if not fail
- *  - validate that count is enough to hold range number of pages - if not fail
- *  - fault in any non-resident pages
- *  - if the user is doing a read force a write fault for any COWed pages
- *  - if the user is doing a read mark all pages as dirty
- *  - hold all pages
- *  - return number of pages in count
- */
-#ifdef notyet
-static int
-vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp, int *count, int flags)
-{
-
-	vm_offset_t start, va;
-	vm_paddr_t pa;
-	int pageslen, faults, rv;
-	
-	struct thread *td;
-	vm_map_t map;
-	pmap_t pmap;
-	vm_page_t m, *pages;
-	vm_prot_t prot;
-	
-	start = addr & ~PAGE_MASK;
-	pageslen = roundup2(addr + len, PAGE_SIZE);
-	if (*count < (pageslen >> PAGE_SHIFT))
-		return (EFBIG);
-
-	*count = pageslen >> PAGE_SHIFT;
-	/*
-	 * Check that virtual address range is legal
-	 * This check is somewhat bogus as on some architectures kernel
-	 * and user do not share VA - however, it appears that all FreeBSD
-	 * architectures define it
-	 */
-	if (addr + len > VM_MAXUSER_ADDRESS)
-		return (EFAULT);
-	
-	td = curthread;
-	map = &td->td_proc->p_vmspace->vm_map;
-	pmap = &td->td_proc->p_vmspace->vm_pmap;
-	pages = mp;
-
-	prot = (flags & VM_HOLD_WRITEABLE) ? VM_PROT_WRITE : VM_PROT_READ;
-	bzero(pages, sizeof(vm_page_t *) * (*count));
-retry:
-	
-	/*
-	 * First optimistically assume that all pages are resident (and R/W if for write)
-	 * if so just mark pages as held (and dirty if for write) and return
-	 */
-	vm_page_lock_queues();
-	for (pages = mp, faults = 0, va = start; va < pageslen; va += PAGE_SIZE, pages++) {
-		/*
-		 * Assure that we only hold the page once
-		 */
-		if (*pages == NULL) {
-			/*
-			 * page queue mutex is recursable so this is OK
-			 * it would be really nice if we had an unlocked version of this so
-			 * we were only acquiring the pmap lock 1 time as opposed to potentially
-			 * many dozens of times
-			 */
-			m = pmap_extract_and_hold(pmap, va, prot);
-			if (m == NULL) {
-				faults++;
-				continue;
-			}
-			*pages = m;
-		if (flags & VM_HOLD_WRITEABLE)
-			vm_page_dirty(m);
-		}
-	}
-	vm_page_unlock_queues();
-	
-	if (faults == 0) 
-		return (0);
-	/*
-	 * Pages either have insufficient permissions or are not present
-	 * trigger a fault where neccessary
-	 * 
-	 */
-	for (va = start; va < pageslen; va += PAGE_SIZE) {
-		m = NULL;
-		pa = pmap_extract(pmap, va);
-		rv = 0;
-		if (pa)
-			m = PHYS_TO_VM_PAGE(pa);
-		if (flags & VM_HOLD_WRITEABLE) {
-			if (m == NULL  || (m->flags & PG_WRITEABLE) == 0)
-				rv = vm_fault(map, va, VM_PROT_WRITE, VM_FAULT_DIRTY);
-		} else if (m == NULL)
-			rv = vm_fault(map, va, VM_PROT_READ, VM_FAULT_NORMAL);
-		if (rv)
-			goto error;
-	} 
-	goto retry;
-
-error:	
-	vm_page_lock_queues();
-	for (pages = mp, va = start; va < pageslen; va += PAGE_SIZE, pages++) 
-		if (*pages)
-			vm_page_unhold(*pages);
-	vm_page_unlock_queues();
-	return (EFAULT);
-}
-#endif
-
-static void
-vm_fault_unhold_pages(vm_page_t *mp, int count)
-{
-
-	KASSERT(count >= 0, ("negative count %d", count));
-	vm_page_lock_queues();
-	while (count--) {
-		vm_page_unhold(*mp);
-		mp++;
-	}
-	vm_page_unlock_queues();
-}
-
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_ddp.c b/sys/dev/cxgb/ulp/tom/cxgb_ddp.c
new file mode 100644
index 0000000..8bdcb65
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_ddp.c
@@ -0,0 +1,735 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/condvar.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/syslog.h>
+#include <sys/socketvar.h>
+#include <sys/uio.h>
+
+#include <machine/bus.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+
+
+#include <dev/cxgb/cxgb_osdep.h>
+#include <dev/cxgb/sys/mbufq.h>
+
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_offload.h>
+#include <net/route.h>
+
+#include <dev/cxgb/t3cdev.h>
+#include <dev/cxgb/common/cxgb_firmware_exports.h>
+#include <dev/cxgb/common/cxgb_t3_cpl.h>
+#include <dev/cxgb/common/cxgb_tcb.h>
+#include <dev/cxgb/common/cxgb_ctl_defs.h>
+#include <dev/cxgb/cxgb_l2t.h>
+#include <dev/cxgb/cxgb_offload.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+
+#include <dev/cxgb/sys/mvec.h>
+#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
+#include <dev/cxgb/ulp/tom/cxgb_defs.h>
+#include <dev/cxgb/ulp/tom/cxgb_tom.h>
+#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
+#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
+#include <dev/cxgb/ulp/tom/cxgb_vm.h>
+
+#define MAX_SCHEDULE_TIMEOUT	300
+
+/*
+ * Return the # of page pods needed to accommodate a # of pages.
+ */
+static inline unsigned int
+pages2ppods(unsigned int pages)
+{
+	return (pages + PPOD_PAGES - 1) / PPOD_PAGES + NUM_SENTINEL_PPODS;
+}
+
+/**
+ *	t3_pin_pages - pin a user memory range and prepare it for DDP
+ *	@addr - the starting address
+ *	@len - the length of the range
+ *	@newgl - contains the pages and physical addresses of the pinned range
+ *	@gl - an existing gather list, may be %NULL
+ *
+ *	Pins the pages in the user-space memory range [addr, addr + len) and
+ *	maps them for DMA.  Returns a gather list with the pinned pages and
+ *	their physical addresses.  If @gl is non NULL the pages it describes
+ *	are compared against the pages for [addr, addr + len), and if the
+ *	existing gather list already covers the range a new list is not
+ *	allocated.  Returns 0 on success, or a negative errno.  On success if
+ *	a new gather list was allocated it is returned in @newgl.
+ */ 
+static int
+t3_pin_pages(bus_dma_tag_t tag, bus_dmamap_t map, vm_offset_t addr,
+    size_t len, struct ddp_gather_list **newgl,
+    const struct ddp_gather_list *gl)
+{
+	int i = 0, err;
+	size_t pg_off;
+	unsigned int npages;
+	struct ddp_gather_list *p;
+
+	/*
+	 * XXX need x86 agnostic check
+	 */
+	if (addr + len > VM_MAXUSER_ADDRESS)
+		return (EFAULT);
+
+	pg_off = addr & PAGE_MASK;
+	npages = (pg_off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	p = malloc(sizeof(struct ddp_gather_list) + npages * sizeof(vm_page_t *),
+	    M_DEVBUF, M_NOWAIT|M_ZERO);
+	if (p == NULL)
+		return (ENOMEM);
+
+	err = vm_fault_hold_user_pages(addr, p->dgl_pages, npages, VM_HOLD_WRITEABLE);
+	if (err)
+		goto free_gl;
+
+	if (gl && gl->dgl_offset == pg_off && gl->dgl_nelem >= npages &&
+	    gl->dgl_length >= len) {
+		for (i = 0; i < npages; i++)
+			if (p->dgl_pages[i] != gl->dgl_pages[i])
+				goto different_gl;
+		err = 0;
+		goto unpin;
+	}
+
+different_gl:
+	p->dgl_length = len;
+	p->dgl_offset = pg_off;
+	p->dgl_nelem = npages;
+#ifdef NEED_BUSDMA
+	p->phys_addr[0] = pci_map_page(pdev, p->pages[0], pg_off,
+				       PAGE_SIZE - pg_off,
+				       PCI_DMA_FROMDEVICE) - pg_off;
+	for (i = 1; i < npages; ++i)
+		p->phys_addr[i] = pci_map_page(pdev, p->pages[i], 0, PAGE_SIZE,
+					       PCI_DMA_FROMDEVICE);
+#endif	
+	*newgl = p;
+	return (0);
+unpin:
+	vm_fault_unhold_pages(p->dgl_pages, npages);
+
+free_gl:
+	
+	free(p, M_DEVBUF);
+	*newgl = NULL;
+	return (err);
+}
+
+static void
+unmap_ddp_gl(const struct ddp_gather_list *gl)
+{
+#ifdef NEED_BUSDMA	
+	int i;
+
+	if (!gl->nelem)
+		return;
+
+	pci_unmap_page(pdev, gl->phys_addr[0] + gl->offset,
+		       PAGE_SIZE - gl->offset, PCI_DMA_FROMDEVICE);
+	for (i = 1; i < gl->nelem; ++i)
+		pci_unmap_page(pdev, gl->phys_addr[i], PAGE_SIZE,
+			       PCI_DMA_FROMDEVICE);
+
+#endif
+}
+
+static void
+ddp_gl_free_pages(struct ddp_gather_list *gl, int dirty)
+{
+	/*
+	 * XXX mark pages as dirty before unholding 
+	 */
+	vm_fault_unhold_pages(gl->dgl_pages, gl->dgl_nelem);
+}
+
+void
+t3_free_ddp_gl(struct ddp_gather_list *gl)
+{
+	unmap_ddp_gl(gl);
+	ddp_gl_free_pages(gl, 0);
+	free(gl, M_DEVBUF);
+}
+
+/* Max # of page pods for a buffer, enough for 1MB buffer at 4KB page size */
+#define MAX_PPODS 64U
+
+/*
+ * Allocate page pods for DDP buffer 1 (the user buffer) and set up the tag in
+ * the TCB.  We allocate page pods in multiples of PPOD_CLUSTER_SIZE.  First we
+ * try to allocate enough page pods to accommodate the whole buffer, subject to
+ * the MAX_PPODS limit.  If that fails we try to allocate PPOD_CLUSTER_SIZE page
+ * pods before failing entirely.
+ */
+static int
+alloc_buf1_ppods(struct socket *so, struct ddp_state *p,
+			    unsigned long addr, unsigned int len)
+{
+	int err, tag, npages, nppods;
+	struct tom_data *d = TOM_DATA(TOE_DEV(so));
+
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+	npages = ((addr & PAGE_MASK) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	nppods = min(pages2ppods(npages), MAX_PPODS);
+	nppods = roundup2(nppods, PPOD_CLUSTER_SIZE);
+	err = t3_alloc_ppods(d, nppods, &tag);
+	if (err && nppods > PPOD_CLUSTER_SIZE) {
+		nppods = PPOD_CLUSTER_SIZE;
+		err = t3_alloc_ppods(d, nppods, &tag);
+	}
+	if (err)
+		return (ENOMEM);
+
+	p->ubuf_nppods = nppods;
+	p->ubuf_tag = tag;
+#if NUM_DDP_KBUF == 1
+	t3_set_ddp_tag(so, 1, tag << 6);
+#endif
+	return (0);
+}
+
+/*
+ * Starting offset for the user DDP buffer.  A non-0 value ensures a DDP flush
+ * won't block indefinitely if there's nothing to place (which should be rare).
+ */
+#define UBUF_OFFSET 1
+
+static __inline unsigned long
+select_ddp_flags(const struct socket *so, int buf_idx,
+                 int nonblock, int rcv_flags)
+{
+	if (buf_idx == 1) {
+		if (__predict_false(rcv_flags & MSG_WAITALL))
+			return V_TF_DDP_PSH_NO_INVALIDATE0(1) |
+			       V_TF_DDP_PSH_NO_INVALIDATE1(1) |
+			       V_TF_DDP_PUSH_DISABLE_1(1);
+		if (nonblock)
+			return V_TF_DDP_BUF1_FLUSH(1);
+
+		return V_TF_DDP_BUF1_FLUSH(!TOM_TUNABLE(TOE_DEV(so),
+							ddp_push_wait));
+	}
+
+	if (__predict_false(rcv_flags & MSG_WAITALL))
+		return V_TF_DDP_PSH_NO_INVALIDATE0(1) |
+		       V_TF_DDP_PSH_NO_INVALIDATE1(1) |
+		       V_TF_DDP_PUSH_DISABLE_0(1);
+	if (nonblock)
+		return V_TF_DDP_BUF0_FLUSH(1);
+
+	return V_TF_DDP_BUF0_FLUSH(!TOM_TUNABLE(TOE_DEV(so), ddp_push_wait));
+}
+
+/*
+ * Reposts the kernel DDP buffer after it has been previously become full and
+ * invalidated.  We just need to reset the offset and adjust the DDP flags.
+ * Conveniently, we can set the flags and the offset with a single message.
+ * Note that this function does not set the buffer length.  Again conveniently
+ * our kernel buffer is of fixed size.  If the length needs to be changed it
+ * needs to be done separately.
+ */
+static void
+t3_repost_kbuf(struct socket *so, unsigned int bufidx, int modulate, 
+    int activate, int nonblock)
+{
+	struct toepcb *toep = sototcpcb(so)->t_toe;
+	struct ddp_state *p = &toep->tp_ddp_state;
+	unsigned long flags;
+	
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+	p->buf_state[bufidx].cur_offset = p->kbuf[bufidx]->dgl_offset;
+	p->buf_state[bufidx].flags = p->kbuf_noinval ? DDP_BF_NOINVAL : 0;
+	p->buf_state[bufidx].gl = p->kbuf[bufidx];
+	p->cur_buf = bufidx;
+	p->kbuf_idx = bufidx;
+
+	flags = select_ddp_flags(so, bufidx, nonblock, 0);
+	if (!bufidx)
+		t3_setup_ddpbufs(toep, 0, 0, 0, 0, flags |
+			 V_TF_DDP_PSH_NO_INVALIDATE0(p->kbuf_noinval) |
+			 V_TF_DDP_PSH_NO_INVALIDATE1(p->kbuf_noinval) |
+		         V_TF_DDP_BUF0_VALID(1),
+		         V_TF_DDP_BUF0_FLUSH(1) |
+			 V_TF_DDP_PSH_NO_INVALIDATE0(1) |
+		         V_TF_DDP_PSH_NO_INVALIDATE1(1) | V_TF_DDP_OFF(1) |
+			 V_TF_DDP_BUF0_VALID(1) |
+			 V_TF_DDP_ACTIVE_BUF(activate), modulate);
+	else
+		t3_setup_ddpbufs(toep, 0, 0, 0, 0, flags |
+			 V_TF_DDP_PSH_NO_INVALIDATE0(p->kbuf_noinval) |	
+		         V_TF_DDP_PSH_NO_INVALIDATE1(p->kbuf_noinval) | 
+			 V_TF_DDP_BUF1_VALID(1) | 
+			 V_TF_DDP_ACTIVE_BUF(activate),
+		         V_TF_DDP_BUF1_FLUSH(1) | 
+			 V_TF_DDP_PSH_NO_INVALIDATE0(1) |
+		         V_TF_DDP_PSH_NO_INVALIDATE1(1) | V_TF_DDP_OFF(1) |
+			 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), 
+			 modulate);
+	
+}
+
+/**
+ * setup_uio_ppods - setup HW page pods for a user iovec
+ * @sk: the associated socket
+ * @uio: the uio
+ * @oft: additional bytes to map before the start of the buffer
+ *
+ * Pins a user iovec and sets up HW page pods for DDP into it.  We allocate
+ * page pods for user buffers on the first call per socket.  Afterwards we
+ * limit the buffer length to whatever the existing page pods can accommodate.
+ * Returns a negative error code or the length of the mapped buffer.
+ *
+ * The current implementation handles iovecs with only one entry.
+ */
+static int
+setup_uio_ppods(struct socket *so, const struct uio *uio, int oft, int *length)
+{
+	int err;
+	unsigned int len;
+	struct ddp_gather_list *gl = NULL;
+	struct toepcb *toep = sototcpcb(so)->t_toe;
+	struct ddp_state *p = &toep->tp_ddp_state;
+	struct iovec *iov = uio->uio_iov;
+	vm_offset_t addr = (vm_offset_t)iov->iov_base - oft;
+
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+	if (__predict_false(p->ubuf_nppods == 0)) {
+		err = alloc_buf1_ppods(so, p, addr, iov->iov_len + oft);
+		if (err)
+			return (err);
+	}
+
+	len = (p->ubuf_nppods - NUM_SENTINEL_PPODS) * PPOD_PAGES * PAGE_SIZE;
+	len -= addr & PAGE_MASK;
+	if (len > M_TCB_RX_DDP_BUF0_LEN)
+		len = M_TCB_RX_DDP_BUF0_LEN;
+	len = min(len, sototcpcb(so)->rcv_wnd - 32768);
+	len = min(len, iov->iov_len + oft);
+
+	if (len <= p->kbuf[0]->dgl_length) {
+		printf("length too short\n");
+		return (EINVAL);
+	}
+	
+	err = t3_pin_pages(toep->tp_rx_dmat, toep->tp_dmamap, addr, len, &gl, p->ubuf);
+	if (err)
+		return (err);
+	if (gl) {
+		if (p->ubuf)
+			t3_free_ddp_gl(p->ubuf);
+		p->ubuf = gl;
+		t3_setup_ppods(so, gl, pages2ppods(gl->dgl_nelem), p->ubuf_tag, len,
+			       gl->dgl_offset, 0);
+	}
+	*length = len;
+	return (0);
+}
+
+/*
+ * 
+ */
+void
+t3_cancel_ubuf(struct toepcb *toep)
+{
+	struct ddp_state *p = &toep->tp_ddp_state;
+	int ubuf_pending = t3_ddp_ubuf_pending(toep);
+	struct socket *so = toeptoso(toep);
+	int err = 0, count=0;
+	
+	if (p->ubuf == NULL)
+		return;
+
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);	
+	p->cancel_ubuf = 1;
+	while (ubuf_pending && !(so->so_rcv.sb_state & SBS_CANTRCVMORE)) {
+#ifdef T3_TRACE
+		T3_TRACE3(TB(p), 
+		  "t3_cancel_ubuf: flags0 0x%x flags1 0x%x get_tcb_count %d",
+		  p->buf_state[0].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY), 
+		  p->buf_state[1].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY),
+		  p->get_tcb_count);
+#endif
+		CTR3(KTR_TOM,
+		  "t3_cancel_ubuf: flags0 0x%x flags1 0x%x get_tcb_count %d",
+		  p->buf_state[0].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY), 
+		  p->buf_state[1].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY),
+		  p->get_tcb_count);	
+		if (p->get_tcb_count == 0)
+			t3_cancel_ddpbuf(toep, p->cur_buf);
+		else
+			CTR5(KTR_TOM, "waiting err=%d get_tcb_count=%d timeo=%d so=%p SBS_CANTRCVMORE=%d",
+			    err, p->get_tcb_count, so->so_rcv.sb_timeo, so,
+			    !!(so->so_rcv.sb_state & SBS_CANTRCVMORE));
+		
+		while (p->get_tcb_count && !(so->so_rcv.sb_state & SBS_CANTRCVMORE)) {
+			if (count & 0xfffffff)
+				CTR5(KTR_TOM, "waiting err=%d get_tcb_count=%d timeo=%d so=%p count=%d",
+				    err, p->get_tcb_count, so->so_rcv.sb_timeo, so, count);
+			count++;
+			err = sbwait(&so->so_rcv);
+		}
+		ubuf_pending = t3_ddp_ubuf_pending(toep);
+	}
+	p->cancel_ubuf = 0;
+}
+
+#define OVERLAY_MASK (V_TF_DDP_PSH_NO_INVALIDATE0(1) | \
+	              V_TF_DDP_PSH_NO_INVALIDATE1(1) | \
+		      V_TF_DDP_BUF1_FLUSH(1) | \
+		      V_TF_DDP_BUF0_FLUSH(1) | \
+		      V_TF_DDP_PUSH_DISABLE_1(1) | \
+		      V_TF_DDP_PUSH_DISABLE_0(1) | \
+		      V_TF_DDP_INDICATE_OUT(1))
+
+/*
+ * Post a user buffer as an overlay on top of the current kernel buffer.
+ */
+int
+t3_overlay_ubuf(struct socket *so, const struct uio *uio,
+    int nonblock, int rcv_flags, int modulate, int post_kbuf)
+{
+	int err, len, ubuf_idx;
+	unsigned long flags;
+	struct toepcb *toep = sototcpcb(so)->t_toe;
+	struct ddp_state *p = &toep->tp_ddp_state;
+
+	if (p->kbuf[0] == NULL) {
+		return (EINVAL);
+	}
+	
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+	err = setup_uio_ppods(so, uio, 0, &len);
+	if (err) {
+		return (err);
+	}
+	
+	ubuf_idx = p->kbuf_idx;
+	p->buf_state[ubuf_idx].flags = DDP_BF_NOFLIP;
+	/* Use existing offset */
+	/* Don't need to update .gl, user buffer isn't copied. */
+	p->cur_buf = ubuf_idx;
+
+	flags = select_ddp_flags(so, ubuf_idx, nonblock, rcv_flags);
+
+	if (post_kbuf) {
+		struct ddp_buf_state *dbs = &p->buf_state[ubuf_idx ^ 1];
+		
+		dbs->cur_offset = 0;
+		dbs->flags = 0;
+		dbs->gl = p->kbuf[ubuf_idx ^ 1];
+		p->kbuf_idx ^= 1;
+		flags |= p->kbuf_idx ?
+		    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_PUSH_DISABLE_1(0) :
+		    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_PUSH_DISABLE_0(0);
+	}
+	
+	if (ubuf_idx == 0) {
+		t3_overlay_ddpbuf(toep, 0, p->ubuf_tag << 6, p->kbuf_tag[1] << 6,
+				  len);
+		t3_setup_ddpbufs(toep, 0, 0, p->kbuf[1]->dgl_length, 0,
+				 flags,
+				 OVERLAY_MASK | flags, 1);
+	} else {
+		t3_overlay_ddpbuf(toep, 1, p->kbuf_tag[0] << 6, p->ubuf_tag << 6,
+				  len);
+		t3_setup_ddpbufs(toep, p->kbuf[0]->dgl_length, 0, 0, 0,
+				 flags,
+				 OVERLAY_MASK | flags, 1);
+	}
+#ifdef T3_TRACE
+	T3_TRACE5(TIDTB(so),
+		  "t3_overlay_ubuf: tag %u flags 0x%x mask 0x%x ubuf_idx %d "
+		  " kbuf_idx %d",
+		   p->ubuf_tag, flags, OVERLAY_MASK, ubuf_idx, p->kbuf_idx);
+#endif
+	CTR3(KTR_TOM,
+	    "t3_overlay_ubuf: tag %u flags 0x%x mask 0x%x",
+	    p->ubuf_tag, flags, OVERLAY_MASK);
+	CTR3(KTR_TOM,
+	    "t3_overlay_ubuf:  ubuf_idx %d kbuf_idx %d post_kbuf %d",
+	    ubuf_idx, p->kbuf_idx, post_kbuf);
+	    
+	return (0);
+}
+
+/*
+ * Clean up DDP state that needs to survive until socket close time, such as the
+ * DDP buffers.  The buffers are already unmapped at this point as unmapping
+ * needs the PCI device and a socket may close long after the device is removed.
+ */
+void
+t3_cleanup_ddp(struct toepcb *toep)
+{
+	struct ddp_state *p = &toep->tp_ddp_state;
+	int idx;
+
+	for (idx = 0; idx < NUM_DDP_KBUF; idx++)
+		if (p->kbuf[idx]) {
+			ddp_gl_free_pages(p->kbuf[idx], 0);
+			free(p->kbuf[idx], M_DEVBUF);
+		}
+	if (p->ubuf) {
+		ddp_gl_free_pages(p->ubuf, 0);
+		free(p->ubuf, M_DEVBUF);
+		p->ubuf = NULL;
+	}
+	toep->tp_ulp_mode = 0;
+}
+
+/*
+ * This is a companion to t3_cleanup_ddp() and releases the HW resources
+ * associated with a connection's DDP state, such as the page pods.
+ * It's called when HW is done with a connection.   The rest of the state
+ * remains available until both HW and the app are done with the connection.
+ */
+void
+t3_release_ddp_resources(struct toepcb *toep)
+{
+	struct ddp_state *p = &toep->tp_ddp_state;
+	struct tom_data *d = TOM_DATA(toep->tp_toedev);
+	int idx;
+	
+	for (idx = 0; idx < NUM_DDP_KBUF; idx++) {
+		t3_free_ppods(d, p->kbuf_tag[idx], 
+		    p->kbuf_nppods[idx]);
+		unmap_ddp_gl(p->kbuf[idx]);
+	}
+
+	if (p->ubuf_nppods) {
+		t3_free_ppods(d, p->ubuf_tag, p->ubuf_nppods);
+		p->ubuf_nppods = 0;
+	}
+	if (p->ubuf)
+		unmap_ddp_gl(p->ubuf);
+	
+}
+
+void
+t3_post_kbuf(struct socket *so, int modulate, int nonblock)
+{
+	struct toepcb *toep = sototcpcb(so)->t_toe;
+	struct ddp_state *p = &toep->tp_ddp_state;
+
+	t3_set_ddp_tag(so, p->cur_buf, p->kbuf_tag[p->cur_buf] << 6);
+	t3_set_ddp_buf(so, p->cur_buf, 0, p->kbuf[p->cur_buf]->dgl_length);
+	t3_repost_kbuf(so, p->cur_buf, modulate, 1, nonblock);
+#ifdef T3_TRACE
+	T3_TRACE1(TIDTB(so),
+		  "t3_post_kbuf: cur_buf = kbuf_idx = %u ", p->cur_buf);
+#endif
+	CTR1(KTR_TOM,
+		  "t3_post_kbuf: cur_buf = kbuf_idx = %u ", p->cur_buf);
+}
+
+/*
+ * Prepare a socket for DDP.  Must be called when the socket is known to be
+ * open.
+ */
+int
+t3_enter_ddp(struct socket *so, unsigned int kbuf_size, unsigned int waitall, int nonblock)
+{
+	int i, err = ENOMEM;
+	static vm_pindex_t color;
+	unsigned int nppods, kbuf_pages, idx = 0;
+	struct toepcb *toep = sototcpcb(so)->t_toe;
+	struct ddp_state *p = &toep->tp_ddp_state;
+	struct tom_data *d = TOM_DATA(toep->tp_toedev);
+
+	
+	if (kbuf_size > M_TCB_RX_DDP_BUF0_LEN)
+		return (EINVAL);
+
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+	
+	kbuf_pages = (kbuf_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	nppods = pages2ppods(kbuf_pages);
+
+	p->kbuf_noinval = !!waitall;
+	p->kbuf_tag[NUM_DDP_KBUF - 1] = -1;
+	for (idx = 0; idx < NUM_DDP_KBUF; idx++) {
+		p->kbuf[idx] = 
+		    malloc(sizeof (struct ddp_gather_list) + kbuf_pages *
+			sizeof(vm_page_t *), M_DEVBUF, M_NOWAIT|M_ZERO);
+		if (p->kbuf[idx] == NULL)
+			goto err;
+		err = t3_alloc_ppods(d, nppods, &p->kbuf_tag[idx]);
+		if (err) {
+			printf("t3_alloc_ppods failed err=%d\n", err);
+			goto err;
+		}
+		
+		p->kbuf_nppods[idx] = nppods;
+		p->kbuf[idx]->dgl_length = kbuf_size;
+		p->kbuf[idx]->dgl_offset = 0;
+		p->kbuf[idx]->dgl_nelem = kbuf_pages;
+
+		for (i = 0; i < kbuf_pages; ++i) {
+			p->kbuf[idx]->dgl_pages[i] = vm_page_alloc(NULL, color,
+			    VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED |
+			    VM_ALLOC_ZERO);
+			if (p->kbuf[idx]->dgl_pages[i] == NULL) {
+				p->kbuf[idx]->dgl_nelem = i;
+				printf("failed to allocate kbuf pages\n");
+				goto err;
+			}
+		}
+#ifdef NEED_BUSDMA
+		/*
+		 * XXX we'll need this for VT-d or any platform with an iommu :-/
+		 *
+		 */
+		for (i = 0; i < kbuf_pages; ++i)
+			p->kbuf[idx]->phys_addr[i] = 
+			    pci_map_page(p->pdev, p->kbuf[idx]->pages[i],
+					 0, PAGE_SIZE, PCI_DMA_FROMDEVICE);
+#endif
+		t3_setup_ppods(so, p->kbuf[idx], nppods, p->kbuf_tag[idx], 
+			       p->kbuf[idx]->dgl_length, 0, 0);
+	}
+	cxgb_log_tcb(TOEP_T3C_DEV(toep)->adapter, toep->tp_tid);
+
+	t3_set_ddp_tag(so, 0, p->kbuf_tag[0] << 6);
+	t3_set_ddp_buf(so, 0, 0, p->kbuf[0]->dgl_length);
+	t3_repost_kbuf(so, 0, 0, 1, nonblock);
+
+	t3_set_rcv_coalesce_enable(so, 
+	    TOM_TUNABLE(TOE_DEV(so), ddp_rcvcoalesce));
+
+#ifdef T3_TRACE
+	T3_TRACE4(TIDTB(so),
+		  "t3_enter_ddp: kbuf_size %u waitall %u tag0 %d tag1 %d",
+		   kbuf_size, waitall, p->kbuf_tag[0], p->kbuf_tag[1]);
+#endif
+	CTR4(KTR_TOM,
+		  "t3_enter_ddp: kbuf_size %u waitall %u tag0 %d tag1 %d",
+		   kbuf_size, waitall, p->kbuf_tag[0], p->kbuf_tag[1]);
+	DELAY(100000);
+	cxgb_log_tcb(TOEP_T3C_DEV(toep)->adapter, toep->tp_tid);
+	return (0);
+
+err:
+	t3_release_ddp_resources(toep);
+	t3_cleanup_ddp(toep);
+	return (err);
+}
+
+int
+t3_ddp_copy(const struct mbuf *m, int offset, struct uio *uio, int len)
+{
+	int page_off, resid_init, err;
+	struct ddp_gather_list *gl = (struct ddp_gather_list *)m->m_ddp_gl;
+	
+	resid_init = uio->uio_resid;
+	
+	if (!gl->dgl_pages)
+		panic("pages not set\n");
+
+	offset += gl->dgl_offset + m->m_cur_offset;
+	page_off = offset & PAGE_MASK;
+	KASSERT(len <= gl->dgl_length,
+	    ("len=%d > dgl_length=%d in ddp_copy\n", len, gl->dgl_length));
+
+	err = uiomove_fromphys(gl->dgl_pages, page_off, len, uio);
+	return (err);
+}
+
+
+/*
+ * Allocate n page pods.  Returns -1 on failure or the page pod tag.
+ */
+int
+t3_alloc_ppods(struct tom_data *td, unsigned int n, int *ptag)
+{
+	unsigned int i, j;
+
+	if (__predict_false(!td->ppod_map)) {
+		printf("ppod_map not set\n");
+		return (EINVAL);
+	}
+
+	mtx_lock(&td->ppod_map_lock);
+	for (i = 0; i < td->nppods; ) {
+		
+		for (j = 0; j < n; ++j)           /* scan ppod_map[i..i+n-1] */
+			if (td->ppod_map[i + j]) {
+				i = i + j + 1;
+				goto next;
+			}
+		memset(&td->ppod_map[i], 1, n);   /* allocate range */
+		mtx_unlock(&td->ppod_map_lock);
+		CTR2(KTR_TOM,
+		    "t3_alloc_ppods: n=%u tag=%u", n, i);
+		*ptag = i;
+		return (0);
+	next: ;
+	}
+	mtx_unlock(&td->ppod_map_lock);
+	return (0);
+}
+
+void
+t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n)
+{
+	/* No need to take ppod_lock here */
+	memset(&td->ppod_map[tag], 0, n);
+}
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_defs.h b/sys/dev/cxgb/ulp/tom/cxgb_defs.h
index 9077295..8989fd9 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_defs.h
+++ b/sys/dev/cxgb/ulp/tom/cxgb_defs.h
@@ -40,6 +40,13 @@ $FreeBSD$
 #define toeptoso(toep) ((toep)->tp_tp->t_inpcb->inp_socket)
 #define sototoep(so) (sototcpcb((so))->t_toe)
 
+#define TRACE_ENTER printf("%s:%s entered\n", __FUNCTION__, __FILE__)
+#define TRACE_EXIT printf("%s:%s:%d exited\n", __FUNCTION__, __FILE__, __LINE__)
+	
+#define	KTR_TOM	KTR_SPARE2
+#define	KTR_TCB	KTR_SPARE3
+
+struct toepcb;
 struct listen_ctx;
 
 typedef void (*defer_handler_t)(struct toedev *dev, struct mbuf *m);
@@ -54,7 +61,8 @@ void t3_init_listen_cpl_handlers(void);
 int t3_init_cpl_io(void);
 void t3_init_wr_tab(unsigned int wr_len);
 uint32_t t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail);
-void t3_cleanup_rbuf(struct tcpcb *tp);
+void t3_send_rx_modulate(struct toepcb *toep);
+void t3_cleanup_rbuf(struct tcpcb *tp, int copied);
 
 void t3_init_socket_ops(void);
 void t3_install_socket_ops(struct socket *so);
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_listen.c b/sys/dev/cxgb/ulp/tom/cxgb_listen.c
index a88b26e..acbad6f 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_listen.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_listen.c
@@ -180,7 +180,6 @@ listen_hash_add(struct tom_data *d, struct socket *so, unsigned int stid)
 	return p;
 }
 
-#if 0
 /*
  * Given a pointer to a listening socket return its server TID by consulting
  * the socket->stid map.  Returns -1 if the socket is not in the map.
@@ -191,16 +190,15 @@ listen_hash_find(struct tom_data *d, struct socket *so)
 	int stid = -1, bucket = listen_hashfn(so);
 	struct listen_info *p;
 
-	spin_lock(&d->listen_lock);
+	mtx_lock(&d->listen_lock);
 	for (p = d->listen_hash_tab[bucket]; p; p = p->next)
-		if (p->sk == sk) {
+		if (p->so == so) {
 			stid = p->stid;
 			break;
 		}
-	spin_unlock(&d->listen_lock);
+	mtx_unlock(&d->listen_lock);
 	return stid;
 }
-#endif
 
 /*
  * Delete the listen_info structure for a listening socket.  Returns the server
@@ -244,28 +242,24 @@ t3_listen_start(struct toedev *dev, struct socket *so, struct t3cdev *cdev)
 	if (!TOM_TUNABLE(dev, activated))
 		return;
 
-	printf("start listen\n");
+	if (listen_hash_find(d, so) != -1)
+		return;
 	
-	ctx = malloc(sizeof(*ctx), M_CXGB, M_NOWAIT);
+	CTR1(KTR_TOM, "start listen on port %u", ntohs(inp->inp_lport));
+	ctx = malloc(sizeof(*ctx), M_CXGB, M_NOWAIT|M_ZERO);
 
 	if (!ctx)
 		return;
 
 	ctx->tom_data = d;
 	ctx->lso = so;
-	ctx->ulp_mode = 0; /* DDP if the default */
+	ctx->ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) ? ULP_MODE_TCPDDP : 0;
 	LIST_INIT(&ctx->synq_head);
 	
 	stid = cxgb_alloc_stid(d->cdev, d->client, ctx);
 	if (stid < 0)
 		goto free_ctx;
 
-#ifdef notyet
-	/*
-	 * XXX need to mark inpcb as referenced
-	 */
-	sock_hold(sk);
-#endif
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		goto free_stid;
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h b/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h
index 9fa42b5..e37c9b1 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h
+++ b/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h
@@ -1,4 +1,3 @@
-
 /**************************************************************************
 
 Copyright (c) 2007, Chelsio Inc.
@@ -86,7 +85,6 @@ struct pagepod {
 #define M_PPOD_PGSZ    0x3
 #define V_PPOD_PGSZ(x) ((x) << S_PPOD_PGSZ)
 
-struct pci_dev;
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <machine/bus.h>
@@ -96,8 +94,7 @@ struct ddp_gather_list {
 	unsigned int	dgl_length;
 	unsigned int	dgl_offset;
 	unsigned int	dgl_nelem;
-	vm_page_t   	*dgl_pages;
-	bus_addr_t 	dgl_phys_addr[0];
+	vm_page_t   	dgl_pages[0];
 };
 
 struct ddp_buf_state {
@@ -107,7 +104,6 @@ struct ddp_buf_state {
 };
 
 struct ddp_state {
-	struct pci_dev *pdev;
 	struct ddp_buf_state buf_state[2];   /* per buffer state */
 	int cur_buf;
 	unsigned short kbuf_noinval;
@@ -119,6 +115,7 @@ struct ddp_state {
 	int get_tcb_count;
 	unsigned int kbuf_posted;
 	int cancel_ubuf;
+	int user_ddp_pending;
 	unsigned int kbuf_nppods[NUM_DDP_KBUF];
 	unsigned int kbuf_tag[NUM_DDP_KBUF];
 	struct ddp_gather_list *kbuf[NUM_DDP_KBUF]; /* kernel buffer for DDP prefetch */
@@ -132,54 +129,51 @@ enum {
 	DDP_BF_PSH     = 1 << 3,   /* set in skb->flags if the a DDP was 
 	                              completed with a segment having the
 				      PSH flag set */
+	DDP_BF_NODATA  = 1 << 4,   /* buffer completed before filling */ 
 };
 
-#ifdef notyet
+#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+
 /*
  * Returns 1 if a UBUF DMA buffer might be active.
  */
-static inline int t3_ddp_ubuf_pending(struct sock *so)
+static inline int
+t3_ddp_ubuf_pending(struct toepcb *toep)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct ddp_state *p = DDP_STATE(tp);
+	struct ddp_state *p = &toep->tp_ddp_state;
 
 	/* When the TOM_TUNABLE(ddp) is enabled, we're always in ULP_MODE DDP,
 	 * but DDP_STATE() is only valid if the connection actually enabled
 	 * DDP.
 	 */
-	if (!p)
-		return 0;
+	if (p->kbuf[0] == NULL)
+		return (0);
 
 	return (p->buf_state[0].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY)) || 
 	       (p->buf_state[1].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY));
 }
-#endif
 
 int t3_setup_ppods(struct socket *so, const struct ddp_gather_list *gl,
 		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
 		   unsigned int pg_off, unsigned int color);
-int t3_alloc_ppods(struct tom_data *td, unsigned int n);
+int t3_alloc_ppods(struct tom_data *td, unsigned int n, int *tag);
 void t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n);
-void t3_free_ddp_gl(struct pci_dev *pdev, struct ddp_gather_list *gl);
-int t3_pin_pages(struct pci_dev *pdev, unsigned long uaddr, size_t len,
-		 struct ddp_gather_list **newgl,
-		 const struct ddp_gather_list *gl);
-int t3_ddp_copy(const struct mbuf *skb, int offset, struct iovec *to,
-		int len);
+void t3_free_ddp_gl(struct ddp_gather_list *gl);
+int t3_ddp_copy(const struct mbuf *m, int offset, struct uio *uio, int len);
 //void t3_repost_kbuf(struct socket *so, int modulate, int activate);
-void t3_post_kbuf(struct socket *so, int modulate);
-int t3_post_ubuf(struct socket *so, const struct iovec *iov, int nonblock,
+void t3_post_kbuf(struct socket *so, int modulate, int nonblock);
+int t3_post_ubuf(struct socket *so, const struct uio *uio, int nonblock,
 		 int rcv_flags, int modulate, int post_kbuf);
-void t3_cancel_ubuf(struct socket *so);
-int t3_overlay_ubuf(struct socket *so, const struct iovec *iov, int nonblock,
-		    int rcv_flags, int modulate, int post_kbuf);
-int t3_enter_ddp(struct socket *so, unsigned int kbuf_size, unsigned int waitall);
-void t3_cleanup_ddp(struct socket *so);
+void t3_cancel_ubuf(struct toepcb *toep);
+int t3_overlay_ubuf(struct socket *so, const struct uio *uio, int nonblock,
+    int rcv_flags, int modulate, int post_kbuf);
+int t3_enter_ddp(struct socket *so, unsigned int kbuf_size, unsigned int waitall, int nonblock);
+void t3_cleanup_ddp(struct toepcb *toep);
 void t3_release_ddp_resources(struct toepcb *toep);
-void t3_cancel_ddpbuf(struct socket *so, unsigned int bufidx);
-void t3_overlay_ddpbuf(struct socket *so, unsigned int bufidx, unsigned int tag0,
+void t3_cancel_ddpbuf(struct toepcb *, unsigned int bufidx);
+void t3_overlay_ddpbuf(struct toepcb *, unsigned int bufidx, unsigned int tag0,
 		       unsigned int tag1, unsigned int len);
-void t3_setup_ddpbufs(struct socket *so, unsigned int len0, unsigned int offset0,
+void t3_setup_ddpbufs(struct toepcb *, unsigned int len0, unsigned int offset0,
 		      unsigned int len1, unsigned int offset1,
 		      uint64_t ddp_flags, uint64_t flag_mask, int modulate);
 #endif  /* T3_DDP_H */
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp_subr.c b/sys/dev/cxgb/ulp/tom/cxgb_tcp_subr.c
deleted file mode 100644
index 2eca099..0000000
--- a/sys/dev/cxgb/ulp/tom/cxgb_tcp_subr.c
+++ /dev/null
@@ -1,694 +0,0 @@
-/*-
- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
- *	The Regents of the University of California.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include "opt_compat.h"
-#include "opt_inet.h"
-#include "opt_inet6.h"
-#include "opt_ipsec.h"
-#include "opt_mac.h"
-#include "opt_tcpdebug.h"
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/callout.h>
-#include <sys/kernel.h>
-#include <sys/sysctl.h>
-#include <sys/malloc.h>
-#include <sys/mbuf.h>
-#ifdef INET6
-#include <sys/domain.h>
-#endif
-#include <sys/priv.h>
-#include <sys/proc.h>
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-#include <sys/protosw.h>
-#include <sys/random.h>
-
-#include <vm/uma.h>
-
-#include <net/route.h>
-#include <net/if.h>
-
-#include <netinet/in.h>
-#include <netinet/in_systm.h>
-#include <netinet/ip.h>
-#ifdef INET6
-#include <netinet/ip6.h>
-#endif
-#include <netinet/in_pcb.h>
-#ifdef INET6
-#include <netinet6/in6_pcb.h>
-#endif
-#include <netinet/in_var.h>
-#include <netinet/ip_var.h>
-#ifdef INET6
-#include <netinet6/ip6_var.h>
-#include <netinet6/scope6_var.h>
-#include <netinet6/nd6.h>
-#endif
-#include <netinet/ip_icmp.h>
-#include <netinet/tcp.h>
-#include <netinet/tcp_fsm.h>
-#include <netinet/tcp_seq.h>
-#include <netinet/tcp_timer.h>
-#include <netinet/tcp_var.h>
-#include <netinet/tcp_syncache.h>
-#include <netinet/tcp_offload.h>
-#ifdef INET6
-#include <netinet6/tcp6_var.h>
-#endif
-#include <netinet/tcpip.h>
-#ifdef TCPDEBUG
-#include <netinet/tcp_debug.h>
-#endif
-#include <netinet6/ip6protosw.h>
-
-#ifdef IPSEC
-#include <netipsec/ipsec.h>
-#include <netipsec/xform.h>
-#ifdef INET6
-#include <netipsec/ipsec6.h>
-#endif
-#include <netipsec/key.h>
-#endif /*IPSEC*/
-
-#include <machine/in_cksum.h>
-#include <sys/md5.h>
-
-#include <security/mac/mac_framework.h>
-
-#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
-
-
-SYSCTL_NODE(_net_inet_tcp, 0,	cxgb,	CTLFLAG_RW, 0,	"chelsio TOE");
-
-static int	tcp_log_debug = 0;
-SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, log_debug, CTLFLAG_RW,
-    &tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
-
-static int	tcp_tcbhashsize = 0;
-SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN,
-    &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
-
-static int	do_tcpdrain = 1;
-SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, do_tcpdrain, CTLFLAG_RW,
-    &do_tcpdrain, 0,
-    "Enable tcp_drain routine for extra help when low on mbufs");
-
-SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, pcbcount, CTLFLAG_RD,
-    &tcbinfo.ipi_count, 0, "Number of active PCBs");
-
-static int	icmp_may_rst = 1;
-SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, icmp_may_rst, CTLFLAG_RW,
-    &icmp_may_rst, 0,
-    "Certain ICMP unreachable messages may abort connections in SYN_SENT");
-
-static int	tcp_isn_reseed_interval = 0;
-SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
-    &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
-
-/*
- * TCP bandwidth limiting sysctls.  Note that the default lower bound of
- * 1024 exists only for debugging.  A good production default would be
- * something like 6100.
- */
-SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0,
-    "TCP inflight data limiting");
-
-static int	tcp_inflight_enable = 1;
-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW,
-    &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
-
-static int	tcp_inflight_debug = 0;
-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW,
-    &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
-
-static int	tcp_inflight_rttthresh;
-SYSCTL_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, CTLTYPE_INT|CTLFLAG_RW,
-    &tcp_inflight_rttthresh, 0, sysctl_msec_to_ticks, "I",
-    "RTT threshold below which inflight will deactivate itself");
-
-static int	tcp_inflight_min = 6144;
-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW,
-    &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
-
-static int	tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW,
-    &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
-
-static int	tcp_inflight_stab = 20;
-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW,
-    &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
-
-uma_zone_t sack_hole_zone;
-
-static struct inpcb *tcp_notify(struct inpcb *, int);
-static struct inpcb *cxgb_tcp_drop_syn_sent(struct inpcb *inp, int errno);
-
-/*
- * Target size of TCP PCB hash tables. Must be a power of two.
- *
- * Note that this can be overridden by the kernel environment
- * variable net.inet.tcp.tcbhashsize
- */
-#ifndef TCBHASHSIZE
-#define TCBHASHSIZE	512
-#endif
-
-/*
- * XXX
- * Callouts should be moved into struct tcp directly.  They are currently
- * separate because the tcpcb structure is exported to userland for sysctl
- * parsing purposes, which do not know about callouts.
- */
-struct tcpcb_mem {
-	struct	tcpcb		tcb;
-	struct	tcp_timer	tt;
-};
-
-MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
-
-/*
- * Drop a TCP connection, reporting
- * the specified error.  If connection is synchronized,
- * then send a RST to peer.
- */
-struct tcpcb *
-cxgb_tcp_drop(struct tcpcb *tp, int errno)
-{
-	struct socket *so = tp->t_inpcb->inp_socket;
-
-	INP_INFO_WLOCK_ASSERT(&tcbinfo);
-	INP_LOCK_ASSERT(tp->t_inpcb);
-
-	if (TCPS_HAVERCVDSYN(tp->t_state)) {
-		tp->t_state = TCPS_CLOSED;
-		(void) tcp_gen_reset(tp);
-		tcpstat.tcps_drops++;
-	} else
-		tcpstat.tcps_conndrops++;
-	if (errno == ETIMEDOUT && tp->t_softerror)
-		errno = tp->t_softerror;
-	so->so_error = errno;
-	return (cxgb_tcp_close(tp));
-}
-
-/*
- * Attempt to close a TCP control block, marking it as dropped, and freeing
- * the socket if we hold the only reference.
- */
-struct tcpcb *
-cxgb_tcp_close(struct tcpcb *tp)
-{
-	struct inpcb *inp = tp->t_inpcb;
-	struct socket *so;
-
-	INP_INFO_WLOCK_ASSERT(&tcbinfo);
-	INP_LOCK_ASSERT(inp);
-
-	if (tp->t_state == TCPS_LISTEN)
-		tcp_gen_listen_close(tp);
-	in_pcbdrop(inp);
-	tcpstat.tcps_closed++;
-	KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
-	so = inp->inp_socket;
-	soisdisconnected(so);
-	if (inp->inp_vflag & INP_SOCKREF) {
-		KASSERT(so->so_state & SS_PROTOREF,
-		    ("tcp_close: !SS_PROTOREF"));
-		inp->inp_vflag &= ~INP_SOCKREF;
-		INP_UNLOCK(inp);
-		ACCEPT_LOCK();
-		SOCK_LOCK(so);
-		so->so_state &= ~SS_PROTOREF;
-		sofree(so);
-		return (NULL);
-	}
-	return (tp);
-}
-
-/*
- * Notify a tcp user of an asynchronous error;
- * store error as soft error, but wake up user
- * (for now, won't do anything until can select for soft error).
- *
- * Do not wake up user since there currently is no mechanism for
- * reporting soft errors (yet - a kqueue filter may be added).
- */
-static struct inpcb *
-tcp_notify(struct inpcb *inp, int error)
-{
-	struct tcpcb *tp;
-
-	INP_INFO_WLOCK_ASSERT(&tcbinfo);
-	INP_LOCK_ASSERT(inp);
-
-	if ((inp->inp_vflag & INP_TIMEWAIT) ||
-	    (inp->inp_vflag & INP_DROPPED))
-		return (inp);
-
-	tp = intotcpcb(inp);
-	KASSERT(tp != NULL, ("tcp_notify: tp == NULL"));
-
-	/*
-	 * Ignore some errors if we are hooked up.
-	 * If connection hasn't completed, has retransmitted several times,
-	 * and receives a second error, give up now.  This is better
-	 * than waiting a long time to establish a connection that
-	 * can never complete.
-	 */
-	if (tp->t_state == TCPS_ESTABLISHED &&
-	    (error == EHOSTUNREACH || error == ENETUNREACH ||
-	     error == EHOSTDOWN)) {
-		return (inp);
-	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
-	    tp->t_softerror) {
-		tp = cxgb_tcp_drop(tp, error);
-		if (tp != NULL)
-			return (inp);
-		else
-			return (NULL);
-	} else {
-		tp->t_softerror = error;
-		return (inp);
-	}
-#if 0
-	wakeup( &so->so_timeo);
-	sorwakeup(so);
-	sowwakeup(so);
-#endif
-}
-
-void
-cxgb_tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
-{
-	struct ip *ip = vip;
-	struct tcphdr *th;
-	struct in_addr faddr;
-	struct inpcb *inp;
-	struct tcpcb *tp;
-	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
-	struct icmp *icp;
-	struct in_conninfo inc;
-	tcp_seq icmp_tcp_seq;
-	int mtu;
-
-	faddr = ((struct sockaddr_in *)sa)->sin_addr;
-	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
-		return;
-
-	if (cmd == PRC_MSGSIZE)
-		notify = tcp_mtudisc;
-	else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
-		cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
-		notify = cxgb_tcp_drop_syn_sent;
-	/*
-	 * Redirects don't need to be handled up here.
-	 */
-	else if (PRC_IS_REDIRECT(cmd))
-		return;
-	/*
-	 * Source quench is depreciated.
-	 */
-	else if (cmd == PRC_QUENCH)
-		return;
-	/*
-	 * Hostdead is ugly because it goes linearly through all PCBs.
-	 * XXX: We never get this from ICMP, otherwise it makes an
-	 * excellent DoS attack on machines with many connections.
-	 */
-	else if (cmd == PRC_HOSTDEAD)
-		ip = NULL;
-	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
-		return;
-	if (ip != NULL) {
-		icp = (struct icmp *)((caddr_t)ip
-				      - offsetof(struct icmp, icmp_ip));
-		th = (struct tcphdr *)((caddr_t)ip
-				       + (ip->ip_hl << 2));
-		INP_INFO_WLOCK(&tcbinfo);
-		inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
-		    ip->ip_src, th->th_sport, 0, NULL);
-		if (inp != NULL)  {
-			INP_LOCK(inp);
-			if (!(inp->inp_vflag & INP_TIMEWAIT) &&
-			    !(inp->inp_vflag & INP_DROPPED) &&
-			    !(inp->inp_socket == NULL)) {
-				icmp_tcp_seq = htonl(th->th_seq);
-				tp = intotcpcb(inp);
-				if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
-				    SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
-					if (cmd == PRC_MSGSIZE) {
-					    /*
-					     * MTU discovery:
-					     * If we got a needfrag set the MTU
-					     * in the route to the suggested new
-					     * value (if given) and then notify.
-					     */
-					    bzero(&inc, sizeof(inc));
-					    inc.inc_flags = 0;	/* IPv4 */
-					    inc.inc_faddr = faddr;
-
-					    mtu = ntohs(icp->icmp_nextmtu);
-					    /*
-					     * If no alternative MTU was
-					     * proposed, try the next smaller
-					     * one.  ip->ip_len has already
-					     * been swapped in icmp_input().
-					     */
-					    if (!mtu)
-						mtu = ip_next_mtu(ip->ip_len,
-						 1);
-					    if (mtu < max(296, (tcp_minmss)
-						 + sizeof(struct tcpiphdr)))
-						mtu = 0;
-					    if (!mtu)
-						mtu = tcp_mssdflt
-						 + sizeof(struct tcpiphdr);
-					    /*
-					     * Only cache the the MTU if it
-					     * is smaller than the interface
-					     * or route MTU.  tcp_mtudisc()
-					     * will do right thing by itself.
-					     */
-					    if (mtu <= tcp_maxmtu(&inc, NULL))
-						tcp_hc_updatemtu(&inc, mtu);
-					}
-
-					inp = (*notify)(inp, inetctlerrmap[cmd]);
-				}
-			}
-			if (inp != NULL)
-				INP_UNLOCK(inp);
-		} else {
-			inc.inc_fport = th->th_dport;
-			inc.inc_lport = th->th_sport;
-			inc.inc_faddr = faddr;
-			inc.inc_laddr = ip->ip_src;
-#ifdef INET6
-			inc.inc_isipv6 = 0;
-#endif
-			syncache_unreach(&inc, th);
-		}
-		INP_INFO_WUNLOCK(&tcbinfo);
-	} else
-		in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
-}
-
-#ifdef INET6
-void
-tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
-{
-	struct tcphdr th;
-	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
-	struct ip6_hdr *ip6;
-	struct mbuf *m;
-	struct ip6ctlparam *ip6cp = NULL;
-	const struct sockaddr_in6 *sa6_src = NULL;
-	int off;
-	struct tcp_portonly {
-		u_int16_t th_sport;
-		u_int16_t th_dport;
-	} *thp;
-
-	if (sa->sa_family != AF_INET6 ||
-	    sa->sa_len != sizeof(struct sockaddr_in6))
-		return;
-
-	if (cmd == PRC_MSGSIZE)
-		notify = tcp_mtudisc;
-	else if (!PRC_IS_REDIRECT(cmd) &&
-		 ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
-		return;
-	/* Source quench is depreciated. */
-	else if (cmd == PRC_QUENCH)
-		return;
-
-	/* if the parameter is from icmp6, decode it. */
-	if (d != NULL) {
-		ip6cp = (struct ip6ctlparam *)d;
-		m = ip6cp->ip6c_m;
-		ip6 = ip6cp->ip6c_ip6;
-		off = ip6cp->ip6c_off;
-		sa6_src = ip6cp->ip6c_src;
-	} else {
-		m = NULL;
-		ip6 = NULL;
-		off = 0;	/* fool gcc */
-		sa6_src = &sa6_any;
-	}
-
-	if (ip6 != NULL) {
-		struct in_conninfo inc;
-		/*
-		 * XXX: We assume that when IPV6 is non NULL,
-		 * M and OFF are valid.
-		 */
-
-		/* check if we can safely examine src and dst ports */
-		if (m->m_pkthdr.len < off + sizeof(*thp))
-			return;
-
-		bzero(&th, sizeof(th));
-		m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
-
-		in6_pcbnotify(&tcbinfo, sa, th.th_dport,
-		    (struct sockaddr *)ip6cp->ip6c_src,
-		    th.th_sport, cmd, NULL, notify);
-
-		inc.inc_fport = th.th_dport;
-		inc.inc_lport = th.th_sport;
-		inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
-		inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
-		inc.inc_isipv6 = 1;
-		INP_INFO_WLOCK(&tcbinfo);
-		syncache_unreach(&inc, &th);
-		INP_INFO_WUNLOCK(&tcbinfo);
-	} else
-		in6_pcbnotify(&tcbinfo, sa, 0, (const struct sockaddr *)sa6_src,
-			      0, cmd, NULL, notify);
-}
-#endif /* INET6 */
-
-
-/*
- * Following is where TCP initial sequence number generation occurs.
- *
- * There are two places where we must use initial sequence numbers:
- * 1.  In SYN-ACK packets.
- * 2.  In SYN packets.
- *
- * All ISNs for SYN-ACK packets are generated by the syncache.  See
- * tcp_syncache.c for details.
- *
- * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
- * depends on this property.  In addition, these ISNs should be
- * unguessable so as to prevent connection hijacking.  To satisfy
- * the requirements of this situation, the algorithm outlined in
- * RFC 1948 is used, with only small modifications.
- *
- * Implementation details:
- *
- * Time is based off the system timer, and is corrected so that it
- * increases by one megabyte per second.  This allows for proper
- * recycling on high speed LANs while still leaving over an hour
- * before rollover.
- *
- * As reading the *exact* system time is too expensive to be done
- * whenever setting up a TCP connection, we increment the time
- * offset in two ways.  First, a small random positive increment
- * is added to isn_offset for each connection that is set up.
- * Second, the function tcp_isn_tick fires once per clock tick
- * and increments isn_offset as necessary so that sequence numbers
- * are incremented at approximately ISN_BYTES_PER_SECOND.  The
- * random positive increments serve only to ensure that the same
- * exact sequence number is never sent out twice (as could otherwise
- * happen when a port is recycled in less than the system tick
- * interval.)
- *
- * net.inet.tcp.isn_reseed_interval controls the number of seconds
- * between seeding of isn_secret.  This is normally set to zero,
- * as reseeding should not be necessary.
- *
- * Locking of the global variables isn_secret, isn_last_reseed, isn_offset,
- * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock.  In
- * general, this means holding an exclusive (write) lock.
- */
-
-#define ISN_BYTES_PER_SECOND 1048576
-#define ISN_STATIC_INCREMENT 4096
-#define ISN_RANDOM_INCREMENT (4096 - 1)
-
-
-/*
- * When a specific ICMP unreachable message is received and the
- * connection state is SYN-SENT, drop the connection.  This behavior
- * is controlled by the icmp_may_rst sysctl.
- */
-static struct inpcb *
-cxgb_tcp_drop_syn_sent(struct inpcb *inp, int errno)
-{
-	struct tcpcb *tp;
-
-	INP_INFO_WLOCK_ASSERT(&tcbinfo);
-	INP_LOCK_ASSERT(inp);
-
-	if ((inp->inp_vflag & INP_TIMEWAIT) ||
-	    (inp->inp_vflag & INP_DROPPED))
-		return (inp);
-
-	tp = intotcpcb(inp);
-	if (tp->t_state != TCPS_SYN_SENT)
-		return (inp);
-
-	tp = cxgb_tcp_drop(tp, errno);
-	if (tp != NULL)
-		return (inp);
-	else
-		return (NULL);
-}
-
-static int
-cxgb_sysctl_drop(SYSCTL_HANDLER_ARGS)
-{
-	/* addrs[0] is a foreign socket, addrs[1] is a local one. */
-	struct sockaddr_storage addrs[2];
-	struct inpcb *inp;
-	struct tcpcb *tp;
-	struct tcptw *tw;
-	struct sockaddr_in *fin, *lin;
-#ifdef INET6
-	struct sockaddr_in6 *fin6, *lin6;
-	struct in6_addr f6, l6;
-#endif
-	int error;
-
-	inp = NULL;
-	fin = lin = NULL;
-#ifdef INET6
-	fin6 = lin6 = NULL;
-#endif
-	error = 0;
-
-	if (req->oldptr != NULL || req->oldlen != 0)
-		return (EINVAL);
-	if (req->newptr == NULL)
-		return (EPERM);
-	if (req->newlen < sizeof(addrs))
-		return (ENOMEM);
-	error = SYSCTL_IN(req, &addrs, sizeof(addrs));
-	if (error)
-		return (error);
-
-	switch (addrs[0].ss_family) {
-#ifdef INET6
-	case AF_INET6:
-		fin6 = (struct sockaddr_in6 *)&addrs[0];
-		lin6 = (struct sockaddr_in6 *)&addrs[1];
-		if (fin6->sin6_len != sizeof(struct sockaddr_in6) ||
-		    lin6->sin6_len != sizeof(struct sockaddr_in6))
-			return (EINVAL);
-		if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) {
-			if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr))
-				return (EINVAL);
-			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]);
-			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]);
-			fin = (struct sockaddr_in *)&addrs[0];
-			lin = (struct sockaddr_in *)&addrs[1];
-			break;
-		}
-		error = sa6_embedscope(fin6, ip6_use_defzone);
-		if (error)
-			return (error);
-		error = sa6_embedscope(lin6, ip6_use_defzone);
-		if (error)
-			return (error);
-		break;
-#endif
-	case AF_INET:
-		fin = (struct sockaddr_in *)&addrs[0];
-		lin = (struct sockaddr_in *)&addrs[1];
-		if (fin->sin_len != sizeof(struct sockaddr_in) ||
-		    lin->sin_len != sizeof(struct sockaddr_in))
-			return (EINVAL);
-		break;
-	default:
-		return (EINVAL);
-	}
-	INP_INFO_WLOCK(&tcbinfo);
-	switch (addrs[0].ss_family) {
-#ifdef INET6
-	case AF_INET6:
-		inp = in6_pcblookup_hash(&tcbinfo, &f6, fin6->sin6_port,
-		    &l6, lin6->sin6_port, 0, NULL);
-		break;
-#endif
-	case AF_INET:
-		inp = in_pcblookup_hash(&tcbinfo, fin->sin_addr, fin->sin_port,
-		    lin->sin_addr, lin->sin_port, 0, NULL);
-		break;
-	}
-	if (inp != NULL) {
-		INP_LOCK(inp);
-		if (inp->inp_vflag & INP_TIMEWAIT) {
-			/*
-			 * XXXRW: There currently exists a state where an
-			 * inpcb is present, but its timewait state has been
-			 * discarded.  For now, don't allow dropping of this
-			 * type of inpcb.
-			 */
-			tw = intotw(inp);
-			if (tw != NULL)
-				tcp_twclose(tw, 0);
-			else
-				INP_UNLOCK(inp);
-		} else if (!(inp->inp_vflag & INP_DROPPED) &&
-			   !(inp->inp_socket->so_options & SO_ACCEPTCONN)) {
-			tp = intotcpcb(inp);
-			tp = cxgb_tcp_drop(tp, ECONNABORTED);
-			if (tp != NULL)
-				INP_UNLOCK(inp);
-		} else
-			INP_UNLOCK(inp);
-	} else
-		error = ESRCH;
-	INP_INFO_WUNLOCK(&tcbinfo);
-	return (error);
-}
-
-SYSCTL_PROC(_net_inet_tcp_cxgb, TCPCTL_DROP, drop,
-    CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL,
-    0, cxgb_sysctl_drop, "", "Drop TCP connection");
-
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp_usrreq.c b/sys/dev/cxgb/ulp/tom/cxgb_tcp_usrreq.c
deleted file mode 100644
index bd940b2..0000000
--- a/sys/dev/cxgb/ulp/tom/cxgb_tcp_usrreq.c
+++ /dev/null
@@ -1,1362 +0,0 @@
-/*-
- * Copyright (c) 1982, 1986, 1988, 1993
- *	The Regents of the University of California.
- * Copyright (c) 2006-2007 Robert N. M. Watson
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *	From: @(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include "opt_ddb.h"
-#include "opt_inet.h"
-#include "opt_inet6.h"
-#include "opt_tcpdebug.h"
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/malloc.h>
-#include <sys/kernel.h>
-#include <sys/sysctl.h>
-#include <sys/mbuf.h>
-#ifdef INET6
-#include <sys/domain.h>
-#endif /* INET6 */
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-#include <sys/protosw.h>
-#include <sys/proc.h>
-#include <sys/jail.h>
-
-#ifdef DDB
-#include <ddb/ddb.h>
-#endif
-
-#include <net/if.h>
-#include <net/route.h>
-
-#include <netinet/in.h>
-#include <netinet/in_systm.h>
-#ifdef INET6
-#include <netinet/ip6.h>
-#endif
-#include <netinet/in_pcb.h>
-#ifdef INET6
-#include <netinet6/in6_pcb.h>
-#endif
-#include <netinet/in_var.h>
-#include <netinet/ip_var.h>
-#ifdef INET6
-#include <netinet6/ip6_var.h>
-#include <netinet6/scope6_var.h>
-#endif
-#include <netinet/tcp.h>
-#include <netinet/tcp_fsm.h>
-#include <netinet/tcp_seq.h>
-#include <netinet/tcp_timer.h>
-#include <netinet/tcp_var.h>
-#include <netinet/tcpip.h>
-#ifdef TCPDEBUG
-#include <netinet/tcp_debug.h>
-#endif
-#include <netinet/tcp_offload.h>
-#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
-
-
-/*
- * TCP protocol interface to socket abstraction.
- */
-static int	tcp_attach(struct socket *);
-static int	tcp_connect(struct tcpcb *, struct sockaddr *,
-		    struct thread *td);
-#ifdef INET6
-static int	tcp6_connect(struct tcpcb *, struct sockaddr *,
-		    struct thread *td);
-#endif /* INET6 */
-static void	tcp_disconnect(struct tcpcb *);
-static void	tcp_usrclosed(struct tcpcb *);
-
-#ifdef TCPDEBUG
-#define	TCPDEBUG0	int ostate = 0
-#define	TCPDEBUG1()	ostate = tp ? tp->t_state : 0
-#define	TCPDEBUG2(req)	if (tp && (so->so_options & SO_DEBUG)) \
-				tcp_trace(TA_USER, ostate, tp, 0, 0, req)
-#else
-#define	TCPDEBUG0
-#define	TCPDEBUG1()
-#define	TCPDEBUG2(req)
-#endif
-
-/*
- * TCP attaches to socket via pru_attach(), reserving space,
- * and an internet control block.
- */
-static int
-tcp_usr_attach(struct socket *so, int proto, struct thread *td)
-{
-	struct inpcb *inp;
-	struct tcpcb *tp = NULL;
-	int error;
-	TCPDEBUG0;
-
-	inp = sotoinpcb(so);
-	KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL"));
-	TCPDEBUG1();
-
-	error = tcp_attach(so);
-	if (error)
-		goto out;
-
-	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
-		so->so_linger = TCP_LINGERTIME;
-
-	inp = sotoinpcb(so);
-	tp = intotcpcb(inp);
-out:
-	TCPDEBUG2(PRU_ATTACH);
-	return error;
-}
-
-/*
- * tcp_detach is called when the socket layer loses its final reference
- * to the socket, be it a file descriptor reference, a reference from TCP,
- * etc.  At this point, there is only one case in which we will keep around
- * inpcb state: time wait.
- *
- * This function can probably be re-absorbed back into tcp_usr_detach() now
- * that there is a single detach path.
- */
-static void
-tcp_detach(struct socket *so, struct inpcb *inp)
-{
-	struct tcpcb *tp;
-#ifdef INET6
-	int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0;
-#endif
-
-	INP_INFO_WLOCK_ASSERT(&tcbinfo);
-	INP_LOCK_ASSERT(inp);
-
-	KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp"));
-	KASSERT(inp->inp_socket == so, ("tcp_detach: inp_socket != so"));
-
-	tp = intotcpcb(inp);
-
-	if (inp->inp_vflag & INP_TIMEWAIT) {
-		/*
-		 * There are two cases to handle: one in which the time wait
-		 * state is being discarded (INP_DROPPED), and one in which
-		 * this connection will remain in timewait.  In the former,
-		 * it is time to discard all state (except tcptw, which has
-		 * already been discarded by the timewait close code, which
-		 * should be further up the call stack somewhere).  In the
-		 * latter case, we detach from the socket, but leave the pcb
-		 * present until timewait ends.
-		 *
-		 * XXXRW: Would it be cleaner to free the tcptw here?
-		 */
-		if (inp->inp_vflag & INP_DROPPED) {
-			KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && "
-			    "INP_DROPPED && tp != NULL"));
-#ifdef INET6
-			if (isipv6) {
-				in6_pcbdetach(inp);
-				in6_pcbfree(inp);
-			} else {
-#endif
-				in_pcbdetach(inp);
-				in_pcbfree(inp);
-#ifdef INET6
-			}
-#endif
-		} else {
-#ifdef INET6
-			if (isipv6)
-				in6_pcbdetach(inp);
-			else
-#endif
-				in_pcbdetach(inp);
-			INP_UNLOCK(inp);
-		}
-	} else {
-		/*
-		 * If the connection is not in timewait, we consider two
-		 * two conditions: one in which no further processing is
-		 * necessary (dropped || embryonic), and one in which TCP is
-		 * not yet done, but no longer requires the socket, so the
-		 * pcb will persist for the time being.
-		 *
-		 * XXXRW: Does the second case still occur?
-		 */
-		if (inp->inp_vflag & INP_DROPPED ||
-		    tp->t_state < TCPS_SYN_SENT) {
-			tcp_discardcb(tp);
-#ifdef INET6
-			if (isipv6) {
-				in6_pcbdetach(inp);
-				in6_pcbfree(inp);
-			} else {
-#endif
-				in_pcbdetach(inp);
-				in_pcbfree(inp);
-#ifdef INET6
-			}
-#endif
-		} else {
-#ifdef INET6
-			if (isipv6)
-				in6_pcbdetach(inp);
-			else
-#endif
-				in_pcbdetach(inp);
-		}
-	}
-}
-
-/*
- * pru_detach() detaches the TCP protocol from the socket.
- * If the protocol state is non-embryonic, then can't
- * do this directly: have to initiate a pru_disconnect(),
- * which may finish later; embryonic TCB's can just
- * be discarded here.
- */
-static void
-tcp_usr_detach(struct socket *so)
-{
-	struct inpcb *inp;
-
-	inp = sotoinpcb(so);
-	KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL"));
-	INP_INFO_WLOCK(&tcbinfo);
-	INP_LOCK(inp);
-	KASSERT(inp->inp_socket != NULL,
-	    ("tcp_usr_detach: inp_socket == NULL"));
-	tcp_detach(so, inp);
-	INP_INFO_WUNLOCK(&tcbinfo);
-}
-
-/*
- * Give the socket an address.
- */
-static int
-tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
-{
-	int error = 0;
-	struct inpcb *inp;
-	struct tcpcb *tp = NULL;
-	struct sockaddr_in *sinp;
-
-	sinp = (struct sockaddr_in *)nam;
-	if (nam->sa_len != sizeof (*sinp))
-		return (EINVAL);
-	/*
-	 * Must check for multicast addresses and disallow binding
-	 * to them.
-	 */
-	if (sinp->sin_family == AF_INET &&
-	    IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
-		return (EAFNOSUPPORT);
-
-	TCPDEBUG0;
-	INP_INFO_WLOCK(&tcbinfo);
-	inp = sotoinpcb(so);
-	KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL"));
-	INP_LOCK(inp);
-	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
-		error = EINVAL;
-		goto out;
-	}
-	tp = intotcpcb(inp);
-	TCPDEBUG1();
-	error = in_pcbbind(inp, nam, td->td_ucred);
-out:
-	TCPDEBUG2(PRU_BIND);
-	INP_UNLOCK(inp);
-	INP_INFO_WUNLOCK(&tcbinfo);
-
-	return (error);
-}
-
-#ifdef INET6
-static int
-tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
-{
-	int error = 0;
-	struct inpcb *inp;
-	struct tcpcb *tp = NULL;
-	struct sockaddr_in6 *sin6p;
-
-	sin6p = (struct sockaddr_in6 *)nam;
-	if (nam->sa_len != sizeof (*sin6p))
-		return (EINVAL);
-	/*
-	 * Must check for multicast addresses and disallow binding
-	 * to them.
-	 */
-	if (sin6p->sin6_family == AF_INET6 &&
-	    IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
-		return (EAFNOSUPPORT);
-
-	TCPDEBUG0;
-	INP_INFO_WLOCK(&tcbinfo);
-	inp = sotoinpcb(so);
-	KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL"));
-	INP_LOCK(inp);
-	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
-		error = EINVAL;
-		goto out;
-	}
-	tp = intotcpcb(inp);
-	TCPDEBUG1();
-	inp->inp_vflag &= ~INP_IPV4;
-	inp->inp_vflag |= INP_IPV6;
-	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
-		if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
-			inp->inp_vflag |= INP_IPV4;
-		else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
-			struct sockaddr_in sin;
-
-			in6_sin6_2_sin(&sin, sin6p);
-			inp->inp_vflag |= INP_IPV4;
-			inp->inp_vflag &= ~INP_IPV6;
-			error = in_pcbbind(inp, (struct sockaddr *)&sin,
-			    td->td_ucred);
-			goto out;
-		}
-	}
-	error = in6_pcbbind(inp, nam, td->td_ucred);
-out:
-	TCPDEBUG2(PRU_BIND);
-	INP_UNLOCK(inp);
-	INP_INFO_WUNLOCK(&tcbinfo);
-	return (error);
-}
-#endif /* INET6 */
-
-/*
- * Prepare to accept connections.
- */
-static int
-tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
-{
-	int error = 0;
-	struct inpcb *inp;
-	struct tcpcb *tp = NULL;
-
-	TCPDEBUG0;
-	INP_INFO_WLOCK(&tcbinfo);
-	inp = sotoinpcb(so);
-	KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL"));
-	INP_LOCK(inp);
-	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
-		error = EINVAL;
-		goto out;
-	}
-	tp = intotcpcb(inp);
-	TCPDEBUG1();
-	SOCK_LOCK(so);
-	error = solisten_proto_check(so);
-	if (error == 0 && inp->inp_lport == 0)
-		error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
-	if (error == 0) {
-		tp->t_state = TCPS_LISTEN;
-		solisten_proto(so, backlog);
-		tcp_gen_listen_open(tp);
-	}
-	SOCK_UNLOCK(so);
-
-out:
-	TCPDEBUG2(PRU_LISTEN);
-	INP_UNLOCK(inp);
-	INP_INFO_WUNLOCK(&tcbinfo);
-	return (error);
-}
-
-#ifdef INET6
-static int
-tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
-{
-	int error = 0;
-	struct inpcb *inp;
-	struct tcpcb *tp = NULL;
-
-	TCPDEBUG0;
-	INP_INFO_WLOCK(&tcbinfo);
-	inp = sotoinpcb(so);
-	KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL"));
-	INP_LOCK(inp);
-	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
-		error = EINVAL;
-		goto out;
-	}
-	tp = intotcpcb(inp);
-	TCPDEBUG1();
-	SOCK_LOCK(so);
-	error = solisten_proto_check(so);
-	if (error == 0 && inp->inp_lport == 0) {
-		inp->inp_vflag &= ~INP_IPV4;
-		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
-			inp->inp_vflag |= INP_IPV4;
-		error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
-	}
-	if (error == 0) {
-		tp->t_state = TCPS_LISTEN;
-		solisten_proto(so, backlog);
-	}
-	SOCK_UNLOCK(so);
-
-out:
-	TCPDEBUG2(PRU_LISTEN);
-	INP_UNLOCK(inp);
-	INP_INFO_WUNLOCK(&tcbinfo);
-	return (error);
-}
-#endif /* INET6 */
-
-/*
- * Initiate connection to peer.
- * Create a template for use in transmissions on this connection.
- * Enter SYN_SENT state, and mark socket as connecting.
- * Start keep-alive timer, and seed output sequence space.
- * Send initial segment on connection.
- */
-static int
-tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
-{
-	int error = 0;
-	struct inpcb *inp;
-	struct tcpcb *tp = NULL;
-	struct sockaddr_in *sinp;
-
-	sinp = (struct sockaddr_in *)nam;
-	if (nam->sa_len != sizeof (*sinp))
-		return (EINVAL);
-	/*
-	 * Must disallow TCP ``connections'' to multicast addresses.
-	 */
-	if (sinp->sin_family == AF_INET
-	    && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
-		return (EAFNOSUPPORT);
-	if (jailed(td->td_ucred))
-		prison_remote_ip(td->td_ucred, 0, &sinp->sin_addr.s_addr);
-
-	TCPDEBUG0;
-	INP_INFO_WLOCK(&tcbinfo);
-	inp = sotoinpcb(so);
-	KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL"));
-	INP_LOCK(inp);
-	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
-		error = EINVAL;
-		goto out;
-	}
-	tp = intotcpcb(inp);
-	TCPDEBUG1();
-	if ((error = tcp_connect(tp, nam, td)) != 0)
-		goto out;
-	printf("calling tcp_gen_connect\n");
-	
-	error = tcp_gen_connect(so, nam);
-out:
-	TCPDEBUG2(PRU_CONNECT);
-	INP_UNLOCK(inp);
-	INP_INFO_WUNLOCK(&tcbinfo);
-	return (error);
-}
-
-#ifdef INET6
-static int
-tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
-{
-	int error = 0;
-	struct inpcb *inp;
-	struct tcpcb *tp = NULL;
-	struct sockaddr_in6 *sin6p;
-
-	TCPDEBUG0;
-
-	sin6p = (struct sockaddr_in6 *)nam;
-	if (nam->sa_len != sizeof (*sin6p))
-		return (EINVAL);
-	/*
-	 * Must disallow TCP ``connections'' to multicast addresses.
-	 */
-	if (sin6p->sin6_family == AF_INET6
-	    && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
-		return (EAFNOSUPPORT);
-
-	INP_INFO_WLOCK(&tcbinfo);
-	inp = sotoinpcb(so);
-	KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL"));
-	INP_LOCK(inp);
-	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
-		error = EINVAL;
-		goto out;
-	}
-	tp = intotcpcb(inp);
-	TCPDEBUG1();
-	if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
-		struct sockaddr_in sin;
-
-		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
-			error = EINVAL;
-			goto out;
-		}
-
-		in6_sin6_2_sin(&sin, sin6p);
-		inp->inp_vflag |= INP_IPV4;
-		inp->inp_vflag &= ~INP_IPV6;
-		if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
-			goto out;
-		error = tcp_gen_connect(so, nam);
-		goto out;
-	}
-	inp->inp_vflag &= ~INP_IPV4;
-	inp->inp_vflag |= INP_IPV6;
-	inp->inp_inc.inc_isipv6 = 1;
-	if ((error = tcp6_connect(tp, nam, td)) != 0)
-		goto out;
-	error = tcp_gen_connect(so, nam);
-
-out:
-	TCPDEBUG2(PRU_CONNECT);
-	INP_UNLOCK(inp);
-	INP_INFO_WUNLOCK(&tcbinfo);
-	return (error);
-}
-#endif /* INET6 */
-
-/*
- * Initiate disconnect from peer.
- * If connection never passed embryonic stage, just drop;
- * else if don't need to let data drain, then can just drop anyways,
- * else have to begin TCP shutdown process: mark socket disconnecting,
- * drain unread data, state switch to reflect user close, and
- * send segment (e.g. FIN) to peer.  Socket will be really disconnected
- * when peer sends FIN and acks ours.
- *
- * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
- */
-static int
-tcp_usr_disconnect(struct socket *so)
-{
-	struct inpcb *inp;
-	struct tcpcb *tp = NULL;
-	int error = 0;
-
-	TCPDEBUG0;
-	INP_INFO_WLOCK(&tcbinfo);
-	inp = sotoinpcb(so);
-	KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL"));
-	INP_LOCK(inp);
-	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
-		error = ECONNRESET;
-		goto out;
-	}
-	tp = intotcpcb(inp);
-	TCPDEBUG1();
-	tcp_disconnect(tp);
-out:
-	TCPDEBUG2(PRU_DISCONNECT);
-	INP_UNLOCK(inp);
-	INP_INFO_WUNLOCK(&tcbinfo);
-	return (error);
-}
-
-/*
- * Accept a connection.  Essentially all the work is
- * done at higher levels; just return the address
- * of the peer, storing through addr.
- */
-static int
-tcp_usr_accept(struct socket *so, struct sockaddr **nam)
-{
-	int error = 0;
-	struct inpcb *inp = NULL;
-	struct tcpcb *tp = NULL;
-	struct in_addr addr;
-	in_port_t port = 0;
-	TCPDEBUG0;
-
-	if (so->so_state & SS_ISDISCONNECTED)
-		return (ECONNABORTED);
-
-	inp = sotoinpcb(so);
-	KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL"));
-	INP_LOCK(inp);
-	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
-		error = ECONNABORTED;
-		goto out;
-	}
-	tp = intotcpcb(inp);
-	TCPDEBUG1();
-
-	/*
-	 * We inline in_getpeeraddr and COMMON_END here, so that we can
-	 * copy the data of interest and defer the malloc until after we
-	 * release the lock.
-	 */
-	port = inp->inp_fport;
-	addr = inp->inp_faddr;
-
-out:
-	TCPDEBUG2(PRU_ACCEPT);
-	INP_UNLOCK(inp);
-	if (error == 0)
-		*nam = in_sockaddr(port, &addr);
-	return error;
-}
-
-#ifdef INET6
-static int
-tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
-{
-	struct inpcb *inp = NULL;
-	int error = 0;
-	struct tcpcb *tp = NULL;
-	struct in_addr addr;
-	struct in6_addr addr6;
-	in_port_t port = 0;
-	int v4 = 0;
-	TCPDEBUG0;
-
-	if (so->so_state & SS_ISDISCONNECTED)
-		return (ECONNABORTED);
-
-	inp = sotoinpcb(so);
-	KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL"));
-	INP_LOCK(inp);
-	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
-		error = ECONNABORTED;
-		goto out;
-	}
-	tp = intotcpcb(inp);
-	TCPDEBUG1();
-
-	/*
-	 * We inline in6_mapped_peeraddr and COMMON_END here, so that we can
-	 * copy the data of interest and defer the malloc until after we
-	 * release the lock.
-	 */
-	if (inp->inp_vflag & INP_IPV4) {
-		v4 = 1;
-		port = inp->inp_fport;
-		addr = inp->inp_faddr;
-	} else {
-		port = inp->inp_fport;
-		addr6 = inp->in6p_faddr;
-	}
-
-out:
-	TCPDEBUG2(PRU_ACCEPT);
-	INP_UNLOCK(inp);
-	if (error == 0) {
-		if (v4)
-			*nam = in6_v4mapsin6_sockaddr(port, &addr);
-		else
-			*nam = in6_sockaddr(port, &addr6);
-	}
-	return error;
-}
-#endif /* INET6 */
-
-/*
- * Mark the connection as being incapable of further output.
- */
-static int
-tcp_usr_shutdown(struct socket *so)
-{
-	int error = 0;
-	struct inpcb *inp;
-	struct tcpcb *tp = NULL;
-
-	TCPDEBUG0;
-	INP_INFO_WLOCK(&tcbinfo);
-	inp = sotoinpcb(so);
-	KASSERT(inp != NULL, ("inp == NULL"));
-	INP_LOCK(inp);
-	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
-		error = ECONNRESET;
-		goto out;
-	}
-	tp = intotcpcb(inp);
-	TCPDEBUG1();
-	socantsendmore(so);
-	tcp_usrclosed(tp);
-	error = tcp_gen_disconnect(tp);
-
-out:
-	TCPDEBUG2(PRU_SHUTDOWN);
-	INP_UNLOCK(inp);
-	INP_INFO_WUNLOCK(&tcbinfo);
-
-	return (error);
-}
-
-/*
- * After a receive, possibly send window update to peer.
- */
-static int
-tcp_usr_rcvd(struct socket *so, int flags)
-{
-	struct inpcb *inp;
-	struct tcpcb *tp = NULL;
-	int error = 0;
-
-	TCPDEBUG0;
-	inp = sotoinpcb(so);
-	KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL"));
-	INP_LOCK(inp);
-	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
-		error = ECONNRESET;
-		goto out;
-	}
-	tp = intotcpcb(inp);
-	TCPDEBUG1();
-	tcp_gen_rcvd(tp);
-
-out:
-	TCPDEBUG2(PRU_RCVD);
-	INP_UNLOCK(inp);
-	return (error);
-}
-
-/*
- * Do a send by putting data in output queue and updating urgent
- * marker if URG set.  Possibly send more data.  Unlike the other
- * pru_*() routines, the mbuf chains are our responsibility.  We
- * must either enqueue them or free them.  The other pru_* routines
- * generally are caller-frees.
- */
-static int
-tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
-    struct sockaddr *nam, struct mbuf *control, struct thread *td)
-{
-	int error = 0;
-	struct inpcb *inp;
-	struct tcpcb *tp = NULL;
-	int headlocked = 0;
-#ifdef INET6
-	int isipv6;
-#endif
-	TCPDEBUG0;
-
-	/*
-	 * We require the pcbinfo lock in two cases:
-	 *
-	 * (1) An implied connect is taking place, which can result in
-	 *     binding IPs and ports and hence modification of the pcb hash
-	 *     chains.
-	 *
-	 * (2) PRUS_EOF is set, resulting in explicit close on the send.
-	 */
-	if ((nam != NULL) || (flags & PRUS_EOF)) {
-		INP_INFO_WLOCK(&tcbinfo);
-		headlocked = 1;
-	}
-	inp = sotoinpcb(so);
-	KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
-	INP_LOCK(inp);
-	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
-		if (control)
-			m_freem(control);
-		if (m)
-			m_freem(m);
-		error = ECONNRESET;
-		goto out;
-	}
-#ifdef INET6
-	isipv6 = nam && nam->sa_family == AF_INET6;
-#endif /* INET6 */
-	tp = intotcpcb(inp);
-	TCPDEBUG1();
-	if (control) {
-		/* TCP doesn't do control messages (rights, creds, etc) */
-		if (control->m_len) {
-			m_freem(control);
-			if (m)
-				m_freem(m);
-			error = EINVAL;
-			goto out;
-		}
-		m_freem(control);	/* empty control, just free it */
-	}
-	if (!(flags & PRUS_OOB)) {
-		sbappendstream(&so->so_snd, m);
-		if (nam && tp->t_state < TCPS_SYN_SENT) {
-			/*
-			 * Do implied connect if not yet connected,
-			 * initialize window to default value, and
-			 * initialize maxseg/maxopd using peer's cached
-			 * MSS.
-			 */
-			INP_INFO_WLOCK_ASSERT(&tcbinfo);
-#ifdef INET6
-			if (isipv6)
-				error = tcp6_connect(tp, nam, td);
-			else
-#endif /* INET6 */
-			error = tcp_connect(tp, nam, td);
-			if (error)
-				goto out;
-			tp->snd_wnd = TTCP_CLIENT_SND_WND;
-			tcp_mss(tp, -1);
-		}
-		if (flags & PRUS_EOF) {
-			/*
-			 * Close the send side of the connection after
-			 * the data is sent.
-			 */
-			INP_INFO_WLOCK_ASSERT(&tcbinfo);
-			socantsendmore(so);
-			tcp_usrclosed(tp);
-		}
-		if (headlocked) {
-			INP_INFO_WUNLOCK(&tcbinfo);
-			headlocked = 0;
-		}
-		if (tp != NULL) {
-			if (flags & PRUS_MORETOCOME)
-				tp->t_flags |= TF_MORETOCOME;
-			error = tcp_gen_send(tp);
-			if (flags & PRUS_MORETOCOME)
-				tp->t_flags &= ~TF_MORETOCOME;
-		}
-	} else {
-		/*
-		 * XXXRW: PRUS_EOF not implemented with PRUS_OOB?
-		 */
-		SOCKBUF_LOCK(&so->so_snd);
-		if (sbspace(&so->so_snd) < -512) {
-			SOCKBUF_UNLOCK(&so->so_snd);
-			m_freem(m);
-			error = ENOBUFS;
-			goto out;
-		}
-		/*
-		 * According to RFC961 (Assigned Protocols),
-		 * the urgent pointer points to the last octet
-		 * of urgent data.  We continue, however,
-		 * to consider it to indicate the first octet
-		 * of data past the urgent section.
-		 * Otherwise, snd_up should be one lower.
-		 */
-		sbappendstream_locked(&so->so_snd, m);
-		SOCKBUF_UNLOCK(&so->so_snd);
-		if (nam && tp->t_state < TCPS_SYN_SENT) {
-			/*
-			 * Do implied connect if not yet connected,
-			 * initialize window to default value, and
-			 * initialize maxseg/maxopd using peer's cached
-			 * MSS.
-			 */
-			INP_INFO_WLOCK_ASSERT(&tcbinfo);
-#ifdef INET6
-			if (isipv6)
-				error = tcp6_connect(tp, nam, td);
-			else
-#endif /* INET6 */
-			error = tcp_connect(tp, nam, td);
-			if (error)
-				goto out;
-			tp->snd_wnd = TTCP_CLIENT_SND_WND;
-			tcp_mss(tp, -1);
-			INP_INFO_WUNLOCK(&tcbinfo);
-			headlocked = 0;
-		} else if (nam) {
-			INP_INFO_WUNLOCK(&tcbinfo);
-			headlocked = 0;
-		}
-		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
-		tp->t_flags |= TF_FORCEDATA;
-		error = tcp_gen_send(tp);
-		tp->t_flags &= ~TF_FORCEDATA;
-	}
-out:
-	TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB :
-		  ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
-	INP_UNLOCK(inp);
-	if (headlocked)
-		INP_INFO_WUNLOCK(&tcbinfo);
-	return (error);
-}
-
-/*
- * Abort the TCP.  Drop the connection abruptly.
- */
-static void
-tcp_usr_abort(struct socket *so)
-{
-	struct inpcb *inp;
-	struct tcpcb *tp = NULL;
-	TCPDEBUG0;
-
-	inp = sotoinpcb(so);
-	KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL"));
-
-	INP_INFO_WLOCK(&tcbinfo);
-	INP_LOCK(inp);
-	KASSERT(inp->inp_socket != NULL,
-	    ("tcp_usr_abort: inp_socket == NULL"));
-
-	/*
-	 * If we still have full TCP state, and we're not dropped, drop.
-	 */
-	if (!(inp->inp_vflag & INP_TIMEWAIT) &&
-	    !(inp->inp_vflag & INP_DROPPED)) {
-		tp = intotcpcb(inp);
-		TCPDEBUG1();
-		cxgb_tcp_drop(tp, ECONNABORTED);
-		TCPDEBUG2(PRU_ABORT);
-	}
-	if (!(inp->inp_vflag & INP_DROPPED)) {
-		SOCK_LOCK(so);
-		so->so_state |= SS_PROTOREF;
-		SOCK_UNLOCK(so);
-		inp->inp_vflag |= INP_SOCKREF;
-	}
-	INP_UNLOCK(inp);
-	INP_INFO_WUNLOCK(&tcbinfo);
-}
-
-/*
- * TCP socket is closed.  Start friendly disconnect.
- */
-static void
-tcp_usr_close(struct socket *so)
-{
-	struct inpcb *inp;
-	struct tcpcb *tp = NULL;
-	TCPDEBUG0;
-
-	inp = sotoinpcb(so);
-	KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL"));
-
-	INP_INFO_WLOCK(&tcbinfo);
-	INP_LOCK(inp);
-	KASSERT(inp->inp_socket != NULL,
-	    ("tcp_usr_close: inp_socket == NULL"));
-
-	/*
-	 * If we still have full TCP state, and we're not dropped, initiate
-	 * a disconnect.
-	 */
-	if (!(inp->inp_vflag & INP_TIMEWAIT) &&
-	    !(inp->inp_vflag & INP_DROPPED)) {
-		tp = intotcpcb(inp);
-		TCPDEBUG1();
-		tcp_disconnect(tp);
-		TCPDEBUG2(PRU_CLOSE);
-	}
-	if (!(inp->inp_vflag & INP_DROPPED)) {
-		SOCK_LOCK(so);
-		so->so_state |= SS_PROTOREF;
-		SOCK_UNLOCK(so);
-		inp->inp_vflag |= INP_SOCKREF;
-	}
-	INP_UNLOCK(inp);
-	INP_INFO_WUNLOCK(&tcbinfo);
-}
-
-/*
- * Receive out-of-band data.
- */
-static int
-tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
-{
-	int error = 0;
-	struct inpcb *inp;
-	struct tcpcb *tp = NULL;
-
-	TCPDEBUG0;
-	inp = sotoinpcb(so);
-	KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL"));
-	INP_LOCK(inp);
-	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
-		error = ECONNRESET;
-		goto out;
-	}
-	tp = intotcpcb(inp);
-	TCPDEBUG1();
-	if ((so->so_oobmark == 0 &&
-	     (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
-	    so->so_options & SO_OOBINLINE ||
-	    tp->t_oobflags & TCPOOB_HADDATA) {
-		error = EINVAL;
-		goto out;
-	}
-	if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
-		error = EWOULDBLOCK;
-		goto out;
-	}
-	m->m_len = 1;
-	*mtod(m, caddr_t) = tp->t_iobc;
-	if ((flags & MSG_PEEK) == 0)
-		tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
-
-out:
-	TCPDEBUG2(PRU_RCVOOB);
-	INP_UNLOCK(inp);
-	return (error);
-}
-
-struct pr_usrreqs cxgb_tcp_usrreqs = {
-	.pru_abort =		tcp_usr_abort,
-	.pru_accept =		tcp_usr_accept,
-	.pru_attach =		tcp_usr_attach,
-	.pru_bind =		tcp_usr_bind,
-	.pru_connect =		tcp_usr_connect,
-	.pru_control =		in_control,
-	.pru_detach =		tcp_usr_detach,
-	.pru_disconnect =	tcp_usr_disconnect,
-	.pru_listen =		tcp_usr_listen,
-	.pru_peeraddr =		in_getpeeraddr,
-	.pru_rcvd =		tcp_usr_rcvd,
-	.pru_rcvoob =		tcp_usr_rcvoob,
-	.pru_send =		tcp_usr_send,
-	.pru_shutdown =		tcp_usr_shutdown,
-	.pru_sockaddr =		in_getsockaddr,
-	.pru_sosetlabel =	in_pcbsosetlabel,
-	.pru_close =		tcp_usr_close,
-};
-
-#ifdef INET6
-struct pr_usrreqs cxgb_tcp6_usrreqs = {
-	.pru_abort =		tcp_usr_abort,
-	.pru_accept =		tcp6_usr_accept,
-	.pru_attach =		tcp_usr_attach,
-	.pru_bind =		tcp6_usr_bind,
-	.pru_connect =		tcp6_usr_connect,
-	.pru_control =		in6_control,
-	.pru_detach =		tcp_usr_detach,
-	.pru_disconnect =	tcp_usr_disconnect,
-	.pru_listen =		tcp6_usr_listen,
-	.pru_peeraddr =		in6_mapped_peeraddr,
-	.pru_rcvd =		tcp_usr_rcvd,
-	.pru_rcvoob =		tcp_usr_rcvoob,
-	.pru_send =		tcp_usr_send,
-	.pru_shutdown =		tcp_usr_shutdown,
-	.pru_sockaddr =		in6_mapped_sockaddr,
- 	.pru_sosetlabel =	in_pcbsosetlabel,
-	.pru_close =		tcp_usr_close,
-};
-#endif /* INET6 */
-
-/*
- * Common subroutine to open a TCP connection to remote host specified
- * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
- * port number if needed.  Call in_pcbconnect_setup to do the routing and
- * to choose a local host address (interface).  If there is an existing
- * incarnation of the same connection in TIME-WAIT state and if the remote
- * host was sending CC options and if the connection duration was < MSL, then
- * truncate the previous TIME-WAIT state and proceed.
- * Initialize connection parameters and enter SYN-SENT state.
- */
-static int
-tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
-{
-	struct inpcb *inp = tp->t_inpcb, *oinp;
-	struct socket *so = inp->inp_socket;
-	struct in_addr laddr;
-	u_short lport;
-	int error;
-
-	INP_INFO_WLOCK_ASSERT(&tcbinfo);
-	INP_LOCK_ASSERT(inp);
-
-	if (inp->inp_lport == 0) {
-		error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
-		if (error)
-			return error;
-	}
-
-	/*
-	 * Cannot simply call in_pcbconnect, because there might be an
-	 * earlier incarnation of this same connection still in
-	 * TIME_WAIT state, creating an ADDRINUSE error.
-	 */
-	laddr = inp->inp_laddr;
-	lport = inp->inp_lport;
-	error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport,
-	    &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred);
-	if (error && oinp == NULL)
-		return error;
-	if (oinp)
-		return EADDRINUSE;
-	inp->inp_laddr = laddr;
-	in_pcbrehash(inp);
-
-	/*
-	 * Compute window scaling to request:
-	 * Scale to fit into sweet spot.  See tcp_syncache.c.
-	 * XXX: This should move to tcp_output().
-	 */
-	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
-	    (TCP_MAXWIN << tp->request_r_scale) < sb_max)
-		tp->request_r_scale++;
-
-	soisconnecting(so);
-	tcpstat.tcps_connattempt++;
-	tp->t_state = TCPS_SYN_SENT;
-	tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
-	tp->iss = tcp_new_isn(tp);
-	tp->t_bw_rtseq = tp->iss;
-	tcp_sendseqinit(tp);
-
-	return 0;
-}
-
-#ifdef INET6
-static int
-tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
-{
-	struct inpcb *inp = tp->t_inpcb, *oinp;
-	struct socket *so = inp->inp_socket;
-	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
-	struct in6_addr *addr6;
-	int error;
-
-	INP_INFO_WLOCK_ASSERT(&tcbinfo);
-	INP_LOCK_ASSERT(inp);
-
-	if (inp->inp_lport == 0) {
-		error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
-		if (error)
-			return error;
-	}
-
-	/*
-	 * Cannot simply call in_pcbconnect, because there might be an
-	 * earlier incarnation of this same connection still in
-	 * TIME_WAIT state, creating an ADDRINUSE error.
-	 * in6_pcbladdr() also handles scope zone IDs.
-	 */
-	error = in6_pcbladdr(inp, nam, &addr6);
-	if (error)
-		return error;
-	oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
-				  &sin6->sin6_addr, sin6->sin6_port,
-				  IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
-				  ? addr6
-				  : &inp->in6p_laddr,
-				  inp->inp_lport,  0, NULL);
-	if (oinp)
-		return EADDRINUSE;
-	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
-		inp->in6p_laddr = *addr6;
-	inp->in6p_faddr = sin6->sin6_addr;
-	inp->inp_fport = sin6->sin6_port;
-	/* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
-	inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK;
-	if (inp->in6p_flags & IN6P_AUTOFLOWLABEL)
-		inp->in6p_flowinfo |=
-		    (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
-	in_pcbrehash(inp);
-
-	/* Compute window scaling to request.  */
-	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
-	    (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
-		tp->request_r_scale++;
-
-	soisconnecting(so);
-	tcpstat.tcps_connattempt++;
-	tp->t_state = TCPS_SYN_SENT;
-	tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
-	tp->iss = tcp_new_isn(tp);
-	tp->t_bw_rtseq = tp->iss;
-	tcp_sendseqinit(tp);
-
-	return 0;
-}
-#endif /* INET6 */
-
-/*
- * tcp_sendspace and tcp_recvspace are the default send and receive window
- * sizes, respectively.  These are obsolescent (this information should
- * be set by the route).
- */
-u_long	tcp_sendspace = 1024*32;
-SYSCTL_ULONG(_net_inet_tcp_cxgb, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
-    &tcp_sendspace , 0, "Maximum outgoing TCP datagram size");
-u_long	tcp_recvspace = 1024*64;
-SYSCTL_ULONG(_net_inet_tcp_cxgb, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
-    &tcp_recvspace , 0, "Maximum incoming TCP datagram size");
-
-/*
- * Attach TCP protocol to socket, allocating
- * internet protocol control block, tcp control block,
- * bufer space, and entering LISTEN state if to accept connections.
- */
-static int
-tcp_attach(struct socket *so)
-{
-	struct tcpcb *tp;
-	struct inpcb *inp;
-	int error;
-#ifdef INET6
-	int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0;
-#endif
-
-	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
-		error = soreserve(so, tcp_sendspace, tcp_recvspace);
-		if (error)
-			return (error);
-	}
-	so->so_rcv.sb_flags |= SB_AUTOSIZE;
-	so->so_snd.sb_flags |= SB_AUTOSIZE;
-	INP_INFO_WLOCK(&tcbinfo);
-	error = in_pcballoc(so, &tcbinfo);
-	if (error) {
-		INP_INFO_WUNLOCK(&tcbinfo);
-		return (error);
-	}
-	inp = sotoinpcb(so);
-#ifdef INET6
-	if (isipv6) {
-		inp->inp_vflag |= INP_IPV6;
-		inp->in6p_hops = -1;	/* use kernel default */
-	}
-	else
-#endif
-	inp->inp_vflag |= INP_IPV4;
-	tp = tcp_newtcpcb(inp);
-	if (tp == NULL) {
-#ifdef INET6
-		if (isipv6) {
-			in6_pcbdetach(inp);
-			in6_pcbfree(inp);
-		} else {
-#endif
-			in_pcbdetach(inp);
-			in_pcbfree(inp);
-#ifdef INET6
-		}
-#endif
-		INP_INFO_WUNLOCK(&tcbinfo);
-		return (ENOBUFS);
-	}
-	tp->t_state = TCPS_CLOSED;
-	INP_UNLOCK(inp);
-	INP_INFO_WUNLOCK(&tcbinfo);
-	return (0);
-}
-
-/*
- * Initiate (or continue) disconnect.
- * If embryonic state, just send reset (once).
- * If in ``let data drain'' option and linger null, just drop.
- * Otherwise (hard), mark socket disconnecting and drop
- * current input data; switch states based on user close, and
- * send segment to peer (with FIN).
- */
-static void
-tcp_disconnect(struct tcpcb *tp)
-{
-	struct inpcb *inp = tp->t_inpcb;
-	struct socket *so = inp->inp_socket;
-
-	INP_INFO_WLOCK_ASSERT(&tcbinfo);
-	INP_LOCK_ASSERT(inp);
-
-	/*
-	 * Neither tcp_close() nor tcp_drop() should return NULL, as the
-	 * socket is still open.
-	 */
-	if (tp->t_state < TCPS_ESTABLISHED) {
-		tp = cxgb_tcp_close(tp);
-		KASSERT(tp != NULL,
-		    ("tcp_disconnect: tcp_close() returned NULL"));
-	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
-		tp = cxgb_tcp_drop(tp, 0);
-		KASSERT(tp != NULL,
-		    ("tcp_disconnect: tcp_drop() returned NULL"));
-	} else {
-		soisdisconnecting(so);
-		sbflush(&so->so_rcv);
-		tcp_usrclosed(tp);
-		if (!(inp->inp_vflag & INP_DROPPED))
-			tcp_gen_disconnect(tp);
-	}
-}
-
-/*
- * User issued close, and wish to trail through shutdown states:
- * if never received SYN, just forget it.  If got a SYN from peer,
- * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
- * If already got a FIN from peer, then almost done; go to LAST_ACK
- * state.  In all other cases, have already sent FIN to peer (e.g.
- * after PRU_SHUTDOWN), and just have to play tedious game waiting
- * for peer to send FIN or not respond to keep-alives, etc.
- * We can let the user exit from the close as soon as the FIN is acked.
- */
-static void
-tcp_usrclosed(struct tcpcb *tp)
-{
-
-	INP_INFO_WLOCK_ASSERT(&tcbinfo);
-	INP_LOCK_ASSERT(tp->t_inpcb);
-
-	switch (tp->t_state) {
-	case TCPS_LISTEN:
-		tcp_gen_listen_close(tp);
-	case TCPS_CLOSED:
-		tp->t_state = TCPS_CLOSED;
-		tp = cxgb_tcp_close(tp);
-		/*
-		 * tcp_close() should never return NULL here as the socket is
-		 * still open.
-		 */
-		KASSERT(tp != NULL,
-		    ("tcp_usrclosed: tcp_close() returned NULL"));
-		break;
-
-	case TCPS_SYN_SENT:
-	case TCPS_SYN_RECEIVED:
-		tp->t_flags |= TF_NEEDFIN;
-		break;
-
-	case TCPS_ESTABLISHED:
-		tp->t_state = TCPS_FIN_WAIT_1;
-		break;
-
-	case TCPS_CLOSE_WAIT:
-		tp->t_state = TCPS_LAST_ACK;
-		break;
-	}
-	if (tp->t_state >= TCPS_FIN_WAIT_2) {
-		soisdisconnected(tp->t_inpcb->inp_socket);
-		/* Prevent the connection hanging in FIN_WAIT_2 forever. */
-		if (tp->t_state == TCPS_FIN_WAIT_2) {
-			int timeout;
-
-			timeout = (tcp_fast_finwait2_recycle) ? 
-			    tcp_finwait2_timeout : tcp_maxidle;
-			tcp_timer_activate(tp, TT_2MSL, timeout);
-		}
-	}
-}
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h b/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h
index a078bee..8a9c498 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h
+++ b/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h
@@ -30,45 +30,49 @@
 #ifndef CXGB_TOEPCB_H_
 #define CXGB_TOEPCB_H_
 #include <sys/bus.h>
+#include <sys/condvar.h>
 #include <dev/cxgb/sys/mbufq.h>
 
 struct toepcb {
-	struct toedev *tp_toedev;
-	struct l2t_entry *tp_l2t;
-	pr_ctloutput_t *tp_ctloutput;
-	unsigned int tp_tid;
-	int tp_wr_max;
-	int tp_wr_avail;
-	int tp_wr_unacked;
-	int tp_delack_mode;
-	int tp_mtu_idx;
-	int tp_ulp_mode;
-	int tp_qset_idx;
-	int tp_mss_clamp;
-	int tp_qset;
-	int tp_flags;
-	int tp_enqueued_bytes;
-	int tp_page_count;
-	int tp_state;
-
-	tcp_seq tp_iss;
-	tcp_seq tp_delack_seq;
-	tcp_seq tp_rcv_wup;
-	tcp_seq tp_copied_seq;
-	uint64_t tp_write_seq;
-
-	volatile int tp_refcount;
-	vm_page_t *tp_pages;
+	struct toedev 		*tp_toedev;
+	struct l2t_entry 	*tp_l2t;
+	pr_ctloutput_t 		*tp_ctloutput;
+	unsigned int 		tp_tid;
+	int 			tp_wr_max;
+	int 			tp_wr_avail;
+	int 			tp_wr_unacked;
+	int 			tp_delack_mode;
+	int 			tp_mtu_idx;
+	int 			tp_ulp_mode;
+	int 			tp_qset_idx;
+	int 			tp_mss_clamp;
+	int 			tp_qset;
+	int 			tp_flags;
+	int 			tp_enqueued_bytes;
+	int 			tp_page_count;
+	int 			tp_state;
+
+	tcp_seq 		tp_iss;
+	tcp_seq 		tp_delack_seq;
+	tcp_seq 		tp_rcv_wup;
+	tcp_seq 		tp_copied_seq;
+	uint64_t 		tp_write_seq;
+
+	volatile int 		tp_refcount;
+	vm_page_t 		*tp_pages;
 	
-	struct tcpcb *tp_tp;
-	struct mbuf  *tp_m_last;
-	bus_dma_tag_t	tp_tx_dmat;
-	bus_dmamap_t	tp_dmamap;
-
-	LIST_ENTRY(toepcb) synq_entry;
-	struct mbuf_head wr_list;
-	struct mbuf_head out_of_order_queue;
-	struct ddp_state tp_ddp_state;
+	struct tcpcb 		*tp_tp;
+	struct mbuf  		*tp_m_last;
+	bus_dma_tag_t		tp_tx_dmat;
+	bus_dma_tag_t		tp_rx_dmat;
+	bus_dmamap_t		tp_dmamap;
+
+	LIST_ENTRY(toepcb) 	synq_entry;
+	struct mbuf_head 	wr_list;
+	struct mbuf_head 	out_of_order_queue;
+	struct ddp_state 	tp_ddp_state;
+	struct cv		tp_cv;
+			   
 };
 
 static inline void
@@ -95,7 +99,7 @@ enqueue_wr(struct toepcb *toep, struct mbuf *m)
 }
 
 static inline struct mbuf *
-peek_wr(struct toepcb *toep)
+peek_wr(const struct toepcb *toep)
 {
 
 	return (mbufq_peek(&toep->wr_list));
@@ -108,5 +112,10 @@ dequeue_wr(struct toepcb *toep)
 	return (mbufq_dequeue(&toep->wr_list));
 }
 
+#define wr_queue_walk(toep, m) \
+	for (m = peek_wr(toep); m; m = m->m_nextpkt)
+
+
+
 #endif
 
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom.c b/sys/dev/cxgb/ulp/tom/cxgb_tom.c
index b5b87b7..4015cd3 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_tom.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tom.c
@@ -34,11 +34,13 @@ __FBSDID("$FreeBSD$");
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
+#include <sys/ktr.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/eventhandler.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
+#include <sys/condvar.h>
 #include <sys/mutex.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
@@ -90,16 +92,20 @@ static TAILQ_HEAD(, tom_data) cxgb_list;
 static struct mtx cxgb_list_lock;
 
 static int t3_toe_attach(struct toedev *dev, const struct offload_id *entry);
+static void cxgb_register_listeners(void);
+
 /*
  * Handlers for each CPL opcode
  */
-static cxgb_cpl_handler_func tom_cpl_handlers[NUM_CPL_CMDS];
+static cxgb_cpl_handler_func tom_cpl_handlers[256];
+
 
 static eventhandler_tag listen_tag;
 
 static struct offload_id t3_toe_id_tab[] = {
 	{ TOE_ID_CHELSIO_T3, 0 },
 	{ TOE_ID_CHELSIO_T3B, 0 },
+	{ TOE_ID_CHELSIO_T3C, 0 },
 	{ 0 }
 };
 
@@ -138,7 +144,7 @@ toepcb_alloc(void)
 {
 	struct toepcb *toep;
 	
-	toep = malloc(sizeof(struct toepcb), M_DEVBUF, M_NOWAIT);
+	toep = malloc(sizeof(struct toepcb), M_DEVBUF, M_NOWAIT|M_ZERO);
 	
 	if (toep == NULL)
 		return (NULL);
@@ -150,8 +156,8 @@ toepcb_alloc(void)
 void
 toepcb_init(struct toepcb *toep)
 {
-	bzero(toep, sizeof(*toep));
 	toep->tp_refcount = 1;
+	cv_init(&toep->tp_cv, "toep cv");
 }
 
 void
@@ -164,12 +170,9 @@ void
 toepcb_release(struct toepcb *toep)
 {
 	if (toep->tp_refcount == 1) {
-		printf("doing final toepcb free\n");
-		
 		free(toep, M_DEVBUF);
 		return;
 	}
-	
 	atomic_add_acq_int(&toep->tp_refcount, -1);
 }
 
@@ -179,13 +182,30 @@ toepcb_release(struct toepcb *toep)
 static void
 t3cdev_add(struct tom_data *t)
 {
-	printf("t3cdev_add\n");
-	
 	mtx_lock(&cxgb_list_lock);
 	TAILQ_INSERT_TAIL(&cxgb_list, t, entry);
 	mtx_unlock(&cxgb_list_lock);
 }
 
+static inline int
+cdev2type(struct t3cdev *cdev)
+{
+	int type = 0;
+
+	switch (cdev->type) {
+	case T3A:
+		type = TOE_ID_CHELSIO_T3;
+		break;
+	case T3B:
+		type = TOE_ID_CHELSIO_T3B;
+		break;
+	case T3C:
+		type = TOE_ID_CHELSIO_T3C;
+		break;
+	}
+	return (type);
+}
+
 /*
  * Allocate a TOM data structure,
  * initialize its cpl_handlers
@@ -200,11 +220,7 @@ t3c_tom_add(struct t3cdev *cdev)
 	struct toedev *tdev;
 	struct adap_ports *port_info;
 
-	printf("%s called\n", __FUNCTION__);
-	
-	
 	t = malloc(sizeof(*t), M_CXGB, M_NOWAIT|M_ZERO);
-	
 	if (t == NULL)
 		return;
 
@@ -224,8 +240,7 @@ t3c_tom_add(struct t3cdev *cdev)
 
 	/* Register TCP offload device */
 	tdev = &t->tdev;
-	tdev->tod_ttid = (cdev->type == T3A ?
-		      TOE_ID_CHELSIO_T3 : TOE_ID_CHELSIO_T3B);
+	tdev->tod_ttid = cdev2type(cdev);
 	tdev->tod_lldev = cdev->lldev;
 
 	if (register_toedev(tdev, "toe%d")) {
@@ -234,13 +249,11 @@ t3c_tom_add(struct t3cdev *cdev)
 	}
 	TOM_DATA(tdev) = t;
 
-	printf("nports=%d\n", port_info->nports);
 	for (i = 0; i < port_info->nports; i++) {
 		struct ifnet *ifp = port_info->lldevs[i];
 		TOEDEV(ifp) = tdev;
 
-		printf("enabling toe on %p\n", ifp);
-		
+		CTR1(KTR_TOM, "enabling toe on %p", ifp);
 		ifp->if_capabilities |= IFCAP_TOE4;
 		ifp->if_capenable |= IFCAP_TOE4;
 	}
@@ -251,6 +264,7 @@ t3c_tom_add(struct t3cdev *cdev)
 
 	/* Activate TCP offload device */
 	activate_offload(tdev);
+	cxgb_register_listeners();
 	return;
 
 out_free_all:
@@ -269,8 +283,8 @@ static int
 do_bad_cpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 {
 	log(LOG_ERR, "%s: received bad CPL command %u\n", cdev->name,
-	    *mtod(m, unsigned int *));
-
+	    0xFF & *mtod(m, unsigned int *));
+	kdb_backtrace();
 	return (CPL_RET_BUF_DONE | CPL_RET_BAD_MSG);
 }
 
@@ -282,7 +296,7 @@ do_bad_cpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 void
 t3tom_register_cpl_handler(unsigned int opcode, cxgb_cpl_handler_func h)
 {
-	if (opcode < NUM_CPL_CMDS)
+	if (opcode < 256)
 		tom_cpl_handlers[opcode] = h ? h : do_bad_cpl;
 	else
 		log(LOG_ERR, "Chelsio T3 TOM: handler registration for "
@@ -327,7 +341,7 @@ init_cpl_handlers(void)
 {
 	int i;
 
-	for (i = 0; i < NUM_CPL_CMDS; ++i)
+	for (i = 0; i < 256; ++i)
 		tom_cpl_handlers[i] = do_bad_cpl;
 
 	t3_init_listen_cpl_handlers();
@@ -349,7 +363,7 @@ t3_toe_attach(struct toedev *dev, const struct offload_id *entry)
 #endif
 	t3_init_tunables(t);
 	mtx_init(&t->listen_lock, "tom data listeners", NULL, MTX_DEF);
-
+	CTR2(KTR_TOM, "t3_toe_attach dev=%p entry=%p", dev, entry);
 	/* Adjust TOE activation for this module */
 	t->conf.activated = activated;
 
@@ -374,19 +388,14 @@ t3_toe_attach(struct toedev *dev, const struct offload_id *entry)
 	t->ddp_ulimit = ddp.ulimit;
 	t->pdev = ddp.pdev;
 	t->rx_page_size = rx_page_info.page_size;
-#ifdef notyet
 	/* OK if this fails, we just can't do DDP */
 	t->nppods = (ddp.ulimit + 1 - ddp.llimit) / PPOD_SIZE;
-	t->ppod_map = t3_alloc_mem(t->nppods);
-#endif
+	t->ppod_map = malloc(t->nppods, M_DEVBUF, M_WAITOK|M_ZERO);
 
-#if 0
-	spin_lock_init(&t->ppod_map_lock);
-	tom_proc_init(dev);
-#ifdef CONFIG_SYSCTL
-	t->sysctl = t3_sysctl_register(dev, &t->conf);
-#endif
-#endif
+	mtx_init(&t->ppod_map_lock, "ppod map", NULL, MTX_DEF);
+
+
+	t3_sysctl_register(cdev->adapter, &t->conf);
 	return (0);
 }
 
@@ -411,11 +420,8 @@ cxgb_toe_listen_stop(void *unused, struct tcpcb *tp)
 	
 	mtx_lock(&cxgb_list_lock);
 	TAILQ_FOREACH(p, &cxgb_list, entry) {
-		if (tp->t_state == TCPS_LISTEN) {
-			printf("stopping listen on port=%d\n",
-			    ntohs(tp->t_inpcb->inp_lport));
+		if (tp->t_state == TCPS_LISTEN)
 			t3_listen_stop(&p->tdev, so, p->cdev);
-		}
 	}
 	mtx_unlock(&cxgb_list_lock);
 }
@@ -439,23 +445,12 @@ cxgb_register_listeners(void)
 static int
 t3_tom_init(void)
 {
-
-#if 0
-	struct socket *sock;
-	err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
-	if (err < 0) {
-		printk(KERN_ERR "Could not create TCP socket, error %d\n", err);
-		return err;
-	}
-
-	t3_def_state_change = sock->sk->sk_state_change;
-	t3_def_data_ready = sock->sk->sk_data_ready;
-	t3_def_error_report = sock->sk->sk_error_report;
-	sock_release(sock);
-#endif
 	init_cpl_handlers();
-	if (t3_init_cpl_io() < 0)
+	if (t3_init_cpl_io() < 0) {
+		log(LOG_ERR,
+		    "Unable to initialize cpl io ops\n");
 		return -1;
+	}
 	t3_init_socket_ops();
 
 	 /* Register with the TOE device layer. */
@@ -466,7 +461,6 @@ t3_tom_init(void)
 		return -1;
 	}
 	INP_INFO_WLOCK(&tcbinfo);
-
 	INP_INFO_WUNLOCK(&tcbinfo);	    
 
 	mtx_init(&cxgb_list_lock, "cxgb tom list", NULL, MTX_DEF);
@@ -477,10 +471,8 @@ t3_tom_init(void)
 	TAILQ_INIT(&cxgb_list);
 	
 	/* Register to offloading devices */
-	printf("setting add to %p\n", t3c_tom_add);
 	t3c_tom_client.add = t3c_tom_add;
 	cxgb_register_client(&t3c_tom_client);
-	cxgb_register_listeners();
 	return (0);
 }
 
@@ -491,8 +483,6 @@ t3_tom_load(module_t mod, int cmd, void *arg)
 
 	switch (cmd) {
 	case MOD_LOAD:
-		printf("wheeeeee ...\n");
-		
 		t3_tom_init();
 		break;
 	case MOD_QUIESCE:
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom.h b/sys/dev/cxgb/ulp/tom/cxgb_tom.h
index 8d60bbd..bcda2c3 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_tom.h
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tom.h
@@ -138,6 +138,8 @@ struct listen_ctx {
 
 void t3_init_tunables(struct tom_data *t);
 
+void t3_sysctl_register(struct adapter *sc, const struct tom_tunables *p);
+
 static __inline struct mbuf *
 m_gethdr_nofail(int len)
 {
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c b/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c
index 7219922..b4ff748 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c
@@ -66,6 +66,7 @@ __FBSDID("$FreeBSD$");
 #include <dev/cxgb/common/cxgb_ctl_defs.h>
 #include <dev/cxgb/common/cxgb_t3_cpl.h>
 #include <dev/cxgb/cxgb_offload.h>
+#include <dev/cxgb/cxgb_include.h>
 #include <dev/cxgb/cxgb_l2t.h>
 #include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
 #include <dev/cxgb/ulp/tom/cxgb_tom.h>
@@ -82,7 +83,7 @@ static struct tom_tunables default_tunable_vals = {
 	.delack = 1,
 	.max_conn = -1,
 	.soft_backlog_limit = 0,
-	.ddp = 0,
+	.ddp = 1,
 	.ddp_thres = 14 * 4096,
 	.ddp_copy_limit = 13 * 4096,
 	.ddp_push_wait = 1,
@@ -96,7 +97,8 @@ static struct tom_tunables default_tunable_vals = {
 	.activated = 1,
 };
 
-void t3_init_tunables(struct tom_data *t)
+void
+t3_init_tunables(struct tom_data *t)
 {
 	t->conf = default_tunable_vals;
 
@@ -104,3 +106,15 @@ void t3_init_tunables(struct tom_data *t)
 	t->conf.mss = T3C_DATA(t->cdev)->tx_max_chunk;
 	t->conf.max_wrs = T3C_DATA(t->cdev)->max_wrs;
 }
+
+void
+t3_sysctl_register(struct adapter *sc, const struct tom_tunables *p)
+{
+	struct sysctl_ctx_list *ctx;
+	struct sysctl_oid_list *children;
+
+	ctx = device_get_sysctl_ctx(sc->dev);
+	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
+	
+}
+
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_vm.c b/sys/dev/cxgb/ulp/tom/cxgb_vm.c
new file mode 100644
index 0000000..7036005
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_vm.c
@@ -0,0 +1,180 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/condvar.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <dev/cxgb/ulp/tom/cxgb_vm.h>
+
+#define TRACE_ENTER printf("%s:%s entered", __FUNCTION__, __FILE__)
+#define TRACE_EXIT printf("%s:%s:%d exited", __FUNCTION__, __FILE__, __LINE__)
+
+/*
+ * This routine takes a user address range and does the following:
+ *  - validate that the user has access to those pages (flags indicates read or write) - if not fail
+ *  - validate that count is enough to hold range number of pages - if not fail
+ *  - fault in any non-resident pages
+ *  - if the user is doing a read force a write fault for any COWed pages
+ *  - if the user is doing a read mark all pages as dirty
+ *  - hold all pages
+ *  - return number of pages in count
+ */
+int
+vm_fault_hold_user_pages(vm_offset_t addr, vm_page_t *mp, int count, int flags)
+{
+
+	vm_offset_t end, va;
+	vm_paddr_t pa;
+	int faults, rv;
+
+	struct thread *td;
+	vm_map_t map;
+	pmap_t pmap;
+	vm_page_t m, *pages;
+	vm_prot_t prot;
+	
+
+	/*
+	 * Check that virtual address range is legal
+	 * This check is somewhat bogus as on some architectures kernel
+	 * and user do not share VA - however, it appears that all FreeBSD
+	 * architectures define it
+	 */
+	end = addr + (count * PAGE_SIZE);
+	if (end > VM_MAXUSER_ADDRESS) {
+		printf("bad address passed\n");
+		return (EFAULT);
+	}
+
+	td = curthread;
+	map = &td->td_proc->p_vmspace->vm_map;
+	pmap = &td->td_proc->p_vmspace->vm_pmap;
+	pages = mp;
+
+	prot = VM_PROT_READ;
+	prot |= (flags & VM_HOLD_WRITEABLE) ? VM_PROT_WRITE : 0;
+	bzero(pages, sizeof(vm_page_t *) * count);
+retry:
+
+	/*
+	 * First optimistically assume that all pages are resident (and R/W if for write)
+	 * if so just mark pages as held (and dirty if for write) and return
+	 */
+	vm_page_lock_queues();
+	for (pages = mp, faults = 0, va = addr; va < end; va += PAGE_SIZE, pages++) {
+		/*
+		 * Assure that we only hold the page once
+		 */
+		if (*pages == NULL) {
+			/*
+			 * page queue mutex is recursable so this is OK
+			 * it would be really nice if we had an unlocked version of this so
+			 * we were only acquiring the pmap lock 1 time as opposed to potentially
+			 * many dozens of times
+			 */
+			m = pmap_extract_and_hold(pmap, va, prot);
+			if (m == NULL) {
+				faults++;
+				continue;
+			}
+			
+			*pages = m;
+			if (flags & VM_HOLD_WRITEABLE)
+				vm_page_dirty(m);
+		}
+	}
+	vm_page_unlock_queues();
+	
+	if (faults == 0) {
+		return (0);
+	}
+	
+	/*
+	 * Pages either have insufficient permissions or are not present
+	 * trigger a fault where neccessary
+	 * 
+	 */
+	for (va = addr; va < end; va += PAGE_SIZE) {
+		m = NULL;
+		pa = pmap_extract(pmap, va);
+		rv = 0;
+		if (pa)
+			m = PHYS_TO_VM_PAGE(pa);
+		if (flags & VM_HOLD_WRITEABLE) {
+			if (m == NULL  || (m->flags & PG_WRITEABLE) == 0)
+				rv = vm_fault(map, va, VM_PROT_WRITE, VM_FAULT_DIRTY);
+		} else if (m == NULL)
+			rv = vm_fault(map, va, VM_PROT_READ, VM_FAULT_NORMAL);
+		if (rv) {
+			printf("vm_fault bad return rv=%d va=0x%zx\n", rv, va);
+			
+			goto error;
+		} 
+	}
+	
+	goto retry;
+
+error:	
+	vm_page_lock_queues();
+	for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++)
+		if (*pages)
+			vm_page_unhold(*pages);
+	vm_page_unlock_queues();
+	return (EFAULT);
+}
+
+void
+vm_fault_unhold_pages(vm_page_t *mp, int count)
+{
+
+	KASSERT(count >= 0, ("negative count %d", count));
+	vm_page_lock_queues();
+	while (count--) {
+		vm_page_unhold(*mp);
+		mp++;
+	}
+	vm_page_unlock_queues();
+}
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_vm.h b/sys/dev/cxgb/ulp/tom/cxgb_vm.h
new file mode 100644
index 0000000..29418b6
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_vm.h
@@ -0,0 +1,40 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+$FreeBSD$
+
+***************************************************************************/
+#ifndef CXGB_VM_H_
+#define CXGB_VM_H_
+
+#define VM_HOLD_WRITEABLE	0x1
+
+int vm_fault_hold_user_pages(vm_offset_t addr, vm_page_t *mp, int count, int flags);
+void vm_fault_unhold_pages(vm_page_t *mp, int count);
+
+#endif
author	kmacy <kmacy@FreeBSD.org>	2008-02-23 01:06:17 +0000
committer	kmacy <kmacy@FreeBSD.org>	2008-02-23 01:06:17 +0000
commit	48fe676ff5ddc104ebc346eebf48c7c0e285f833 (patch)
tree	02a3e854ca5eb4caea80ce68a9a12f620befb52d /sys/dev/cxgb/ulp
parent	df26e399aa077b14fb965be866012bccf2847bae (diff)
download	FreeBSD-src-48fe676ff5ddc104ebc346eebf48c7c0e285f833.zip FreeBSD-src-48fe676ff5ddc104ebc346eebf48c7c0e285f833.tar.gz