Add driver for TCP offload

Sponsored by: Chelsio Inc.
author: kmacy <kmacy@FreeBSD.org> 2007-12-16 05:27:26 +0000
committer: kmacy <kmacy@FreeBSD.org> 2007-12-16 05:27:26 +0000
commit: f96fe5e169e8cfe06b070663cdaf7a637dfde154 (patch)
tree: 4227e68976ae5d008757c5ec68446ae18267d17a /sys/dev/cxgb/ulp
parent: f04336e4cbede2676f151b37d96aacb1b14cb9b2 (diff)
download: FreeBSD-src-f96fe5e169e8cfe06b070663cdaf7a637dfde154.zip
FreeBSD-src-f96fe5e169e8cfe06b070663cdaf7a637dfde154.tar.gz
9 files changed, 5422 insertions, 0 deletions
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
new file mode 100644
index 0000000..0c796b5
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
@@ -0,0 +1,3378 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/socketvar.h>
+#include <sys/protosw.h>
+#include <sys/priv.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+
+
+#include <dev/cxgb/cxgb_osdep.h>
+#include <dev/cxgb/sys/mbufq.h>
+
+#include <netinet/ip.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_ofld.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_syncache.h>
+#include <net/route.h>
+
+
+#include <dev/cxgb/t3cdev.h>
+#include <dev/cxgb/common/cxgb_firmware_exports.h>
+#include <dev/cxgb/common/cxgb_t3_cpl.h>
+#include <dev/cxgb/common/cxgb_tcb.h>
+#include <dev/cxgb/common/cxgb_ctl_defs.h>
+#include <dev/cxgb/cxgb_l2t.h>
+#include <dev/cxgb/cxgb_offload.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <machine/bus.h>
+#include <dev/cxgb/sys/mvec.h>
+#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
+#include <dev/cxgb/ulp/tom/cxgb_defs.h>
+#include <dev/cxgb/ulp/tom/cxgb_tom.h>
+#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
+#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+
+
+
+/*
+ * For ULP connections HW may add headers, e.g., for digests, that aren't part
+ * of the messages sent by the host but that are part of the TCP payload and
+ * therefore consume TCP sequence space.  Tx connection parameters that
+ * operate in TCP sequence space are affected by the HW additions and need to
+ * compensate for them to accurately track TCP sequence numbers. This array
+ * contains the compensating extra lengths for ULP packets.  It is indexed by
+ * a packet's ULP submode.
+ */
+const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
+
+#ifdef notyet
+/*
+ * This sk_buff holds a fake header-only TCP segment that we use whenever we
+ * need to exploit SW TCP functionality that expects TCP headers, such as
+ * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
+ * CPUs without locking.
+ */
+static struct mbuf *tcphdr_mbuf __read_mostly;
+#endif
+
+/*
+ * Size of WRs in bytes.  Note that we assume all devices we are handling have
+ * the same WR size.
+ */
+static unsigned int wrlen __read_mostly;
+
+/*
+ * The number of WRs needed for an skb depends on the number of page fragments
+ * in the skb and whether it has any payload in its main body.  This maps the
+ * length of the gather list represented by an skb into the # of necessary WRs.
+ */
+static unsigned int mbuf_wrs[TX_MAX_SEGS] __read_mostly;
+
+/*
+ * Max receive window supported by HW in bytes.  Only a small part of it can
+ * be set through option0, the rest needs to be set through RX_DATA_ACK.
+ */
+#define MAX_RCV_WND ((1U << 27) - 1)
+
+/*
+ * Min receive window.  We want it to be large enough to accommodate receive
+ * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
+ */
+#define MIN_RCV_WND (24 * 1024U)
+#define SO_TOS(so) ((sotoinpcb(so)->inp_ip_tos >> 2) & M_TOS)
+
+#define VALIDATE_SEQ 0
+#define VALIDATE_SOCK(so)
+#define DEBUG_WR 0
+
+extern int tcp_do_autorcvbuf;
+extern int tcp_do_autosndbuf;
+extern int tcp_autorcvbuf_max;
+extern int tcp_autosndbuf_max;
+
+static void t3_send_reset(struct toepcb *toep);
+static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
+static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
+static void handle_syncache_event(int event, void *arg);
+
+
+static inline int
+is_t3a(const struct toedev *dev)
+{
+	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
+}
+
+static void
+dump_toepcb(struct toepcb *toep)
+{
+	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
+	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
+	    toep->tp_mtu_idx, toep->tp_tid);
+
+	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
+	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked, 
+	    toep->tp_mss_clamp, toep->tp_flags);
+}
+
+static struct rtentry *
+rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
+{
+	struct rtentry *rt = NULL;
+	
+	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
+		RT_UNLOCK(rt);
+
+	return (rt);
+}
+
+/*
+ * Determine whether to send a CPL message now or defer it.  A message is
+ * deferred if the connection is in SYN_SENT since we don't know the TID yet.
+ * For connections in other states the message is sent immediately.
+ * If through_l2t is set the message is subject to ARP processing, otherwise
+ * it is sent directly.
+ */
+static inline void
+send_or_defer(struct socket *so, struct tcpcb *tp, struct mbuf *m, int through_l2t)
+{
+	struct toepcb *toep = tp->t_toe;
+
+	
+	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
+		INP_LOCK(tp->t_inpcb);
+		mbufq_tail(&toep->out_of_order_queue, m);  // defer
+		INP_UNLOCK(tp->t_inpcb);
+	} else if (through_l2t)
+		l2t_send(T3C_DEV(so), m, toep->tp_l2t);  // send through L2T
+	else
+		cxgb_ofld_send(T3C_DEV(so), m);          // send directly
+}
+
+static inline unsigned int
+mkprio(unsigned int cntrl, const struct socket *so)
+{
+        return cntrl;
+}
+
+/*
+ * Populate a TID_RELEASE WR.  The skb must be already propely sized.
+ */
+static inline void
+mk_tid_release(struct mbuf *m, const struct socket *so, unsigned int tid)
+{
+	struct cpl_tid_release *req;
+
+	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, so));
+	m->m_pkthdr.len = m->m_len = sizeof(*req);
+	req = mtod(m, struct cpl_tid_release *);
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
+}
+
+static inline void
+make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
+{
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	struct tx_data_wr *req;
+
+	INP_LOCK_ASSERT(tp->t_inpcb);
+
+	req = mtod(m, struct tx_data_wr *);
+	m->m_len = sizeof(*req);
+	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
+	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
+	/* len includes the length of any HW ULP additions */
+	req->len = htonl(len);
+	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
+	/* V_TX_ULP_SUBMODE sets both the mode and submode */
+	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
+	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
+	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
+				   (tail ? 0 : 1))));
+	req->sndseq = htonl(tp->snd_nxt);
+	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
+		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | 
+				    V_TX_CPU_IDX(toep->tp_qset));
+ 
+		/* Sendbuffer is in units of 32KB.
+		 */
+		if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) 
+			req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15));
+		else
+			req->param |= htonl(V_TX_SNDBUF(so->so_snd.sb_hiwat >> 15));
+		toep->tp_flags |= TP_DATASENT;
+	}
+}
+
+int
+t3_push_frames(struct socket *so, int req_completion)
+{
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	
+	struct mbuf *tail, *m0, *last;
+	struct t3cdev *cdev;
+	struct tom_data *d;
+	int bytes, count, total_bytes;
+	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
+	segp = segs;
+
+	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
+		DPRINTF("tcp state=%d\n", tp->t_state);	
+		return (0);
+	}	
+
+	if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
+		DPRINTF("disconnecting\n");
+		
+		return (0);
+	}
+
+	INP_LOCK_ASSERT(tp->t_inpcb);
+	
+	SOCKBUF_LOCK(&so->so_snd);
+	
+	d = TOM_DATA(TOE_DEV(so));
+	cdev = d->cdev;
+	last = tail = so->so_snd.sb_sndptr ? so->so_snd.sb_sndptr : so->so_snd.sb_mb;
+	total_bytes = 0;
+	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
+	    toep->tp_wr_avail, tail, so->so_snd.sb_cc, toep->tp_m_last);
+
+	if (last && toep->tp_m_last == last  && so->so_snd.sb_sndptroff != 0) {
+		KASSERT(tail, ("sbdrop error"));
+		last = tail = tail->m_next;
+	}
+
+	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
+		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
+		SOCKBUF_UNLOCK(&so->so_snd);
+		return (0);		
+	}
+			
+	toep->tp_m_last = NULL;
+	while (toep->tp_wr_avail && (tail != NULL)) {
+		count = bytes = 0;
+		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
+			SOCKBUF_UNLOCK(&so->so_snd);
+			return (0);
+		}
+		while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
+		    && (tail != NULL) && (count < TX_MAX_SEGS)) {
+			bytes += tail->m_len;
+			count++;
+			last = tail;
+			/*
+			 * technically an abuse to be using this for a VA
+			 * but less gross than defining my own structure
+			 * or calling pmap_kextract from here :-|
+			 */
+			segp->ds_addr = (bus_addr_t)tail->m_data;
+			segp->ds_len = tail->m_len;
+			DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
+			    count, mbuf_wrs[count], tail->m_data, tail->m_len);
+			
+			segp++;
+			tail = tail->m_next;
+		}
+		DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
+		    toep->tp_wr_avail, count, mbuf_wrs[count], tail);	
+		if (tail) {
+			so->so_snd.sb_sndptr = tail;
+			toep->tp_m_last = NULL;
+		} else 
+			toep->tp_m_last = so->so_snd.sb_sndptr = last;
+
+		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
+
+		so->so_snd.sb_sndptroff += bytes;
+		total_bytes += bytes;
+		toep->tp_write_seq += bytes;
+
+
+		SOCKBUF_UNLOCK(&so->so_snd);
+		
+		/*
+		 * XXX can drop socket buffer lock here
+		 */
+	
+		toep->tp_wr_avail -= mbuf_wrs[count];
+		toep->tp_wr_unacked += mbuf_wrs[count];
+		
+		make_tx_data_wr(so, m0, bytes, tail);
+		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, so));
+		m_set_sgl(m0, segs);
+		m_set_sgllen(m0, count);
+		/*
+		 * remember credits used
+		 */
+		m0->m_pkthdr.csum_data = mbuf_wrs[count];
+		m0->m_pkthdr.len = bytes;
+		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
+		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
+			struct work_request_hdr *wr = cplhdr(m0);
+
+			wr->wr_hi |= htonl(F_WR_COMPL);
+			toep->tp_wr_unacked = 0;	
+		}
+
+		m0->m_type = MT_DONTFREE;
+		enqueue_wr(toep, m0);
+		DPRINTF("sending offload tx with %d bytes in %d segments\n",
+		    bytes, count);
+		
+		l2t_send(cdev, m0, toep->tp_l2t);
+		if (toep->tp_wr_avail && (tail != NULL)) 
+			SOCKBUF_LOCK(&so->so_snd);
+	}
+
+	SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
+	return (total_bytes);
+}
+
+/*
+ * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
+ * under any circumstances.  We take the easy way out and always queue the
+ * message to the write_queue.  We can optimize the case where the queue is
+ * already empty though the optimization is probably not worth it.
+ */
+static void
+close_conn(struct socket *so)
+{
+	struct mbuf *m;
+	struct cpl_close_con_req *req;
+	struct tom_data *d;
+	struct inpcb *inp = sotoinpcb(so);
+	struct tcpcb *tp;
+	struct toepcb *toep;
+	unsigned int tid; 
+
+
+	INP_LOCK(inp);
+	tp = sototcpcb(so);
+	toep = tp->t_toe;
+	
+	if (tp->t_state != TCPS_SYN_SENT)
+		t3_push_frames(so, 1);
+	
+	if (toep->tp_flags & TP_FIN_SENT) {
+		INP_UNLOCK(inp);
+		return;
+	}
+
+	tid = toep->tp_tid;
+	    
+	d = TOM_DATA(toep->tp_toedev);
+	
+	m = m_gethdr_nofail(sizeof(*req));
+
+	toep->tp_flags |= TP_FIN_SENT;
+	req = mtod(m, struct cpl_close_con_req *);
+	
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
+	req->wr.wr_lo = htonl(V_WR_TID(tid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
+	req->rsvd = htonl(toep->tp_write_seq);
+	INP_UNLOCK(inp);
+	/*
+	 * XXX - need to defer shutdown while there is still data in the queue
+	 *
+	 */
+	cxgb_ofld_send(d->cdev, m);
+
+}
+
+/*
+ * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
+ * and send it along.
+ */
+static void
+abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
+{
+	struct cpl_abort_req *req = cplhdr(m);
+
+	req->cmd = CPL_ABORT_NO_RST;
+	cxgb_ofld_send(cdev, m);
+}
+
+/*
+ * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
+ * permitted to return without sending the message in case we cannot allocate
+ * an sk_buff.  Returns the number of credits sent.
+ */
+uint32_t
+t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
+{
+	struct mbuf *m;
+	struct cpl_rx_data_ack *req;
+	struct toepcb *toep = tp->t_toe;
+	struct toedev *tdev = toep->tp_toedev;
+	
+	m = m_gethdr_nofail(sizeof(*req));
+
+	DPRINTF("returning %u credits to HW\n", credits);
+	
+	req = mtod(m, struct cpl_rx_data_ack *);
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
+	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
+	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toeptoso(toep))); 
+	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
+	return (credits);
+}
+
+
+/*
+ * Set of states for which we should return RX credits.
+ */
+#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
+
+/*
+ * Called after some received data has been read.  It returns RX credits
+ * to the HW for the amount of data processed.
+ */
+void
+t3_cleanup_rbuf(struct tcpcb *tp)
+{
+	struct toepcb *toep = tp->t_toe;
+	struct socket *so;
+	struct toedev *dev;
+	int dack_mode, must_send, read;
+	u32 thres, credits, dack = 0;
+
+	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
+		(tp->t_state == TCPS_FIN_WAIT_2)))
+		return;
+	INP_LOCK_ASSERT(tp->t_inpcb);
+	
+	so = tp->t_inpcb->inp_socket;
+	SOCKBUF_LOCK(&so->so_rcv);
+	read = toep->tp_enqueued_bytes - so->so_rcv.sb_cc;
+	toep->tp_copied_seq += read;
+	toep->tp_enqueued_bytes -= read;
+	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
+	SOCKBUF_UNLOCK(&so->so_rcv);
+
+	if (credits > so->so_rcv.sb_mbmax)
+	    printf("copied_seq=%u rcv_wup=%u credits=%u\n",
+		toep->tp_copied_seq, toep->tp_rcv_wup, credits);
+	/*
+	 * XXX this won't accurately reflect credit return - we need
+	 * to look at the difference between the amount that has been 
+	 * put in the recv sockbuf and what is there now
+	 */
+
+	if (__predict_false(!credits))
+		return;
+
+	dev = toep->tp_toedev;
+	thres = TOM_TUNABLE(dev, rx_credit_thres);
+
+	if (__predict_false(thres == 0))
+		return;
+
+	if (toep->tp_ulp_mode)
+		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
+	else {
+		dack_mode = TOM_TUNABLE(dev, delack);
+		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
+			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
+
+			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
+				dack = F_RX_DACK_CHANGE |
+				       V_RX_DACK_MODE(dack_mode);
+		}
+	}
+
+	/*
+	 * For coalescing to work effectively ensure the receive window has
+	 * at least 16KB left.
+	 */
+	must_send = credits + 16384 >= tp->rcv_wnd;
+
+	if (must_send || credits >= thres)
+		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
+}
+
+static int
+cxgb_toe_disconnect(struct tcpcb *tp)
+{
+	struct socket *so;
+	
+	DPRINTF("cxgb_toe_disconnect\n");
+
+	so = tp->t_inpcb->inp_socket;
+	close_conn(so);
+	return (0);
+}
+
+static int
+cxgb_toe_abort(struct tcpcb *tp)
+{
+	struct toepcb *toep = tp->t_toe;
+       
+	
+	t3_send_reset(toep);
+
+	/*
+	 * unhook from socket
+	 */
+	tp->t_flags &= ~TF_TOE;
+	toep->tp_tp = NULL;
+	tp->t_toe = NULL;
+	return (0);
+}
+
+static int
+cxgb_toe_send(struct tcpcb *tp)
+{
+	struct socket *so;
+	
+	DPRINTF("cxgb_toe_send\n");
+	dump_toepcb(tp->t_toe);
+
+	so = tp->t_inpcb->inp_socket;
+	t3_push_frames(so, 1);
+	return (0);
+}
+
+static int
+cxgb_toe_rcvd(struct tcpcb *tp)
+{
+	INP_LOCK_ASSERT(tp->t_inpcb);
+	t3_cleanup_rbuf(tp);
+	
+	return (0);
+}
+
+static void
+cxgb_toe_detach(struct tcpcb *tp)
+{
+	struct toepcb *toep;
+	/*
+	 * XXX how do we handle teardown in the SYN_SENT state?
+	 *
+	 */
+	INP_INFO_WLOCK(&tcbinfo);
+	toep = tp->t_toe;
+	toep->tp_tp = NULL;
+
+	/*
+	 * unhook from socket
+	 */
+	tp->t_flags &= ~TF_TOE;
+	tp->t_toe = NULL;
+	INP_INFO_WUNLOCK(&tcbinfo);
+}
+	
+
+static struct toe_usrreqs cxgb_toe_usrreqs = {
+	.tu_disconnect = cxgb_toe_disconnect,
+	.tu_abort = cxgb_toe_abort,
+	.tu_send = cxgb_toe_send,
+	.tu_rcvd = cxgb_toe_rcvd,
+	.tu_detach = cxgb_toe_detach,
+	.tu_detach = cxgb_toe_detach,
+	.tu_syncache_event = handle_syncache_event,
+};
+
+
+static void
+__set_tcb_field(struct socket *so, struct mbuf *m, uint16_t word,
+			    uint64_t mask, uint64_t val, int no_reply)
+{
+	struct cpl_set_tcb_field *req;
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+
+	req = mtod(m, struct cpl_set_tcb_field *);
+	m->m_pkthdr.len = m->m_len = sizeof(*req);
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
+	req->reply = V_NO_REPLY(no_reply);
+	req->cpu_idx = 0;
+	req->word = htons(word);
+	req->mask = htobe64(mask);
+	req->val = htobe64(val);
+
+	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so));
+	send_or_defer(so, tp, m, 0);
+}
+
+static void
+t3_set_tcb_field(struct socket *so, uint16_t word, uint64_t mask, uint64_t val)
+{
+	struct mbuf *m;
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	
+	if (toep == NULL)
+		return;
+
+	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN))
+		return;
+
+	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
+
+	__set_tcb_field(so, m, word, mask, val, 1);
+}
+
+/*
+ * Set one of the t_flags bits in the TCB.
+ */
+static void
+set_tcb_tflag(struct socket *so, unsigned int bit_pos, int val)
+{
+	t3_set_tcb_field(so, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
+}
+
+/*
+ * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
+ */
+static void
+t3_set_nagle(struct socket *so)
+{
+	struct tcpcb *tp = sototcpcb(so);
+	
+	set_tcb_tflag(so, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
+}
+
+/*
+ * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
+ */
+void
+t3_set_keepalive(struct socket *so, int on_off)
+{
+	set_tcb_tflag(so, S_TF_KEEPALIVE, on_off);
+}
+
+void
+t3_set_rcv_coalesce_enable(struct socket *so, int on_off)
+{
+	set_tcb_tflag(so, S_TF_RCV_COALESCE_ENABLE, on_off);
+}
+
+/*
+ * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
+ */
+static void
+t3_set_tos(struct socket *so)
+{
+	t3_set_tcb_field(so, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
+			 V_TCB_TOS(SO_TOS(so)));
+}
+
+
+/*
+ * In DDP mode, TP fails to schedule a timer to push RX data to the host when
+ * DDP is disabled (data is delivered to freelist). [Note that, the peer should
+ * set the PSH bit in the last segment, which would trigger delivery.]
+ * We work around the issue by setting a DDP buffer in a partial placed state,
+ * which guarantees that TP will schedule a timer.
+ */
+#define TP_DDP_TIMER_WORKAROUND_MASK\
+    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
+     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
+       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
+#define TP_DDP_TIMER_WORKAROUND_VAL\
+    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
+     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
+      32))
+
+static void
+t3_enable_ddp(struct socket *so, int on)
+{
+	if (on)
+		t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
+				 V_TF_DDP_OFF(0));
+	else
+		t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS,
+				 V_TF_DDP_OFF(1) |
+				 TP_DDP_TIMER_WORKAROUND_MASK,
+				 V_TF_DDP_OFF(1) |
+				 TP_DDP_TIMER_WORKAROUND_VAL);
+
+}
+
+
+void
+t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag_color)
+{
+	t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
+			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
+			 tag_color);
+}
+
+void
+t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset,
+		    unsigned int len)
+{
+	if (buf_idx == 0)
+		t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_OFFSET,
+			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
+			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
+			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
+			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
+	else
+		t3_set_tcb_field(so, W_TCB_RX_DDP_BUF1_OFFSET,
+			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
+			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
+			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
+			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
+}
+
+static int
+t3_set_cong_control(struct socket *so, const char *name)
+{
+#ifdef notyet	
+	int cong_algo;
+
+	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
+		if (!strcmp(name, t3_cong_ops[cong_algo].name))
+			break;
+
+	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
+		return -EINVAL;
+#endif
+	return 0;
+}
+
+int
+t3_get_tcb(struct socket *so)
+{
+	struct cpl_get_tcb *req;
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
+
+	if (!m)
+		return (ENOMEM);
+	
+	INP_LOCK_ASSERT(tp->t_inpcb);	
+	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so));	
+	req = mtod(m, struct cpl_get_tcb *);
+	m->m_pkthdr.len = m->m_len = sizeof(*req);
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
+	req->cpuno = htons(toep->tp_qset);
+	if (sototcpcb(so)->t_state == TCPS_SYN_SENT)
+		mbufq_tail(&toep->out_of_order_queue, m);	// defer
+	else
+		cxgb_ofld_send(T3C_DEV(so), m);
+	return 0;
+}
+
+static inline void
+so_insert_tid(struct tom_data *d, struct socket *so, unsigned int tid)
+{
+	struct toepcb *toep = sototoep(so);
+	toepcb_hold(toep);
+
+	cxgb_insert_tid(d->cdev, d->client, toep, tid);
+}
+
+/**
+ *	find_best_mtu - find the entry in the MTU table closest to an MTU
+ *	@d: TOM state
+ *	@mtu: the target MTU
+ *
+ *	Returns the index of the value in the MTU table that is closest to but
+ *	does not exceed the target MTU.
+ */
+static unsigned int
+find_best_mtu(const struct t3c_data *d, unsigned short mtu)
+{
+	int i = 0;
+
+	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
+		++i;
+	return (i);
+}
+
+static unsigned int
+select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
+{
+	unsigned int idx;
+	
+#ifdef notyet
+	struct rtentry *dst = sotoinpcb(so)->inp_route.ro_rt;
+#endif
+	if (tp) {
+		tp->t_maxseg = pmtu - 40;
+		if (tp->t_maxseg < td->mtus[0] - 40)
+			tp->t_maxseg = td->mtus[0] - 40;
+		idx = find_best_mtu(td, tp->t_maxseg + 40);
+
+		tp->t_maxseg = td->mtus[idx] - 40;
+	} else
+		idx = find_best_mtu(td, pmtu);
+	
+	return (idx);
+}
+
+void
+t3_release_ddp_resources(struct toepcb *toep)
+{
+	/*
+	 * This is a no-op until we have DDP support
+	 */
+}
+
+static inline void
+free_atid(struct t3cdev *cdev, unsigned int tid)
+{
+	struct toepcb *toep = cxgb_free_atid(cdev, tid);
+
+	if (toep)
+		toepcb_release(toep);
+}
+
+/*
+ * Release resources held by an offload connection (TID, L2T entry, etc.)
+ */
+static void
+t3_release_offload_resources(struct toepcb *toep)
+{
+	struct tcpcb *tp = toep->tp_tp;
+	struct toedev *tdev = toep->tp_toedev;
+	struct t3cdev *cdev;
+	unsigned int tid = toep->tp_tid;
+
+	if (!tdev)
+		return;
+
+	cdev = TOEP_T3C_DEV(toep);
+	if (!cdev)
+		return;
+
+	toep->tp_qset = 0;
+	t3_release_ddp_resources(toep);
+
+#ifdef CTRL_SKB_CACHE
+	kfree_skb(CTRL_SKB_CACHE(tp));
+	CTRL_SKB_CACHE(tp) = NULL;
+#endif
+
+	if (toep->tp_wr_avail != toep->tp_wr_max) {
+		purge_wr_queue(toep);
+		reset_wr_list(toep);
+	}
+
+	if (toep->tp_l2t) {
+		l2t_release(L2DATA(cdev), toep->tp_l2t);
+		toep->tp_l2t = NULL;
+	}
+	printf("setting toep->tp_tp to NULL\n");
+	
+	toep->tp_tp = NULL;
+	if (tp) {
+		INP_LOCK_ASSERT(tp->t_inpcb);
+		tp->t_toe = NULL;
+		tp->t_flags &= ~TF_TOE;
+	}
+	
+	if (toep->tp_state == TCPS_SYN_SENT) {
+		free_atid(cdev, tid);
+#ifdef notyet		
+		__skb_queue_purge(&tp->out_of_order_queue);
+#endif		
+	} else {                                          // we have TID
+		cxgb_remove_tid(cdev, toep, tid);
+		toepcb_release(toep);
+	}
+#if 0
+	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
+#endif
+}
+
+static void
+install_offload_ops(struct socket *so)
+{
+	struct tcpcb *tp = sototcpcb(so);
+
+	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
+	
+	t3_install_socket_ops(so);
+	tp->t_flags |= TF_TOE;
+	tp->t_tu = &cxgb_toe_usrreqs;
+}
+
+/*
+ * Determine the receive window scaling factor given a target max
+ * receive window.
+ */
+static __inline int
+select_rcv_wscale(int space)
+{
+	int wscale = 0;
+
+	if (space > MAX_RCV_WND)
+		space = MAX_RCV_WND;
+
+	if (tcp_do_rfc1323)
+		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
+	return wscale;
+}
+
+/*
+ * Determine the receive window size for a socket.
+ */
+static unsigned int
+select_rcv_wnd(struct socket *so)
+{
+	struct toedev *dev = TOE_DEV(so);
+	struct tom_data *d = TOM_DATA(dev);
+	unsigned int wnd;
+	unsigned int max_rcv_wnd;
+
+	if (tcp_do_autorcvbuf)
+		wnd = tcp_autorcvbuf_max;
+	else
+		wnd = sbspace(&so->so_rcv);
+	
+	/* XXX
+	 * For receive coalescing to work effectively we need a receive window
+	 * that can accomodate a coalesced segment.
+	 */	
+	if (wnd < MIN_RCV_WND)
+		wnd = MIN_RCV_WND; 
+	
+	/* PR 5138 */
+	max_rcv_wnd = (dev->tod_ttid == TOE_ID_CHELSIO_T3B ? 
+				    (uint32_t)d->rx_page_size * 23 :
+				    MAX_RCV_WND);
+	
+	return min(wnd, max_rcv_wnd);
+}
+
+/*
+ * Assign offload parameters to some socket fields.  This code is used by
+ * both active and passive opens.
+ */
+static inline void
+init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
+    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
+{
+	struct tcpcb *tp = sototcpcb(so);
+	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
+
+	SOCK_LOCK_ASSERT(so);
+	
+	printf("initializing offload socket\n");
+#ifdef notyet
+	/*
+	 * We either need to fix push frames to work with sbcompress
+	 * or we need to add this
+	 */
+	so->so_rcv.sb_flags |= SB_TOE;
+	so->so_snd.sb_flags |= SB_TOE;
+#endif	
+	tp->t_toe = toep;
+	toep->tp_tp = tp;
+	toep->tp_toedev = dev;
+	
+	toep->tp_tid = tid;
+	toep->tp_l2t = e;
+	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
+	toep->tp_wr_unacked = 0;
+	toep->tp_delack_mode = 0;
+	
+	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
+	/*
+	 * XXX broken
+	 * 
+	 */
+	tp->rcv_wnd = select_rcv_wnd(so);
+        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) &&
+		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
+	toep->tp_qset_idx = 0;
+	
+	reset_wr_list(toep);
+	DPRINTF("initialization done\n");
+}
+
+/*
+ * The next two functions calculate the option 0 value for a socket.
+ */
+static inline unsigned int
+calc_opt0h(struct socket *so, int mtu_idx)
+{
+	struct tcpcb *tp = sototcpcb(so);
+	int wscale = select_rcv_wscale(tp->rcv_wnd);
+
+	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
+	    V_KEEP_ALIVE((so->so_options & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
+	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
+}
+
+static inline unsigned int
+calc_opt0l(struct socket *so, int ulp_mode)
+{
+	struct tcpcb *tp = sototcpcb(so);
+	unsigned int val;
+	
+	val = V_TOS(SO_TOS(so)) | V_ULP_MODE(ulp_mode) |
+	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
+
+	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", SO_TOS(so), tp->rcv_wnd, val);
+	return (val);
+}
+
+static inline unsigned int
+calc_opt2(const struct socket *so, struct toedev *dev)
+{
+	int flv_valid;
+
+	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
+
+	return V_FLAVORS_VALID(flv_valid) |
+	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0);
+}
+#if 0
+(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
+#endif
+	
+static void
+mk_act_open_req(struct socket *so, struct mbuf *m,
+    unsigned int atid, const struct l2t_entry *e)
+{
+	struct cpl_act_open_req *req;
+	struct inpcb *inp = sotoinpcb(so);
+	struct tcpcb *tp = intotcpcb(inp);
+	struct toepcb *toep = tp->t_toe;
+	struct toedev *tdev = TOE_DEV(so);
+	
+	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, so));
+	
+	req = mtod(m, struct cpl_act_open_req *);
+	m->m_pkthdr.len = m->m_len = sizeof(*req);
+	
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
+	req->local_port = inp->inp_lport;
+	req->peer_port = inp->inp_fport;
+	memcpy(&req->local_ip, &inp->inp_laddr, 4);
+	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
+	DPRINTF("connect smt_idx=%d\n", e->smt_idx);
+	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
+			   V_TX_CHANNEL(e->smt_idx));
+	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
+	req->params = 0;
+	req->opt2 = htonl(calc_opt2(so, tdev));
+}
+
+
+/*
+ * Convert an ACT_OPEN_RPL status to an errno.
+ */
+static int
+act_open_rpl_status_to_errno(int status)
+{
+	switch (status) {
+	case CPL_ERR_CONN_RESET:
+		return (ECONNREFUSED);
+	case CPL_ERR_ARP_MISS:
+		return (EHOSTUNREACH);
+	case CPL_ERR_CONN_TIMEDOUT:
+		return (ETIMEDOUT);
+	case CPL_ERR_TCAM_FULL:
+		return (ENOMEM);
+	case CPL_ERR_CONN_EXIST:
+		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
+		return (EADDRINUSE);
+	default:
+		return (EIO);
+	}
+}
+
+static void
+fail_act_open(struct toepcb *toep, int errno)
+{
+	struct tcpcb *tp = toep->tp_tp;
+
+	t3_release_offload_resources(toep);
+	if (tp) {
+		INP_LOCK_ASSERT(tp->t_inpcb);
+		tcp_drop(tp, errno);
+	}
+	
+#ifdef notyet
+	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
+#endif
+}
+
+/*
+ * Handle active open failures.
+ */
+static void
+active_open_failed(struct toepcb *toep, struct mbuf *m)
+{
+	struct cpl_act_open_rpl *rpl = cplhdr(m);
+	struct inpcb *inp;
+
+	INP_INFO_WLOCK(&tcbinfo);
+	if (toep->tp_tp == NULL)
+		goto done;
+
+	inp = toep->tp_tp->t_inpcb;
+	INP_LOCK(inp);
+
+/*
+ * Don't handle connection retry for now
+ */
+#ifdef notyet
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (rpl->status == CPL_ERR_CONN_EXIST &&
+	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
+		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
+		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
+			       jiffies + HZ / 2);
+	} else
+#endif		
+		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
+	INP_UNLOCK(inp);
+done:
+	INP_INFO_WUNLOCK(&tcbinfo);
+
+	m_free(m);
+}
+
+/*
+ * Return whether a failed active open has allocated a TID
+ */
+static inline int
+act_open_has_tid(int status)
+{
+	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
+	       status != CPL_ERR_ARP_MISS;
+}
+
+/*
+ * Process an ACT_OPEN_RPL CPL message.
+ */
+static int
+do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct toepcb *toep = (struct toepcb *)ctx;
+	struct cpl_act_open_rpl *rpl = cplhdr(m);
+	
+	if (cdev->type != T3A && act_open_has_tid(rpl->status))
+		cxgb_queue_tid_release(cdev, GET_TID(rpl));
+	
+	active_open_failed(toep, m);
+	return (0);
+}
+
+/*
+ * Handle an ARP failure for an active open.   XXX purge ofo queue
+ *
+ * XXX badly broken for crossed SYNs as the ATID is no longer valid.
+ * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
+ * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
+ * free the atid.  Hmm.
+ */
+#ifdef notyet
+static void
+act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
+{
+	struct toepcb *toep = m_get_toep(m);
+	struct tcpcb *tp = toep->tp_tp;
+	struct inpcb *inp = tp->t_inpcb;
+	struct socket *so = toeptoso(toep);
+	
+	INP_LOCK(inp);
+	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
+		fail_act_open(so, EHOSTUNREACH);
+		printf("freeing %p\n", m);
+		
+		m_free(m);
+	}
+	INP_UNLOCK(inp);
+}
+#endif
+/*
+ * Send an active open request.
+ */
+int
+t3_connect(struct toedev *tdev, struct socket *so,
+    struct rtentry *rt, struct sockaddr *nam)
+{
+	struct mbuf *m;
+	struct l2t_entry *e;
+	struct tom_data *d = TOM_DATA(tdev);
+	struct inpcb *inp = sotoinpcb(so);
+	struct tcpcb *tp = intotcpcb(inp);
+	struct toepcb *toep; /* allocated by init_offload_socket */
+		
+	int atid;
+
+	toep = toepcb_alloc();
+	if (toep == NULL)
+		goto out_err;
+	
+	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
+		goto out_err;
+	
+	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
+	if (!e)
+		goto free_tid;
+
+	INP_LOCK_ASSERT(inp);
+	m = m_gethdr(MT_DATA, M_WAITOK);
+	
+#if 0	
+	m->m_toe.mt_toepcb = tp->t_toe;
+	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
+#endif
+	SOCK_LOCK(so);
+	
+	init_offload_socket(so, tdev, atid, e, rt, toep);
+	
+	install_offload_ops(so);
+	
+	mk_act_open_req(so, m, atid, e);
+	SOCK_UNLOCK(so);
+	
+	soisconnecting(so);
+	toep = tp->t_toe;
+	m_set_toep(m, tp->t_toe);
+	
+	printf("sending off request\n");
+	
+	toep->tp_state = TCPS_SYN_SENT;
+	l2t_send(d->cdev, (struct mbuf *)m, e);
+
+	if (toep->tp_ulp_mode)
+		t3_enable_ddp(so, 0);
+	return 	(0);
+	
+free_tid:
+	printf("failing connect - free atid\n");
+	
+	free_atid(d->cdev, atid);
+out_err:
+	printf("return ENOMEM\n");
+       return (ENOMEM);
+}
+
+/*
+ * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
+ * not send multiple ABORT_REQs for the same connection and also that we do
+ * not try to send a message after the connection has closed.  Returns 1 if
+ * an ABORT_REQ wasn't generated after all, 0 otherwise.
+ */
+static void
+t3_send_reset(struct toepcb *toep)
+{
+	
+	struct cpl_abort_req *req;
+	unsigned int tid = toep->tp_tid;
+	int mode = CPL_ABORT_SEND_RST;
+	struct tcpcb *tp = toep->tp_tp;
+	struct toedev *tdev = toep->tp_toedev;
+	struct socket *so = NULL;
+	struct mbuf *m;
+	
+	if (tp) {
+		INP_LOCK_ASSERT(tp->t_inpcb);
+		so = toeptoso(toep);
+	}
+	
+	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
+		tdev == NULL))
+		return;
+	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
+	
+	/* Purge the send queue so we don't send anything after an abort. */
+	if (so)
+		sbflush(&so->so_snd);
+	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
+		mode |= CPL_ABORT_POST_CLOSE_REQ;
+
+	m = m_gethdr_nofail(sizeof(*req));
+	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, so));
+	set_arp_failure_handler(m, abort_arp_failure);
+
+	req = mtod(m, struct cpl_abort_req *);
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
+	req->wr.wr_lo = htonl(V_WR_TID(tid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
+	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
+	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
+	req->cmd = mode;
+	if (tp && (tp->t_state == TCPS_SYN_SENT))
+		mbufq_tail(&toep->out_of_order_queue, m);	// defer
+	else
+		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
+}
+
+static int
+t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+	struct inpcb *inp;
+	int error, optval;
+	
+	if (sopt->sopt_name == IP_OPTIONS)
+		return (ENOPROTOOPT);
+
+	if (sopt->sopt_name != IP_TOS)
+		return (EOPNOTSUPP);
+	
+	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
+
+	if (error)
+		return (error);
+
+	if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
+		return (EPERM);
+
+	inp = sotoinpcb(so);
+	inp->inp_ip_tos = optval;
+
+	t3_set_tos(so);
+	
+	return (0);
+}
+
+static int
+t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+	int err = 0;
+	size_t copied;
+
+	if (sopt->sopt_name != TCP_CONGESTION &&
+	    sopt->sopt_name != TCP_NODELAY)
+		return (EOPNOTSUPP);
+	
+	if (sopt->sopt_name == TCP_CONGESTION) {
+		char name[TCP_CA_NAME_MAX];
+		int optlen = sopt->sopt_valsize;
+		struct tcpcb *tp;
+		
+		if (optlen < 1)
+			return (EINVAL);
+		
+		err = copyinstr(sopt->sopt_val, name, 
+		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
+		if (err)
+			return (err);
+		if (copied < 1)
+			return (EINVAL);
+
+		tp = sototcpcb(so);
+		/*
+		 * XXX I need to revisit this
+		 */
+		if ((err = t3_set_cong_control(so, name)) == 0) {
+#ifdef notyet
+			tp->t_cong_control = strdup(name, M_CXGB);
+#endif			
+		} else
+			return (err);
+	} else {
+		int optval, oldval;
+		struct inpcb *inp;
+		struct tcpcb *tp;
+		
+		err = sooptcopyin(sopt, &optval, sizeof optval,
+		    sizeof optval);
+
+		if (err)
+			return (err);
+
+		inp = sotoinpcb(so);
+		tp = intotcpcb(inp);
+		    
+		INP_LOCK(inp);
+		
+		oldval = tp->t_flags;
+		if (optval)
+			tp->t_flags |= TF_NODELAY;
+		else
+			tp->t_flags &= ~TF_NODELAY;
+		INP_UNLOCK(inp);
+		
+		if (oldval != tp->t_flags)
+			t3_set_nagle(so);
+
+	}
+
+	return (0);
+}
+
+static int
+t3_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+	int err;
+
+	if (sopt->sopt_level != IPPROTO_TCP) 
+		err =  t3_ip_ctloutput(so, sopt);
+	else
+		err = t3_tcp_ctloutput(so, sopt);
+
+	if (err != EOPNOTSUPP)
+		return (err);
+
+	return tcp_ctloutput(so, sopt);
+}
+
+/*
+ * Process new data received for a connection.
+ */
+static void
+new_rx_data(struct toepcb *toep, struct mbuf *m)
+{
+	struct cpl_rx_data *hdr = cplhdr(m);
+	struct tcpcb *tp = toep->tp_tp;
+	struct socket *so = toeptoso(toep);
+	int len = be16toh(hdr->len);
+
+	INP_LOCK(tp->t_inpcb);
+	
+#ifdef notyet	
+	if (__predict_false(sk_no_receive(sk))) {
+		handle_excess_rx(so, skb);
+		return;
+	}
+
+	if (ULP_MODE(tp) == ULP_MODE_TCPDDP)
+		handle_ddp_data(so, skb);
+
+	TCP_SKB_CB(skb)->seq = ntohl(hdr->seq);
+	TCP_SKB_CB(skb)->flags = 0;
+	skb_ulp_mode(skb) = 0;                    /* for iSCSI */
+#endif
+#if VALIDATE_SEQ
+	if (__predict_false(TCP_SKB_CB(skb)->seq != tp->rcv_nxt)) {
+		printk(KERN_ERR
+		       "%s: TID %u: Bad sequence number %u, expected %u\n",
+		       TOE_DEV(sk)->name, TID(tp), TCP_SKB_CB(skb)->seq,
+		       tp->rcv_nxt);
+		__kfree_skb(skb);
+		return;
+	}
+#endif
+	m_adj(m, sizeof(*hdr));
+
+#ifdef notyet
+	/*
+	 * We don't handle urgent data yet
+	 */
+	if (__predict_false(hdr->urg))
+		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
+	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
+		     tp->urg_seq - tp->rcv_nxt < skb->len))
+		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
+							 tp->rcv_nxt];
+#endif	
+	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
+		toep->tp_delack_mode = hdr->dack_mode;
+		toep->tp_delack_seq = tp->rcv_nxt;
+	}
+
+	DPRINTF("appending mbuf=%p pktlen=%d m_len=%d len=%d\n", m, m->m_pkthdr.len, m->m_len, len);
+	
+	if (len < m->m_pkthdr.len)
+		m->m_pkthdr.len = m->m_len = len;
+
+	tp->rcv_nxt += m->m_pkthdr.len;
+	tp->t_rcvtime = ticks;
+	toep->tp_enqueued_bytes += m->m_pkthdr.len;
+#ifdef T3_TRACE
+	T3_TRACE2(TIDTB(sk),
+		  "new_rx_data: seq 0x%x len %u",
+		  TCP_SKB_CB(skb)->seq, skb->len);
+#endif
+	SOCKBUF_LOCK(&so->so_rcv);
+	if (sb_notify(&so->so_rcv))
+		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, so->so_rcv.sb_flags, m->m_pkthdr.len);
+
+	sbappend_locked(&so->so_rcv, m);
+	KASSERT(so->so_rcv.sb_cc < so->so_rcv.sb_mbmax,
+
+	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
+		so, so->so_rcv.sb_cc, so->so_rcv.sb_mbmax));
+	
+	INP_UNLOCK(tp->t_inpcb);
+	DPRINTF("sb_cc=%d sb_mbcnt=%d\n",
+	    so->so_rcv.sb_cc, so->so_rcv.sb_mbcnt);
+	    
+	if (__predict_true((so->so_state & SS_NOFDREF) == 0))
+		sorwakeup_locked(so);
+	else
+		SOCKBUF_UNLOCK(&so->so_rcv);
+}
+
+/*
+ * Handler for RX_DATA CPL messages.
+ */
+static int
+do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct toepcb *toep = (struct toepcb *)ctx;
+
+	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
+	
+	new_rx_data(toep, m);
+
+	return (0);
+}
+
+static void
+new_rx_data_ddp(struct socket *so, struct mbuf *m)
+{
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	struct ddp_state *q;
+	struct ddp_buf_state *bsp;
+	struct cpl_rx_data_ddp *hdr;
+	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
+
+#ifdef notyet
+	if (unlikely(sk_no_receive(sk))) {
+		handle_excess_rx(so, m);
+		return;
+	}
+#endif
+	tp = sototcpcb(so);
+	q = &toep->tp_ddp_state;
+	hdr = cplhdr(m);
+	ddp_report = ntohl(hdr->u.ddp_report);
+	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
+	bsp = &q->buf_state[buf_idx];
+
+#ifdef T3_TRACE
+	T3_TRACE5(TIDTB(sk),
+		  "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
+		  "hdr seq 0x%x len %u offset %u",
+		  tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
+		  ntohs(hdr->len), G_DDP_OFFSET(ddp_report));
+	T3_TRACE1(TIDTB(sk),
+		  "new_rx_data_ddp: ddp_report 0x%x",
+		  ddp_report);
+#endif
+
+	ddp_len = ntohs(hdr->len);
+	rcv_nxt = ntohl(hdr->seq) + ddp_len;
+
+	/*
+	 * Overload to store old rcv_next
+	 */
+	m->m_pkthdr.csum_data = tp->rcv_nxt;
+	tp->rcv_nxt = rcv_nxt;
+
+	/*
+	 * Store the length in m->m_len.  We are changing the meaning of
+	 * m->m_len here, we need to be very careful that nothing from now on
+	 * interprets ->len of this packet the usual way.
+	 */
+	m->m_len = tp->rcv_nxt - m->m_pkthdr.csum_data;
+
+	/*
+	 * Figure out where the new data was placed in the buffer and store it
+	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
+	 * account for page pod's pg_offset.
+	 */
+	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
+#ifdef notyet	
+	TCP_SKB_CB(skb)->when = end_offset - skb->len;
+
+	/*
+	 * We store in mac.raw the address of the gather list where the
+	 * placement happened.
+	 */
+	skb->mac.raw = (unsigned char *)bsp->gl;
+#endif	
+	bsp->cur_offset = end_offset;
+
+	/*
+	 * Bit 0 of flags stores whether the DDP buffer is completed.
+	 * Note that other parts of the code depend on this being in bit 0.
+	 */
+	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
+#if 0		
+		TCP_SKB_CB(skb)->flags = 0;  /* potential spurious completion */
+#endif		
+		panic("spurious ddp completion");
+	} else {
+		m->m_pkthdr.csum_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
+		if (m->m_pkthdr.csum_flags && !(bsp->flags & DDP_BF_NOFLIP))
+			q->cur_buf ^= 1;                     /* flip buffers */
+	}
+
+	if (bsp->flags & DDP_BF_NOCOPY) {
+		m->m_pkthdr.csum_flags |= (bsp->flags & DDP_BF_NOCOPY);
+		bsp->flags &= ~DDP_BF_NOCOPY;
+	}
+
+	if (ddp_report & F_DDP_PSH)
+		m->m_pkthdr.csum_flags |= DDP_BF_PSH;
+	
+	tp->t_rcvtime = ticks;
+	sbappendstream_locked(&so->so_rcv, m);
+#ifdef notyet	
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_data_ready(sk, 0);
+#endif	
+}
+
+#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
+		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
+		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
+		 F_DDP_INVALID_PPOD)
+
+/*
+ * Handler for RX_DATA_DDP CPL messages.
+ */
+static int
+do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct toepcb *toep = ctx;
+	struct socket *so = toeptoso(toep);
+	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
+
+	VALIDATE_SOCK(so);
+
+	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
+		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
+		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
+		return CPL_RET_BUF_DONE;
+	}
+#if 0
+	skb->h.th = tcphdr_skb->h.th;
+#endif	
+	new_rx_data_ddp(so, m);
+	return (0);
+}
+
+static void
+process_ddp_complete(struct socket *so, struct mbuf *m)
+{
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	struct ddp_state *q;
+	struct ddp_buf_state *bsp;
+	struct cpl_rx_ddp_complete *hdr;
+	unsigned int ddp_report, buf_idx, when;
+
+#ifdef notyet	
+	if (unlikely(sk_no_receive(sk))) {
+		handle_excess_rx(sk, skb);
+		return;
+	}
+#endif
+	q = &toep->tp_ddp_state; 
+	hdr = cplhdr(m);
+	ddp_report = ntohl(hdr->ddp_report);
+	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
+	bsp = &q->buf_state[buf_idx];
+
+	when = bsp->cur_offset;
+	m->m_len = G_DDP_OFFSET(ddp_report) - when;
+
+#ifdef T3_TRACE
+	T3_TRACE5(TIDTB(sk),
+		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
+		  "ddp_report 0x%x offset %u, len %u",
+		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
+		   G_DDP_OFFSET(ddp_report), skb->len);
+#endif
+
+	bsp->cur_offset += m->m_len;
+
+	if (!(bsp->flags & DDP_BF_NOFLIP))
+		q->cur_buf ^= 1;                     /* flip buffers */
+
+#ifdef T3_TRACE
+	T3_TRACE4(TIDTB(sk),
+		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
+		  "ddp_report %u offset %u",
+		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
+		   G_DDP_OFFSET(ddp_report));
+#endif
+#if 0	
+	skb->mac.raw = (unsigned char *)bsp->gl;
+#endif	
+	m->m_pkthdr.csum_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
+	if (bsp->flags & DDP_BF_NOCOPY)
+		bsp->flags &= ~DDP_BF_NOCOPY;
+	m->m_pkthdr.csum_data = tp->rcv_nxt;
+	tp->rcv_nxt += m->m_len;
+
+	tp->t_rcvtime = ticks;
+	sbappendstream_locked(&so->so_rcv, m);
+#ifdef notyet	
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_data_ready(sk, 0);
+#endif	
+}
+
+/*
+ * Handler for RX_DDP_COMPLETE CPL messages.
+ */
+static int
+do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct toepcb *toep = ctx;
+	struct socket *so = toeptoso(toep);
+
+	VALIDATE_SOCK(so);
+#if 0
+	skb->h.th = tcphdr_skb->h.th;
+#endif	
+	process_ddp_complete(so, m);
+	return (0);
+}
+
+/*
+ * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
+ * socket state before calling tcp_time_wait to comply with its expectations.
+ */
+static void
+enter_timewait(struct socket *so)
+{
+	struct tcpcb *tp = sototcpcb(so);
+
+	INP_LOCK_ASSERT(tp->t_inpcb);
+	/*
+	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
+	 * process peer_close because we don't want to carry the peer FIN in
+	 * the socket's receive queue and if we increment rcv_nxt without
+	 * having the FIN in the receive queue we'll confuse facilities such
+	 * as SIOCINQ.
+	 */
+	tp->rcv_nxt++;
+
+	tp->ts_recent_age = 0;	     /* defeat recycling */
+	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
+	tcp_twstart(tp);
+}
+
+/*
+ * Handle a peer FIN.
+ */
+static void
+do_peer_fin(struct socket *so, struct mbuf *m)
+{
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	int keep = 0, dead = (so->so_state & SS_NOFDREF);
+
+	DPRINTF("do_peer_fin state=%d dead=%d\n", tp->t_state, !!dead);
+	
+#ifdef T3_TRACE
+	T3_TRACE0(TIDTB(sk),"do_peer_fin:");
+#endif
+
+	if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
+		printf("abort_pending set\n");
+		
+		goto out;
+	}
+	
+#ifdef notyet
+	if (ULP_MODE(tp) == ULP_MODE_TCPDDP) {
+		keep = handle_peer_close_data(so, skb);
+		if (keep < 0)
+			return;
+	}
+	sk->sk_shutdown |= RCV_SHUTDOWN;
+	sock_set_flag(so, SOCK_DONE);
+#endif
+	INP_INFO_WLOCK(&tcbinfo);
+	INP_LOCK(tp->t_inpcb);
+	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) 
+		socantrcvmore(so);
+	switch (tp->t_state) {
+	case TCPS_SYN_RECEIVED:
+	    tp->t_starttime = ticks;
+	/* FALLTHROUGH */ 
+	case TCPS_ESTABLISHED:
+		tp->t_state = TCPS_CLOSE_WAIT;
+		break;
+	case TCPS_FIN_WAIT_1:
+		tp->t_state = TCPS_CLOSING;
+		break;
+	case TCPS_FIN_WAIT_2:
+		/*
+		 * If we've sent an abort_req we must have sent it too late,
+		 * HW will send us a reply telling us so, and this peer_close
+		 * is really the last message for this connection and needs to
+		 * be treated as an abort_rpl, i.e., transition the connection
+		 * to TCP_CLOSE (note that the host stack does this at the
+		 * time of generating the RST but we must wait for HW).
+		 * Otherwise we enter TIME_WAIT.
+		 */
+		t3_release_offload_resources(toep);
+		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
+			tp = tcp_close(tp);
+		} else
+			enter_timewait(so);
+		break;
+	default:
+		log(LOG_ERR,
+		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
+		       TOE_DEV(so)->tod_name, toep->tp_tid, tp->t_state);
+	}
+	INP_INFO_WUNLOCK(&tcbinfo);
+	if (tp)
+		INP_UNLOCK(tp->t_inpcb);					
+
+	if (!dead) {
+		DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(&so->so_rcv), so->so_rcv.sb_flags);
+		
+		sorwakeup(so);
+		sowwakeup(so);
+		wakeup(&so->so_timeo);
+#ifdef notyet		
+		sk->sk_state_change(sk);
+
+		/* Do not send POLL_HUP for half duplex close. */
+		if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
+		    sk->sk_state == TCP_CLOSE)
+			sk_wake_async(so, 1, POLL_HUP);
+		else
+			sk_wake_async(so, 1, POLL_IN);
+#endif
+	}
+out:
+	if (!keep)
+		m_free(m);
+}
+
+/*
+ * Handler for PEER_CLOSE CPL messages.
+ */
+static int
+do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct toepcb *toep = (struct toepcb *)ctx;
+	struct socket *so = toeptoso(toep);
+
+	VALIDATE_SOCK(so);
+
+	do_peer_fin(so, m);
+	return (0);
+}
+
+static void
+process_close_con_rpl(struct socket *so, struct mbuf *m)
+{
+	struct tcpcb *tp = sototcpcb(so);
+	struct cpl_close_con_rpl *rpl = cplhdr(m);
+	struct toepcb *toep = tp->t_toe;
+
+	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
+
+	DPRINTF("process_close_con_rpl(%p) state=%d dead=%d\n", so, tp->t_state,
+	    !!(so->so_state & SS_NOFDREF));
+	if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING))
+		goto out;
+	
+	INP_INFO_WLOCK(&tcbinfo);
+	INP_LOCK(tp->t_inpcb);
+	switch (tp->t_state) {
+	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
+		t3_release_offload_resources(toep);
+		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
+			tp = tcp_close(tp);
+
+		} else
+			enter_timewait(so);
+		break;
+	case TCPS_LAST_ACK:
+		/*
+		 * In this state we don't care about pending abort_rpl.
+		 * If we've sent abort_req it was post-close and was sent too
+		 * late, this close_con_rpl is the actual last message.
+		 */
+		t3_release_offload_resources(toep);
+		tp = tcp_close(tp);
+		break;
+	case TCPS_FIN_WAIT_1:
+#ifdef notyet
+		dst_confirm(sk->sk_dst_cache);
+#endif
+		soisdisconnecting(so);
+		
+		if ((so->so_state & SS_NOFDREF) == 0) {
+			/*
+			 * Wake up lingering close
+			 */
+			sowwakeup(so);
+			sorwakeup(so);
+			wakeup(&so->so_timeo);
+		} else if ((so->so_options & SO_LINGER) && so->so_linger == 0 &&
+		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
+			tp = tcp_drop(tp, 0);
+		}
+
+		break;
+	default:
+		log(LOG_ERR,
+		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
+		       TOE_DEV(so)->tod_name, toep->tp_tid,
+		       tp->t_state);
+	}
+	INP_INFO_WUNLOCK(&tcbinfo);
+	if (tp)
+		INP_UNLOCK(tp->t_inpcb);
+out:
+	m_free(m);
+}
+
+/*
+ * Handler for CLOSE_CON_RPL CPL messages.
+ */
+static int
+do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
+			    void *ctx)
+{
+	struct toepcb *toep = (struct toepcb *)ctx;
+	struct socket *so = toeptoso(toep);
+
+	VALIDATE_SOCK(so);
+
+	process_close_con_rpl(so, m);
+	return (0);
+}
+
+/*
+ * Process abort replies.  We only process these messages if we anticipate
+ * them as the coordination between SW and HW in this area is somewhat lacking
+ * and sometimes we get ABORT_RPLs after we are done with the connection that
+ * originated the ABORT_REQ.
+ */
+static void
+process_abort_rpl(struct socket *so, struct mbuf *m)
+{
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	
+#ifdef T3_TRACE
+	T3_TRACE1(TIDTB(sk),
+		  "process_abort_rpl: GTS rpl pending %d",
+		  sock_flag(sk, ABORT_RPL_PENDING));
+#endif
+	INP_LOCK(tp->t_inpcb);
+	
+	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
+		/*
+		 * XXX panic on tcpdrop
+		 */
+		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(TOE_DEV(so)))
+			toep->tp_flags |= TP_ABORT_RPL_RCVD;
+		else {
+			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
+			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
+			    !is_t3a(TOE_DEV(so))) {
+				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
+					panic("TP_ABORT_REQ_RCVD set");
+				INP_INFO_WLOCK(&tcbinfo);
+				INP_LOCK(tp->t_inpcb);
+				t3_release_offload_resources(toep);
+				tp = tcp_close(tp);
+				INP_INFO_WUNLOCK(&tcbinfo);
+			}
+		}
+	}
+	if (tp)
+		INP_UNLOCK(tp->t_inpcb);
+
+	m_free(m);
+}
+
+/*
+ * Handle an ABORT_RPL_RSS CPL message.
+ */
+static int
+do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct socket *so;
+	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
+	struct toepcb *toep;
+	
+	/*
+	 * Ignore replies to post-close aborts indicating that the abort was
+	 * requested too late.  These connections are terminated when we get
+	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
+	 * arrives the TID is either no longer used or it has been recycled.
+	 */
+	if (rpl->status == CPL_ERR_ABORT_FAILED) {
+discard:
+		m_free(m);
+		return (0);
+	}
+
+	toep = (struct toepcb *)ctx;
+	
+        /*
+	 * Sometimes we've already closed the socket, e.g., a post-close
+	 * abort races with ABORT_REQ_RSS, the latter frees the socket
+	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
+	 * but FW turns the ABORT_REQ into a regular one and so we get
+	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
+	 */
+	if (!toep)
+		goto discard;
+
+	if (toep->tp_tp == NULL) {
+		printf("removing tid for abort\n");
+		cxgb_remove_tid(cdev, toep, toep->tp_tid);
+		if (toep->tp_l2t) 
+			l2t_release(L2DATA(cdev), toep->tp_l2t);
+
+		toepcb_release(toep);
+		goto discard;
+	}
+	
+	printf("toep=%p\n", toep);
+	printf("tp=%p\n", toep->tp_tp);
+
+	so = toeptoso(toep); /* <- XXX panic */
+	toepcb_hold(toep);
+	process_abort_rpl(so, m);
+	toepcb_release(toep);
+	return (0);
+}
+
+/*
+ * Convert the status code of an ABORT_REQ into a Linux error code.  Also
+ * indicate whether RST should be sent in response.
+ */
+static int
+abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
+{
+	struct tcpcb *tp = sototcpcb(so);
+
+	switch (abort_reason) {
+	case CPL_ERR_BAD_SYN:
+#if 0		
+		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
+#endif		
+	case CPL_ERR_CONN_RESET:
+		// XXX need to handle SYN_RECV due to crossed SYNs
+		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
+	case CPL_ERR_XMIT_TIMEDOUT:
+	case CPL_ERR_PERSIST_TIMEDOUT:
+	case CPL_ERR_FINWAIT2_TIMEDOUT:
+	case CPL_ERR_KEEPALIVE_TIMEDOUT:
+#if 0		
+		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
+#endif		
+		return (ETIMEDOUT);
+	default:
+		return (EIO);
+	}
+}
+
+static inline void
+set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
+{
+	struct cpl_abort_rpl *rpl = cplhdr(m);
+
+	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
+	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
+	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
+	
+	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
+	rpl->cmd = cmd;
+}
+
+static void
+send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
+{
+	struct mbuf *reply_mbuf;
+	struct cpl_abort_req_rss *req = cplhdr(m);
+
+	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
+	m_set_priority(m, CPL_PRIORITY_DATA);
+	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
+	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
+	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
+	m_free(m);
+}
+
+/*
+ * Returns whether an ABORT_REQ_RSS message is a negative advice.
+ */
+static inline int
+is_neg_adv_abort(unsigned int status)
+{
+	return status == CPL_ERR_RTX_NEG_ADVICE ||
+	    status == CPL_ERR_PERSIST_NEG_ADVICE;
+}
+
+static void
+send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
+{
+	struct mbuf  *reply_mbuf;
+	struct cpl_abort_req_rss *req = cplhdr(m);
+
+	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
+
+	if (!reply_mbuf) {
+		/* Defer the reply.  Stick rst_status into req->cmd. */
+		req->status = rst_status;
+		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
+		return;
+	}
+
+	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
+	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
+	m_free(m);
+
+	/*
+	 * XXX need to sync with ARP as for SYN_RECV connections we can send
+	 * these messages while ARP is pending.  For other connection states
+	 * it's not a problem.
+	 */
+	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
+}
+
+#ifdef notyet
+static void
+cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
+{
+	UNIMPLEMENTED();
+#ifdef notyet	
+	struct request_sock *req = child->sk_user_data;
+
+	inet_csk_reqsk_queue_removed(parent, req);
+	synq_remove(tcp_sk(child));
+	__reqsk_free(req);
+	child->sk_user_data = NULL;
+#endif
+}
+
+
+/*
+ * Performs the actual work to abort a SYN_RECV connection.
+ */
+static void
+do_abort_syn_rcv(struct socket *child, struct socket *parent)
+{
+	struct tcpcb *parenttp = sototcpcb(parent);
+	struct tcpcb *childtp = sototcpcb(child);
+
+	/*
+	 * If the server is still open we clean up the child connection,
+	 * otherwise the server already did the clean up as it was purging
+	 * its SYN queue and the skb was just sitting in its backlog.
+	 */
+	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
+		cleanup_syn_rcv_conn(child, parent);
+		INP_INFO_WLOCK(&tcbinfo);
+		INP_LOCK(childtp->t_inpcb);
+		t3_release_offload_resources(childtp->t_toe);
+		childtp = tcp_close(childtp);
+		INP_INFO_WUNLOCK(&tcbinfo);
+		if (childtp)
+			INP_UNLOCK(childtp->t_inpcb);
+	}
+}
+#endif
+
+/*
+ * Handle abort requests for a SYN_RECV connection.  These need extra work
+ * because the socket is on its parent's SYN queue.
+ */
+static int
+abort_syn_rcv(struct socket *so, struct mbuf *m)
+{
+	UNIMPLEMENTED();
+#ifdef notyet	
+	struct socket *parent;
+	struct toedev *tdev = TOE_DEV(so);
+	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
+	struct socket *oreq = so->so_incomp;
+	struct t3c_tid_entry *t3c_stid;
+	struct tid_info *t;
+
+	if (!oreq)
+		return -1;        /* somehow we are not on the SYN queue */
+
+	t = &(T3C_DATA(cdev))->tid_maps;
+	t3c_stid = lookup_stid(t, oreq->ts_recent);
+	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
+
+	SOCK_LOCK(parent);
+	do_abort_syn_rcv(so, parent);
+	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
+	SOCK_UNLOCK(parent);
+#endif
+	return (0);
+}
+
+/*
+ * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
+ * request except that we need to reply to it.
+ */
+static void
+process_abort_req(struct socket *so, struct mbuf *m, struct toedev *tdev)
+{
+	int rst_status = CPL_ABORT_NO_RST;
+	const struct cpl_abort_req_rss *req = cplhdr(m);
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+
+	INP_LOCK(tp->t_inpcb);
+	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
+		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
+		m_free(m);
+		goto skip;
+	}
+
+	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
+	/*
+	 * Three cases to consider:
+	 * a) We haven't sent an abort_req; close the connection.
+	 * b) We have sent a post-close abort_req that will get to TP too late
+	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
+	 *    be ignored and the connection should be closed now.
+	 * c) We have sent a regular abort_req that will get to TP too late.
+	 *    That will generate an abort_rpl with status 0, wait for it.
+	 */
+	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
+	    (is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
+		so->so_error = abort_status_to_errno(so, req->status,
+		    &rst_status);
+#if 0	
+		if (!sock_flag(sk, SOCK_DEAD))
+			sk->sk_error_report(sk);
+#endif		
+		/*
+		 * SYN_RECV needs special processing.  If abort_syn_rcv()
+		 * returns 0 is has taken care of the abort.
+		 */
+		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
+			goto skip;
+
+		t3_release_offload_resources(toep);
+		tp = tcp_close(tp);
+	}
+	if (tp)
+		INP_UNLOCK(tp->t_inpcb);	
+	send_abort_rpl(m, tdev, rst_status);
+	return;
+	
+skip:
+	INP_UNLOCK(tp->t_inpcb);		
+}
+
+/*
+ * Handle an ABORT_REQ_RSS CPL message.
+ */
+static int
+do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	const struct cpl_abort_req_rss *req = cplhdr(m);
+	struct toepcb *toep = (struct toepcb *)ctx;
+	struct socket *so;
+	struct inpcb *inp;
+	
+	if (is_neg_adv_abort(req->status)) {
+		m_free(m);
+		return (0);
+	}
+
+	printf("aborting tid=%d\n", toep->tp_tid);
+	
+	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
+		cxgb_remove_tid(cdev, toep, toep->tp_tid);
+		toep->tp_flags |= TP_ABORT_REQ_RCVD;
+		printf("sending abort rpl\n");
+		
+		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
+		printf("sent\n");
+		if (toep->tp_l2t) 
+			l2t_release(L2DATA(cdev), toep->tp_l2t);
+
+		/*
+		 *  Unhook
+		 */
+		toep->tp_tp->t_toe = NULL;
+		toep->tp_tp->t_flags &= ~TF_TOE;
+		toep->tp_tp = NULL;
+		/*
+		 * XXX need to call syncache_chkrst - but we don't
+		 * have a way of doing that yet
+		 */
+		toepcb_release(toep);
+		printf("abort for unestablished connection :-(\n");
+		return (0);
+	}
+	if (toep->tp_tp == NULL) {
+		printf("disconnected toepcb\n");
+		/* should be freed momentarily */
+		return (0);
+	}
+
+	so = toeptoso(toep);
+	inp = sotoinpcb(so);
+	
+	VALIDATE_SOCK(so);
+	toepcb_hold(toep);
+	INP_INFO_WLOCK(&tcbinfo);
+	process_abort_req(so, m, TOE_DEV(so));
+	INP_INFO_WUNLOCK(&tcbinfo);
+	toepcb_release(toep);
+	return (0);
+}
+#ifdef notyet
+static void
+pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
+{
+	struct toedev *tdev = TOE_DEV(parent);
+
+	do_abort_syn_rcv(child, parent);
+	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
+		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
+
+		rpl->opt0h = htonl(F_TCAM_BYPASS);
+		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
+		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
+	} else
+		m_free(m);
+}
+#endif
+static void
+handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
+{
+	UNIMPLEMENTED();
+	
+#ifdef notyet	
+	struct t3cdev *cdev;
+	struct socket *parent;
+	struct socket *oreq;
+	struct t3c_tid_entry *t3c_stid;
+	struct tid_info *t;
+	struct tcpcb *otp, *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	
+	/*
+	 * If the connection is being aborted due to the parent listening
+	 * socket going away there's nothing to do, the ABORT_REQ will close
+	 * the connection.
+	 */
+	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
+		m_free(m);
+		return;
+	}
+
+	oreq = so->so_incomp;
+	otp = sototcpcb(oreq);
+	
+	cdev = T3C_DEV(so);
+	t = &(T3C_DATA(cdev))->tid_maps;
+	t3c_stid = lookup_stid(t, otp->ts_recent);
+	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
+
+	SOCK_LOCK(parent);
+	pass_open_abort(so, parent, m);
+	SOCK_UNLOCK(parent);
+#endif	
+}
+
+/*
+ * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
+ * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
+ * connection.
+ */
+static void
+pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
+{
+
+#ifdef notyet	
+	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
+	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
+#endif
+	handle_pass_open_arp_failure(m_get_socket(m), m);
+}
+
+/*
+ * Populate a reject CPL_PASS_ACCEPT_RPL WR.
+ */
+static void
+mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
+{
+	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
+	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
+	unsigned int tid = GET_TID(req);
+
+	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
+	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
+	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
+	rpl->opt0h = htonl(F_TCAM_BYPASS);
+	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
+	rpl->opt2 = 0;
+	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
+}
+
+/*
+ * Send a deferred reject to an accept request.
+ */
+static void
+reject_pass_request(struct toedev *tdev, struct mbuf *m)
+{
+	struct mbuf *reply_mbuf;
+
+	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
+	mk_pass_accept_rpl(reply_mbuf, m);
+	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
+	m_free(m);
+}
+
+static void
+handle_syncache_event(int event, void *arg)
+{
+	struct toepcb *toep = arg;
+
+	switch (event) {
+	case SC_ENTRY_PRESENT:
+		/*
+		 * entry already exists - free toepcb
+		 * and l2t
+		 */
+		printf("syncache entry present\n");
+		toepcb_release(toep);
+		break;
+	case SC_DROP:
+		/*
+		 * The syncache has given up on this entry
+		 * either it timed out, or it was evicted
+		 * we need to explicitly release the tid
+		 */
+		printf("syncache entry dropped\n");
+		toepcb_release(toep);		
+		break;
+	default:
+		log(LOG_ERR, "unknown syncache event %d\n", event);
+		break;
+	}
+}
+
+static void
+syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
+{
+	struct in_conninfo inc;
+	struct tcpopt to;
+	struct tcphdr th;
+	struct inpcb *inp;
+	int mss, wsf, sack, ts;
+
+	bzero(&to, sizeof(struct tcpopt));
+	inp = sotoinpcb(lso);
+	
+	/*
+	 * Fill out information for entering us into the syncache
+	 */
+	inc.inc_fport = th.th_sport = req->peer_port;
+	inc.inc_lport = th.th_dport = req->local_port;
+	toep->tp_iss = th.th_seq = req->rcv_isn;
+	th.th_flags = TH_SYN;
+
+	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = ntohl(req->rcv_isn);
+	
+	inc.inc_isipv6 = 0;
+	inc.inc_len = 0;
+	inc.inc_faddr.s_addr = req->peer_ip;
+	inc.inc_laddr.s_addr = req->local_ip;
+
+	DPRINTF("syncache add of %d:%d %d:%d\n",
+	    ntohl(req->local_ip), ntohs(req->local_port),
+	    ntohl(req->peer_ip), ntohs(req->peer_port));
+	
+	mss = req->tcp_options.mss;
+	wsf = req->tcp_options.wsf;
+	ts = req->tcp_options.tstamp;
+	sack = req->tcp_options.sack;
+	to.to_mss = mss;
+	to.to_wscale = wsf;
+	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
+
+	INP_INFO_WLOCK(&tcbinfo);
+	INP_LOCK(inp);
+	syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
+}
+
+
+/*
+ * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
+ * lock held.  Note that the sock here is a listening socket that is not owned
+ * by the TOE.
+ */
+static void
+process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
+    struct listen_ctx *lctx)
+{
+	int rt_flags;
+	struct l2t_entry *e;
+	struct iff_mac tim;
+	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
+	struct cpl_pass_accept_rpl *rpl;
+	struct cpl_pass_accept_req *req = cplhdr(m);
+	unsigned int tid = GET_TID(req);
+	struct tom_data *d = TOM_DATA(tdev);
+	struct t3cdev *cdev = d->cdev;
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *newtoep;
+	struct rtentry *dst;
+	struct sockaddr_in nam;
+	struct t3c_data *td = T3C_DATA(cdev);
+
+	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
+	if (__predict_false(reply_mbuf == NULL)) {
+		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
+			t3_defer_reply(m, tdev, reject_pass_request);
+		else {
+			cxgb_queue_tid_release(cdev, tid);
+			m_free(m);
+		}
+		DPRINTF("failed to get reply_mbuf\n");
+		
+		goto out;
+	}
+
+	if (tp->t_state != TCPS_LISTEN) {
+		DPRINTF("socket not in listen state\n");
+		
+		goto reject;
+	}
+	
+	tim.mac_addr = req->dst_mac;
+	tim.vlan_tag = ntohs(req->vlan_tag);
+	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
+		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
+		goto reject;
+	}
+	
+#ifdef notyet
+	/*
+	 * XXX do route lookup to confirm that we're still listening on this
+	 * address
+	 */
+	if (ip_route_input(skb, req->local_ip, req->peer_ip,
+			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
+		goto reject;
+	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
+		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
+	dst_release(skb->dst);	// done with the input route, release it
+	skb->dst = NULL;
+	
+	if ((rt_flags & RTF_LOCAL) == 0)
+		goto reject;
+#endif
+	/*
+	 * XXX
+	 */
+	rt_flags = RTF_LOCAL;
+	if ((rt_flags & RTF_LOCAL) == 0)
+		goto reject;
+	
+	/*
+	 * Calculate values and add to syncache
+	 */
+
+	newtoep = toepcb_alloc();
+	if (newtoep == NULL)
+		goto reject;
+
+	bzero(&nam, sizeof(struct sockaddr_in));
+	
+	nam.sin_len = sizeof(struct sockaddr_in);
+	nam.sin_family = AF_INET;
+	nam.sin_addr.s_addr =req->peer_ip;
+	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
+
+	if (dst == NULL) {
+		printf("failed to find route\n");
+		goto reject;
+	}
+	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
+	    (struct sockaddr *)&nam);
+	if (e == NULL) {
+		DPRINTF("failed to get l2t\n");
+	}
+	/*
+	 * Point to our listen socket until accept
+	 */
+	newtoep->tp_tp = tp;
+	newtoep->tp_flags = TP_SYN_RCVD;
+	newtoep->tp_tid = tid;
+	newtoep->tp_toedev = tdev;
+	
+	printf("inserting tid=%d\n", tid);
+	cxgb_insert_tid(cdev, d->client, newtoep, tid);
+	SOCK_LOCK(so);
+	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
+	SOCK_UNLOCK(so);
+	
+	
+	if (lctx->ulp_mode) {
+		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
+		
+		if (!ddp_mbuf)
+			newtoep->tp_ulp_mode = 0;
+		else
+			newtoep->tp_ulp_mode = lctx->ulp_mode;
+	}
+
+	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
+
+	DPRINTF("adding request to syn cache\n");
+
+	/*
+	 * XXX workaround for lack of syncache drop
+	 */
+	toepcb_hold(newtoep);
+	syncache_add_accept_req(req, so, newtoep);
+
+	
+	
+	rpl = cplhdr(reply_mbuf);
+	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
+	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	rpl->wr.wr_lo = 0;
+	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
+	rpl->opt2 = htonl(calc_opt2(so, tdev));
+	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
+	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
+
+	DPRINTF("accept smt_idx=%d\n", e->smt_idx);
+	
+	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
+	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
+	rpl->opt0l_status = htonl(calc_opt0l(so, lctx->ulp_mode) |
+				  CPL_PASS_OPEN_ACCEPT);
+
+	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
+	
+	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, so));
+	
+#ifdef DEBUG_PRINT		
+	{
+		int i;
+
+		DPRINTF("rpl:\n");
+		uint32_t *rplbuf = mtod(reply_mbuf, uint32_t *);
+		
+		for (i = 0; i < sizeof(*rpl)/sizeof(uint32_t); i++)
+			DPRINTF("[%d] %08x\n", i, rplbuf[i]);
+	}
+#endif	
+
+		
+	l2t_send(cdev, reply_mbuf, e);
+	m_free(m);
+#ifdef notyet
+	/*
+	 * XXX this call path has to be converted to not depend on sockets
+	 */
+	if (newtoep->tp_ulp_mode) 
+		__set_tcb_field(newso, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
+				V_TF_DDP_OFF(1) |
+				TP_DDP_TIMER_WORKAROUND_MASK,
+				V_TF_DDP_OFF(1) |
+				TP_DDP_TIMER_WORKAROUND_VAL, 1);
+
+#endif	
+	return;
+reject:
+	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
+		mk_pass_accept_rpl(reply_mbuf, m);
+	else 
+		mk_tid_release(reply_mbuf, NULL, tid);
+	cxgb_ofld_send(cdev, reply_mbuf);
+	m_free(m);
+out:
+#if 0
+	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
+#else
+	return;
+#endif	
+}      
+
+/*
+ * Handle a CPL_PASS_ACCEPT_REQ message.
+ */
+static int
+do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
+	struct socket *lso = listen_ctx->lso;
+	struct tom_data *d = listen_ctx->tom_data;
+
+#if VALIDATE_TID
+	struct cpl_pass_accept_req *req = cplhdr(m);
+	unsigned int tid = GET_TID(req);
+	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
+
+	if (unlikely(!lsk)) {
+		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
+		       cdev->name,
+		       (unsigned long)((union listen_entry *)ctx -
+					t->stid_tab));
+		return CPL_RET_BUF_DONE;
+	}
+	if (unlikely(tid >= t->ntids)) {
+		printk(KERN_ERR "%s: passive open TID %u too large\n",
+		       cdev->name, tid);
+		return CPL_RET_BUF_DONE;
+	}
+	/*
+	 * For T3A the current user of the TID may have closed but its last
+	 * message(s) may have been backlogged so the TID appears to be still
+	 * in use.  Just take the TID away, the connection can close at its
+	 * own leisure.  For T3B this situation is a bug.
+	 */
+	if (!valid_new_tid(t, tid) &&
+	    cdev->type != T3A) {
+		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
+		       cdev->name, tid);
+		return CPL_RET_BUF_DONE;
+	}
+#endif
+
+	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
+	return (0);
+}
+
+/*
+ * Called when a connection is established to translate the TCP options
+ * reported by HW to Linux's native format.
+ */
+static void
+assign_rxopt(struct socket *so, unsigned int opt)
+{
+	const struct t3c_data *td = T3C_DATA(T3C_DEV(so));
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+
+	INP_LOCK_ASSERT(tp->t_inpcb);
+	
+	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
+	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
+	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
+	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
+	if (tp->t_flags & TF_RCVD_SCALE)
+		tp->rcv_scale = 0;
+}
+
+/*
+ * Completes some final bits of initialization for just established connections
+ * and changes their state to TCP_ESTABLISHED.
+ *
+ * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
+ */
+static void
+make_established(struct socket *so, u32 snd_isn, unsigned int opt)
+{
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	
+	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
+	assign_rxopt(so, opt);
+	so->so_proto->pr_ctloutput = t3_ctloutput;
+	
+#if 0	
+	inet_sk(sk)->id = tp->write_seq ^ jiffies;
+#endif	
+
+
+	/*
+	 * XXX not clear what rcv_wup maps to
+	 */
+	/*
+	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
+	 * pass through opt0.
+	 */
+	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
+		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
+
+	dump_toepcb(toep);
+
+#ifdef notyet
+/*
+ * no clean interface for marking ARP up to date
+ */
+	dst_confirm(sk->sk_dst_cache);
+#endif
+	tp->t_state = TCPS_ESTABLISHED;
+}
+
+static int
+syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
+{
+
+	struct in_conninfo inc;
+	struct tcpopt to;
+	struct tcphdr th;
+	int mss, wsf, sack, ts;
+	struct mbuf *m = NULL;
+	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
+	unsigned int opt;
+	
+#ifdef MAC
+#error	"no MAC support"
+#endif	
+	
+	opt = ntohs(req->tcp_opt);
+	
+	bzero(&to, sizeof(struct tcpopt));
+	
+	/*
+	 * Fill out information for entering us into the syncache
+	 */
+	inc.inc_fport = th.th_sport = req->peer_port;
+	inc.inc_lport = th.th_dport = req->local_port;
+	th.th_seq = req->rcv_isn;
+	th.th_flags = TH_ACK;
+	
+	inc.inc_isipv6 = 0;
+	inc.inc_len = 0;
+	inc.inc_faddr.s_addr = req->peer_ip;
+	inc.inc_laddr.s_addr = req->local_ip;
+	
+	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
+	wsf  = G_TCPOPT_WSCALE_OK(opt);
+	ts   = G_TCPOPT_TSTAMP(opt);
+	sack = G_TCPOPT_SACK(opt);
+	
+	to.to_mss = mss;
+	to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
+	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
+
+	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
+	    ntohl(req->local_ip), ntohs(req->local_port),
+	    ntohl(req->peer_ip), ntohs(req->peer_port),
+	    mss, wsf, ts, sack);
+	return syncache_expand(&inc, &to, &th, so, m);
+}
+
+
+/*
+ * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
+ * if we are in TCP_SYN_RECV due to crossed SYNs
+ */
+static int
+do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct cpl_pass_establish *req = cplhdr(m);
+	struct toepcb *toep = (struct toepcb *)ctx;
+	struct tcpcb *tp;
+	struct socket *so, *lso;
+	struct t3c_data *td = T3C_DATA(cdev);
+	// Complete socket initialization now that we have the SND_ISN
+	
+	struct toedev *tdev;
+
+	so = lso = toeptoso(toep);
+	tdev = toep->tp_toedev;
+
+	SOCK_LOCK(so);
+	LIST_REMOVE(toep, synq_entry);
+	SOCK_UNLOCK(so);
+	
+	INP_INFO_WLOCK(&tcbinfo);
+	if (!syncache_expand_establish_req(req, &so, toep)) {
+		/*
+		 * No entry 
+		 */
+		UNIMPLEMENTED();
+	}
+	if (so == NULL) {
+		/*
+		 * Couldn't create the socket
+		 */
+		UNIMPLEMENTED();
+	}
+
+	/*
+	 * XXX workaround for lack of syncache drop
+	 */
+	toepcb_release(toep);
+	
+	tp = sototcpcb(so);
+	INP_LOCK(tp->t_inpcb);
+#ifdef notyet	
+	so->so_snd.sb_flags |= SB_TOE;
+	so->so_rcv.sb_flags |= SB_TOE;
+#endif	
+	toep->tp_tp = tp;
+	toep->tp_flags = 0;
+	tp->t_toe = toep;
+	reset_wr_list(toep);
+	tp->rcv_wnd = select_rcv_wnd(so);
+	DPRINTF("rcv_wnd=%ld\n", tp->rcv_wnd);
+	install_offload_ops(so);
+	
+	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
+	toep->tp_wr_unacked = 0;
+	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
+	toep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so->so_options & SO_NO_DDP) &&
+	    tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
+	toep->tp_qset_idx = 0;
+	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
+	
+	/*
+	 * XXX Cancel any keep alive timer
+	 */
+	     
+	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
+	INP_INFO_WUNLOCK(&tcbinfo);
+	INP_UNLOCK(tp->t_inpcb);
+	soisconnected(so);
+	
+#ifdef notyet
+	/*
+	 * XXX not sure how these checks map to us
+	 */
+	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
+		sk->sk_state_change(sk);
+		sk_wake_async(so, 0, POLL_OUT);
+	}
+	/*
+	 * The state for the new connection is now up to date.
+	 * Next check if we should add the connection to the parent's
+	 * accept queue.  When the parent closes it resets connections
+	 * on its SYN queue, so check if we are being reset.  If so we
+	 * don't need to do anything more, the coming ABORT_RPL will
+	 * destroy this socket.  Otherwise move the connection to the
+	 * accept queue.
+	 *
+	 * Note that we reset the synq before closing the server so if
+	 * we are not being reset the stid is still open.
+	 */
+	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
+		__kfree_skb(skb);
+		goto unlock;
+	}
+#endif
+	m_free(m);
+
+	return (0);
+}
+
+/*
+ * Fill in the right TID for CPL messages waiting in the out-of-order queue
+ * and send them to the TOE.
+ */
+static void
+fixup_and_send_ofo(struct socket *so)
+{
+	struct mbuf *m;
+	struct toedev *tdev = TOE_DEV(so);
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	unsigned int tid = toep->tp_tid;
+
+	printf("fixup_and_send_ofo\n");
+	
+	INP_LOCK_ASSERT(tp->t_inpcb);
+	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
+		/*
+		 * A variety of messages can be waiting but the fields we'll
+		 * be touching are common to all so any message type will do.
+		 */
+		struct cpl_close_con_req *p = cplhdr(m);
+
+		p->wr.wr_lo = htonl(V_WR_TID(tid));
+		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
+		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
+	}
+}
+
+/*
+ * Updates socket state from an active establish CPL message.  Runs with the
+ * socket lock held.
+ */
+static void
+socket_act_establish(struct socket *so, struct mbuf *m)
+{
+	struct cpl_act_establish *req = cplhdr(m);
+	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	
+	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
+		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
+		    toep->tp_tid, tp->t_state);
+
+	tp->ts_recent_age = ticks;
+	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
+	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
+
+	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
+	
+	/*
+	 * Now that we finally have a TID send any CPL messages that we had to
+	 * defer for lack of a TID.
+	 */
+	if (mbufq_len(&toep->out_of_order_queue))
+		fixup_and_send_ofo(so);
+
+	if (__predict_false(so->so_state & SS_NOFDREF)) {
+#ifdef notyet
+		/*	
+		 * XXX 	not clear what should be done here
+		 * appears to correspond to sorwakeup_locked
+		 */
+		sk->sk_state_change(sk);
+		sk_wake_async(so, 0, POLL_OUT);
+#endif
+	}
+	m_free(m);
+#ifdef notyet
+/*
+ * XXX assume no write requests permitted while socket connection is
+ * incomplete
+ */
+	/*
+	 * Currently the send queue must be empty at this point because the
+	 * socket layer does not send anything before a connection is
+	 * established.  To be future proof though we handle the possibility
+	 * that there are pending buffers to send (either TX_DATA or
+	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
+	 * buffers according to the just learned write_seq, and then we send
+	 * them on their way.
+	 */
+	fixup_pending_writeq_buffers(sk);
+	if (t3_push_frames(so, 1))
+		sk->sk_write_space(sk);
+#endif
+
+	soisconnected(so);
+	toep->tp_state = tp->t_state = TCPS_ESTABLISHED;
+	tcpstat.tcps_connects++;
+				
+}
+
+/*
+ * Process a CPL_ACT_ESTABLISH message.
+ */
+static int
+do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct cpl_act_establish *req = cplhdr(m);
+	unsigned int tid = GET_TID(req);
+	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
+	struct toepcb *toep = (struct toepcb *)ctx;
+	struct tcpcb *tp = toep->tp_tp;
+	struct socket *so; 
+	struct toedev *tdev;
+	struct tom_data *d;
+	
+	if (tp == NULL) {
+		free_atid(cdev, atid);
+		return (0);
+	}
+	
+	so = toeptoso(toep);
+	tdev = TOE_DEV(so); /* blow up here if link was down */
+	d = TOM_DATA(tdev);
+	
+	INP_LOCK(tp->t_inpcb);
+	
+	/*
+	 * It's OK if the TID is currently in use, the owning socket may have
+	 * backlogged its last CPL message(s).  Just take it away.
+	 */
+	toep->tp_tid = tid;
+	toep->tp_tp = tp;
+	so_insert_tid(d, so, tid);
+	free_atid(cdev, atid);
+	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
+
+	socket_act_establish(so, m);
+	INP_UNLOCK(tp->t_inpcb);
+	return (0);
+}
+
+/*
+ * Process an acknowledgment of WR completion.  Advance snd_una and send the
+ * next batch of work requests from the write queue.
+ */
+static void
+wr_ack(struct toepcb *toep, struct mbuf *m)
+{
+	struct tcpcb *tp = toep->tp_tp;
+	struct cpl_wr_ack *hdr = cplhdr(m);
+	struct socket *so = toeptoso(toep);
+	unsigned int credits = ntohs(hdr->credits);
+	u32 snd_una = ntohl(hdr->snd_una);
+	int bytes = 0;
+	
+	DPRINTF("wr_ack: snd_una=%u credits=%d\n", snd_una, credits);
+
+	INP_LOCK(tp->t_inpcb);
+	
+	toep->tp_wr_avail += credits;
+	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
+		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
+
+	while (credits) {
+		struct mbuf *p = peek_wr(toep);
+		DPRINTF("p->credits=%d p->bytes=%d\n", p->m_pkthdr.csum_data, p->m_pkthdr.len) ;
+		
+		if (__predict_false(!p)) {
+			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
+			    "nothing pending, state %u\n",
+			       credits, toep->tp_tid, tp->t_state);
+			break;
+		}
+		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
+#if DEBUG_WR > 1
+			struct tx_data_wr *w = cplhdr(p);
+#ifdef notyet
+			log(LOG_ERR,
+			       "TID %u got %u WR credits, need %u, len %u, "
+			       "main body %u, frags %u, seq # %u, ACK una %u,"
+			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
+			       toep->tp_tid, credits, p->csum, p->len,
+			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
+			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
+			       WR_AVAIL(tp), count_pending_wrs(tp) - credits);
+#endif			
+#endif
+			p->m_pkthdr.csum_data -= credits;
+			break;
+		} else {
+			dequeue_wr(toep);
+			credits -= p->m_pkthdr.csum_data;
+			bytes += p->m_pkthdr.len;
+			DPRINTF("done with wr of %d bytes\n", p->m_pkthdr.len);
+	
+			m_free(p);
+		}
+	}
+
+#if DEBUG_WR
+	check_wr_invariants(tp);
+#endif
+
+	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
+#if VALIDATE_SEQ
+		struct tom_data *d = TOM_DATA(TOE_DEV(so));
+
+		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
+		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
+		    toep->tp_tid, tp->snd_una);
+#endif
+		goto out_free;
+	}
+
+	if (tp->snd_una != snd_una) {
+		tp->snd_una = snd_una;
+		tp->ts_recent_age = ticks;
+#ifdef notyet
+		/*
+		 * Keep ARP entry "minty fresh"
+		 */
+		dst_confirm(sk->sk_dst_cache);
+#endif
+		if (tp->snd_una == tp->snd_nxt)
+			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
+	}
+	if (bytes) {
+		DPRINTF("sbdrop(%d)\n", bytes);
+		SOCKBUF_LOCK(&so->so_snd);
+		sbdrop_locked(&so->so_snd, bytes);
+		sowwakeup_locked(so);
+	}
+	
+	if (so->so_snd.sb_sndptroff < so->so_snd.sb_cc)
+		t3_push_frames(so, 0);
+
+out_free:
+	INP_UNLOCK(tp->t_inpcb);
+	m_free(m);
+}
+
+/*
+ * Handler for TX_DATA_ACK CPL messages.
+ */
+static int
+do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
+{
+	struct toepcb *toep = (struct toepcb *)ctx;
+
+	DPRINTF("do_wr_ack\n");
+	dump_toepcb(toep);
+	
+	VALIDATE_SOCK(so);
+
+	wr_ack(toep, m);
+	return 0;
+}
+
+
+/*
+ * Reset a connection that is on a listener's SYN queue or accept queue,
+ * i.e., one that has not had a struct socket associated with it.
+ * Must be called from process context.
+ *
+ * Modeled after code in inet_csk_listen_stop().
+ */
+static void
+t3_reset_listen_child(struct socket *child)
+{
+	struct tcpcb *tp = sototcpcb(child);
+	
+	t3_send_reset(tp->t_toe);
+}
+
+/*
+ * Disconnect offloaded established but not yet accepted connections sitting
+ * on a server's accept_queue.  We just send an ABORT_REQ at this point and
+ * finish off the disconnect later as we may need to wait for the ABORT_RPL.
+ */
+void
+t3_disconnect_acceptq(struct socket *listen_so)
+{
+	struct socket *so;
+	struct tcpcb *tp;
+
+	TAILQ_FOREACH(so, &listen_so->so_comp, so_list) {
+		tp = sototcpcb(so);
+		
+		if (tp->t_flags & TF_TOE) {
+			INP_LOCK(tp->t_inpcb);
+			t3_reset_listen_child(so);
+			INP_UNLOCK(tp->t_inpcb);
+		}
+		
+	}
+}
+
+/*
+ * Reset offloaded connections sitting on a server's syn queue.  As above
+ * we send ABORT_REQ and finish off when we get ABORT_RPL.
+ */
+
+void
+t3_reset_synq(struct listen_ctx *lctx)
+{
+	struct toepcb *toep;
+
+	SOCK_LOCK(lctx->lso);
+	while (!LIST_EMPTY(&lctx->synq_head)) {
+		toep = LIST_FIRST(&lctx->synq_head);
+		LIST_REMOVE(toep, synq_entry);
+		toep->tp_tp = NULL;
+		t3_send_reset(toep);
+		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
+		toepcb_release(toep);
+	}
+	SOCK_UNLOCK(lctx->lso);
+}
+
+void
+t3_init_wr_tab(unsigned int wr_len)
+{
+	int i;
+
+	if (mbuf_wrs[1])     /* already initialized */
+		return;
+
+	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
+		int sgl_len = (3 * i) / 2 + (i & 1);
+
+		sgl_len += 3;
+		mbuf_wrs[i] = sgl_len <= wr_len ?
+		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
+	}
+
+	wrlen = wr_len * 8;
+}
+
+int
+t3_init_cpl_io(void)
+{
+#ifdef notyet
+	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
+	if (!tcphdr_skb) {
+		log(LOG_ERR,
+		       "Chelsio TCP offload: can't allocate sk_buff\n");
+		return -1;
+	}
+	skb_put(tcphdr_skb, sizeof(struct tcphdr));
+	tcphdr_skb->h.raw = tcphdr_skb->data;
+	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
+#endif
+
+	
+	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
+	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
+	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
+	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
+	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
+	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
+	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
+	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
+	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
+	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
+	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
+	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
+#ifdef notyet	
+	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
+	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
+	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
+#endif
+	return (0);
+}
+
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
new file mode 100644
index 0000000..8cb42e1
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
@@ -0,0 +1,560 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/syslog.h>
+#include <sys/socketvar.h>
+#include <sys/uio.h>
+
+#include <machine/bus.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+
+
+#include <dev/cxgb/cxgb_osdep.h>
+#include <dev/cxgb/sys/mbufq.h>
+
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_ofld.h>
+#include <net/route.h>
+
+#include <dev/cxgb/t3cdev.h>
+#include <dev/cxgb/common/cxgb_firmware_exports.h>
+#include <dev/cxgb/common/cxgb_t3_cpl.h>
+#include <dev/cxgb/common/cxgb_tcb.h>
+#include <dev/cxgb/common/cxgb_ctl_defs.h>
+#include <dev/cxgb/cxgb_l2t.h>
+#include <dev/cxgb/cxgb_offload.h>
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+
+#include <dev/cxgb/sys/mvec.h>
+#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
+#include <dev/cxgb/ulp/tom/cxgb_defs.h>
+#include <dev/cxgb/ulp/tom/cxgb_tom.h>
+#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
+#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+
+static int	(*pru_sosend)(struct socket *so, struct sockaddr *addr,
+    struct uio *uio, struct mbuf *top, struct mbuf *control,
+    int flags, struct thread *td);
+
+static int	(*pru_soreceive)(struct socket *so, struct sockaddr **paddr,
+    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
+    int *flagsp);
+
+#ifdef notyet
+#define VM_HOLD_WRITEABLE	0x1
+static int  vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp,
+    int *count, int flags);
+#endif
+static void vm_fault_unhold_pages(vm_page_t *m, int count);
+
+
+
+#define TMP_IOV_MAX 16
+
+void
+t3_init_socket_ops(void)
+{
+	struct protosw *prp;
+
+	prp = pffindtype(AF_INET, SOCK_STREAM);
+	pru_sosend = prp->pr_usrreqs->pru_sosend;
+	pru_soreceive = prp->pr_usrreqs->pru_soreceive;
+}
+
+
+struct cxgb_dma_info {
+	size_t			cdi_mapped;
+	int			cdi_nsegs;
+	bus_dma_segment_t	*cdi_segs;
+	
+};
+
+static void
+cxgb_dma_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
+    bus_size_t mapsize, int error)
+{
+	struct cxgb_dma_info *cdi = arg;
+	
+	cdi->cdi_mapped = mapsize;
+	cdi->cdi_nsegs = nsegs;
+	cdi->cdi_segs = segs;
+}
+
+static void
+iov_adj(struct iovec **iov, int *iovcnt, size_t count)
+{
+	struct iovec *iovtmp;
+	int iovcnttmp;
+	caddr_t ptmp;
+	
+	if (count > 0) {
+		iovtmp = *iov;
+		iovcnttmp = *iovcnt;
+		while (count > 0) {
+			if (count < iovtmp->iov_len) {
+				ptmp = iovtmp->iov_base;
+				ptmp += count; 
+				iovtmp->iov_base = ptmp;
+				iovtmp->iov_len -= count;
+				break;
+			} else 
+				count -= iovtmp->iov_len;
+			iovtmp++;
+			iovcnttmp--;
+		} 
+		*iov = iovtmp;
+		*iovcnt = iovcnttmp;
+	} else if (count < 0) {
+		iovtmp = &(*iov)[*iovcnt - 1];
+		iovcnttmp = *iovcnt;
+		while (count < 0) {
+			if (-count < iovtmp->iov_len) {
+				iovtmp->iov_len += count;
+				break;
+			} else
+				count += iovtmp->iov_len;
+			iovtmp--;
+			iovcnttmp--;
+		}
+		*iovcnt = iovcnttmp;
+	}
+}
+
+
+static void
+cxgb_zero_copy_free(void *cl, void *arg) {}
+
+static int
+cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, int flags)
+{
+
+	return (EINVAL);
+}
+
+static void
+cxgb_wait_dma_completion(struct toepcb *tp)
+{
+	
+}
+
+static int
+cxgb_vm_page_to_miov(struct toepcb *toep, struct uio *uio, struct mbuf **m)
+{
+	int i, seg_count, err, type;
+	struct mbuf *m0;
+	struct cxgb_dma_info cdi;
+	struct mbuf_vec *mv;
+	struct mbuf_iovec *mi;
+	bus_dma_segment_t *segs;
+	
+	err = bus_dmamap_load_uio(toep->tp_tx_dmat, toep->tp_dmamap, uio,
+	    cxgb_dma_callback, &cdi, 0);
+
+	if (err)
+		return (err);
+	seg_count = cdi.cdi_nsegs;	
+	if ((m0 = mcl_alloc(seg_count, &type)) == NULL) {
+		bus_dmamap_unload(toep->tp_tx_dmat, toep->tp_dmamap);
+		return (ENOMEM);
+	}
+	segs = cdi.cdi_segs;
+	m0->m_type = type;
+	m0->m_flags = (M_EXT|M_NOFREE);
+	m0->m_ext.ext_type = EXT_EXTREF;
+	m0->m_ext.ext_free = cxgb_zero_copy_free;
+	m0->m_ext.ext_args = NULL;
+    
+	mv = mtomv(m0);
+	mv->mv_count = seg_count;
+	mv->mv_first = 0;
+	for (i = 0, mi = mv->mv_vec; i < seg_count; mi++, segs++, i++)
+		mi_collapse_sge(mi, segs);
+
+	*m = m0;
+	
+	if (cdi.cdi_mapped < uio->uio_resid) {
+		uio->uio_resid -= cdi.cdi_mapped;
+	} else
+		uio->uio_resid = 0;
+
+	return (0);
+}
+
+static int
+t3_sosend(struct socket *so, struct uio *uio)
+{
+	int rv, count, hold_resid, sent, iovcnt;
+	struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov;
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	struct mbuf *m;
+	struct uio uiotmp;
+
+	/*
+	 * Events requiring iteration:
+	 *  - number of pages exceeds max hold pages for process or system
+	 *  - number of pages exceeds maximum sg entries for a single WR
+	 *
+	 * We're limited to holding 128 pages at once - and we're limited to
+	 * 34 SG entries per work request, but each SG entry can be any number 
+	 * of contiguous pages
+	 *
+	 */
+
+	uiotmp = *uio;
+	iovcnt = uio->uio_iovcnt;
+	iov = uio->uio_iov;
+	sent = 0;
+sendmore:
+	/*
+	 * Make sure we don't exceed the socket buffer
+	 */
+	count = min(toep->tp_page_count, (sbspace(&so->so_snd) >> PAGE_SHIFT) + 2*PAGE_SIZE);
+	rv = cxgb_hold_iovec_pages(&uiotmp, toep->tp_pages, &count, 0);
+	hold_resid = uiotmp.uio_resid;
+	if (rv)
+		return (rv);
+
+	/*
+	 * Bump past sent and shave off the unheld amount
+	 */
+	if (hold_resid  > 0) {
+		iovtmpp = iovtmp;
+		memcpy(iovtmp, iov, iovcnt*sizeof(*iov));
+		if (sent)
+			iov_adj(&iovtmpp, &iovcnt, sent);
+		iov_adj(&iovtmpp, &iovcnt, -hold_resid);
+		uiotmp.uio_iov = iovtmpp;
+		uiotmp.uio_iovcnt = iovcnt;
+
+	}
+	uiotmp.uio_resid = uio->uio_resid - hold_resid;
+	
+	/*
+	 * Push off all held pages
+	 *
+	 */
+	while (uiotmp.uio_resid > 0) {
+		rv = cxgb_vm_page_to_miov(toep, &uiotmp, &m);
+		if (rv) {
+			vm_fault_unhold_pages(toep->tp_pages, count);
+			return (rv);
+		}
+		uio->uio_resid -= m->m_pkthdr.len;
+		sent += m->m_pkthdr.len;
+		sbappend_locked(&so->so_snd, m);
+		t3_push_frames(so, TRUE);
+		iov_adj(&uiotmp.uio_iov, &iovcnt, uiotmp.uio_resid);
+	}
+	/*
+	 * Wait for pending I/O to be DMA'd to the card 
+	 * 
+	 */
+	cxgb_wait_dma_completion(toep);
+	vm_fault_unhold_pages(toep->tp_pages, count);
+	/*
+	 * If there is more data to send adjust local copy of iov
+	 * to point to teh start
+	 */
+	if (hold_resid) {
+		iovtmpp = iovtmp;
+		memcpy(iovtmp, iov, iovcnt*sizeof(*iov));
+		iov_adj(&iovtmpp, &iovcnt, sent);
+		uiotmp = *uio;
+		uiotmp.uio_iov = iovtmpp;
+		uiotmp.uio_iovcnt = iovcnt;
+		goto sendmore;
+	}
+
+	return (0);
+}
+
+static int
+cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
+    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
+{
+	struct tcpcb *tp = sototcpcb(so);
+	struct toedev *tdev; 
+	int zcopy_thres, zcopy_enabled, rv;
+
+	/*
+	 * In order to use DMA direct from userspace the following
+	 * conditions must be met:
+	 *  - the connection is currently offloaded
+	 *  - ddp is enabled
+	 *  - the number of bytes to be transferred exceeds the threshold
+	 *  - the number of bytes currently in flight won't exceed the in-flight
+	 *    threshold XXX TODO
+	 *  - vm_fault_hold_user_pages succeeds
+	 *  - blocking socket XXX for now
+	 *
+	 */
+	if (tp->t_flags & TF_TOE) {
+		tdev = TOE_DEV(so);
+		zcopy_thres = TOM_TUNABLE(tdev, zcopy_sosend_partial_thres);
+		zcopy_enabled = TOM_TUNABLE(tdev, zcopy_sosend_enabled);
+
+		if ((uio->uio_resid > zcopy_thres) &&
+		    (uio->uio_iovcnt < TMP_IOV_MAX) &&  ((so->so_state & SS_NBIO) == 0)
+		    && zcopy_enabled) {
+			rv = t3_sosend(so, uio);
+			if (rv != EAGAIN)
+				return (rv);
+		}
+	}
+	return pru_sosend(so, addr, uio, top, control, flags, td);
+}
+
+
+static int
+t3_soreceive(struct socket *so, struct uio *uio)
+{
+#ifdef notyet
+	int i, rv, count, hold_resid, sent, iovcnt;
+	struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov;
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	struct mbuf *m;
+	struct uio uiotmp;
+
+	/*
+	 * Events requiring iteration:
+	 *  - number of pages exceeds max hold pages for process or system
+	 *  - number of pages exceeds maximum sg entries for a single WR
+	 *
+	 * We're limited to holding 128 pages at once - and we're limited to
+	 * 34 SG entries per work request, but each SG entry can be any number 
+	 * of contiguous pages
+	 *
+	 */
+
+	uiotmp = *uio;
+	iovcnt = uio->uio_iovcnt;
+	iov = uio->uio_iov;
+	sent = 0;
+	re;
+#endif  
+	return (0);
+}
+
+static int
+cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
+    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+	struct toedev *tdev;
+	int rv, zcopy_thres, zcopy_enabled;
+	struct tcpcb *tp = sototcpcb(so);
+
+	/*
+	 * In order to use DMA direct from userspace the following
+	 * conditions must be met:
+	 *  - the connection is currently offloaded
+	 *  - ddp is enabled
+	 *  - the number of bytes to be transferred exceeds the threshold
+	 *  - the number of bytes currently in flight won't exceed the in-flight
+	 *    threshold XXX TODO
+	 *  - vm_fault_hold_user_pages succeeds
+	 *  - blocking socket XXX for now
+	 *  - iovcnt is 1
+	 *
+	 */
+	if (tp->t_flags & TF_TOE) {
+		tdev =  TOE_DEV(so);
+		zcopy_thres = TOM_TUNABLE(tdev, ddp_thres);
+		zcopy_enabled = TOM_TUNABLE(tdev, ddp);
+		if ((uio->uio_resid > zcopy_thres) &&
+		    (uio->uio_iovcnt == 1) &&  ((so->so_state & SS_NBIO) == 0)
+		    && zcopy_enabled) {
+			rv = t3_soreceive(so, uio);
+			if (rv != EAGAIN)
+				return (rv);
+		}
+	}
+	
+	return pru_soreceive(so, psa, uio, mp0, controlp, flagsp);
+}
+
+
+void
+t3_install_socket_ops(struct socket *so)
+{
+	so->so_proto->pr_usrreqs->pru_sosend = cxgb_sosend;
+	so->so_proto->pr_usrreqs->pru_soreceive = cxgb_soreceive;
+}
+
+/*
+ * This routine takes a user address range and does the following:
+ *  - validate that the user has access to those pages (flags indicates read or write) - if not fail
+ *  - validate that count is enough to hold range number of pages - if not fail
+ *  - fault in any non-resident pages
+ *  - if the user is doing a read force a write fault for any COWed pages
+ *  - if the user is doing a read mark all pages as dirty
+ *  - hold all pages
+ *  - return number of pages in count
+ */
+#ifdef notyet
+static int
+vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp, int *count, int flags)
+{
+
+	vm_offset_t start, va;
+	vm_paddr_t pa;
+	int pageslen, faults, rv;
+	
+	struct thread *td;
+	vm_map_t map;
+	pmap_t pmap;
+	vm_page_t m, *pages;
+	vm_prot_t prot;
+	
+	start = addr & ~PAGE_MASK;
+	pageslen = roundup2(addr + len, PAGE_SIZE);
+	if (*count < (pageslen >> PAGE_SHIFT))
+		return (EFBIG);
+
+	*count = pageslen >> PAGE_SHIFT;
+	/*
+	 * Check that virtual address range is legal
+	 * This check is somewhat bogus as on some architectures kernel
+	 * and user do not share VA - however, it appears that all FreeBSD
+	 * architectures define it
+	 */
+	if (addr + len > VM_MAXUSER_ADDRESS)
+		return (EFAULT);
+	
+	td = curthread;
+	map = &td->td_proc->p_vmspace->vm_map;
+	pmap = &td->td_proc->p_vmspace->vm_pmap;
+	pages = mp;
+
+	prot = (flags & VM_HOLD_WRITEABLE) ? VM_PROT_WRITE : VM_PROT_READ;
+	bzero(pages, sizeof(vm_page_t *) * (*count));
+retry:
+	
+	/*
+	 * First optimistically assume that all pages are resident (and R/W if for write)
+	 * if so just mark pages as held (and dirty if for write) and return
+	 */
+	vm_page_lock_queues();
+	for (pages = mp, faults = 0, va = start; va < pageslen; va += PAGE_SIZE, pages++) {
+		/*
+		 * Assure that we only hold the page once
+		 */
+		if (*pages == NULL) {
+			/*
+			 * page queue mutex is recursable so this is OK
+			 * it would be really nice if we had an unlocked version of this so
+			 * we were only acquiring the pmap lock 1 time as opposed to potentially
+			 * many dozens of times
+			 */
+			m = pmap_extract_and_hold(pmap, va, prot);
+			if (m == NULL) {
+				faults++;
+				continue;
+			}
+			*pages = m;
+		if (flags & VM_HOLD_WRITEABLE)
+			vm_page_dirty(m);
+		}
+	}
+	vm_page_unlock_queues();
+	
+	if (faults == 0) 
+		return (0);
+	/*
+	 * Pages either have insufficient permissions or are not present
+	 * trigger a fault where neccessary
+	 * 
+	 */
+	for (va = start; va < pageslen; va += PAGE_SIZE) {
+		m = NULL;
+		pa = pmap_extract(pmap, va);
+		rv = 0;
+		if (pa)
+			m = PHYS_TO_VM_PAGE(pa);
+		if (flags & VM_HOLD_WRITEABLE) {
+			if (m == NULL  || (m->flags & PG_WRITEABLE) == 0)
+				rv = vm_fault(map, va, VM_PROT_WRITE, VM_FAULT_DIRTY);
+		} else if (m == NULL)
+			rv = vm_fault(map, va, VM_PROT_READ, VM_FAULT_NORMAL);
+		if (rv)
+			goto error;
+	} 
+	goto retry;
+
+error:	
+	vm_page_lock_queues();
+	for (pages = mp, va = start; va < pageslen; va += PAGE_SIZE, pages++) 
+		if (*pages)
+			vm_page_unhold(*pages);
+	vm_page_unlock_queues();
+	return (EFAULT);
+}
+#endif
+
+static void
+vm_fault_unhold_pages(vm_page_t *mp, int count)
+{
+
+	KASSERT(count >= 0, ("negative count %d", count));
+	vm_page_lock_queues();
+	while (count--) {
+		vm_page_unhold(*mp);
+		mp++;
+	}
+	vm_page_unlock_queues();
+}
+
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_defs.h b/sys/dev/cxgb/ulp/tom/cxgb_defs.h
new file mode 100644
index 0000000..9077295
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_defs.h
@@ -0,0 +1,79 @@
+
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+$FreeBSD$
+
+***************************************************************************/
+#ifndef CXGB_DEFS_H_
+#define CXGB_DEFS_H_
+
+#define VALIDATE_TID 0
+
+#define TOEPCB(so)  ((struct toepcb *)(sototcpcb((so))->t_toe))
+#define TOE_DEV(so) (TOEPCB((so))->tp_toedev)
+#define toeptoso(toep) ((toep)->tp_tp->t_inpcb->inp_socket)
+#define sototoep(so) (sototcpcb((so))->t_toe)
+
+struct listen_ctx;
+
+typedef void (*defer_handler_t)(struct toedev *dev, struct mbuf *m);
+
+void t3tom_register_cpl_handler(unsigned int opcode, cxgb_cpl_handler_func h);
+void t3_listen_start(struct toedev *dev, struct socket *so, struct t3cdev *cdev);
+void t3_listen_stop(struct toedev *dev, struct socket *so, struct t3cdev *cdev);
+int t3_push_frames(struct socket *so, int req_completion);
+int t3_connect(struct toedev *tdev, struct socket *so, struct rtentry *rt,
+	struct sockaddr *nam);
+void t3_init_listen_cpl_handlers(void);
+int t3_init_cpl_io(void);
+void t3_init_wr_tab(unsigned int wr_len);
+uint32_t t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail);
+void t3_cleanup_rbuf(struct tcpcb *tp);
+
+void t3_init_socket_ops(void);
+void t3_install_socket_ops(struct socket *so);
+
+
+void t3_disconnect_acceptq(struct socket *listen_so);
+void t3_reset_synq(struct listen_ctx *ctx);
+void t3_defer_reply(struct mbuf *m, struct toedev *dev, defer_handler_t handler);
+
+struct toepcb *toepcb_alloc(void);
+void toepcb_hold(struct toepcb *);
+void toepcb_release(struct toepcb *);
+void toepcb_init(struct toepcb *);
+
+void t3_set_rcv_coalesce_enable(struct socket *so, int on_off);
+void t3_set_keepalive(struct socket *so, int on_off);
+void t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag);
+void t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset,
+		    unsigned int len);
+int t3_get_tcb(struct socket *so);
+
+#endif
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_listen.c b/sys/dev/cxgb/ulp/tom/cxgb_listen.c
new file mode 100644
index 0000000..e785790
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_listen.c
@@ -0,0 +1,345 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/syslog.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+
+
+#include <dev/cxgb/cxgb_osdep.h>
+#include <dev/cxgb/sys/mbufq.h>
+
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_fsm.h>
+
+#include <netinet/tcp_ofld.h>
+#include <net/route.h>
+
+#include <dev/cxgb/t3cdev.h>
+#include <dev/cxgb/common/cxgb_firmware_exports.h>
+#include <dev/cxgb/common/cxgb_t3_cpl.h>
+#include <dev/cxgb/common/cxgb_tcb.h>
+#include <dev/cxgb/common/cxgb_ctl_defs.h>
+#include <dev/cxgb/cxgb_l2t.h>
+#include <dev/cxgb/cxgb_offload.h>
+#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
+#include <dev/cxgb/ulp/tom/cxgb_defs.h>
+#include <dev/cxgb/ulp/tom/cxgb_tom.h>
+#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
+#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+
+
+static struct listen_info *listen_hash_add(struct tom_data *d, struct socket *so, unsigned int stid);
+static int listen_hash_del(struct tom_data *d, struct socket *so);
+
+/*
+ * Process a CPL_CLOSE_LISTSRV_RPL message.  If the status is good we release
+ * the STID.
+ */
+static int
+do_close_server_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct cpl_close_listserv_rpl *rpl = cplhdr(m);
+	unsigned int stid = GET_TID(rpl);
+
+	if (rpl->status != CPL_ERR_NONE)
+		log(LOG_ERR, "Unexpected CLOSE_LISTSRV_RPL status %u for "
+		       "STID %u\n", rpl->status, stid);
+	else {
+		struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
+
+		cxgb_free_stid(cdev, stid);
+		free(listen_ctx, M_CXGB);
+	}
+
+	return (CPL_RET_BUF_DONE);
+}
+
+/*
+ * Process a CPL_PASS_OPEN_RPL message.  Remove the socket from the listen hash
+ * table and free the STID if there was any error, otherwise nothing to do.
+ */
+static int
+do_pass_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+       	struct cpl_pass_open_rpl *rpl = cplhdr(m);
+
+	if (rpl->status != CPL_ERR_NONE) {
+		int stid = GET_TID(rpl);
+		struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
+		struct tom_data *d = listen_ctx->tom_data;
+		struct socket *lso = listen_ctx->lso;
+
+#if VALIDATE_TID
+		if (!lso)
+			return (CPL_RET_UNKNOWN_TID | CPL_RET_BUF_DONE);
+#endif
+		/*
+		 * Note: It is safe to unconditionally call listen_hash_del()
+		 * at this point without risking unhashing a reincarnation of
+		 * an already closed socket (i.e., there is no listen, close,
+		 * listen, free the sock for the second listen while processing
+		 * a message for the first race) because we are still holding
+		 * a reference on the socket.  It is possible that the unhash
+		 * will fail because the socket is already closed, but we can't
+		 * unhash the wrong socket because it is impossible for the
+		 * socket to which this message refers to have reincarnated.
+		 */
+		listen_hash_del(d, lso);
+		cxgb_free_stid(cdev, stid);
+#ifdef notyet
+		/*
+		 * XXX need to unreference the inpcb
+		 * but we have no way of knowing that other TOMs aren't referencing it 
+		 */
+		sock_put(lso);
+#endif
+		free(listen_ctx, M_CXGB);
+	}
+	return CPL_RET_BUF_DONE;
+}
+
+void
+t3_init_listen_cpl_handlers(void)
+{
+	t3tom_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl);
+	t3tom_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
+}
+
+static inline int
+listen_hashfn(const struct socket *so)
+{
+	return ((unsigned long)so >> 10) & (LISTEN_INFO_HASH_SIZE - 1);
+}
+
+/*
+ * Create and add a listen_info entry to the listen hash table.  This and the
+ * listen hash table functions below cannot be called from softirqs.
+ */
+static struct listen_info *
+listen_hash_add(struct tom_data *d, struct socket *so, unsigned int stid)
+{
+	struct listen_info *p;
+
+	p = malloc(sizeof(*p), M_CXGB, M_NOWAIT|M_ZERO);
+	if (p) {
+		int bucket = listen_hashfn(so);
+
+		p->so = so;	/* just a key, no need to take a reference */
+		p->stid = stid;
+		mtx_lock(&d->listen_lock);		
+		p->next = d->listen_hash_tab[bucket];
+		d->listen_hash_tab[bucket] = p;
+		mtx_unlock(&d->listen_lock);
+	}
+	return p;
+}
+
+#if 0
+/*
+ * Given a pointer to a listening socket return its server TID by consulting
+ * the socket->stid map.  Returns -1 if the socket is not in the map.
+ */
+static int
+listen_hash_find(struct tom_data *d, struct socket *so)
+{
+	int stid = -1, bucket = listen_hashfn(so);
+	struct listen_info *p;
+
+	spin_lock(&d->listen_lock);
+	for (p = d->listen_hash_tab[bucket]; p; p = p->next)
+		if (p->sk == sk) {
+			stid = p->stid;
+			break;
+		}
+	spin_unlock(&d->listen_lock);
+	return stid;
+}
+#endif
+
+/*
+ * Delete the listen_info structure for a listening socket.  Returns the server
+ * TID for the socket if it is present in the socket->stid map, or -1.
+ */
+static int
+listen_hash_del(struct tom_data *d, struct socket *so)
+{
+	int bucket, stid = -1;
+	struct listen_info *p, **prev;
+
+	bucket = listen_hashfn(so);
+	prev  = &d->listen_hash_tab[bucket];
+
+	mtx_lock(&d->listen_lock);
+	for (p = *prev; p; prev = &p->next, p = p->next)
+		if (p->so == so) {
+			stid = p->stid;
+			*prev = p->next;
+			free(p, M_CXGB);
+			break;
+		}
+	mtx_unlock(&d->listen_lock);
+	
+	return (stid);
+}
+
+/*
+ * Start a listening server by sending a passive open request to HW.
+ */
+void
+t3_listen_start(struct toedev *dev, struct socket *so, struct t3cdev *cdev)
+{
+	int stid;
+	struct mbuf *m;
+	struct cpl_pass_open_req *req;
+	struct tom_data *d = TOM_DATA(dev);
+	struct inpcb *inp = sotoinpcb(so);
+	struct listen_ctx *ctx;
+
+	if (!TOM_TUNABLE(dev, activated))
+		return;
+
+	printf("start listen\n");
+	
+	ctx = malloc(sizeof(*ctx), M_CXGB, M_NOWAIT);
+
+	if (!ctx)
+		return;
+
+	ctx->tom_data = d;
+	ctx->lso = so;
+	ctx->ulp_mode = 0; /* DDP if the default */
+	LIST_INIT(&ctx->synq_head);
+	
+	stid = cxgb_alloc_stid(d->cdev, d->client, ctx);
+	if (stid < 0)
+		goto free_ctx;
+
+#ifdef notyet
+	/*
+	 * XXX need to mark inpcb as referenced
+	 */
+	sock_hold(sk);
+#endif
+	m = m_gethdr(M_NOWAIT, MT_DATA);
+	if (m == NULL)
+		goto free_stid;
+	m->m_pkthdr.len = m->m_len = sizeof(*req);
+	
+	if (!listen_hash_add(d, so, stid))
+		goto free_all;
+
+	req = mtod(m, struct cpl_pass_open_req *);
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, stid));
+	req->local_port = inp->inp_lport; 
+	memcpy(&req->local_ip, &inp->inp_laddr, 4);
+	req->peer_port = 0;
+	req->peer_ip = 0;
+	req->peer_netmask = 0;
+	req->opt0h = htonl(F_DELACK | F_TCAM_BYPASS);
+	req->opt0l = htonl(V_RCV_BUFSIZ(16));
+	req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK));
+
+	m_set_priority(m, CPL_PRIORITY_LISTEN); 
+	cxgb_ofld_send(cdev, m);
+	return;
+
+free_all:
+	m_free(m);
+free_stid:
+	cxgb_free_stid(cdev, stid);
+#if 0	
+	sock_put(sk);
+#endif	
+free_ctx:
+	free(ctx, M_CXGB);
+}
+
+/*
+ * Stop a listening server by sending a close_listsvr request to HW.
+ * The server TID is freed when we get the reply.
+ */
+void
+t3_listen_stop(struct toedev *dev, struct socket *so, struct t3cdev *cdev)
+{
+	struct mbuf *m;
+	struct cpl_close_listserv_req *req;
+	struct listen_ctx *lctx;
+	int stid = listen_hash_del(TOM_DATA(dev), so);
+	
+	if (stid < 0)
+		return;
+
+	lctx = cxgb_get_lctx(cdev, stid);
+	/*
+	 * Do this early so embryonic connections are marked as being aborted
+	 * while the stid is still open.  This ensures pass_establish messages
+	 * that arrive while we are closing the server will be able to locate
+	 * the listening socket.
+	 */
+	t3_reset_synq(lctx);
+
+	/* Send the close ASAP to stop further passive opens */
+	m = m_gethdr(M_NOWAIT, MT_DATA);
+	if (m == NULL) {
+		/*
+		 * XXX allocate from lowmem cache
+		 */
+	}
+	m->m_pkthdr.len = m->m_len = sizeof(*req);
+
+	req = mtod(m, struct cpl_close_listserv_req *);
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, stid));
+	req->cpu_idx = 0;
+	m_set_priority(m, CPL_PRIORITY_LISTEN);
+	cxgb_ofld_send(cdev, m);
+
+	t3_disconnect_acceptq(so);
+}
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h b/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h
new file mode 100644
index 0000000..9fa42b5
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h
@@ -0,0 +1,185 @@
+
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+$FreeBSD$
+
+***************************************************************************/
+
+#ifndef T3_DDP_H
+#define T3_DDP_H
+
+/* Should be 1 or 2 indicating single or double kernel buffers. */
+#define NUM_DDP_KBUF 2
+
+/* min receive window for a connection to be considered for DDP */
+#define MIN_DDP_RCV_WIN (48 << 10)
+
+/* amount of Rx window not available to DDP to avoid window exhaustion */
+#define DDP_RSVD_WIN (16 << 10)
+
+/* # of sentinel invalid page pods at the end of a group of valid page pods */
+#define NUM_SENTINEL_PPODS 0
+
+/* # of pages a pagepod can hold without needing another pagepod */
+#define PPOD_PAGES 4
+
+/* page pods are allocated in groups of this size (must be power of 2) */
+#define PPOD_CLUSTER_SIZE 16
+
+/* for each TID we reserve this many page pods up front */
+#define RSVD_PPODS_PER_TID 1
+
+struct pagepod {
+	uint32_t pp_vld_tid;
+	uint32_t pp_pgsz_tag_color;
+	uint32_t pp_max_offset;
+	uint32_t pp_page_offset;
+	uint64_t pp_rsvd;
+	uint64_t pp_addr[5];
+};
+
+#define PPOD_SIZE sizeof(struct pagepod)
+
+#define S_PPOD_TID    0
+#define M_PPOD_TID    0xFFFFFF
+#define V_PPOD_TID(x) ((x) << S_PPOD_TID)
+
+#define S_PPOD_VALID    24
+#define V_PPOD_VALID(x) ((x) << S_PPOD_VALID)
+#define F_PPOD_VALID    V_PPOD_VALID(1U)
+
+#define S_PPOD_COLOR    0
+#define M_PPOD_COLOR    0x3F
+#define V_PPOD_COLOR(x) ((x) << S_PPOD_COLOR)
+
+#define S_PPOD_TAG    6
+#define M_PPOD_TAG    0xFFFFFF
+#define V_PPOD_TAG(x) ((x) << S_PPOD_TAG)
+
+#define S_PPOD_PGSZ    30
+#define M_PPOD_PGSZ    0x3
+#define V_PPOD_PGSZ(x) ((x) << S_PPOD_PGSZ)
+
+struct pci_dev;
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <machine/bus.h>
+
+/* DDP gather lists can specify an offset only for the first page. */
+struct ddp_gather_list {
+	unsigned int	dgl_length;
+	unsigned int	dgl_offset;
+	unsigned int	dgl_nelem;
+	vm_page_t   	*dgl_pages;
+	bus_addr_t 	dgl_phys_addr[0];
+};
+
+struct ddp_buf_state {
+	unsigned int cur_offset;     /* offset of latest DDP notification */
+	unsigned int flags;
+	struct ddp_gather_list *gl;
+};
+
+struct ddp_state {
+	struct pci_dev *pdev;
+	struct ddp_buf_state buf_state[2];   /* per buffer state */
+	int cur_buf;
+	unsigned short kbuf_noinval;
+	unsigned short kbuf_idx;        /* which HW buffer is used for kbuf */
+	struct ddp_gather_list *ubuf;
+	unsigned int ubuf_nppods;       /* # of page pods for buffer 1 */
+	unsigned int ubuf_tag;
+	unsigned int ubuf_ddp_ready;
+	int get_tcb_count;
+	unsigned int kbuf_posted;
+	int cancel_ubuf;
+	unsigned int kbuf_nppods[NUM_DDP_KBUF];
+	unsigned int kbuf_tag[NUM_DDP_KBUF];
+	struct ddp_gather_list *kbuf[NUM_DDP_KBUF]; /* kernel buffer for DDP prefetch */
+};
+
+/* buf_state flags */
+enum {
+	DDP_BF_NOINVAL = 1 << 0,   /* buffer is set to NO_INVALIDATE */
+	DDP_BF_NOCOPY  = 1 << 1,   /* DDP to final dest, no copy needed */
+	DDP_BF_NOFLIP  = 1 << 2,   /* buffer flips after GET_TCB_RPL */
+	DDP_BF_PSH     = 1 << 3,   /* set in skb->flags if the a DDP was 
+	                              completed with a segment having the
+				      PSH flag set */
+};
+
+#ifdef notyet
+/*
+ * Returns 1 if a UBUF DMA buffer might be active.
+ */
+static inline int t3_ddp_ubuf_pending(struct sock *so)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct ddp_state *p = DDP_STATE(tp);
+
+	/* When the TOM_TUNABLE(ddp) is enabled, we're always in ULP_MODE DDP,
+	 * but DDP_STATE() is only valid if the connection actually enabled
+	 * DDP.
+	 */
+	if (!p)
+		return 0;
+
+	return (p->buf_state[0].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY)) || 
+	       (p->buf_state[1].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY));
+}
+#endif
+
+int t3_setup_ppods(struct socket *so, const struct ddp_gather_list *gl,
+		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
+		   unsigned int pg_off, unsigned int color);
+int t3_alloc_ppods(struct tom_data *td, unsigned int n);
+void t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n);
+void t3_free_ddp_gl(struct pci_dev *pdev, struct ddp_gather_list *gl);
+int t3_pin_pages(struct pci_dev *pdev, unsigned long uaddr, size_t len,
+		 struct ddp_gather_list **newgl,
+		 const struct ddp_gather_list *gl);
+int t3_ddp_copy(const struct mbuf *skb, int offset, struct iovec *to,
+		int len);
+//void t3_repost_kbuf(struct socket *so, int modulate, int activate);
+void t3_post_kbuf(struct socket *so, int modulate);
+int t3_post_ubuf(struct socket *so, const struct iovec *iov, int nonblock,
+		 int rcv_flags, int modulate, int post_kbuf);
+void t3_cancel_ubuf(struct socket *so);
+int t3_overlay_ubuf(struct socket *so, const struct iovec *iov, int nonblock,
+		    int rcv_flags, int modulate, int post_kbuf);
+int t3_enter_ddp(struct socket *so, unsigned int kbuf_size, unsigned int waitall);
+void t3_cleanup_ddp(struct socket *so);
+void t3_release_ddp_resources(struct toepcb *toep);
+void t3_cancel_ddpbuf(struct socket *so, unsigned int bufidx);
+void t3_overlay_ddpbuf(struct socket *so, unsigned int bufidx, unsigned int tag0,
+		       unsigned int tag1, unsigned int len);
+void t3_setup_ddpbufs(struct socket *so, unsigned int len0, unsigned int offset0,
+		      unsigned int len1, unsigned int offset1,
+		      uint64_t ddp_flags, uint64_t flag_mask, int modulate);
+#endif  /* T3_DDP_H */
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h b/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h
new file mode 100644
index 0000000..a078bee
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h
@@ -0,0 +1,112 @@
+
+/*-
+ * Copyright (c) 2007, Chelsio Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Neither the name of the Chelsio Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+#ifndef CXGB_TOEPCB_H_
+#define CXGB_TOEPCB_H_
+#include <sys/bus.h>
+#include <dev/cxgb/sys/mbufq.h>
+
+struct toepcb {
+	struct toedev *tp_toedev;
+	struct l2t_entry *tp_l2t;
+	pr_ctloutput_t *tp_ctloutput;
+	unsigned int tp_tid;
+	int tp_wr_max;
+	int tp_wr_avail;
+	int tp_wr_unacked;
+	int tp_delack_mode;
+	int tp_mtu_idx;
+	int tp_ulp_mode;
+	int tp_qset_idx;
+	int tp_mss_clamp;
+	int tp_qset;
+	int tp_flags;
+	int tp_enqueued_bytes;
+	int tp_page_count;
+	int tp_state;
+
+	tcp_seq tp_iss;
+	tcp_seq tp_delack_seq;
+	tcp_seq tp_rcv_wup;
+	tcp_seq tp_copied_seq;
+	uint64_t tp_write_seq;
+
+	volatile int tp_refcount;
+	vm_page_t *tp_pages;
+	
+	struct tcpcb *tp_tp;
+	struct mbuf  *tp_m_last;
+	bus_dma_tag_t	tp_tx_dmat;
+	bus_dmamap_t	tp_dmamap;
+
+	LIST_ENTRY(toepcb) synq_entry;
+	struct mbuf_head wr_list;
+	struct mbuf_head out_of_order_queue;
+	struct ddp_state tp_ddp_state;
+};
+
+static inline void
+reset_wr_list(struct toepcb *toep)
+{
+
+	mbufq_init(&toep->wr_list);
+}
+
+static inline void
+purge_wr_queue(struct toepcb *toep)
+{
+	struct mbuf *m;
+	
+	while ((m = mbufq_dequeue(&toep->wr_list)) != NULL) 
+		m_freem(m);
+}
+
+static inline void
+enqueue_wr(struct toepcb *toep, struct mbuf *m)
+{
+
+	mbufq_tail(&toep->wr_list, m);
+}
+
+static inline struct mbuf *
+peek_wr(struct toepcb *toep)
+{
+
+	return (mbufq_peek(&toep->wr_list));
+}
+
+static inline struct mbuf *
+dequeue_wr(struct toepcb *toep)
+{
+
+	return (mbufq_dequeue(&toep->wr_list));
+}
+
+#endif
+
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom.c b/sys/dev/cxgb/ulp/tom/cxgb_tom.c
new file mode 100644
index 0000000..2dc6150
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tom.c
@@ -0,0 +1,500 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/eventhandler.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/socketvar.h>
+#include <sys/taskqueue.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+
+#include <dev/cxgb/cxgb_osdep.h>
+#include <dev/cxgb/sys/mbufq.h>
+
+#include <netinet/in_pcb.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_ofld.h>
+#include <netinet/tcp_fsm.h>
+#include <net/route.h>
+
+#include <dev/cxgb/t3cdev.h>
+#include <dev/cxgb/common/cxgb_firmware_exports.h>
+#include <dev/cxgb/common/cxgb_tcb.h>
+#include <dev/cxgb/cxgb_include.h>
+#include <dev/cxgb/common/cxgb_ctl_defs.h>
+#include <dev/cxgb/common/cxgb_t3_cpl.h>
+#include <dev/cxgb/cxgb_offload.h>
+#include <dev/cxgb/cxgb_l2t.h>
+#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
+#include <dev/cxgb/ulp/tom/cxgb_tom.h>
+#include <dev/cxgb/ulp/tom/cxgb_defs.h>
+#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
+#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+
+static int activated = 1;
+TUNABLE_INT("hw.t3toe.activated", &activated);
+SYSCTL_NODE(_hw, OID_AUTO, t3toe, CTLFLAG_RD, 0, "T3 toe driver parameters");
+SYSCTL_UINT(_hw_t3toe, OID_AUTO, activated, CTLFLAG_RDTUN, &activated, 0,
+    "enable TOE at init time");
+
+static TAILQ_HEAD(, tom_data) cxgb_list;
+static struct mtx cxgb_list_lock;
+
+static int t3_toe_attach(struct toedev *dev, const struct offload_id *entry);
+/*
+ * Handlers for each CPL opcode
+ */
+static cxgb_cpl_handler_func tom_cpl_handlers[NUM_CPL_CMDS];
+
+static eventhandler_tag listen_tag;
+
+static struct offload_id t3_toe_id_tab[] = {
+	{ TOE_ID_CHELSIO_T3, 0 },
+	{ TOE_ID_CHELSIO_T3B, 0 },
+	{ 0 }
+};
+
+static struct tom_info t3_tom_info = {
+	.ti_attach = t3_toe_attach,
+	.ti_id_table = t3_toe_id_tab,
+	.ti_name = "Chelsio-T3"
+};
+
+struct cxgb_client t3c_tom_client = {
+	.name = "tom_cxgb3",
+	.remove = NULL,
+	.handlers = tom_cpl_handlers,
+	.redirect = NULL
+};
+
+/*
+ * Add an skb to the deferred skb queue for processing from process context.
+ */
+void
+t3_defer_reply(struct mbuf *m, struct toedev *dev, defer_handler_t handler)
+{
+	struct tom_data *td = TOM_DATA(dev);
+
+	m_set_handler(m, handler);
+	mtx_lock(&td->deferq.lock);
+	
+	mbufq_tail(&td->deferq, m);
+	if (mbufq_len(&td->deferq) == 1)
+		taskqueue_enqueue(td->tq, &td->deferq_task);
+	mtx_lock(&td->deferq.lock);
+}
+
+struct toepcb *
+toepcb_alloc(void)
+{
+	struct toepcb *toep;
+	
+	toep = malloc(sizeof(struct toepcb), M_DEVBUF, M_NOWAIT);
+	
+	if (toep == NULL)
+		return (NULL);
+
+	toepcb_init(toep);
+	return (toep);
+}
+
+void
+toepcb_init(struct toepcb *toep)
+{
+	bzero(toep, sizeof(*toep));
+	toep->tp_refcount = 1;
+}
+
+void
+toepcb_hold(struct toepcb *toep)
+{
+	atomic_add_acq_int(&toep->tp_refcount, 1);
+}
+
+void
+toepcb_release(struct toepcb *toep)
+{
+	if (toep->tp_refcount == 1) {
+		printf("doing final toepcb free\n");
+		
+		free(toep, M_DEVBUF);
+		return;
+	}
+	
+	atomic_add_acq_int(&toep->tp_refcount, -1);
+}
+
+/*
+ * Add a T3 offload device to the list of devices we are managing.
+ */
+static void
+t3cdev_add(struct tom_data *t)
+{
+	mtx_lock(&cxgb_list_lock);
+	TAILQ_INSERT_TAIL(&cxgb_list, t, entry);
+	mtx_unlock(&cxgb_list_lock);
+}
+
+/*
+ * Allocate a TOM data structure,
+ * initialize its cpl_handlers
+ * and register it as a T3C client
+ */
+static void t3c_tom_add(struct t3cdev *cdev)
+{
+	int i;
+	unsigned int wr_len;
+	struct tom_data *t;
+	struct toedev *tdev;
+	struct adap_ports *port_info;
+
+	t = malloc(sizeof(*t), M_CXGB, M_NOWAIT|M_ZERO);
+	
+	if (!t)
+		return;
+
+	if (cdev->ctl(cdev, GET_WR_LEN, &wr_len) < 0)
+		goto out_free_tom;
+
+	port_info = malloc(sizeof(*port_info), M_CXGB, M_NOWAIT|M_ZERO);
+	if (!port_info)
+		goto out_free_tom;
+
+	if (cdev->ctl(cdev, GET_PORTS, port_info) < 0)
+		goto out_free_all;
+
+	t3_init_wr_tab(wr_len);
+	t->cdev = cdev;
+	t->client = &t3c_tom_client;
+
+	/* Register TCP offload device */
+	tdev = &t->tdev;
+	tdev->tod_ttid = (cdev->type == T3A ?
+		      TOE_ID_CHELSIO_T3 : TOE_ID_CHELSIO_T3B);
+	tdev->tod_lldev = cdev->lldev;
+
+	if (register_toedev(tdev, "toe%d")) {
+		printf("unable to register offload device");
+		goto out_free_all;
+	}
+	TOM_DATA(tdev) = t;
+
+	for (i = 0; i < port_info->nports; i++) {
+		struct ifnet *ifp = port_info->lldevs[i];
+		TOEDEV(ifp) = tdev;
+		
+		ifp->if_capabilities |= IFCAP_TOE;
+	}
+	t->ports = port_info;
+
+	/* Add device to the list of offload devices */
+	t3cdev_add(t);
+
+	/* Activate TCP offload device */
+	activate_offload(tdev);
+	return;
+
+out_free_all:
+	free(port_info, M_CXGB);
+out_free_tom:
+	free(t, M_CXGB);
+	return;
+}
+
+/*
+ * Process a received packet with an unknown/unexpected CPL opcode.
+ */
+static int
+do_bad_cpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	log(LOG_ERR, "%s: received bad CPL command %u\n", cdev->name,
+	    *mtod(m, unsigned int *));
+
+	return (CPL_RET_BUF_DONE | CPL_RET_BAD_MSG);
+}
+
+
+/*
+ * Add a new handler to the CPL dispatch table.  A NULL handler may be supplied
+ * to unregister an existing handler.
+ */
+void
+t3tom_register_cpl_handler(unsigned int opcode, cxgb_cpl_handler_func h)
+{
+	if (opcode < NUM_CPL_CMDS)
+		tom_cpl_handlers[opcode] = h ? h : do_bad_cpl;
+	else
+		log(LOG_ERR, "Chelsio T3 TOM: handler registration for "
+		       "opcode %u failed\n", opcode);
+}
+
+/*
+ * Make a preliminary determination if a connection can be offloaded.  It's OK
+ * to fail the offload later if we say we can offload here.  For now this
+ * always accepts the offload request unless there are IP options.
+ */
+static int
+can_offload(struct toedev *dev, struct socket *so)
+{
+	struct tom_data *tomd = TOM_DATA(dev);
+	struct t3cdev *cdev = T3CDEV(dev->tod_lldev);
+	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
+
+	return sotoinpcb(so)->inp_depend4.inp4_options == NULL &&
+	    tomd->conf.activated &&
+	    (tomd->conf.max_conn < 0 ||
+	     atomic_load_acq_int(&t->tids_in_use) + t->atids_in_use < tomd->conf.max_conn);
+}
+
+
+static int tom_ctl(struct toedev *dev, unsigned int req, void *data)
+{
+	struct tom_data *t = TOM_DATA(dev);
+	struct t3cdev *cdev = t->cdev;
+
+	if (cdev->ctl)
+		return cdev->ctl(cdev, req, data);
+
+	return (EOPNOTSUPP);
+}
+
+/*
+ * Initialize the CPL dispatch table.
+ */
+static void
+init_cpl_handlers(void)
+{
+	int i;
+
+	for (i = 0; i < NUM_CPL_CMDS; ++i)
+		tom_cpl_handlers[i] = do_bad_cpl;
+
+	t3_init_listen_cpl_handlers();
+}
+
+static int
+t3_toe_attach(struct toedev *dev, const struct offload_id *entry)
+{
+	struct tom_data *t = TOM_DATA(dev);
+	struct t3cdev *cdev = t->cdev;
+	struct ddp_params ddp;
+	struct ofld_page_info rx_page_info;
+	int err;
+	
+#if 0
+	skb_queue_head_init(&t->deferq);
+	T3_INIT_WORK(&t->deferq_task, process_deferq, t);
+	spin_lock_init(&t->listen_lock);
+#endif
+	t3_init_tunables(t);
+	mtx_init(&t->listen_lock, "tom data listeners", NULL, MTX_DEF);
+
+	/* Adjust TOE activation for this module */
+	t->conf.activated = activated;
+
+	dev->tod_can_offload = can_offload;
+	dev->tod_connect = t3_connect;
+	dev->tod_ctl = tom_ctl;
+#if 0	
+#ifndef NETEVENT
+	dev->tod_neigh_update = tom_neigh_update;
+#endif
+	dev->tod_failover = t3_failover;
+#endif
+	err = cdev->ctl(cdev, GET_DDP_PARAMS, &ddp);
+	if (err)
+		return err;
+
+	err = cdev->ctl(cdev, GET_RX_PAGE_INFO, &rx_page_info);
+	if (err)
+		return err;
+
+	t->ddp_llimit = ddp.llimit;
+	t->ddp_ulimit = ddp.ulimit;
+	t->pdev = ddp.pdev;
+	t->rx_page_size = rx_page_info.page_size;
+#ifdef notyet
+	/* OK if this fails, we just can't do DDP */
+	t->nppods = (ddp.ulimit + 1 - ddp.llimit) / PPOD_SIZE;
+	t->ppod_map = t3_alloc_mem(t->nppods);
+#endif
+
+#if 0
+	spin_lock_init(&t->ppod_map_lock);
+	tom_proc_init(dev);
+#ifdef CONFIG_SYSCTL
+	t->sysctl = t3_sysctl_register(dev, &t->conf);
+#endif
+#endif
+	return (0);
+}
+
+static void
+cxgb_toe_listen(void *unused, int event, struct tcpcb *tp)
+{
+	struct socket *so = tp->t_inpcb->inp_socket;
+	struct tom_data *p;
+
+	switch (event) {
+	case OFLD_LISTEN_OPEN:
+	case OFLD_LISTEN_CLOSE:
+		mtx_lock(&cxgb_list_lock);
+		TAILQ_FOREACH(p, &cxgb_list, entry) {
+			if (event == OFLD_LISTEN_OPEN)
+				t3_listen_start(&p->tdev, so, p->cdev);
+			else if (tp->t_state == TCPS_LISTEN) {
+				printf("stopping listen on port=%d\n",
+				    ntohs(tp->t_inpcb->inp_lport));
+				
+				t3_listen_stop(&p->tdev, so, p->cdev);
+			}
+			
+		}
+		mtx_unlock(&cxgb_list_lock);
+		break;
+	default:
+		log(LOG_ERR, "unrecognized listen event %d\n", event);
+		break;
+	}
+}
+
+static void
+cxgb_register_listeners(void)
+{
+	struct inpcb *inp;
+	struct tcpcb *tp;
+	
+	INP_INFO_RLOCK(&tcbinfo);
+	LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
+		tp = intotcpcb(inp);
+
+		if (tp->t_state == TCPS_LISTEN)
+			cxgb_toe_listen(NULL, OFLD_LISTEN_OPEN, tp);
+	}
+	INP_INFO_RUNLOCK(&tcbinfo);
+}
+
+static int
+t3_tom_init(void)
+{
+
+#if 0
+	struct socket *sock;
+	err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (err < 0) {
+		printk(KERN_ERR "Could not create TCP socket, error %d\n", err);
+		return err;
+	}
+
+	t3_def_state_change = sock->sk->sk_state_change;
+	t3_def_data_ready = sock->sk->sk_data_ready;
+	t3_def_error_report = sock->sk->sk_error_report;
+	sock_release(sock);
+#endif
+	init_cpl_handlers();
+	if (t3_init_cpl_io() < 0)
+		return -1;
+	t3_init_socket_ops();
+
+	 /* Register with the TOE device layer. */
+
+	if (register_tom(&t3_tom_info) != 0) {
+		log(LOG_ERR,
+		    "Unable to register Chelsio T3 TCP offload module.\n");
+		return -1;
+	}
+
+	mtx_init(&cxgb_list_lock, "cxgb tom list", NULL, MTX_DEF);
+	listen_tag = EVENTHANDLER_REGISTER(ofld_listen, cxgb_toe_listen, NULL, EVENTHANDLER_PRI_ANY);
+	TAILQ_INIT(&cxgb_list);
+	
+	/* Register to offloading devices */
+	t3c_tom_client.add = t3c_tom_add;
+	cxgb_register_client(&t3c_tom_client);
+	cxgb_register_listeners();
+	return (0);
+}
+
+static int
+t3_tom_load(module_t mod, int cmd, void *arg)
+{
+	int err = 0;
+
+	switch (cmd) {
+	case MOD_LOAD:
+		printf("wheeeeee ...\n");
+		
+		t3_tom_init();
+		break;
+	case MOD_QUIESCE:
+		break;
+	case MOD_UNLOAD:
+		printf("uhm, ... unloading isn't really supported for toe\n");
+		break;
+	case MOD_SHUTDOWN:
+		break;
+	default:
+		err = EOPNOTSUPP;
+		break;
+	}
+
+	return (err);
+}
+
+static moduledata_t mod_data= {
+	"t3_tom",
+	t3_tom_load,
+	0
+};
+MODULE_VERSION(t3_tom, 1);
+MODULE_DEPEND(t3_tom, toecore, 1, 1, 1);
+MODULE_DEPEND(t3_tom, if_cxgb, 1, 1, 1);
+DECLARE_MODULE(t3_tom, mod_data, SI_SUB_EXEC, SI_ORDER_ANY);
+
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom.h b/sys/dev/cxgb/ulp/tom/cxgb_tom.h
new file mode 100644
index 0000000..8d60bbd
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tom.h
@@ -0,0 +1,157 @@
+
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+$FreeBSD$
+
+***************************************************************************/
+#ifndef CXGB_TOM_H_
+#define CXGB_TOM_H_
+#include <sys/protosw.h>
+
+#define LISTEN_INFO_HASH_SIZE 32 
+
+struct listen_info {
+	struct listen_info *next;  /* Link to next entry */
+	struct socket *so;         /* The listening socket */
+	unsigned int stid;         /* The server TID */
+};
+
+
+/*
+ * TOM tunable parameters.  They can be manipulated through sysctl(2) or /proc.
+ */
+struct tom_tunables {
+        int max_host_sndbuf;    // max host RAM consumed by a sndbuf
+        int tx_hold_thres;      // push/pull threshold for non-full TX sk_buffs
+        int max_wrs;            // max # of outstanding WRs per connection
+        int rx_credit_thres;    // min # of RX credits needed for RX_DATA_ACK
+        int cong_alg;           // Congestion control algorithm
+        int mss;                // max TX_DATA WR payload size
+        int delack;             // delayed ACK control
+        int max_conn;           // maximum number of offloaded connections
+        int soft_backlog_limit; // whether the listen backlog limit is soft
+        int ddp;                // whether to put new connections in DDP mode
+        int ddp_thres;          // min recvmsg size before activating DDP
+        int ddp_copy_limit;     // capacity of kernel DDP buffer
+        int ddp_push_wait;      // whether blocking DDP waits for PSH flag
+        int ddp_rcvcoalesce;    // whether receive coalescing is enabled
+        int zcopy_sosend_enabled; // < is never zcopied
+        int zcopy_sosend_partial_thres; // < is never zcopied
+        int zcopy_sosend_partial_copy; // bytes copied in partial zcopy
+        int zcopy_sosend_thres;// >= are mostly zcopied
+        int zcopy_sosend_copy; // bytes coped in zcopied
+        int zcopy_sosend_ret_pending_dma;// pot. return while pending DMA
+        int activated;          // TOE engine activation state
+};
+
+struct tom_data {
+        TAILQ_ENTRY(tom_data) entry;
+			      
+        struct t3cdev *cdev;
+        struct pci_dev *pdev;
+        struct toedev tdev;
+
+        struct cxgb_client *client;
+        struct tom_tunables conf;
+        struct tom_sysctl_table *sysctl;
+
+        /*
+         * The next three locks listen_lock, deferq.lock, and tid_release_lock
+         * are used rarely so we let them potentially share a cacheline.
+         */
+
+        struct listen_info *listen_hash_tab[LISTEN_INFO_HASH_SIZE];
+        struct mtx listen_lock;
+
+        struct mbuf_head deferq;
+        struct task deferq_task;
+
+        struct socket **tid_release_list;
+        struct mtx tid_release_lock;
+        struct task tid_release_task;
+
+        volatile int tx_dma_pending;
+	
+        unsigned int ddp_llimit;
+        unsigned int ddp_ulimit;
+
+        unsigned int rx_page_size;
+
+        u8 *ppod_map;
+        unsigned int nppods;
+        struct mtx ppod_map_lock;
+	
+        struct adap_ports *ports;
+	struct taskqueue *tq;
+};
+
+
+struct listen_ctx {
+	struct socket *lso;
+	struct tom_data *tom_data;
+	int ulp_mode;
+	LIST_HEAD(, toepcb) synq_head;
+	
+};
+
+#define TOM_DATA(dev) (*(struct tom_data **)&(dev)->tod_l4opt)
+#define T3C_DEV(sk) ((TOM_DATA(TOE_DEV(sk)))->cdev)
+#define TOEP_T3C_DEV(toep) (TOM_DATA(toep->tp_toedev)->cdev)
+#define TOM_TUNABLE(dev, param) (TOM_DATA(dev)->conf.param)
+
+#define TP_DATASENT         	(1 << 0)
+#define TP_TX_WAIT_IDLE      	(1 << 1)
+#define TP_FIN_SENT          	(1 << 2)
+#define TP_ABORT_RPL_PENDING 	(1 << 3)
+#define TP_ABORT_SHUTDOWN    	(1 << 4)
+#define TP_ABORT_RPL_RCVD    	(1 << 5)
+#define TP_ABORT_REQ_RCVD    	(1 << 6)
+#define TP_CLOSE_CON_REQUESTED	(1 << 7)
+#define TP_SYN_RCVD		(1 << 8)
+#define TP_ESTABLISHED		(1 << 9)
+
+void t3_init_tunables(struct tom_data *t);
+
+static __inline struct mbuf *
+m_gethdr_nofail(int len)
+{
+	struct mbuf *m;
+	
+	m = m_gethdr(M_NOWAIT, MT_DATA);
+	if (m == NULL) {
+		panic("implement lowmem cache\n");
+	}
+	
+	KASSERT(len < MHLEN, ("requested header size too large for mbuf"));	
+	m->m_pkthdr.len = m->m_len = len;
+	return (m);
+}
+
+
+#endif
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c b/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c
new file mode 100644
index 0000000..7219922
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c
@@ -0,0 +1,106 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/socketvar.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+
+#include <dev/cxgb/cxgb_osdep.h>
+#include <dev/cxgb/sys/mbufq.h>
+
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_fsm.h>
+#include <net/route.h>
+
+#include <dev/cxgb/t3cdev.h>
+#include <dev/cxgb/common/cxgb_firmware_exports.h>
+#include <dev/cxgb/common/cxgb_tcb.h>
+#include <dev/cxgb/common/cxgb_ctl_defs.h>
+#include <dev/cxgb/common/cxgb_t3_cpl.h>
+#include <dev/cxgb/cxgb_offload.h>
+#include <dev/cxgb/cxgb_l2t.h>
+#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
+#include <dev/cxgb/ulp/tom/cxgb_tom.h>
+#include <dev/cxgb/ulp/tom/cxgb_defs.h>
+#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
+
+static struct tom_tunables default_tunable_vals = {
+	.max_host_sndbuf = 32 * 1024,
+	.tx_hold_thres = 0,
+	.max_wrs = 15,
+	.rx_credit_thres = 15 * 1024,
+	.cong_alg = -1,
+	.mss = 16384,
+	.delack = 1,
+	.max_conn = -1,
+	.soft_backlog_limit = 0,
+	.ddp = 0,
+	.ddp_thres = 14 * 4096,
+	.ddp_copy_limit = 13 * 4096,
+	.ddp_push_wait = 1,
+	.ddp_rcvcoalesce = 0,
+	.zcopy_sosend_enabled = 0,	
+	.zcopy_sosend_partial_thres = 40960,
+	.zcopy_sosend_partial_copy = 4096 * 3,
+	.zcopy_sosend_thres = 128 * 1024,
+	.zcopy_sosend_copy = 4096 * 2,
+	.zcopy_sosend_ret_pending_dma = 1,
+	.activated = 1,
+};
+
+void t3_init_tunables(struct tom_data *t)
+{
+	t->conf = default_tunable_vals;
+
+	/* Now apply device specific fixups. */
+	t->conf.mss = T3C_DATA(t->cdev)->tx_max_chunk;
+	t->conf.max_wrs = T3C_DATA(t->cdev)->max_wrs;
+}
author	kmacy <kmacy@FreeBSD.org>	2007-12-16 05:27:26 +0000
committer	kmacy <kmacy@FreeBSD.org>	2007-12-16 05:27:26 +0000
commit	f96fe5e169e8cfe06b070663cdaf7a637dfde154 (patch)
tree	4227e68976ae5d008757c5ec68446ae18267d17a /sys/dev/cxgb/ulp
parent	f04336e4cbede2676f151b37d96aacb1b14cb9b2 (diff)
download	FreeBSD-src-f96fe5e169e8cfe06b070663cdaf7a637dfde154.zip FreeBSD-src-f96fe5e169e8cfe06b070663cdaf7a637dfde154.tar.gz