- Updated TOE support in the kernel.

- Stateful TCP offload drivers for Terminator 3 and 4 (T3 and T4) ASICs. These are available as t3_tom and t4_tom modules that augment cxgb(4) and cxgbe(4) respectively. The cxgb/cxgbe drivers continue to work as usual with or without these extra features. - iWARP driver for Terminator 3 ASIC (kernel verbs). T4 iWARP in the works and will follow soon. Build-tested with make universe. 30s overview ============ What interfaces support TCP offload? Look for TOE4 and/or TOE6 in the capabilities of an interface: # ifconfig -m | grep TOE Enable/disable TCP offload on an interface (just like any other ifnet capability): # ifconfig cxgbe0 toe # ifconfig cxgbe0 -toe Which connections are offloaded? Look for toe4 and/or toe6 in the output of netstat and sockstat: # netstat -np tcp | grep toe # sockstat -46c | grep toe Reviewed by: bz, gnn Sponsored by: Chelsio communications. MFC after: ~3 months (after 9.1, and after ensuring MFC is feasible)
author: np <np@FreeBSD.org> 2012-06-19 07:34:13 +0000
committer: np <np@FreeBSD.org> 2012-06-19 07:34:13 +0000
commit: 67d5f1a727273d8e141e96c429114dff9fb06ec3 (patch)
tree: 9255a545bbd49a0458ed8850371b4fe6ed2cd01f /sys/dev/cxgbe
parent: 27063437e23a5e5e7debf9144ee974d21b6a6774 (diff)
download: FreeBSD-src-67d5f1a727273d8e141e96c429114dff9fb06ec3.zip
FreeBSD-src-67d5f1a727273d8e141e96c429114dff9fb06ec3.tar.gz
14 files changed, 4840 insertions, 719 deletions
diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h
index 6be75bc..ba5335a 100644
--- a/sys/dev/cxgbe/adapter.h
+++ b/sys/dev/cxgbe/adapter.h
@@ -157,6 +157,7 @@ enum {
 	INTR_DIRECT	= (1 << 2),	/* direct interrupts for everything */
 	MASTER_PF	= (1 << 3),
 	ADAP_SYSCTL_CTX	= (1 << 4),
+	TOM_INIT_DONE	= (1 << 5),
 
 	CXGBE_BUSY	= (1 << 9),
 
@@ -199,7 +200,7 @@ struct port_info {
 	int first_txq;	/* index of first tx queue */
 	int nrxq;	/* # of rx queues */
 	int first_rxq;	/* index of first rx queue */
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	int nofldtxq;		/* # of offload tx queues */
 	int first_ofld_txq;	/* index of first offload tx queue */
 	int nofldrxq;		/* # of offload rx queues */
@@ -213,6 +214,8 @@ struct port_info {
 	struct link_config link_cfg;
 	struct port_stats stats;
 
+	eventhandler_tag vlan_c;
+
 	struct callout tick;
 	struct sysctl_ctx_list ctx;	/* from ifconfig up to driver detach */
 
@@ -296,7 +299,7 @@ struct sge_iq {
 enum {
 	EQ_CTRL		= 1,
 	EQ_ETH		= 2,
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	EQ_OFLD		= 3,
 #endif
 
@@ -422,14 +425,36 @@ struct sge_rxq {
 
 } __aligned(CACHE_LINE_SIZE);
 
-#ifndef TCP_OFFLOAD_DISABLE
+static inline struct sge_rxq *
+iq_to_rxq(struct sge_iq *iq)
+{
+
+	return (member2struct(sge_rxq, iq, iq));
+}
+
+
+#ifdef TCP_OFFLOAD
 /* ofld_rxq: SGE ingress queue + SGE free list + miscellaneous items */
 struct sge_ofld_rxq {
 	struct sge_iq iq;	/* MUST be first */
 	struct sge_fl fl;	/* MUST follow iq */
 } __aligned(CACHE_LINE_SIZE);
+
+static inline struct sge_ofld_rxq *
+iq_to_ofld_rxq(struct sge_iq *iq)
+{
+
+	return (member2struct(sge_ofld_rxq, iq, iq));
+}
 #endif
 
+struct wrqe {
+	STAILQ_ENTRY(wrqe) link;
+	struct sge_wrq *wrq;
+	int wr_len;
+	uint64_t wr[] __aligned(16);
+};
+
 /*
  * wrq: SGE egress queue that is given prebuilt work requests.  Both the control
  * and offload tx queues are of this type.
@@ -438,8 +463,9 @@ struct sge_wrq {
 	struct sge_eq eq;	/* MUST be first */
 
 	struct adapter *adapter;
-	struct mbuf *head;	/* held up due to lack of descriptors */
-	struct mbuf *tail;	/* valid only if head is valid */
+
+	/* List of WRs held up due to lack of tx descriptors */
+	STAILQ_HEAD(, wrqe) wr_list;
 
 	/* stats for common events first */
 
@@ -457,7 +483,7 @@ struct sge {
 
 	int nrxq;	/* total # of Ethernet rx queues */
 	int ntxq;	/* total # of Ethernet tx tx queues */
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	int nofldrxq;	/* total # of TOE rx queues */
 	int nofldtxq;	/* total # of TOE tx queues */
 #endif
@@ -469,7 +495,7 @@ struct sge {
 	struct sge_wrq *ctrlq;	/* Control queues */
 	struct sge_txq *txq;	/* NIC tx queues */
 	struct sge_rxq *rxq;	/* NIC rx queues */
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	struct sge_wrq *ofld_txq;	/* TOE tx queues */
 	struct sge_ofld_rxq *ofld_rxq;	/* TOE rx queues */
 #endif
@@ -483,6 +509,7 @@ struct sge {
 struct rss_header;
 typedef int (*cpl_handler_t)(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
+typedef int (*an_handler_t)(struct sge_iq *, const struct rsp_ctrl *);
 
 struct adapter {
 	SLIST_ENTRY(adapter) link;
@@ -519,15 +546,15 @@ struct adapter {
 	uint8_t chan_map[NCHAN];
 	uint32_t filter_mode;
 
-#ifndef TCP_OFFLOAD_DISABLE
-	struct uld_softc tom;
+#ifdef TCP_OFFLOAD
+	void *tom_softc;	/* (struct tom_data *) */
 	struct tom_tunables tt;
 #endif
 	struct l2t_data *l2t;	/* L2 table */
 	struct tid_info tids;
 
 	int open_device_map;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	int offload_map;
 #endif
 	int flags;
@@ -554,7 +581,8 @@ struct adapter {
 	TAILQ_HEAD(, sge_fl) sfl;
 	struct callout sfl_callout;
 
-	cpl_handler_t cpl_handler[256] __aligned(CACHE_LINE_SIZE);
+	an_handler_t an_handler __aligned(CACHE_LINE_SIZE);
+	cpl_handler_t cpl_handler[256];
 };
 
 #define ADAPTER_LOCK(sc)		mtx_lock(&(sc)->sc_lock)
@@ -609,82 +637,96 @@ struct adapter {
 static inline uint32_t
 t4_read_reg(struct adapter *sc, uint32_t reg)
 {
+
 	return bus_space_read_4(sc->bt, sc->bh, reg);
 }
 
 static inline void
 t4_write_reg(struct adapter *sc, uint32_t reg, uint32_t val)
 {
+
 	bus_space_write_4(sc->bt, sc->bh, reg, val);
 }
 
 static inline uint64_t
 t4_read_reg64(struct adapter *sc, uint32_t reg)
 {
+
 	return t4_bus_space_read_8(sc->bt, sc->bh, reg);
 }
 
 static inline void
 t4_write_reg64(struct adapter *sc, uint32_t reg, uint64_t val)
 {
+
 	t4_bus_space_write_8(sc->bt, sc->bh, reg, val);
 }
 
 static inline void
 t4_os_pci_read_cfg1(struct adapter *sc, int reg, uint8_t *val)
 {
+
 	*val = pci_read_config(sc->dev, reg, 1);
 }
 
 static inline void
 t4_os_pci_write_cfg1(struct adapter *sc, int reg, uint8_t val)
 {
+
 	pci_write_config(sc->dev, reg, val, 1);
 }
 
 static inline void
 t4_os_pci_read_cfg2(struct adapter *sc, int reg, uint16_t *val)
 {
+
 	*val = pci_read_config(sc->dev, reg, 2);
 }
 
 static inline void
 t4_os_pci_write_cfg2(struct adapter *sc, int reg, uint16_t val)
 {
+
 	pci_write_config(sc->dev, reg, val, 2);
 }
 
 static inline void
 t4_os_pci_read_cfg4(struct adapter *sc, int reg, uint32_t *val)
 {
+
 	*val = pci_read_config(sc->dev, reg, 4);
 }
 
 static inline void
 t4_os_pci_write_cfg4(struct adapter *sc, int reg, uint32_t val)
 {
+
 	pci_write_config(sc->dev, reg, val, 4);
 }
 
 static inline struct port_info *
 adap2pinfo(struct adapter *sc, int idx)
 {
+
 	return (sc->port[idx]);
 }
 
 static inline void
 t4_os_set_hw_addr(struct adapter *sc, int idx, uint8_t hw_addr[])
 {
+
 	bcopy(hw_addr, sc->port[idx]->hw_addr, ETHER_ADDR_LEN);
 }
 
 static inline bool is_10G_port(const struct port_info *pi)
 {
+
 	return ((pi->link_cfg.supported & FW_PORT_CAP_SPEED_10G) != 0);
 }
 
 static inline int tx_resume_threshold(struct sge_eq *eq)
 {
+
 	return (eq->qsize / 4);
 }
 
@@ -698,6 +740,7 @@ void t4_os_portmod_changed(const struct adapter *, int);
 void t4_os_link_changed(struct adapter *, int, int);
 void t4_iterate(void (*)(struct adapter *, void *), void *);
 int t4_register_cpl_handler(struct adapter *, int, cpl_handler_t);
+int t4_register_an_handler(struct adapter *, an_handler_t);
 
 /* t4_sge.c */
 void t4_sge_modload(void);
@@ -714,21 +757,45 @@ void t4_intr_all(void *);
 void t4_intr(void *);
 void t4_intr_err(void *);
 void t4_intr_evt(void *);
-int t4_mgmt_tx(struct adapter *, struct mbuf *);
-int t4_wrq_tx_locked(struct adapter *, struct sge_wrq *, struct mbuf *);
+void t4_wrq_tx_locked(struct adapter *, struct sge_wrq *, struct wrqe *);
 int t4_eth_tx(struct ifnet *, struct sge_txq *, struct mbuf *);
 void t4_update_fl_bufsize(struct ifnet *);
 int can_resume_tx(struct sge_eq *);
 
-static inline int t4_wrq_tx(struct adapter *sc, struct sge_wrq *wrq, struct mbuf *m)
+static inline struct wrqe *
+alloc_wrqe(int wr_len, struct sge_wrq *wrq)
 {
-	int rc;
+	int len = offsetof(struct wrqe, wr) + wr_len;
+	struct wrqe *wr;
+
+	wr = malloc(len, M_CXGBE, M_NOWAIT);
+	if (__predict_false(wr == NULL))
+		return (NULL);
+	wr->wr_len = wr_len;
+	wr->wrq = wrq;
+	return (wr);
+}
+
+static inline void *
+wrtod(struct wrqe *wr)
+{
+	return (&wr->wr[0]);
+}
+
+static inline void
+free_wrqe(struct wrqe *wr)
+{
+	free(wr, M_CXGBE);
+}
+
+static inline void
+t4_wrq_tx(struct adapter *sc, struct wrqe *wr)
+{
+	struct sge_wrq *wrq = wr->wrq;
 
 	TXQ_LOCK(wrq);
-	rc = t4_wrq_tx_locked(sc, wrq, m);
+	t4_wrq_tx_locked(sc, wrq, wr);
 	TXQ_UNLOCK(wrq);
-	return (rc);
 }
 
-
 #endif
diff --git a/sys/dev/cxgbe/common/t4_hw.c b/sys/dev/cxgbe/common/t4_hw.c
index 6f4dd8d..f629cbe 100644
--- a/sys/dev/cxgbe/common/t4_hw.c
+++ b/sys/dev/cxgbe/common/t4_hw.c
@@ -27,6 +27,8 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_inet.h"
+
 #include "common.h"
 #include "t4_regs.h"
 #include "t4_regs_values.h"
diff --git a/sys/dev/cxgbe/offload.h b/sys/dev/cxgbe/offload.h
index f6ada9d..1ae9f1f 100644
--- a/sys/dev/cxgbe/offload.h
+++ b/sys/dev/cxgbe/offload.h
@@ -31,12 +31,6 @@
 #ifndef __T4_OFFLOAD_H__
 #define __T4_OFFLOAD_H__
 
-/* XXX: flagrant misuse of mbuf fields (during tx by TOM) */
-#define MBUF_EQ(m)		(*((void **)(&(m)->m_pkthdr.rcvif)))
-/* These have to work for !M_PKTHDR so we use a field from m_hdr. */
-#define MBUF_TX_CREDITS(m)	((m)->m_hdr.pad[0])
-#define MBUF_DMA_MAPPED(m)	((m)->m_hdr.pad[1])
-
 #define INIT_ULPTX_WR(w, wrlen, atomic, tid) do { \
 	(w)->wr.wr_hi = htonl(V_FW_WR_OP(FW_ULPTX_WR) | V_FW_WR_ATOMIC(atomic)); \
 	(w)->wr.wr_mid = htonl(V_FW_WR_LEN16(DIV_ROUND_UP(wrlen, 16)) | \
@@ -119,7 +113,7 @@ struct t4_virt_res {                      /* virtualized HW resources */
 	struct t4_range ocq;
 };
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 enum {
 	ULD_TOM = 1,
 };
@@ -130,13 +124,8 @@ struct uld_info {
 	SLIST_ENTRY(uld_info) link;
 	int refcount;
 	int uld_id;
-	int (*attach)(struct adapter *, void **);
-	int (*detach)(void *);
-};
-
-struct uld_softc {
-	struct uld_info *uld;
-	void *softc;
+	int (*activate)(struct adapter *);
+	int (*deactivate)(struct adapter *);
 };
 
 struct tom_tunables {
@@ -148,6 +137,8 @@ struct tom_tunables {
 
 int t4_register_uld(struct uld_info *);
 int t4_unregister_uld(struct uld_info *);
+int t4_activate_uld(struct adapter *, int);
+int t4_deactivate_uld(struct adapter *, int);
 #endif
 
 #endif
diff --git a/sys/dev/cxgbe/t4_l2t.c b/sys/dev/cxgbe/t4_l2t.c
index 55491cd..8373c32 100644
--- a/sys/dev/cxgbe/t4_l2t.c
+++ b/sys/dev/cxgbe/t4_l2t.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2011 Chelsio Communications, Inc.
+ * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -38,16 +38,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/rwlock.h>
 #include <sys/socket.h>
 #include <sys/sbuf.h>
-#include <net/if.h>
-#include <net/if_types.h>
-#include <net/ethernet.h>
-#include <net/if_vlan_var.h>
-#include <net/if_dl.h>
-#include <net/if_llatbl.h>
-#include <net/route.h>
 #include <netinet/in.h>
-#include <netinet/in_var.h>
-#include <netinet/if_ether.h>
 
 #include "common/common.h"
 #include "common/jhash.h"
@@ -72,42 +63,11 @@ __FBSDID("$FreeBSD$");
  * lifetime of an L2T entry is fully contained in the lifetime of the TOE.
  */
 
-/* identifies sync vs async L2T_WRITE_REQs */
-#define S_SYNC_WR    12
-#define V_SYNC_WR(x) ((x) << S_SYNC_WR)
-#define F_SYNC_WR    V_SYNC_WR(1)
-
-enum {
-	L2T_STATE_VALID,	/* entry is up to date */
-	L2T_STATE_STALE,	/* entry may be used but needs revalidation */
-	L2T_STATE_RESOLVING,	/* entry needs address resolution */
-	L2T_STATE_SYNC_WRITE,	/* synchronous write of entry underway */
-
-	/* when state is one of the below the entry is not hashed */
-	L2T_STATE_SWITCHING,	/* entry is being used by a switching filter */
-	L2T_STATE_UNUSED	/* entry not in use */
-};
-
-struct l2t_data {
-	struct rwlock lock;
-	volatile int nfree;	/* number of free entries */
-	struct l2t_entry *rover;/* starting point for next allocation */
-	struct l2t_entry l2tab[L2T_SIZE];
-};
-
-static int do_l2t_write_rpl(struct sge_iq *, const struct rss_header *,
-    struct mbuf *);
-
-#define VLAN_NONE	0xfff
-#define SA(x)           ((struct sockaddr *)(x))
-#define SIN(x)          ((struct sockaddr_in *)(x))
-#define SINADDR(x)      (SIN(x)->sin_addr.s_addr)
-
 /*
  * Allocate a free L2T entry.  Must be called with l2t_data.lock held.
  */
-static struct l2t_entry *
-alloc_l2e(struct l2t_data *d)
+struct l2t_entry *
+t4_alloc_l2e(struct l2t_data *d)
 {
 	struct l2t_entry *end, *e, **p;
 
@@ -121,7 +81,8 @@ alloc_l2e(struct l2t_data *d)
 		if (atomic_load_acq_int(&e->refcnt) == 0)
 			goto found;
 
-	for (e = d->l2tab; atomic_load_acq_int(&e->refcnt); ++e) ;
+	for (e = d->l2tab; atomic_load_acq_int(&e->refcnt); ++e)
+		continue;
 found:
 	d->rover = e + 1;
 	atomic_subtract_int(&d->nfree, 1);
@@ -148,19 +109,18 @@ found:
  * Write an L2T entry.  Must be called with the entry locked.
  * The write may be synchronous or asynchronous.
  */
-static int
-write_l2e(struct adapter *sc, struct l2t_entry *e, int sync)
+int
+t4_write_l2e(struct adapter *sc, struct l2t_entry *e, int sync)
 {
-	struct mbuf *m;
+	struct wrqe *wr;
 	struct cpl_l2t_write_req *req;
 
 	mtx_assert(&e->lock, MA_OWNED);
 
-	if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
+	wr = alloc_wrqe(sizeof(*req), &sc->sge.mgmtq);
+	if (wr == NULL)
 		return (ENOMEM);
-
-	req = mtod(m, struct cpl_l2t_write_req *);
-	m->m_pkthdr.len = m->m_len = sizeof(*req);
+	req = wrtod(wr);
 
 	INIT_TP_WR(req, 0);
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, e->idx |
@@ -170,7 +130,7 @@ write_l2e(struct adapter *sc, struct l2t_entry *e, int sync)
 	req->vlan = htons(e->vlan);
 	memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac));
 
-	t4_mgmt_tx(sc, m);
+	t4_wrq_tx(sc, wr);
 
 	if (sync && e->state != L2T_STATE_SWITCHING)
 		e->state = L2T_STATE_SYNC_WRITE;
@@ -189,7 +149,7 @@ t4_l2t_alloc_switching(struct l2t_data *d)
 	struct l2t_entry *e;
 
 	rw_rlock(&d->lock);
-	e = alloc_l2e(d);
+	e = t4_alloc_l2e(d);
 	if (e) {
 		mtx_lock(&e->lock);          /* avoid race with t4_l2t_free */
 		e->state = L2T_STATE_SWITCHING;
@@ -214,7 +174,7 @@ t4_l2t_set_switching(struct adapter *sc, struct l2t_entry *e, uint16_t vlan,
 	e->lport = port;
 	memcpy(e->dmac, eth_addr, ETHER_ADDR_LEN);
 	mtx_lock(&e->lock);
-	rc = write_l2e(sc, e, 0);
+	rc = t4_write_l2e(sc, e, 0);
 	mtx_unlock(&e->lock);
 	return (rc);
 }
@@ -234,10 +194,13 @@ t4_init_l2t(struct adapter *sc, int flags)
 	rw_init(&d->lock, "L2T");
 
 	for (i = 0; i < L2T_SIZE; i++) {
-		d->l2tab[i].idx = i;
-		d->l2tab[i].state = L2T_STATE_UNUSED;
-		mtx_init(&d->l2tab[i].lock, "L2T_E", NULL, MTX_DEF);
-		atomic_store_rel_int(&d->l2tab[i].refcnt, 0);
+		struct l2t_entry *e = &d->l2tab[i];
+
+		e->idx = i;
+		e->state = L2T_STATE_UNUSED;
+		mtx_init(&e->lock, "L2T_E", NULL, MTX_DEF);
+		STAILQ_INIT(&e->wr_list);
+		atomic_store_rel_int(&e->refcnt, 0);
 	}
 
 	sc->l2t = d;
@@ -259,6 +222,24 @@ t4_free_l2t(struct l2t_data *d)
 	return (0);
 }
 
+int
+do_l2t_write_rpl(struct sge_iq *iq, const struct rss_header *rss,
+    struct mbuf *m)
+{
+	const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1);
+	unsigned int tid = GET_TID(rpl);
+	unsigned int idx = tid & (L2T_SIZE - 1);
+
+	if (__predict_false(rpl->status != CPL_ERR_NONE)) {
+		log(LOG_ERR,
+		    "Unexpected L2T_WRITE_RPL status %u for entry %u\n",
+		    rpl->status, idx);
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
 #ifdef SBUF_DRAIN
 static inline unsigned int
 vlan_prio(const struct l2t_entry *e)
@@ -273,7 +254,7 @@ l2e_state(const struct l2t_entry *e)
 	case L2T_STATE_VALID: return 'V';  /* valid, fast-path entry */
 	case L2T_STATE_STALE: return 'S';  /* needs revalidation, but usable */
 	case L2T_STATE_SYNC_WRITE: return 'W';
-	case L2T_STATE_RESOLVING: return e->arpq_head ? 'A' : 'R';
+	case L2T_STATE_RESOLVING: return STAILQ_EMPTY(&e->wr_list) ? 'R' : 'A';
 	case L2T_STATE_SWITCHING: return 'X';
 	default: return 'U';
 	}
@@ -311,20 +292,20 @@ sysctl_l2t(SYSCTL_HANDLER_ARGS)
 			    "Ethernet address  VLAN/P LP State Users Port");
 			header = 1;
 		}
-		if (e->state == L2T_STATE_SWITCHING || e->v6)
+		if (e->state == L2T_STATE_SWITCHING)
 			ip[0] = 0;
 		else
 			snprintf(ip, sizeof(ip), "%s",
-			    inet_ntoa(*(struct in_addr *)&e->addr[0]));
+			    inet_ntoa(*(struct in_addr *)&e->addr));
 
-		/* XXX: accessing lle probably not safe? */
+		/* XXX: e->ifp may not be around */
 		sbuf_printf(sb, "\n%4u %-15s %02x:%02x:%02x:%02x:%02x:%02x %4d"
 			   " %u %2u   %c   %5u %s",
 			   e->idx, ip, e->dmac[0], e->dmac[1], e->dmac[2],
 			   e->dmac[3], e->dmac[4], e->dmac[5],
 			   e->vlan & 0xfff, vlan_prio(e), e->lport,
 			   l2e_state(e), atomic_load_acq_int(&e->refcnt),
-			   e->lle ? e->lle->lle_tbl->llt_ifp->if_xname : "");
+			   e->ifp->if_xname);
 skip:
 		mtx_unlock(&e->lock);
 	}
@@ -335,459 +316,3 @@ skip:
 	return (rc);
 }
 #endif
-
-#ifndef TCP_OFFLOAD_DISABLE
-static inline void
-l2t_hold(struct l2t_data *d, struct l2t_entry *e)
-{
-	if (atomic_fetchadd_int(&e->refcnt, 1) == 0)  /* 0 -> 1 transition */
-		atomic_subtract_int(&d->nfree, 1);
-}
-
-/*
- * To avoid having to check address families we do not allow v4 and v6
- * neighbors to be on the same hash chain.  We keep v4 entries in the first
- * half of available hash buckets and v6 in the second.
- */
-enum {
-	L2T_SZ_HALF = L2T_SIZE / 2,
-	L2T_HASH_MASK = L2T_SZ_HALF - 1
-};
-
-static inline unsigned int
-arp_hash(const uint32_t *key, int ifindex)
-{
-	return jhash_2words(*key, ifindex, 0) & L2T_HASH_MASK;
-}
-
-static inline unsigned int
-ipv6_hash(const uint32_t *key, int ifindex)
-{
-	uint32_t xor = key[0] ^ key[1] ^ key[2] ^ key[3];
-
-	return L2T_SZ_HALF + (jhash_2words(xor, ifindex, 0) & L2T_HASH_MASK);
-}
-
-static inline unsigned int
-addr_hash(const uint32_t *addr, int addr_len, int ifindex)
-{
-	return addr_len == 4 ? arp_hash(addr, ifindex) :
-			       ipv6_hash(addr, ifindex);
-}
-
-/*
- * Checks if an L2T entry is for the given IP/IPv6 address.  It does not check
- * whether the L2T entry and the address are of the same address family.
- * Callers ensure an address is only checked against L2T entries of the same
- * family, something made trivial by the separation of IP and IPv6 hash chains
- * mentioned above.  Returns 0 if there's a match,
- */
-static inline int
-addreq(const struct l2t_entry *e, const uint32_t *addr)
-{
-	if (e->v6)
-		return (e->addr[0] ^ addr[0]) | (e->addr[1] ^ addr[1]) |
-		       (e->addr[2] ^ addr[2]) | (e->addr[3] ^ addr[3]);
-	return e->addr[0] ^ addr[0];
-}
-
-/*
- * Add a packet to an L2T entry's queue of packets awaiting resolution.
- * Must be called with the entry's lock held.
- */
-static inline void
-arpq_enqueue(struct l2t_entry *e, struct mbuf *m)
-{
-	mtx_assert(&e->lock, MA_OWNED);
-
-	KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt not NULL", __func__));
-	if (e->arpq_head)
-		e->arpq_tail->m_nextpkt = m;
-	else
-		e->arpq_head = m;
-	e->arpq_tail = m;
-}
-
-static inline void
-send_pending(struct adapter *sc, struct l2t_entry *e)
-{
-	struct mbuf *m, *next;
-
-	mtx_assert(&e->lock, MA_OWNED);
-
-	for (m = e->arpq_head; m; m = next) {
-		next = m->m_nextpkt;
-		m->m_nextpkt = NULL;
-		t4_wrq_tx(sc, MBUF_EQ(m), m);
-	}
-	e->arpq_head = e->arpq_tail = NULL;
-}
-
-#ifdef INET
-/*
- * Looks up and fills up an l2t_entry's lle.  We grab all the locks that we need
- * ourself, and update e->state at the end if e->lle was successfully filled.
- *
- * The lle passed in comes from arpresolve and is ignored as it does not appear
- * to be of much use.
- */
-static int
-l2t_fill_lle(struct adapter *sc, struct l2t_entry *e, struct llentry *unused)
-{
-        int rc = 0;
-        struct sockaddr_in sin;
-        struct ifnet *ifp = e->ifp;
-        struct llentry *lle;
-
-        bzero(&sin, sizeof(struct sockaddr_in));
-	if (e->v6)
-		panic("%s: IPv6 L2 resolution not supported yet.", __func__);
-
-	sin.sin_family = AF_INET;
-	sin.sin_len = sizeof(struct sockaddr_in);
-	memcpy(&sin.sin_addr, e->addr, sizeof(struct sockaddr_in));
-
-        mtx_assert(&e->lock, MA_NOTOWNED);
-        KASSERT(e->addr && ifp, ("%s: bad prep before call", __func__));
-
-        IF_AFDATA_LOCK(ifp);
-        lle = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, SA(&sin));
-        IF_AFDATA_UNLOCK(ifp);
-        if (!LLE_IS_VALID(lle))
-                return (ENOMEM);
-        if (!(lle->la_flags & LLE_VALID)) {
-                rc = EINVAL;
-                goto done;
-        }
-
-        LLE_ADDREF(lle);
-
-        mtx_lock(&e->lock);
-        if (e->state == L2T_STATE_RESOLVING) {
-                KASSERT(e->lle == NULL, ("%s: lle already valid", __func__));
-                e->lle = lle;
-                memcpy(e->dmac, &lle->ll_addr, ETHER_ADDR_LEN);
-		write_l2e(sc, e, 1);
-        } else {
-                KASSERT(e->lle == lle, ("%s: lle changed", __func__));
-                LLE_REMREF(lle);
-        }
-        mtx_unlock(&e->lock);
-done:
-        LLE_WUNLOCK(lle);
-        return (rc);
-}
-#endif
-
-int
-t4_l2t_send(struct adapter *sc, struct mbuf *m, struct l2t_entry *e)
-{
-#ifndef INET
-	return (EINVAL);
-#else
-	struct llentry *lle = NULL;
-	struct sockaddr_in sin;
-	struct ifnet *ifp = e->ifp;
-
-	if (e->v6)
-		panic("%s: IPv6 L2 resolution not supported yet.", __func__);
-
-        bzero(&sin, sizeof(struct sockaddr_in));
-	sin.sin_family = AF_INET;
-	sin.sin_len = sizeof(struct sockaddr_in);
-	memcpy(&sin.sin_addr, e->addr, sizeof(struct sockaddr_in));
-
-again:
-	switch (e->state) {
-	case L2T_STATE_STALE:     /* entry is stale, kick off revalidation */
-		if (arpresolve(ifp, NULL, NULL, SA(&sin), e->dmac, &lle) == 0)
-			l2t_fill_lle(sc, e, lle);
-
-		/* Fall through */
-
-	case L2T_STATE_VALID:     /* fast-path, send the packet on */
-		return t4_wrq_tx(sc, MBUF_EQ(m), m);
-
-	case L2T_STATE_RESOLVING:
-	case L2T_STATE_SYNC_WRITE:
-		mtx_lock(&e->lock);
-		if (e->state != L2T_STATE_SYNC_WRITE &&
-		    e->state != L2T_STATE_RESOLVING) {
-			/* state changed by the time we got here */
-			mtx_unlock(&e->lock);
-			goto again;
-		}
-		arpq_enqueue(e, m);
-		mtx_unlock(&e->lock);
-
-		if (e->state == L2T_STATE_RESOLVING &&
-		    arpresolve(ifp, NULL, NULL, SA(&sin), e->dmac, &lle) == 0)
-			l2t_fill_lle(sc, e, lle);
-	}
-
-	return (0);
-#endif
-}
-
-/*
- * Called when an L2T entry has no more users.  The entry is left in the hash
- * table since it is likely to be reused but we also bump nfree to indicate
- * that the entry can be reallocated for a different neighbor.  We also drop
- * the existing neighbor reference in case the neighbor is going away and is
- * waiting on our reference.
- *
- * Because entries can be reallocated to other neighbors once their ref count
- * drops to 0 we need to take the entry's lock to avoid races with a new
- * incarnation.
- */
-static void
-t4_l2e_free(struct l2t_entry *e)
-{
-	struct llentry *lle = NULL;
-	struct l2t_data *d;
-
-	mtx_lock(&e->lock);
-	if (atomic_load_acq_int(&e->refcnt) == 0) {  /* hasn't been recycled */
-		lle = e->lle;
-		e->lle = NULL;
-		/*
-		 * Don't need to worry about the arpq, an L2T entry can't be
-		 * released if any packets are waiting for resolution as we
-		 * need to be able to communicate with the device to close a
-		 * connection.
-		 */
-	}
-	mtx_unlock(&e->lock);
-
-	d = container_of(e, struct l2t_data, l2tab[e->idx]);
-	atomic_add_int(&d->nfree, 1);
-
-	if (lle)
-		LLE_FREE(lle);
-}
-
-void
-t4_l2t_release(struct l2t_entry *e)
-{
-	if (atomic_fetchadd_int(&e->refcnt, -1) == 1)
-		t4_l2e_free(e);
-}
-
-static int
-do_l2t_write_rpl(struct sge_iq *iq, const struct rss_header *rss,
-    struct mbuf *m)
-{
-	struct adapter *sc = iq->adapter;
-	const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1);
-	unsigned int tid = GET_TID(rpl);
-	unsigned int idx = tid & (L2T_SIZE - 1);
-
-	if (__predict_false(rpl->status != CPL_ERR_NONE)) {
-		log(LOG_ERR,
-		    "Unexpected L2T_WRITE_RPL status %u for entry %u\n",
-		    rpl->status, idx);
-		return (EINVAL);
-	}
-
-	if (tid & F_SYNC_WR) {
-		struct l2t_entry *e = &sc->l2t->l2tab[idx];
-
-		mtx_lock(&e->lock);
-		if (e->state != L2T_STATE_SWITCHING) {
-			send_pending(sc, e);
-			e->state = L2T_STATE_VALID;
-		}
-		mtx_unlock(&e->lock);
-	}
-
-	return (0);
-}
-
-/*
- * Reuse an L2T entry that was previously used for the same next hop.
- */
-static void
-reuse_entry(struct l2t_entry *e)
-{
-	struct llentry *lle;
-
-	mtx_lock(&e->lock);                /* avoid race with t4_l2t_free */
-	lle = e->lle;
-	if (lle) {
-		KASSERT(lle->la_flags & LLE_VALID,
-			("%s: invalid lle stored in l2t_entry", __func__));
-
-		if (lle->la_expire >= time_uptime)
-			e->state = L2T_STATE_STALE;
-		else
-			e->state = L2T_STATE_VALID;
-	} else
-		e->state = L2T_STATE_RESOLVING;
-	mtx_unlock(&e->lock);
-}
-
-/*
- * The TOE wants an L2 table entry that it can use to reach the next hop over
- * the specified port.  Produce such an entry - create one if needed.
- *
- * Note that the ifnet could be a pseudo-device like if_vlan, if_lagg, etc. on
- * top of the real cxgbe interface.
- */
-struct l2t_entry *
-t4_l2t_get(struct port_info *pi, struct ifnet *ifp, struct sockaddr *sa)
-{
-	struct l2t_entry *e;
-	struct l2t_data *d = pi->adapter->l2t;
-	int addr_len;
-	uint32_t *addr;
-	int hash;
-	struct sockaddr_in6 *sin6;
-	unsigned int smt_idx = pi->port_id;
-
-	if (sa->sa_family == AF_INET) {
-		addr = (uint32_t *)&SINADDR(sa);
-		addr_len = sizeof(SINADDR(sa));
-	} else if (sa->sa_family == AF_INET6) {
-		sin6 = (struct sockaddr_in6 *)sa;
-		addr = (uint32_t *)&sin6->sin6_addr.s6_addr;
-		addr_len = sizeof(sin6->sin6_addr.s6_addr);
-	} else
-		return (NULL);
-
-#ifndef VLAN_TAG
-	if (ifp->if_type == IFT_L2VLAN)
-		return (NULL);
-#endif
-
-	hash = addr_hash(addr, addr_len, ifp->if_index);
-
-	rw_wlock(&d->lock);
-	for (e = d->l2tab[hash].first; e; e = e->next) {
-		if (!addreq(e, addr) && e->ifp == ifp && e->smt_idx == smt_idx){
-			l2t_hold(d, e);
-			if (atomic_load_acq_int(&e->refcnt) == 1)
-				reuse_entry(e);
-			goto done;
-		}
-	}
-
-	/* Need to allocate a new entry */
-	e = alloc_l2e(d);
-	if (e) {
-		mtx_lock(&e->lock);          /* avoid race with t4_l2t_free */
-		e->state = L2T_STATE_RESOLVING;
-		memcpy(e->addr, addr, addr_len);
-		e->ifindex = ifp->if_index;
-		e->smt_idx = smt_idx;
-		e->ifp = ifp;
-		e->hash = hash;
-		e->lport = pi->lport;
-		e->v6 = (addr_len == 16);
-		e->lle = NULL;
-		atomic_store_rel_int(&e->refcnt, 1);
-#ifdef VLAN_TAG
-		if (ifp->if_type == IFT_L2VLAN)
-			VLAN_TAG(ifp, &e->vlan);
-		else
-			e->vlan = VLAN_NONE;
-#endif
-		e->next = d->l2tab[hash].first;
-		d->l2tab[hash].first = e;
-		mtx_unlock(&e->lock);
-	}
-done:
-	rw_wunlock(&d->lock);
-	return e;
-}
-
-/*
- * Called when the host's neighbor layer makes a change to some entry that is
- * loaded into the HW L2 table.
- */
-void
-t4_l2t_update(struct adapter *sc, struct llentry *lle)
-{
-	struct l2t_entry *e;
-	struct l2t_data *d = sc->l2t;
-	struct sockaddr *sa = L3_ADDR(lle);
-	struct llentry *old_lle = NULL;
-	uint32_t *addr = (uint32_t *)&SINADDR(sa);
-	struct ifnet *ifp = lle->lle_tbl->llt_ifp;
-	int hash = addr_hash(addr, sizeof(*addr), ifp->if_index);
-
-	KASSERT(d != NULL, ("%s: no L2 table", __func__));
-	LLE_WLOCK_ASSERT(lle);
-	KASSERT(lle->la_flags & LLE_VALID || lle->la_flags & LLE_DELETED,
-	    ("%s: entry neither valid nor deleted.", __func__));
-
-	rw_rlock(&d->lock);
-	for (e = d->l2tab[hash].first; e; e = e->next) {
-		if (!addreq(e, addr) && e->ifp == ifp) {
-			mtx_lock(&e->lock);
-			if (atomic_load_acq_int(&e->refcnt))
-				goto found;
-			e->state = L2T_STATE_STALE;
-			mtx_unlock(&e->lock);
-			break;
-		}
-	}
-	rw_runlock(&d->lock);
-
-	/* The TOE has no interest in this LLE */
-	return;
-
- found:
-	rw_runlock(&d->lock);
-
-        if (atomic_load_acq_int(&e->refcnt)) {
-
-                /* Entry is referenced by at least 1 offloaded connection. */
-
-                /* Handle deletes first */
-                if (lle->la_flags & LLE_DELETED) {
-                        if (lle == e->lle) {
-                                e->lle = NULL;
-                                e->state = L2T_STATE_RESOLVING;
-                                LLE_REMREF(lle);
-                        }
-                        goto done;
-                }
-
-                if (lle != e->lle) {
-                        old_lle = e->lle;
-                        LLE_ADDREF(lle);
-                        e->lle = lle;
-                }
-
-                if (e->state == L2T_STATE_RESOLVING ||
-                    memcmp(e->dmac, &lle->ll_addr, ETHER_ADDR_LEN)) {
-
-                        /* unresolved -> resolved; or dmac changed */
-
-                        memcpy(e->dmac, &lle->ll_addr, ETHER_ADDR_LEN);
-			write_l2e(sc, e, 1);
-                } else {
-
-                        /* +ve reinforcement of a valid or stale entry */
-
-                }
-
-                e->state = L2T_STATE_VALID;
-
-        } else {
-                /*
-                 * Entry was used previously but is unreferenced right now.
-                 * e->lle has been released and NULL'd out by t4_l2t_free, or
-                 * l2t_release is about to call t4_l2t_free and do that.
-                 *
-                 * Either way this is of no interest to us.
-                 */
-        }
-
-done:
-        mtx_unlock(&e->lock);
-        if (old_lle)
-                LLE_FREE(old_lle);
-}
-
-#endif
diff --git a/sys/dev/cxgbe/t4_l2t.h b/sys/dev/cxgbe/t4_l2t.h
index 5dfce83..0303885 100644
--- a/sys/dev/cxgbe/t4_l2t.h
+++ b/sys/dev/cxgbe/t4_l2t.h
@@ -30,8 +30,25 @@
 #ifndef __T4_L2T_H
 #define __T4_L2T_H
 
+/* identifies sync vs async L2T_WRITE_REQs */
+#define S_SYNC_WR    12
+#define V_SYNC_WR(x) ((x) << S_SYNC_WR)
+#define F_SYNC_WR    V_SYNC_WR(1)
+
 enum { L2T_SIZE = 4096 };     /* # of L2T entries */
 
+enum {
+	L2T_STATE_VALID,	/* entry is up to date */
+	L2T_STATE_STALE,	/* entry may be used but needs revalidation */
+	L2T_STATE_RESOLVING,	/* entry needs address resolution */
+	L2T_STATE_FAILED,	/* failed to resolve */
+	L2T_STATE_SYNC_WRITE,	/* synchronous write of entry underway */
+
+	/* when state is one of the below the entry is not hashed */
+	L2T_STATE_SWITCHING,	/* entry is being used by a switching filter */
+	L2T_STATE_UNUSED	/* entry not in use */
+};
+
 /*
  * Each L2T entry plays multiple roles.  First of all, it keeps state for the
  * corresponding entry of the HW L2 table and maintains a queue of offload
@@ -43,39 +60,49 @@ enum { L2T_SIZE = 4096 };     /* # of L2T entries */
 struct l2t_entry {
 	uint16_t state;			/* entry state */
 	uint16_t idx;			/* entry index */
-	uint32_t addr[4];		/* next hop IP or IPv6 address */
+	uint32_t addr;			/* next hop IP address */
 	struct ifnet *ifp;		/* outgoing interface */
 	uint16_t smt_idx;		/* SMT index */
 	uint16_t vlan;			/* VLAN TCI (id: 0-11, prio: 13-15) */
-	int ifindex;			/* interface index */
-	struct llentry *lle;		/* llentry for next hop */
 	struct l2t_entry *first;	/* start of hash chain */
 	struct l2t_entry *next;		/* next l2t_entry on chain */
-	struct mbuf *arpq_head;		/* list of mbufs awaiting resolution */
-	struct mbuf *arpq_tail;
+	STAILQ_HEAD(, wrqe) wr_list;	/* list of WRs awaiting resolution */
 	struct mtx lock;
 	volatile int refcnt;		/* entry reference count */
 	uint16_t hash;			/* hash bucket the entry is on */
-	uint8_t v6;			/* whether entry is for IPv6 */
 	uint8_t lport;			/* associated offload logical port */
 	uint8_t dmac[ETHER_ADDR_LEN];	/* next hop's MAC address */
 };
 
+struct l2t_data {
+	struct rwlock lock;
+	volatile int nfree;	/* number of free entries */
+	struct l2t_entry *rover;/* starting point for next allocation */
+	struct l2t_entry l2tab[L2T_SIZE];
+};
+
+
 int t4_init_l2t(struct adapter *, int);
 int t4_free_l2t(struct l2t_data *);
+struct l2t_entry *t4_alloc_l2e(struct l2t_data *);
 struct l2t_entry *t4_l2t_alloc_switching(struct l2t_data *);
 int t4_l2t_set_switching(struct adapter *, struct l2t_entry *, uint16_t,
     uint8_t, uint8_t *);
-void t4_l2t_release(struct l2t_entry *);
+int t4_write_l2e(struct adapter *, struct l2t_entry *, int);
+int do_l2t_write_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *);
+
+static inline void
+t4_l2t_release(struct l2t_entry *e)
+{
+	struct l2t_data *d = container_of(e, struct l2t_data, l2tab[e->idx]);
+
+	if (atomic_fetchadd_int(&e->refcnt, -1) == 1)
+		atomic_add_int(&d->nfree, 1);
+}
+
+
 #ifdef SBUF_DRAIN
 int sysctl_l2t(SYSCTL_HANDLER_ARGS);
 #endif
 
-#ifndef TCP_OFFLOAD_DISABLE
-struct l2t_entry *t4_l2t_get(struct port_info *, struct ifnet *,
-    struct sockaddr *);
-int t4_l2t_send(struct adapter *, struct mbuf *, struct l2t_entry *);
-void t4_l2t_update(struct adapter *, struct llentry *);
-#endif
-
 #endif  /* __T4_L2T_H */
diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c
index 874a6ad..a91363b 100644
--- a/sys/dev/cxgbe/t4_main.c
+++ b/sys/dev/cxgbe/t4_main.c
@@ -119,9 +119,13 @@ static void cxgbe_media_status(struct ifnet *, struct ifmediareq *);
 
 MALLOC_DEFINE(M_CXGBE, "cxgbe", "Chelsio T4 Ethernet driver and services");
 
+/*
+ * Correct lock order when you need to acquire multiple locks is t4_list_lock,
+ * then ADAPTER_LOCK, then t4_uld_list_lock.
+ */
 static struct mtx t4_list_lock;
 static SLIST_HEAD(, adapter) t4_list;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 static struct mtx t4_uld_list_lock;
 static SLIST_HEAD(, uld_info) t4_uld_list;
 #endif
@@ -149,7 +153,7 @@ TUNABLE_INT("hw.cxgbe.ntxq1g", &t4_ntxq1g);
 static int t4_nrxq1g = -1;
 TUNABLE_INT("hw.cxgbe.nrxq1g", &t4_nrxq1g);
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 #define NOFLDTXQ_10G 8
 static int t4_nofldtxq10g = -1;
 TUNABLE_INT("hw.cxgbe.nofldtxq10g", &t4_nofldtxq10g);
@@ -237,7 +241,7 @@ struct intrs_and_queues {
 	int nrxq10g;		/* # of NIC rxq's for each 10G port */
 	int ntxq1g;		/* # of NIC txq's for each 1G port */
 	int nrxq1g;		/* # of NIC rxq's for each 1G port */
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	int nofldtxq10g;	/* # of TOE txq's for each 10G port */
 	int nofldrxq10g;	/* # of TOE rxq's for each 10G port */
 	int nofldtxq1g;		/* # of TOE txq's for each 1G port */
@@ -297,8 +301,10 @@ static void reg_block_dump(struct adapter *, uint8_t *, unsigned int,
     unsigned int);
 static void t4_get_regs(struct adapter *, struct t4_regdump *, uint8_t *);
 static void cxgbe_tick(void *);
+static void cxgbe_vlan_config(void *, struct ifnet *, uint16_t);
 static int cpl_not_handled(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
+static int an_not_handled(struct sge_iq *, const struct rsp_ctrl *);
 static int t4_sysctls(struct adapter *);
 static int cxgbe_sysctls(struct port_info *);
 static int sysctl_int_array(SYSCTL_HANDLER_ARGS);
@@ -342,10 +348,8 @@ static int filter_rpl(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 static int get_sge_context(struct adapter *, struct t4_sge_context *);
 static int read_card_mem(struct adapter *, struct t4_mem_range *);
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 static int toe_capability(struct port_info *, int);
-static int activate_uld(struct adapter *, int, struct uld_softc *);
-static int deactivate_uld(struct uld_softc *);
 #endif
 static int t4_mod_event(module_t, int, void *);
 
@@ -368,8 +372,12 @@ struct t4_pciids {
 	{0x440a, 4, "Chelsio T404-BT"},
 };
 
-#ifndef TCP_OFFLOAD_DISABLE
-/* This is used in service_iq() to get to the fl associated with an iq. */
+#ifdef TCP_OFFLOAD
+/*
+ * service_iq() has an iq and needs the fl.  Offset of fl from the iq should be
+ * exactly the same for both rxq and ofld_rxq.
+ */
+CTASSERT(offsetof(struct sge_ofld_rxq, iq) == offsetof(struct sge_rxq, iq));
 CTASSERT(offsetof(struct sge_ofld_rxq, fl) == offsetof(struct sge_rxq, fl));
 #endif
 
@@ -401,7 +409,7 @@ t4_attach(device_t dev)
 	int rc = 0, i, n10g, n1g, rqidx, tqidx;
 	struct intrs_and_queues iaq;
 	struct sge *s;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	int ofld_rqidx, ofld_tqidx;
 #endif
 
@@ -436,6 +444,7 @@ t4_attach(device_t dev)
 		goto done; /* error message displayed already */
 
 	memset(sc->chan_map, 0xff, sizeof(sc->chan_map));
+	sc->an_handler = an_not_handled;
 	for (i = 0; i < ARRAY_SIZE(sc->cpl_handler); i++)
 		sc->cpl_handler[i] = cpl_not_handled;
 	t4_register_cpl_handler(sc, CPL_SET_TCB_RPL, filter_rpl);
@@ -595,7 +604,7 @@ t4_attach(device_t dev)
 	s->neq += sc->params.nports + 1;/* ctrl queues: 1 per port + 1 mgmt */
 	s->niq = s->nrxq + 1;		/* 1 extra for firmware event queue */
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	if (is_offload(sc)) {
 
 		s->nofldrxq = n10g * iaq.nofldrxq10g + n1g * iaq.nofldrxq1g;
@@ -631,7 +640,7 @@ t4_attach(device_t dev)
 	 * tx queues that each port should get.
 	 */
 	rqidx = tqidx = 0;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	ofld_rqidx = ofld_tqidx = 0;
 #endif
 	for_each_port(sc, i) {
@@ -653,7 +662,7 @@ t4_attach(device_t dev)
 		rqidx += pi->nrxq;
 		tqidx += pi->ntxq;
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 		if (is_offload(sc)) {
 			pi->first_ofld_rxq = ofld_rqidx;
 			pi->first_ofld_txq = ofld_tqidx;
@@ -761,7 +770,7 @@ t4_detach(device_t dev)
 	if (sc->l2t)
 		t4_free_l2t(sc->l2t);
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	free(sc->sge.ofld_rxq, M_CXGBE);
 	free(sc->sge.ofld_txq, M_CXGBE);
 #endif
@@ -832,7 +841,7 @@ cxgbe_attach(device_t dev)
 	ifp->if_qflush = cxgbe_qflush;
 
 	ifp->if_capabilities = T4_CAP;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	if (is_offload(pi->adapter))
 		ifp->if_capabilities |= IFCAP_TOE4;
 #endif
@@ -844,9 +853,12 @@ cxgbe_attach(device_t dev)
 	    cxgbe_media_status);
 	build_medialist(pi);
 
+	pi->vlan_c = EVENTHANDLER_REGISTER(vlan_config, cxgbe_vlan_config, ifp,
+	    EVENTHANDLER_PRI_ANY);
+
 	ether_ifattach(ifp, pi->hw_addr);
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	if (is_offload(pi->adapter)) {
 		device_printf(dev,
 		    "%d txq, %d rxq (NIC); %d txq, %d rxq (TOE)\n",
@@ -876,6 +888,9 @@ cxgbe_detach(device_t dev)
 	SET_BUSY(sc);
 	ADAPTER_UNLOCK(sc);
 
+	if (pi->vlan_c)
+		EVENTHANDLER_DEREGISTER(vlan_config, pi->vlan_c);
+
 	PORT_LOCK(pi);
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 	callout_stop(&pi->tick);
@@ -1042,7 +1057,7 @@ fail:
 			}
 #endif
 		}
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 		if (mask & IFCAP_TOE) {
 			int enable = (ifp->if_capenable ^ mask) & IFCAP_TOE;
 
@@ -1292,7 +1307,7 @@ cfg_itype_and_nqueues(struct adapter *sc, int n10g, int n1g,
 	iaq->ntxq1g = t4_ntxq1g;
 	iaq->nrxq10g = nrxq10g = t4_nrxq10g;
 	iaq->nrxq1g = nrxq1g = t4_nrxq1g;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	iaq->nofldtxq10g = t4_nofldtxq10g;
 	iaq->nofldtxq1g = t4_nofldtxq1g;
 	iaq->nofldrxq10g = nofldrxq10g = t4_nofldrxq10g;
@@ -1364,7 +1379,7 @@ restart:
 					n++;
 				}
 				iaq->nrxq10g = min(n, nrxq10g);
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 				iaq->nofldrxq10g = min(n, nofldrxq10g);
 #endif
 			}
@@ -1379,7 +1394,7 @@ restart:
 					n++;
 				}
 				iaq->nrxq1g = min(n, nrxq1g);
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 				iaq->nofldrxq1g = min(n, nofldrxq1g);
 #endif
 			}
@@ -1392,7 +1407,7 @@ restart:
 		 * Least desirable option: one interrupt vector for everything.
 		 */
 		iaq->nirq = iaq->nrxq10g = iaq->nrxq1g = 1;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 		iaq->nofldrxq10g = iaq->nofldrxq1g = 1;
 #endif
 
@@ -2305,7 +2320,7 @@ adapter_full_init(struct adapter *sc)
 	struct irq *irq;
 	struct port_info *pi;
 	struct sge_rxq *rxq;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 
@@ -2369,7 +2384,7 @@ adapter_full_init(struct adapter *sc)
 		for_each_port(sc, p) {
 			pi = sc->port[p];
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 			/*
 			 * Skip over the NIC queues if they aren't taking direct
 			 * interrupts.
@@ -2386,7 +2401,7 @@ adapter_full_init(struct adapter *sc)
 				rid++;
 			}
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 			/*
 			 * Skip over the offload queues if they aren't taking
 			 * direct interrupts.
@@ -2494,7 +2509,7 @@ port_full_uninit(struct port_info *pi)
 	int i;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 	struct sge_wrq *ofld_txq;
 #endif
@@ -2507,7 +2522,7 @@ port_full_uninit(struct port_info *pi)
 			quiesce_eq(sc, &txq->eq);
 		}
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 		for_each_ofld_txq(pi, i, ofld_txq) {
 			quiesce_eq(sc, &ofld_txq->eq);
 		}
@@ -2518,7 +2533,7 @@ port_full_uninit(struct port_info *pi)
 			quiesce_fl(sc, &rxq->fl);
 		}
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 		for_each_ofld_rxq(pi, i, ofld_rxq) {
 			quiesce_iq(sc, &ofld_rxq->iq);
 			quiesce_fl(sc, &ofld_rxq->fl);
@@ -2892,14 +2907,27 @@ cxgbe_tick(void *arg)
 	PORT_UNLOCK(pi);
 }
 
+static void
+cxgbe_vlan_config(void *arg, struct ifnet *ifp, uint16_t vid)
+{
+	struct ifnet *vlan;
+
+	if (arg != ifp)
+		return;
+
+	vlan = VLAN_DEVAT(ifp, vid);
+	VLAN_SETCOOKIE(vlan, ifp);
+}
+
 static int
 cpl_not_handled(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
+
 #ifdef INVARIANTS
-	panic("%s: opcode %02x on iq %p with payload %p",
+	panic("%s: opcode 0x%02x on iq %p with payload %p",
 	    __func__, rss->opcode, iq, m);
 #else
-	log(LOG_ERR, "%s: opcode %02x on iq %p with payload %p",
+	log(LOG_ERR, "%s: opcode 0x%02x on iq %p with payload %p",
 	    __func__, rss->opcode, iq, m);
 	m_freem(m);
 #endif
@@ -2922,6 +2950,31 @@ t4_register_cpl_handler(struct adapter *sc, int opcode, cpl_handler_t h)
 }
 
 static int
+an_not_handled(struct sge_iq *iq, const struct rsp_ctrl *ctrl)
+{
+
+#ifdef INVARIANTS
+	panic("%s: async notification on iq %p (ctrl %p)", __func__, iq, ctrl);
+#else
+	log(LOG_ERR, "%s: async notification on iq %p (ctrl %p)",
+	    __func__, iq, ctrl);
+#endif
+	return (EDOOFUS);
+}
+
+int
+t4_register_an_handler(struct adapter *sc, an_handler_t h)
+{
+	uintptr_t *loc, new;
+
+	new = h ? (uintptr_t)h : (uintptr_t)an_not_handled;
+	loc = (uintptr_t *) &sc->an_handler;
+	atomic_store_rel_ptr(loc, new);
+
+	return (0);
+}
+
+static int
 t4_sysctls(struct adapter *sc)
 {
 	struct sysctl_ctx_list *ctx;
@@ -3072,7 +3125,7 @@ t4_sysctls(struct adapter *sc)
 	    sysctl_tx_rate, "A", "Tx rate");
 #endif
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	if (is_offload(sc)) {
 		/*
 		 * dev.t4nex.X.toe.
@@ -3125,7 +3178,7 @@ cxgbe_sysctls(struct port_info *pi)
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_txq", CTLFLAG_RD,
 	    &pi->first_txq, 0, "index of first tx queue");
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	if (is_offload(pi->adapter)) {
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldrxq", CTLFLAG_RD,
 		    &pi->nofldrxq, 0,
@@ -4543,7 +4596,7 @@ set_filter_mode(struct adapter *sc, uint32_t mode)
 		goto done;
 	}
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	if (sc->offload_map) {
 		rc = EBUSY;
 		goto done;
@@ -4734,7 +4787,7 @@ static int
 set_filter_wr(struct adapter *sc, int fidx)
 {
 	struct filter_entry *f = &sc->tids.ftid_tab[fidx];
-	struct mbuf *m;
+	struct wrqe *wr;
 	struct fw_filter_wr *fwr;
 	unsigned int ftid;
 
@@ -4755,12 +4808,11 @@ set_filter_wr(struct adapter *sc, int fidx)
 
 	ftid = sc->tids.ftid_base + fidx;
 
-	m = m_gethdr(M_NOWAIT, MT_DATA);
-	if (m == NULL)
+	wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq);
+	if (wr == NULL)
 		return (ENOMEM);
 
-	fwr = mtod(m, struct fw_filter_wr *);
-	m->m_len = m->m_pkthdr.len = sizeof(*fwr);
+	fwr = wrtod(wr);
 	bzero(fwr, sizeof (*fwr));
 
 	fwr->op_pkd = htobe32(V_FW_WR_OP(FW_FILTER_WR));
@@ -4830,7 +4882,7 @@ set_filter_wr(struct adapter *sc, int fidx)
 	f->pending = 1;
 	sc->tids.ftids_in_use++;
 
-	t4_mgmt_tx(sc, m);
+	t4_wrq_tx(sc, wr);
 	return (0);
 }
 
@@ -4838,7 +4890,7 @@ static int
 del_filter_wr(struct adapter *sc, int fidx)
 {
 	struct filter_entry *f = &sc->tids.ftid_tab[fidx];
-	struct mbuf *m;
+	struct wrqe *wr;
 	struct fw_filter_wr *fwr;
 	unsigned int ftid;
 
@@ -4846,18 +4898,16 @@ del_filter_wr(struct adapter *sc, int fidx)
 
 	ftid = sc->tids.ftid_base + fidx;
 
-	m = m_gethdr(M_NOWAIT, MT_DATA);
-	if (m == NULL)
+	wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq);
+	if (wr == NULL)
 		return (ENOMEM);
-
-	fwr = mtod(m, struct fw_filter_wr *);
-	m->m_len = m->m_pkthdr.len = sizeof(*fwr);
+	fwr = wrtod(wr);
 	bzero(fwr, sizeof (*fwr));
 
 	t4_mk_filtdelwr(ftid, fwr, sc->sge.fwq.abs_id);
 
 	f->pending = 1;
-	t4_mgmt_tx(sc, m);
+	t4_wrq_tx(sc, wr);
 	return (0);
 }
 
@@ -5215,7 +5265,7 @@ t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag,
 	return (rc);
 }
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 static int
 toe_capability(struct port_info *pi, int enable)
 {
@@ -5228,13 +5278,28 @@ toe_capability(struct port_info *pi, int enable)
 		return (ENODEV);
 
 	if (enable) {
+		if (!(sc->flags & FULL_INIT_DONE)) {
+			log(LOG_WARNING,
+			    "You must enable a cxgbe interface first\n");
+			return (EAGAIN);
+		}
+
 		if (isset(&sc->offload_map, pi->port_id))
 			return (0);
 
-		if (sc->offload_map == 0) {
-			rc = activate_uld(sc, ULD_TOM, &sc->tom);
+		if (!(sc->flags & TOM_INIT_DONE)) {
+			rc = t4_activate_uld(sc, ULD_TOM);
+			if (rc == EAGAIN) {
+				log(LOG_WARNING,
+				    "You must kldload t4_tom.ko before trying "
+				    "to enable TOE on a cxgbe interface.\n");
+			}
 			if (rc != 0)
 				return (rc);
+			KASSERT(sc->tom_softc != NULL,
+			    ("%s: TOM activated but softc NULL", __func__));
+			KASSERT(sc->flags & TOM_INIT_DONE,
+			    ("%s: TOM activated but flag not set", __func__));
 		}
 
 		setbit(&sc->offload_map, pi->port_id);
@@ -5242,15 +5307,9 @@ toe_capability(struct port_info *pi, int enable)
 		if (!isset(&sc->offload_map, pi->port_id))
 			return (0);
 
+		KASSERT(sc->flags & TOM_INIT_DONE,
+		    ("%s: TOM never initialized?", __func__));
 		clrbit(&sc->offload_map, pi->port_id);
-
-		if (sc->offload_map == 0) {
-			rc = deactivate_uld(&sc->tom);
-			if (rc != 0) {
-				setbit(&sc->offload_map, pi->port_id);
-				return (rc);
-			}
-		}
 	}
 
 	return (0);
@@ -5305,8 +5364,8 @@ done:
 	return (rc);
 }
 
-static int
-activate_uld(struct adapter *sc, int id, struct uld_softc *usc)
+int
+t4_activate_uld(struct adapter *sc, int id)
 {
 	int rc = EAGAIN;
 	struct uld_info *ui;
@@ -5315,13 +5374,9 @@ activate_uld(struct adapter *sc, int id, struct uld_softc *usc)
 
 	SLIST_FOREACH(ui, &t4_uld_list, link) {
 		if (ui->uld_id == id) {
-			rc = ui->attach(sc, &usc->softc);
-			if (rc == 0) {
-				KASSERT(usc->softc != NULL,
-				    ("%s: ULD %d has no state", __func__, id));
+			rc = ui->activate(sc);
+			if (rc == 0)
 				ui->refcount++;
-				usc->uld = ui;
-			}
 			goto done;
 		}
 	}
@@ -5331,25 +5386,21 @@ done:
 	return (rc);
 }
 
-static int
-deactivate_uld(struct uld_softc *usc)
+int
+t4_deactivate_uld(struct adapter *sc, int id)
 {
-	int rc;
+	int rc = EINVAL;
+	struct uld_info *ui;
 
 	mtx_lock(&t4_uld_list_lock);
 
-	if (usc->uld == NULL || usc->softc == NULL) {
-		rc = EINVAL;
-		goto done;
-	}
-
-	rc = usc->uld->detach(usc->softc);
-	if (rc == 0) {
-		KASSERT(usc->uld->refcount > 0,
-		    ("%s: ULD has bad refcount", __func__));
-		usc->uld->refcount--;
-		usc->uld = NULL;
-		usc->softc = NULL;
+	SLIST_FOREACH(ui, &t4_uld_list, link) {
+		if (ui->uld_id == id) {
+			rc = ui->deactivate(sc);
+			if (rc == 0)
+				ui->refcount--;
+			goto done;
+		}
 	}
 done:
 	mtx_unlock(&t4_uld_list_lock);
@@ -5379,7 +5430,7 @@ tweak_tunables(void)
 	if (t4_nrxq1g < 1)
 		t4_nrxq1g = min(nc, NRXQ_1G);
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	if (t4_nofldtxq10g < 1)
 		t4_nofldtxq10g = min(nc, NOFLDTXQ_10G);
 
@@ -5426,7 +5477,7 @@ t4_mod_event(module_t mod, int cmd, void *arg)
 		t4_sge_modload();
 		mtx_init(&t4_list_lock, "T4 adapters", 0, MTX_DEF);
 		SLIST_INIT(&t4_list);
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 		mtx_init(&t4_uld_list_lock, "T4 ULDs", 0, MTX_DEF);
 		SLIST_INIT(&t4_uld_list);
 #endif
@@ -5434,7 +5485,7 @@ t4_mod_event(module_t mod, int cmd, void *arg)
 		break;
 
 	case MOD_UNLOAD:
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 		mtx_lock(&t4_uld_list_lock);
 		if (!SLIST_EMPTY(&t4_uld_list)) {
 			rc = EBUSY;
diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c
index 8f39f10..92c9212 100644
--- a/sys/dev/cxgbe/t4_sge.c
+++ b/sys/dev/cxgbe/t4_sge.c
@@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/kernel.h>
+#include <sys/kdb.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/taskqueue.h>
@@ -51,7 +52,6 @@ __FBSDID("$FreeBSD$");
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "common/t4_msg.h"
-#include "t4_l2t.h"
 
 struct fl_buf_info {
 	int size;
@@ -115,14 +115,14 @@ static int free_mgmtq(struct adapter *);
 static int alloc_rxq(struct port_info *, struct sge_rxq *, int, int,
     struct sysctl_oid *);
 static int free_rxq(struct port_info *, struct sge_rxq *);
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 static int alloc_ofld_rxq(struct port_info *, struct sge_ofld_rxq *, int, int,
     struct sysctl_oid *);
 static int free_ofld_rxq(struct port_info *, struct sge_ofld_rxq *);
 #endif
 static int ctrl_eq_alloc(struct adapter *, struct sge_eq *);
 static int eth_eq_alloc(struct adapter *, struct port_info *, struct sge_eq *);
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 static int ofld_eq_alloc(struct adapter *, struct port_info *, struct sge_eq *);
 #endif
 static int alloc_eq(struct adapter *, struct port_info *, struct sge_eq *);
@@ -397,7 +397,7 @@ first_vector(struct port_info *pi)
 		if (i == pi->port_id)
 			break;
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 		if (sc->flags & INTR_DIRECT)
 			rc += pi->nrxq + pi->nofldrxq;
 		else
@@ -434,7 +434,7 @@ port_intr_iq(struct port_info *pi, int idx)
 	if (sc->intr_count == 1)
 		return (&sc->sge.fwq);
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	if (sc->flags & INTR_DIRECT) {
 		idx %= pi->nrxq + pi->nofldrxq;
 		
@@ -475,19 +475,20 @@ t4_setup_port_queues(struct port_info *pi)
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
 	struct sge_wrq *ctrlq;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 	struct sge_wrq *ofld_txq;
+	struct sysctl_oid *oid2 = NULL;
 #endif
 	char name[16];
 	struct adapter *sc = pi->adapter;
-	struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev), *oid2 = NULL;
+	struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev);
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD,
 	    NULL, "rx queues");
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	if (is_offload(sc)) {
 		oid2 = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_rxq",
 		    CTLFLAG_RD, NULL,
@@ -515,7 +516,7 @@ t4_setup_port_queues(struct port_info *pi)
 		init_fl(&rxq->fl, pi->qsize_rxq / 8, pi->ifp->if_mtu, name);
 
 		if (sc->flags & INTR_DIRECT
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 		    || (sc->intr_count > 1 && pi->nrxq >= pi->nofldrxq)
 #endif
 		   ) {
@@ -527,7 +528,7 @@ t4_setup_port_queues(struct port_info *pi)
 		}
 	}
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	for_each_ofld_rxq(pi, i, ofld_rxq) {
 
 		snprintf(name, sizeof(name), "%s ofld_rxq%d-iq",
@@ -567,7 +568,7 @@ t4_setup_port_queues(struct port_info *pi)
 		j++;
 	}
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	for_each_ofld_rxq(pi, i, ofld_rxq) {
 		if (ofld_rxq->iq.flags & IQ_INTR)
 			continue;
@@ -603,7 +604,7 @@ t4_setup_port_queues(struct port_info *pi)
 		j++;
 	}
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_txq",
 	    CTLFLAG_RD, NULL, "tx queues for offloaded TCP connections");
 	for_each_ofld_txq(pi, i, ofld_txq) {
@@ -655,7 +656,7 @@ t4_teardown_port_queues(struct port_info *pi)
 	struct adapter *sc = pi->adapter;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 	struct sge_wrq *ofld_txq;
 #endif
@@ -677,7 +678,7 @@ t4_teardown_port_queues(struct port_info *pi)
 		free_txq(pi, txq);
 	}
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	for_each_ofld_txq(pi, i, ofld_txq) {
 		free_wrq(sc, ofld_txq);
 	}
@@ -693,7 +694,7 @@ t4_teardown_port_queues(struct port_info *pi)
 			free_rxq(pi, rxq);
 	}
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	for_each_ofld_rxq(pi, i, ofld_rxq) {
 		if ((ofld_rxq->iq.flags & IQ_INTR) == 0)
 			free_ofld_rxq(pi, ofld_rxq);
@@ -709,7 +710,7 @@ t4_teardown_port_queues(struct port_info *pi)
 			free_rxq(pi, rxq);
 	}
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	for_each_ofld_rxq(pi, i, ofld_rxq) {
 		if (ofld_rxq->iq.flags & IQ_INTR)
 			free_ofld_rxq(pi, ofld_rxq);
@@ -775,7 +776,7 @@ static int
 service_iq(struct sge_iq *iq, int budget)
 {
 	struct sge_iq *q;
-	struct sge_rxq *rxq = (void *)iq;	/* Use iff iq is part of rxq */
+	struct sge_rxq *rxq = iq_to_rxq(iq);	/* Use iff iq is part of rxq */
 	struct sge_fl *fl = &rxq->fl;		/* Use iff IQ_HAS_FL */
 	struct adapter *sc = iq->adapter;
 	struct rsp_ctrl *ctrl;
@@ -862,7 +863,8 @@ service_iq(struct sge_iq *iq, int budget)
 				break;
 
 			default:
-				panic("%s: rsp_type %u", __func__, rsp_type);
+				sc->an_handler(iq, ctrl);
+				break;
 			}
 
 			iq_next(iq);
@@ -1076,42 +1078,33 @@ t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
 	return (0);
 }
 
-int
-t4_mgmt_tx(struct adapter *sc, struct mbuf *m)
-{
-	return t4_wrq_tx(sc, &sc->sge.mgmtq, m);
-}
-
 /*
  * Doesn't fail.  Holds on to work requests it can't send right away.
  */
-int
-t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct mbuf *m0)
+void
+t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr)
 {
 	struct sge_eq *eq = &wrq->eq;
 	int can_reclaim;
 	caddr_t dst;
-	struct mbuf *wr, *next;
 
 	TXQ_LOCK_ASSERT_OWNED(wrq);
+#ifdef TCP_OFFLOAD
 	KASSERT((eq->flags & EQ_TYPEMASK) == EQ_OFLD ||
 	    (eq->flags & EQ_TYPEMASK) == EQ_CTRL,
 	    ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK));
+#else
+	KASSERT((eq->flags & EQ_TYPEMASK) == EQ_CTRL,
+	    ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK));
+#endif
 
-	if (__predict_true(m0 != NULL)) {
-		if (wrq->head)
-			wrq->tail->m_nextpkt = m0;
-		else
-			wrq->head = m0;
-		while (m0->m_nextpkt)
-			m0 = m0->m_nextpkt;
-		wrq->tail = m0;
-	}
+	if (__predict_true(wr != NULL))
+		STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link);
 
 	can_reclaim = reclaimable(eq);
 	if (__predict_false(eq->flags & EQ_STALLED)) {
 		if (can_reclaim < tx_resume_threshold(eq))
-			return (0);
+			return;
 		eq->flags &= ~EQ_STALLED;
 		eq->unstalled++;
 	}
@@ -1120,39 +1113,34 @@ t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct mbuf *m0)
 	if (__predict_false(eq->cidx >= eq->cap))
 		eq->cidx -= eq->cap;
 
-	for (wr = wrq->head; wr; wr = next) {
+	while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL) {
 		int ndesc;
-		struct mbuf *m;
 
-		next = wr->m_nextpkt;
-		wr->m_nextpkt = NULL;
+		if (__predict_false(wr->wr_len < 0 ||
+		    wr->wr_len > SGE_MAX_WR_LEN || (wr->wr_len & 0x7))) {
 
-		M_ASSERTPKTHDR(wr);
-		KASSERT(wr->m_pkthdr.len > 0 && (wr->m_pkthdr.len & 0x7) == 0,
-		    ("%s: work request len %d.", __func__, wr->m_pkthdr.len));
-
-		if (wr->m_pkthdr.len > SGE_MAX_WR_LEN) {
 #ifdef INVARIANTS
-			panic("%s: oversized work request", __func__);
-#else
-			log(LOG_ERR, "%s: %s work request too long (%d)",
-			    device_get_nameunit(sc->dev), __func__,
-			    wr->m_pkthdr.len);
-			m_freem(wr);
-			continue;
+			panic("%s: work request with length %d", __func__,
+			    wr->wr_len);
 #endif
+#ifdef KDB
+			kdb_backtrace();
+#endif
+			log(LOG_ERR, "%s: %s work request with length %d",
+			    device_get_nameunit(sc->dev), __func__, wr->wr_len);
+			STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
+			free_wrqe(wr);
+			continue;
 		}
 
-		ndesc = howmany(wr->m_pkthdr.len, EQ_ESIZE);
+		ndesc = howmany(wr->wr_len, EQ_ESIZE);
 		if (eq->avail < ndesc) {
-			wr->m_nextpkt = next;
 			wrq->no_desc++;
 			break;
 		}
 
 		dst = (void *)&eq->desc[eq->pidx];
-		for (m = wr; m; m = m->m_next)
-			copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
+		copy_to_txd(eq, wrtod(wr), &dst, wr->wr_len);
 
 		eq->pidx += ndesc;
 		eq->avail -= ndesc;
@@ -1164,7 +1152,8 @@ t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct mbuf *m0)
 			ring_eq_db(sc, eq);
 
 		wrq->tx_wrs++;
-		m_freem(wr);
+		STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
+		free_wrqe(wr);
 
 		if (eq->avail < 8) {
 			can_reclaim = reclaimable(eq);
@@ -1178,20 +1167,11 @@ t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct mbuf *m0)
 	if (eq->pending)
 		ring_eq_db(sc, eq);
 
-	if (wr == NULL)
-		wrq->head = wrq->tail = NULL;
-	else {
-		wrq->head = wr;
-
-		KASSERT(wrq->tail->m_nextpkt == NULL,
-		    ("%s: wrq->tail grew a tail of its own", __func__));
-
+	if (wr != NULL) {
 		eq->flags |= EQ_STALLED;
 		if (callout_pending(&eq->tx_callout) == 0)
 			callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq);
 	}
-
-	return (0);
 }
 
 /* Per-packet header in a coalesced tx WR, before the SGL starts (in flits) */
@@ -1792,6 +1772,7 @@ alloc_mgmtq(struct adapter *sc)
 static int
 free_mgmtq(struct adapter *sc)
 {
+
 	return free_wrq(sc, &sc->sge.mgmtq);
 }
 
@@ -1885,7 +1866,7 @@ free_rxq(struct port_info *pi, struct sge_rxq *rxq)
 	return (rc);
 }
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 static int
 alloc_ofld_rxq(struct port_info *pi, struct sge_ofld_rxq *ofld_rxq,
     int intr_idx, int idx, struct sysctl_oid *oid)
@@ -2031,7 +2012,7 @@ eth_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
 	return (rc);
 }
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 static int
 ofld_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
 {
@@ -2103,7 +2084,7 @@ alloc_eq(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
 		rc = eth_eq_alloc(sc, pi, eq);
 		break;
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 	case EQ_OFLD:
 		rc = ofld_eq_alloc(sc, pi, eq);
 		break;
@@ -2141,7 +2122,7 @@ free_eq(struct adapter *sc, struct sge_eq *eq)
 			    eq->cntxt_id);
 			break;
 
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
 		case EQ_OFLD:
 			rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0,
 			    eq->cntxt_id);
@@ -2183,6 +2164,7 @@ alloc_wrq(struct adapter *sc, struct port_info *pi, struct sge_wrq *wrq,
 		return (rc);
 
 	wrq->adapter = sc;
+	STAILQ_INIT(&wrq->wr_list);
 
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &wrq->eq.cntxt_id, 0, "SGE context id of the queue");
@@ -3179,7 +3161,7 @@ write_sgl_to_txd(struct sge_eq *eq, struct sgl *sgl, caddr_t *to)
 static inline void
 copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
 {
-	if ((uintptr_t)(*to) + len <= (uintptr_t)eq->spg) {
+	if (__predict_true((uintptr_t)(*to) + len <= (uintptr_t)eq->spg)) {
 		bcopy(from, *to, len);
 		(*to) += len;
 	} else {
diff --git a/sys/dev/cxgbe/tom/t4_connect.c b/sys/dev/cxgbe/tom/t4_connect.c
new file mode 100644
index 0000000..bc59171
--- /dev/null
+++ b/sys/dev/cxgbe/tom/t4_connect.c
@@ -0,0 +1,377 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+
+#ifdef TCP_OFFLOAD
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/module.h>
+#include <sys/protosw.h>
+#include <sys/domain.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/if_vlan_var.h>
+#include <net/route.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/tcp_var.h>
+#define TCPSTATES
+#include <netinet/tcp_fsm.h>
+#include <netinet/toecore.h>
+
+#include "common/common.h"
+#include "common/t4_msg.h"
+#include "common/t4_regs.h"
+#include "tom/t4_tom_l2t.h"
+#include "tom/t4_tom.h"
+
+/* atid services */
+static int alloc_atid(struct adapter *, void *);
+static void *lookup_atid(struct adapter *, int);
+static void free_atid(struct adapter *, int);
+
+static int
+alloc_atid(struct adapter *sc, void *ctx)
+{
+	struct tid_info *t = &sc->tids;
+	int atid = -1;
+
+	mtx_lock(&t->atid_lock);
+	if (t->afree) {
+		union aopen_entry *p = t->afree;
+
+		atid = p - t->atid_tab;
+		t->afree = p->next;
+		p->data = ctx;
+		t->atids_in_use++;
+	}
+	mtx_unlock(&t->atid_lock);
+	return (atid);
+}
+
+static void *
+lookup_atid(struct adapter *sc, int atid)
+{
+	struct tid_info *t = &sc->tids;
+
+	return (t->atid_tab[atid].data);
+}
+
+static void
+free_atid(struct adapter *sc, int atid)
+{
+	struct tid_info *t = &sc->tids;
+	union aopen_entry *p = &t->atid_tab[atid];
+
+	mtx_lock(&t->atid_lock);
+	p->next = t->afree;
+	t->afree = p;
+	t->atids_in_use--;
+	mtx_unlock(&t->atid_lock);
+}
+
+/*
+ * Active open failed.
+ */
+static int
+do_act_establish(struct sge_iq *iq, const struct rss_header *rss,
+    struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_act_establish *cpl = (const void *)(rss + 1);
+	unsigned int tid = GET_TID(cpl);
+	unsigned int atid = G_TID_TID(ntohl(cpl->tos_atid));
+	struct toepcb *toep = lookup_atid(sc, atid);
+	struct inpcb *inp = toep->inp;
+
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+	KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__));
+
+	CTR3(KTR_CXGBE, "%s: atid %u, tid %u", __func__, atid, tid);
+	free_atid(sc, atid);
+
+	INP_WLOCK(inp);
+	toep->tid = tid;
+	insert_tid(sc, tid, toep);
+	if (inp->inp_flags & INP_DROPPED) {
+
+		/* socket closed by the kernel before hw told us it connected */
+
+		send_flowc_wr(toep, NULL);
+		send_reset(sc, toep, be32toh(cpl->snd_isn));
+		goto done;
+	}
+
+	make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt);
+done:
+	INP_WUNLOCK(inp);
+	return (0);
+}
+
+static inline int
+act_open_has_tid(unsigned int status)
+{
+
+	return (status != CPL_ERR_TCAM_FULL &&
+	    status != CPL_ERR_TCAM_PARITY &&
+	    status != CPL_ERR_CONN_EXIST &&
+	    status != CPL_ERR_ARP_MISS);
+}
+
+/*
+ * Convert an ACT_OPEN_RPL status to an errno.
+ */
+static inline int
+act_open_rpl_status_to_errno(int status)
+{
+
+	switch (status) {
+	case CPL_ERR_CONN_RESET:
+		return (ECONNREFUSED);
+	case CPL_ERR_ARP_MISS:
+		return (EHOSTUNREACH);
+	case CPL_ERR_CONN_TIMEDOUT:
+		return (ETIMEDOUT);
+	case CPL_ERR_TCAM_FULL:
+		return (ENOMEM);
+	case CPL_ERR_CONN_EXIST:
+		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
+		return (EADDRINUSE);
+	default:
+		return (EIO);
+	}
+}
+
+static int
+do_act_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
+    struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1);
+	unsigned int atid = G_TID_TID(G_AOPEN_ATID(be32toh(cpl->atid_status)));
+	unsigned int status = G_AOPEN_STATUS(be32toh(cpl->atid_status));
+	struct toepcb *toep = lookup_atid(sc, atid);
+	struct inpcb *inp = toep->inp;
+	struct tcpcb *tp = intotcpcb(inp);
+	struct toedev *tod = &toep->td->tod;
+
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+	KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__));
+
+	CTR3(KTR_CXGBE, "%s: atid %u, status %u ", __func__, atid, status);
+
+	/* Ignore negative advice */
+	if (status == CPL_ERR_RTX_NEG_ADVICE)
+		return (0);
+
+	free_atid(sc, atid);
+	toep->tid = -1;
+
+	if (status && act_open_has_tid(status))
+		release_tid(sc, GET_TID(cpl), toep->ctrlq);
+
+	if (status == CPL_ERR_TCAM_FULL) {
+		INP_WLOCK(inp);
+		toe_connect_failed(tod, tp, EAGAIN);
+		final_cpl_received(toep);	/* unlocks inp */
+	} else {
+		INP_INFO_WLOCK(&V_tcbinfo);
+		INP_WLOCK(inp);
+		toe_connect_failed(tod, tp, act_open_rpl_status_to_errno(status));
+		final_cpl_received(toep);	/* unlocks inp */
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+	}
+
+	return (0);
+}
+
+/*
+ * Options2 for active open.
+ */
+static uint32_t
+calc_opt2a(struct socket *so)
+{
+	struct tcpcb *tp = so_sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	struct port_info *pi = toep->port;
+	struct adapter *sc = pi->adapter;
+	uint32_t opt2 = 0;
+
+	if (tp->t_flags & TF_SACK_PERMIT)
+		opt2 |= F_SACK_EN;
+
+	if (tp->t_flags & TF_REQ_TSTMP)
+		opt2 |= F_TSTAMPS_EN;
+
+	if (tp->t_flags & TF_REQ_SCALE)
+		opt2 |= F_WND_SCALE_EN;
+
+	if (V_tcp_do_ecn)
+		opt2 |= F_CCTRL_ECN;
+
+	opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]);
+	opt2 |= F_RX_COALESCE_VALID | V_RX_COALESCE(M_RX_COALESCE);
+	opt2 |= F_RSS_QUEUE_VALID | V_RSS_QUEUE(toep->ofld_rxq->iq.abs_id);
+
+	return (htobe32(opt2));
+}
+
+
+void
+t4_init_connect_cpl_handlers(struct adapter *sc)
+{
+
+	t4_register_cpl_handler(sc, CPL_ACT_ESTABLISH, do_act_establish);
+	t4_register_cpl_handler(sc, CPL_ACT_OPEN_RPL, do_act_open_rpl);
+}
+
+/*
+ * active open (soconnect).
+ *
+ * State of affairs on entry:
+ * soisconnecting (so_state |= SS_ISCONNECTING)
+ * tcbinfo not locked (This has changed - used to be WLOCKed)
+ * inp WLOCKed
+ * tp->t_state = TCPS_SYN_SENT
+ * rtalloc1, RT_UNLOCK on rt.
+ */
+int
+t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt,
+    struct sockaddr *nam)
+{
+	struct adapter *sc = tod->tod_softc;
+	struct toepcb *toep = NULL;
+	struct wrqe *wr = NULL;
+	struct cpl_act_open_req *cpl;
+	struct l2t_entry *e = NULL;
+	struct ifnet *rt_ifp = rt->rt_ifp;
+	struct port_info *pi;
+	int atid = -1, mtu_idx, rscale, qid_atid, rc = ENOMEM;
+	struct inpcb *inp = sotoinpcb(so);
+	struct tcpcb *tp = intotcpcb(inp);
+
+	INP_WLOCK_ASSERT(inp);
+
+	if (nam->sa_family != AF_INET)
+		CXGBE_UNIMPLEMENTED("IPv6 connect");
+
+	if (rt_ifp->if_type == IFT_ETHER)
+		pi = rt_ifp->if_softc;
+	else if (rt_ifp->if_type == IFT_L2VLAN) {
+		struct ifnet *ifp = VLAN_COOKIE(rt_ifp);
+
+		pi = ifp->if_softc;
+	} else if (rt_ifp->if_type == IFT_IEEE8023ADLAG)
+		return (ENOSYS);	/* XXX: implement lagg support */
+	else
+		return (ENOTSUP);
+
+	toep = alloc_toepcb(pi, -1, -1, M_NOWAIT);
+	if (toep == NULL)
+		goto failed;
+
+	atid = alloc_atid(sc, toep);
+	if (atid < 0)
+		goto failed;
+
+	e = t4_l2t_get(pi, rt_ifp,
+	    rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam);
+	if (e == NULL)
+		goto failed;
+
+	wr = alloc_wrqe(sizeof(*cpl), toep->ctrlq);
+	if (wr == NULL)
+		goto failed;
+	cpl = wrtod(wr);
+
+	toep->tid = atid;
+	toep->l2te = e;
+	toep->ulp_mode = ULP_MODE_NONE;
+	SOCKBUF_LOCK(&so->so_rcv);
+	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
+	toep->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
+	SOCKBUF_UNLOCK(&so->so_rcv);
+
+	offload_socket(so, toep);
+
+	/*
+	 * The kernel sets request_r_scale based on sb_max whereas we need to
+	 * take hardware's MAX_RCV_WND into account too.  This is normally a
+	 * no-op as MAX_RCV_WND is much larger than the default sb_max.
+	 */
+	if (tp->t_flags & TF_REQ_SCALE)
+		rscale = tp->request_r_scale = select_rcv_wscale();
+	else
+		rscale = 0;
+	mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, 0);
+	qid_atid = (toep->ofld_rxq->iq.abs_id << 14) | atid;
+
+	INIT_TP_WR(cpl, 0);
+	OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, qid_atid));
+	inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip,
+	    &cpl->peer_port);
+	cpl->opt0 = calc_opt0(so, pi, e, mtu_idx, rscale, toep->rx_credits,
+	    toep->ulp_mode);
+	cpl->params = select_ntuple(pi, e, sc->filter_mode);
+	cpl->opt2 = calc_opt2a(so);
+
+	CTR5(KTR_CXGBE, "%s: atid %u (%s), toep %p, inp %p", __func__,
+	    toep->tid, tcpstates[tp->t_state], toep, inp);
+
+	rc = t4_l2t_send(sc, wr, e);
+	if (rc == 0) {
+		toepcb_set_flag(toep, TPF_CPL_PENDING);
+		return (0);
+	}
+
+	undo_offload_socket(so);
+failed:
+	CTR5(KTR_CXGBE, "%s: FAILED, atid %d, toep %p, l2te %p, wr %p",
+	    __func__, atid, toep, e, wr);
+
+	if (e)
+		t4_l2t_release(e);
+	if (wr)
+		free_wrqe(wr);
+	if (atid >= 0)
+		free_atid(sc, atid);
+	if (toep)
+		free_toepcb(toep);
+
+	return (rc);
+}
+#endif
diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c
new file mode 100644
index 0000000..161fc12
--- /dev/null
+++ b/sys/dev/cxgbe/tom/t4_cpl_io.c
@@ -0,0 +1,1276 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+
+#ifdef TCP_OFFLOAD
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/module.h>
+#include <sys/protosw.h>
+#include <sys/domain.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sglist.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/tcp_var.h>
+#define TCPSTATES
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/toecore.h>
+
+#include "common/common.h"
+#include "common/t4_msg.h"
+#include "common/t4_regs.h"
+#include "tom/t4_tom_l2t.h"
+#include "tom/t4_tom.h"
+
+VNET_DECLARE(int, tcp_do_autosndbuf);
+#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf)
+VNET_DECLARE(int, tcp_autosndbuf_inc);
+#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc)
+VNET_DECLARE(int, tcp_autosndbuf_max);
+#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max)
+VNET_DECLARE(int, tcp_do_autorcvbuf);
+#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
+VNET_DECLARE(int, tcp_autorcvbuf_inc);
+#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
+VNET_DECLARE(int, tcp_autorcvbuf_max);
+#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
+
+void
+send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp)
+{
+        struct wrqe *wr;
+        struct fw_flowc_wr *flowc;
+	unsigned int nparams = ftxp ? 8 : 4, flowclen;
+	struct port_info *pi = toep->port;
+	struct adapter *sc = pi->adapter;
+	unsigned int pfvf = G_FW_VIID_PFN(pi->viid) << S_FW_VIID_PFN;
+	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
+
+	KASSERT(!toepcb_flag(toep, TPF_FLOWC_WR_SENT),
+	    ("%s: flowc for tid %u sent already", __func__, toep->tid));
+
+	CTR2(KTR_CXGBE, "%s: tid %u", __func__, toep->tid);
+
+	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
+
+	wr = alloc_wrqe(roundup(flowclen, 16), toep->ofld_txq);
+	if (wr == NULL) {
+		/* XXX */
+		panic("%s: allocation failure.", __func__);
+	}
+	flowc = wrtod(wr);
+	memset(flowc, 0, wr->wr_len);
+
+	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
+	    V_FW_FLOWC_WR_NPARAMS(nparams));
+	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
+	    V_FW_WR_FLOWID(toep->tid));
+
+	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
+        flowc->mnemval[0].val = htobe32(pfvf);
+        flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
+        flowc->mnemval[1].val = htobe32(pi->tx_chan);
+        flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
+        flowc->mnemval[2].val = htobe32(pi->tx_chan);
+        flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
+        flowc->mnemval[3].val = htobe32(toep->ofld_rxq->iq.abs_id);
+	if (ftxp) {
+		uint32_t sndbuf = min(ftxp->snd_space, sc->tt.sndbuf);
+
+		flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDNXT;
+		flowc->mnemval[4].val = htobe32(ftxp->snd_nxt);
+		flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_RCVNXT;
+		flowc->mnemval[5].val = htobe32(ftxp->rcv_nxt);
+		flowc->mnemval[6].mnemonic = FW_FLOWC_MNEM_SNDBUF;
+		flowc->mnemval[6].val = htobe32(sndbuf);
+		flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS;
+		flowc->mnemval[7].val = htobe32(ftxp->mss);
+	}
+
+	txsd->tx_credits = howmany(flowclen, 16);
+	txsd->plen = 0;
+	KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
+	    ("%s: not enough credits (%d)", __func__, toep->tx_credits));
+	toep->tx_credits -= txsd->tx_credits;
+	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
+		toep->txsd_pidx = 0;
+	toep->txsd_avail--;
+
+	toepcb_set_flag(toep, TPF_FLOWC_WR_SENT);
+        t4_wrq_tx(sc, wr);
+}
+
+void
+send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt)
+{
+	struct wrqe *wr;
+	struct cpl_abort_req *req;
+	int tid = toep->tid;
+	struct inpcb *inp = toep->inp;
+	struct tcpcb *tp = intotcpcb(inp);	/* don't use if INP_DROPPED */
+
+	INP_WLOCK_ASSERT(inp);
+
+	CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s",
+	    __func__, toep->tid,
+	    inp->inp_flags & INP_DROPPED ? "inp dropped" :
+	    tcpstates[tp->t_state],
+	    toep->flags, inp->inp_flags,
+	    toepcb_flag(toep, TPF_ABORT_SHUTDOWN) ?
+	    " (abort already in progress)" : "");
+
+	if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN))
+		return;	/* abort already in progress */
+
+	toepcb_set_flag(toep, TPF_ABORT_SHUTDOWN);
+
+	KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT),
+	    ("%s: flowc_wr not sent for tid %d.", __func__, tid));
+
+	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
+	if (wr == NULL) {
+		/* XXX */
+		panic("%s: allocation failure.", __func__);
+	}
+	req = wrtod(wr);
+
+	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid);
+	if (inp->inp_flags & INP_DROPPED)
+		req->rsvd0 = htobe32(snd_nxt);
+	else
+		req->rsvd0 = htobe32(tp->snd_nxt);
+	req->rsvd1 = !toepcb_flag(toep, TPF_TX_DATA_SENT);
+	req->cmd = CPL_ABORT_SEND_RST;
+
+	/*
+	 * XXX: What's the correct way to tell that the inp hasn't been detached
+	 * from its socket?  Should I even be flushing the snd buffer here?
+	 */
+	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
+		struct socket *so = inp->inp_socket;
+
+		if (so != NULL)	/* because I'm not sure.  See comment above */
+			sbflush(&so->so_snd);
+	}
+
+	t4_l2t_send(sc, wr, toep->l2te);
+}
+
+/*
+ * Called when a connection is established to translate the TCP options
+ * reported by HW to FreeBSD's native format.
+ */
+static void
+assign_rxopt(struct tcpcb *tp, unsigned int opt)
+{
+	struct toepcb *toep = tp->t_toe;
+	struct adapter *sc = td_adapter(toep->td);
+
+	INP_LOCK_ASSERT(tp->t_inpcb);
+
+	tp->t_maxseg = tp->t_maxopd = sc->params.mtus[G_TCPOPT_MSS(opt)] - 40;
+
+	if (G_TCPOPT_TSTAMP(opt)) {
+		tp->t_flags |= TF_RCVD_TSTMP;	/* timestamps ok */
+		tp->ts_recent = 0;		/* hmmm */
+		tp->ts_recent_age = tcp_ts_getticks();
+		tp->t_maxseg -= TCPOLEN_TSTAMP_APPA;
+	}
+
+	if (G_TCPOPT_SACK(opt))
+		tp->t_flags |= TF_SACK_PERMIT;	/* should already be set */
+	else
+		tp->t_flags &= ~TF_SACK_PERMIT;	/* sack disallowed by peer */
+
+	if (G_TCPOPT_WSCALE_OK(opt))
+		tp->t_flags |= TF_RCVD_SCALE;
+
+	/* Doing window scaling? */
+	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
+	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
+		tp->rcv_scale = tp->request_r_scale;
+		tp->snd_scale = G_TCPOPT_SND_WSCALE(opt);
+	}
+}
+
+/*
+ * Completes some final bits of initialization for just established connections
+ * and changes their state to TCPS_ESTABLISHED.
+ *
+ * The ISNs are from after the exchange of SYNs.  i.e., the true ISN + 1.
+ */
+void
+make_established(struct toepcb *toep, uint32_t snd_isn, uint32_t rcv_isn,
+    uint16_t opt)
+{
+	struct inpcb *inp = toep->inp;
+	struct socket *so = inp->inp_socket;
+	struct tcpcb *tp = intotcpcb(inp);
+	long bufsize;
+	uint32_t iss = be32toh(snd_isn) - 1;	/* true ISS */
+	uint32_t irs = be32toh(rcv_isn) - 1;	/* true IRS */
+	uint16_t tcpopt = be16toh(opt);
+	struct flowc_tx_params ftxp;
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(tp->t_state == TCPS_SYN_SENT ||
+	    tp->t_state == TCPS_SYN_RECEIVED,
+	    ("%s: TCP state %s", __func__, tcpstates[tp->t_state]));
+
+	CTR4(KTR_CXGBE, "%s: tid %d, toep %p, inp %p",
+	    __func__, toep->tid, toep, inp);
+
+	tp->t_state = TCPS_ESTABLISHED;
+	tp->t_starttime = ticks;
+	TCPSTAT_INC(tcps_connects);
+
+	tp->irs = irs;
+	tcp_rcvseqinit(tp);
+	tp->rcv_wnd = toep->rx_credits << 10;
+	tp->rcv_adv += tp->rcv_wnd;
+	tp->last_ack_sent = tp->rcv_nxt;
+
+	/*
+	 * If we were unable to send all rx credits via opt0, save the remainder
+	 * in rx_credits so that they can be handed over with the next credit
+	 * update.
+	 */
+	SOCKBUF_LOCK(&so->so_rcv);
+	bufsize = select_rcv_wnd(so);
+	SOCKBUF_UNLOCK(&so->so_rcv);
+	toep->rx_credits = bufsize - tp->rcv_wnd;
+
+	tp->iss = iss;
+	tcp_sendseqinit(tp);
+	tp->snd_una = iss + 1;
+	tp->snd_nxt = iss + 1;
+	tp->snd_max = iss + 1;
+
+	assign_rxopt(tp, tcpopt);
+
+	SOCKBUF_LOCK(&so->so_snd);
+	if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf)
+		bufsize = V_tcp_autosndbuf_max;
+	else
+		bufsize = sbspace(&so->so_snd);
+	SOCKBUF_UNLOCK(&so->so_snd);
+
+	ftxp.snd_nxt = tp->snd_nxt;
+	ftxp.rcv_nxt = tp->rcv_nxt;
+	ftxp.snd_space = bufsize;
+	ftxp.mss = tp->t_maxseg;
+	send_flowc_wr(toep, &ftxp);
+
+	soisconnected(so);
+}
+
+static int
+send_rx_credits(struct adapter *sc, struct toepcb *toep, uint32_t credits)
+{
+	struct wrqe *wr;
+	struct cpl_rx_data_ack *req;
+	uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
+
+	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
+	if (wr == NULL)
+		return (0);
+	req = wrtod(wr);
+
+	INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
+	req->credit_dack = htobe32(dack | V_RX_CREDITS(credits));
+
+	t4_wrq_tx(sc, wr);
+	return (credits);
+}
+
+void
+t4_rcvd(struct toedev *tod, struct tcpcb *tp)
+{
+	struct adapter *sc = tod->tod_softc;
+	struct inpcb *inp = tp->t_inpcb;
+	struct socket *so = inp->inp_socket;
+	struct sockbuf *so_rcv = &so->so_rcv;
+	struct toepcb *toep = tp->t_toe;
+	int must_send;
+
+	INP_WLOCK_ASSERT(inp);
+
+	SOCKBUF_LOCK(so_rcv);
+	KASSERT(toep->enqueued >= so_rcv->sb_cc,
+	    ("%s: so_rcv->sb_cc > enqueued", __func__));
+	toep->rx_credits += toep->enqueued - so_rcv->sb_cc;
+	toep->enqueued = so_rcv->sb_cc;
+	SOCKBUF_UNLOCK(so_rcv);
+
+	must_send = toep->rx_credits + 16384 >= tp->rcv_wnd;
+	if (must_send || toep->rx_credits >= 15 * 1024) {
+		int credits;
+
+		credits = send_rx_credits(sc, toep, toep->rx_credits);
+		toep->rx_credits -= credits;
+		tp->rcv_wnd += credits;
+		tp->rcv_adv += credits;
+	}
+}
+
+/*
+ * Close a connection by sending a CPL_CLOSE_CON_REQ message.
+ */
+static int
+close_conn(struct adapter *sc, struct toepcb *toep)
+{
+	struct wrqe *wr;
+	struct cpl_close_con_req *req;
+	unsigned int tid = toep->tid;
+
+	CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid,
+	    toepcb_flag(toep, TPF_FIN_SENT) ? ", IGNORED" : "");
+
+	if (toepcb_flag(toep, TPF_FIN_SENT))
+		return (0);
+
+	KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT),
+	    ("%s: flowc_wr not sent for tid %u.", __func__, tid));
+
+	wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
+	if (wr == NULL) {
+		/* XXX */
+		panic("%s: allocation failure.", __func__);
+	}
+	req = wrtod(wr);
+
+        req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) |
+	    V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr)));
+	req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) |
+	    V_FW_WR_FLOWID(tid));
+        req->wr.wr_lo = cpu_to_be64(0);
+        OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
+	req->rsvd = 0;
+
+	toepcb_set_flag(toep, TPF_FIN_SENT);
+	toepcb_clr_flag(toep, TPF_SEND_FIN);
+	t4_l2t_send(sc, wr, toep->l2te);
+
+	return (0);
+}
+
+#define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16)
+#define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16))
+
+/* Maximum amount of immediate data we could stuff in a WR */
+static inline int
+max_imm_payload(int tx_credits)
+{
+	const int n = 2;	/* Use only up to 2 desc for imm. data WR */
+
+	KASSERT(tx_credits >= 0 &&
+		tx_credits <= MAX_OFLD_TX_CREDITS,
+		("%s: %d credits", __func__, tx_credits));
+
+	if (tx_credits < MIN_OFLD_TX_CREDITS)
+		return (0);
+
+	if (tx_credits >= (n * EQ_ESIZE) / 16)
+		return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr));
+	else
+		return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr));
+}
+
+/* Maximum number of SGL entries we could stuff in a WR */
+static inline int
+max_dsgl_nsegs(int tx_credits)
+{
+	int nseg = 1;	/* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */
+	int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS;
+
+	KASSERT(tx_credits >= 0 &&
+		tx_credits <= MAX_OFLD_TX_CREDITS,
+		("%s: %d credits", __func__, tx_credits));
+
+	if (tx_credits < MIN_OFLD_TX_CREDITS)
+		return (0);
+
+	nseg += 2 * (sge_pair_credits * 16 / 24);
+	if ((sge_pair_credits * 16) % 24 == 16)
+		nseg++;
+
+	return (nseg);
+}
+
+static inline void
+write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen,
+    unsigned int plen, uint8_t credits, int more_to_come)
+{
+	struct fw_ofld_tx_data_wr *txwr = dst;
+	int shove = !more_to_come;
+	int compl = 1;
+
+	/*
+	 * We always request completion notifications from the firmware.  The
+	 * only exception is when we know we'll get more data to send shortly
+	 * and that we'll have some tx credits remaining to transmit that data.
+	 */
+	if (more_to_come && toep->tx_credits - credits >= MIN_OFLD_TX_CREDITS)
+		compl = 0;
+
+	txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) |
+	    V_FW_WR_COMPL(compl) | V_FW_WR_IMMDLEN(immdlen));
+	txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) |
+	    V_FW_WR_LEN16(credits));
+	txwr->tunnel_to_proxy =
+	    htobe32(V_FW_OFLD_TX_DATA_WR_ULPMODE(toep->ulp_mode) |
+		V_FW_OFLD_TX_DATA_WR_URGENT(0) |	/* XXX */
+		V_FW_OFLD_TX_DATA_WR_SHOVE(shove));
+	txwr->plen = htobe32(plen);
+}
+
+/*
+ * Generate a DSGL from a starting mbuf.  The total number of segments and the
+ * maximum segments in any one mbuf are provided.
+ */
+static void
+write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n)
+{
+	struct mbuf *m;
+	struct ulptx_sgl *usgl = dst;
+	int i, j, rc;
+	struct sglist sg;
+	struct sglist_seg segs[n];
+
+	KASSERT(nsegs > 0, ("%s: nsegs 0", __func__));
+
+	sglist_init(&sg, n, segs);
+	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
+	    V_ULPTX_NSGE(nsegs));
+
+	i = -1;
+	for (m = start; m != stop; m = m->m_next) {
+		rc = sglist_append(&sg, mtod(m, void *), m->m_len);
+		if (__predict_false(rc != 0))
+			panic("%s: sglist_append %d", __func__, rc);
+
+		for (j = 0; j < sg.sg_nseg; i++, j++) {
+			if (i < 0) {
+				usgl->len0 = htobe32(segs[j].ss_len);
+				usgl->addr0 = htobe64(segs[j].ss_paddr);
+			} else {
+				usgl->sge[i / 2].len[i & 1] =
+				    htobe32(segs[j].ss_len);
+				usgl->sge[i / 2].addr[i & 1] =
+				    htobe64(segs[j].ss_paddr);
+			}
+#ifdef INVARIANTS
+			nsegs--;
+#endif
+		}
+		sglist_reset(&sg);
+	}
+	if (i & 1)
+		usgl->sge[i / 2].len[1] = htobe32(0);
+	KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p",
+	    __func__, nsegs, start, stop));
+}
+
+/*
+ * Max number of SGL entries an offload tx work request can have.  This is 41
+ * (1 + 40) for a full 512B work request.
+ * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40)
+ */
+#define OFLD_SGL_LEN (41)
+
+/*
+ * Send data and/or a FIN to the peer.
+ *
+ * The socket's so_snd buffer consists of a stream of data starting with sb_mb
+ * and linked together with m_next.  sb_sndptr, if set, is the last mbuf that
+ * was transmitted.
+ */
+static void
+t4_push_frames(struct adapter *sc, struct toepcb *toep)
+{
+	struct mbuf *sndptr, *m, *sb_sndptr;
+	struct fw_ofld_tx_data_wr *txwr;
+	struct wrqe *wr;
+	unsigned int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
+	struct inpcb *inp = toep->inp;
+	struct tcpcb *tp = intotcpcb(inp);
+	struct socket *so = inp->inp_socket;
+	struct sockbuf *sb = &so->so_snd;
+	int tx_credits;
+	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT),
+	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
+
+	if (toep->ulp_mode != ULP_MODE_NONE)
+		CXGBE_UNIMPLEMENTED("ulp_mode");
+
+	/*
+	 * This function doesn't resume by itself.  Someone else must clear the
+	 * flag and call this function.
+	 */
+	if (__predict_false(toepcb_flag(toep, TPF_TX_SUSPENDED)))
+		return;
+
+	do {
+		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
+		max_imm = max_imm_payload(tx_credits);
+		max_nsegs = max_dsgl_nsegs(tx_credits);
+
+		SOCKBUF_LOCK(sb);
+		sb_sndptr = sb->sb_sndptr;
+		sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb;
+		plen = 0;
+		nsegs = 0;
+		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
+		for (m = sndptr; m != NULL; m = m->m_next) {
+			int n = sglist_count(mtod(m, void *), m->m_len);
+
+			nsegs += n;
+			plen += m->m_len;
+
+			/* This mbuf sent us _over_ the nsegs limit, back out */
+			if (plen > max_imm && nsegs > max_nsegs) {
+				nsegs -= n;
+				plen -= m->m_len;
+				if (plen == 0) {
+					/* Too few credits */
+					toepcb_set_flag(toep, TPF_TX_SUSPENDED);
+					SOCKBUF_UNLOCK(sb);
+					return;
+				}
+				break;
+			}
+
+			if (max_nsegs_1mbuf < n)
+				max_nsegs_1mbuf = n;
+			sb_sndptr = m;	/* new sb->sb_sndptr if all goes well */
+
+			/* This mbuf put us right at the max_nsegs limit */
+			if (plen > max_imm && nsegs == max_nsegs) {
+				m = m->m_next;
+				break;
+			}
+		}
+
+		if (sb->sb_flags & SB_AUTOSIZE &&
+		    V_tcp_do_autosndbuf &&
+		    sb->sb_hiwat < V_tcp_autosndbuf_max &&
+		    sbspace(sb) < sb->sb_hiwat / 8 * 7) {
+			int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc,
+			    V_tcp_autosndbuf_max);
+
+			if (!sbreserve_locked(sb, newsize, so, NULL))
+				sb->sb_flags &= ~SB_AUTOSIZE;
+			else {
+				sowwakeup_locked(so);	/* room available */
+				SOCKBUF_UNLOCK_ASSERT(sb);
+				goto unlocked;
+			}
+		}
+		SOCKBUF_UNLOCK(sb);
+unlocked:
+
+		/* nothing to send */
+		if (plen == 0) {
+			KASSERT(m == NULL,
+			    ("%s: nothing to send, but m != NULL", __func__));
+			break;
+		}
+
+		if (__predict_false(toepcb_flag(toep, TPF_FIN_SENT)))
+			panic("%s: excess tx.", __func__);
+
+		if (plen <= max_imm) {
+
+			/* Immediate data tx */
+
+			wr = alloc_wrqe(roundup(sizeof(*txwr) + plen, 16),
+					toep->ofld_txq);
+			if (wr == NULL) {
+				/* XXX: how will we recover from this? */
+				toepcb_set_flag(toep, TPF_TX_SUSPENDED);
+				return;
+			}
+			txwr = wrtod(wr);
+			credits = howmany(wr->wr_len, 16);
+			write_tx_wr(txwr, toep, plen, plen, credits,
+			    tp->t_flags & TF_MORETOCOME);
+			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
+		} else {
+			int wr_len;
+
+			/* DSGL tx */
+
+			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
+			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
+			wr = alloc_wrqe(roundup(wr_len, 16), toep->ofld_txq);
+			if (wr == NULL) {
+				/* XXX: how will we recover from this? */
+				toepcb_set_flag(toep, TPF_TX_SUSPENDED);
+				return;
+			}
+			txwr = wrtod(wr);
+			credits = howmany(wr_len, 16);
+			write_tx_wr(txwr, toep, 0, plen, credits,
+			    tp->t_flags & TF_MORETOCOME);
+			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
+			    max_nsegs_1mbuf);
+			if (wr_len & 0xf) {
+				uint64_t *pad = (uint64_t *)
+				    ((uintptr_t)txwr + wr_len);
+				*pad = 0;
+			}
+		}
+
+		KASSERT(toep->tx_credits >= credits,
+			("%s: not enough credits", __func__));
+
+		toep->tx_credits -= credits;
+
+		tp->snd_nxt += plen;
+		tp->snd_max += plen;
+
+		SOCKBUF_LOCK(sb);
+		KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__));
+		sb->sb_sndptr = sb_sndptr;
+		SOCKBUF_UNLOCK(sb);
+
+		toepcb_set_flag(toep, TPF_TX_DATA_SENT);
+
+		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
+		txsd->plen = plen;
+		txsd->tx_credits = credits;
+		txsd++;
+		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
+			toep->txsd_pidx = 0;
+			txsd = &toep->txsd[0];
+		}
+		toep->txsd_avail--;
+
+		t4_l2t_send(sc, wr, toep->l2te);
+	} while (m != NULL);
+
+	/* Send a FIN if requested, but only if there's no more data to send */
+	if (m == NULL && toepcb_flag(toep, TPF_SEND_FIN))
+		close_conn(sc, toep);
+}
+
+int
+t4_tod_output(struct toedev *tod, struct tcpcb *tp)
+{
+	struct adapter *sc = tod->tod_softc;
+#ifdef INVARIANTS
+	struct inpcb *inp = tp->t_inpcb;
+#endif
+	struct toepcb *toep = tp->t_toe;
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
+	    ("%s: inp %p dropped.", __func__, inp));
+	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
+
+	t4_push_frames(sc, toep);
+
+	return (0);
+}
+
+int
+t4_send_fin(struct toedev *tod, struct tcpcb *tp)
+{
+	struct adapter *sc = tod->tod_softc;
+#ifdef INVARIANTS
+	struct inpcb *inp = tp->t_inpcb;
+#endif
+	struct toepcb *toep = tp->t_toe;
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
+	    ("%s: inp %p dropped.", __func__, inp));
+	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
+
+	toepcb_set_flag(toep, TPF_SEND_FIN);
+	t4_push_frames(sc, toep);
+
+	return (0);
+}
+
+int
+t4_send_rst(struct toedev *tod, struct tcpcb *tp)
+{
+	struct adapter *sc = tod->tod_softc;
+#if defined(INVARIANTS)
+	struct inpcb *inp = tp->t_inpcb;
+#endif
+	struct toepcb *toep = tp->t_toe;
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
+	    ("%s: inp %p dropped.", __func__, inp));
+	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
+
+	/* hmmmm */
+	KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT),
+	    ("%s: flowc for tid %u [%s] not sent already",
+	    __func__, toep->tid, tcpstates[tp->t_state]));
+
+	send_reset(sc, toep, 0);
+	return (0);
+}
+
+/*
+ * Peer has sent us a FIN.
+ */
+static int
+do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_peer_close *cpl = (const void *)(rss + 1);
+	unsigned int tid = GET_TID(cpl);
+	struct toepcb *toep = lookup_tid(sc, tid);
+	struct inpcb *inp = toep->inp;
+	struct tcpcb *tp = NULL;
+	struct socket *so = NULL;
+#ifdef INVARIANTS
+	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+	KASSERT(opcode == CPL_PEER_CLOSE,
+	    ("%s: unexpected opcode 0x%x", __func__, opcode));
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
+
+	INP_INFO_WLOCK(&V_tcbinfo);
+	INP_WLOCK(inp);
+	tp = intotcpcb(inp);
+
+	CTR5(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__,
+	    tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, inp);
+
+	if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN))
+		goto done;
+
+	so = inp->inp_socket;
+
+	socantrcvmore(so);
+	tp->rcv_nxt++;	/* FIN */
+	KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt),
+	    ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt,
+	    be32toh(cpl->rcv_nxt)));
+
+	switch (tp->t_state) {
+	case TCPS_SYN_RECEIVED:
+		tp->t_starttime = ticks;
+		/* FALLTHROUGH */ 
+
+	case TCPS_ESTABLISHED:
+		tp->t_state = TCPS_CLOSE_WAIT;
+		break;
+
+	case TCPS_FIN_WAIT_1:
+		tp->t_state = TCPS_CLOSING;
+		break;
+
+	case TCPS_FIN_WAIT_2:
+		tcp_twstart(tp);
+		INP_UNLOCK_ASSERT(inp);	 /* safe, we have a ref on the inp */
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+
+		INP_WLOCK(inp);
+		final_cpl_received(toep);
+		return (0);
+
+	default:
+		log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n",
+		    __func__, tid, tp->t_state);
+	}
+done:
+	INP_WUNLOCK(inp);
+	INP_INFO_WUNLOCK(&V_tcbinfo);
+	return (0);
+}
+
+/*
+ * Peer has ACK'd our FIN.
+ */
+static int
+do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss,
+    struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1);
+	unsigned int tid = GET_TID(cpl);
+	struct toepcb *toep = lookup_tid(sc, tid);
+	struct inpcb *inp = toep->inp;
+	struct tcpcb *tp = NULL;
+	struct socket *so = NULL;
+#ifdef INVARIANTS
+	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+	KASSERT(opcode == CPL_CLOSE_CON_RPL,
+	    ("%s: unexpected opcode 0x%x", __func__, opcode));
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
+
+	INP_INFO_WLOCK(&V_tcbinfo);
+	INP_WLOCK(inp);
+	tp = intotcpcb(inp);
+
+	CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x",
+	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags);
+
+	if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN))
+		goto done;
+
+	so = inp->inp_socket;
+	tp->snd_una = be32toh(cpl->snd_nxt) - 1;	/* exclude FIN */
+
+	switch (tp->t_state) {
+	case TCPS_CLOSING:	/* see TCPS_FIN_WAIT_2 in do_peer_close too */
+		tcp_twstart(tp);
+release:
+		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+
+		INP_WLOCK(inp);
+		final_cpl_received(toep);	/* no more CPLs expected */
+
+		return (0);
+	case TCPS_LAST_ACK:
+		if (tcp_close(tp))
+			INP_WUNLOCK(inp);
+		goto release;
+
+	case TCPS_FIN_WAIT_1:
+		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
+			soisdisconnected(so);
+		tp->t_state = TCPS_FIN_WAIT_2;
+		break;
+
+	default:
+		log(LOG_ERR,
+		    "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n",
+		    __func__, tid, tcpstates[tp->t_state]);
+	}
+done:
+	INP_WUNLOCK(inp);
+	INP_INFO_WUNLOCK(&V_tcbinfo);
+	return (0);
+}
+
+void
+send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid,
+    int rst_status)
+{
+	struct wrqe *wr;
+	struct cpl_abort_rpl *cpl;
+
+	wr = alloc_wrqe(sizeof(*cpl), ofld_txq);
+	if (wr == NULL) {
+		/* XXX */
+		panic("%s: allocation failure.", __func__);
+	}
+	cpl = wrtod(wr);
+
+	INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid);
+	cpl->cmd = rst_status;
+
+	t4_wrq_tx(sc, wr);
+}
+
+static int
+abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason)
+{
+	switch (abort_reason) {
+	case CPL_ERR_BAD_SYN:
+	case CPL_ERR_CONN_RESET:
+		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
+	case CPL_ERR_XMIT_TIMEDOUT:
+	case CPL_ERR_PERSIST_TIMEDOUT:
+	case CPL_ERR_FINWAIT2_TIMEDOUT:
+	case CPL_ERR_KEEPALIVE_TIMEDOUT:
+		return (ETIMEDOUT);
+	default:
+		return (EIO);
+	}
+}
+
+/*
+ * TCP RST from the peer, timeout, or some other such critical error.
+ */
+static int
+do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
+	unsigned int tid = GET_TID(cpl);
+	struct toepcb *toep = lookup_tid(sc, tid);
+	struct sge_wrq *ofld_txq = toep->ofld_txq;
+	struct inpcb *inp;
+	struct tcpcb *tp;
+	struct socket *so;
+#ifdef INVARIANTS
+	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+	KASSERT(opcode == CPL_ABORT_REQ_RSS,
+	    ("%s: unexpected opcode 0x%x", __func__, opcode));
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+
+	if (toepcb_flag(toep, TPF_SYNQE))
+		return (do_abort_req_synqe(iq, rss, m));
+
+	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
+
+	if (cpl->status == CPL_ERR_RTX_NEG_ADVICE ||
+	    cpl->status == CPL_ERR_PERSIST_NEG_ADVICE) {
+		CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)",
+		    __func__, cpl->status, tid, toep->flags);
+		return (0);	/* Ignore negative advice */
+	}
+
+	inp = toep->inp;
+	INP_INFO_WLOCK(&V_tcbinfo);	/* for tcp_close */
+	INP_WLOCK(inp);
+
+	tp = intotcpcb(inp);
+	so = inp->inp_socket;
+
+	CTR6(KTR_CXGBE,
+	    "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d",
+	    __func__, tid, tcpstates[tp->t_state], toep->flags, inp->inp_flags,
+	    cpl->status);
+
+	/*
+	 * If we'd initiated an abort earlier the reply to it is responsible for
+	 * cleaning up resources.  Otherwise we tear everything down right here
+	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
+	 */
+	if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN)) {
+		INP_WUNLOCK(inp);
+		goto done;
+	}
+	toepcb_set_flag(toep, TPF_ABORT_SHUTDOWN);
+
+	so_error_set(so, abort_status_to_errno(tp, cpl->status));
+	tp = tcp_close(tp);
+	if (tp == NULL)
+		INP_WLOCK(inp);	/* re-acquire */
+
+	final_cpl_received(toep);
+done:
+	INP_INFO_WUNLOCK(&V_tcbinfo);
+	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
+	return (0);
+}
+
+/*
+ * Reply to the CPL_ABORT_REQ (send_reset)
+ */
+static int
+do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
+	unsigned int tid = GET_TID(cpl);
+	struct toepcb *toep = lookup_tid(sc, tid);
+	struct inpcb *inp = toep->inp;
+#ifdef INVARIANTS
+	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+	KASSERT(opcode == CPL_ABORT_RPL_RSS,
+	    ("%s: unexpected opcode 0x%x", __func__, opcode));
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+
+	if (toepcb_flag(toep, TPF_SYNQE))
+		return (do_abort_rpl_synqe(iq, rss, m));
+
+	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
+
+	CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d",
+	    __func__, tid, toep, inp, cpl->status);
+
+	KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN),
+	    ("%s: wasn't expecting abort reply", __func__));
+
+	INP_WLOCK(inp);
+	final_cpl_received(toep);
+
+	return (0);
+}
+
+static int
+do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_rx_data *cpl = mtod(m, const void *);
+	unsigned int tid = GET_TID(cpl);
+	struct toepcb *toep = lookup_tid(sc, tid);
+	struct inpcb *inp = toep->inp;
+	struct tcpcb *tp;
+	struct socket *so;
+	struct sockbuf *so_rcv;
+
+	if (__predict_false(toepcb_flag(toep, TPF_SYNQE))) {
+		/*
+		 * do_pass_establish failed and must be attempting to abort the
+		 * synqe's tid.  Meanwhile, the T4 has sent us data for such a
+		 * connection.
+		 */
+		KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN),
+		    ("%s: synqe and tid isn't being aborted.", __func__));
+		m_freem(m);
+		return (0);
+	}
+
+	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
+
+	/* strip off CPL header */
+	m_adj(m, sizeof(*cpl));
+
+	INP_WLOCK(inp);
+	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
+		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
+		    __func__, tid, m->m_pkthdr.len, inp->inp_flags);
+		INP_WUNLOCK(inp);
+		m_freem(m);
+		return (0);
+	}
+
+	tp = intotcpcb(inp);
+
+#ifdef INVARIANTS
+	if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) {
+		log(LOG_ERR,
+		    "%s: unexpected seq# %x for TID %u, rcv_nxt %x\n",
+		    __func__, be32toh(cpl->seq), toep->tid, tp->rcv_nxt);
+	}
+#endif
+
+	tp->rcv_nxt += m->m_pkthdr.len;
+	KASSERT(tp->rcv_wnd >= m->m_pkthdr.len,
+	    ("%s: negative window size", __func__));
+	tp->rcv_wnd -= m->m_pkthdr.len;
+	tp->t_rcvtime = ticks;
+
+	so = inp_inpcbtosocket(inp);
+	so_rcv = &so->so_rcv;
+	SOCKBUF_LOCK(so_rcv);
+
+	if (__predict_false(so_rcv->sb_state & SBS_CANTRCVMORE)) {
+		CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
+		    __func__, tid, m->m_pkthdr.len);
+		m_freem(m);
+		SOCKBUF_UNLOCK(so_rcv);
+		INP_WUNLOCK(inp);
+
+		INP_INFO_WLOCK(&V_tcbinfo);
+		INP_WLOCK(inp);
+		tp = tcp_drop(tp, ECONNRESET);
+		if (tp)
+			INP_WUNLOCK(inp);
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+
+		return (0);
+	}
+
+	/* receive buffer autosize */
+	if (so_rcv->sb_flags & SB_AUTOSIZE &&
+	    V_tcp_do_autorcvbuf &&
+	    so_rcv->sb_hiwat < V_tcp_autorcvbuf_max &&
+	    m->m_pkthdr.len > (sbspace(so_rcv) / 8 * 7)) {
+		unsigned int hiwat = so_rcv->sb_hiwat;
+		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
+		    V_tcp_autorcvbuf_max);
+
+		if (!sbreserve_locked(so_rcv, newsize, so, NULL))
+			so_rcv->sb_flags &= ~SB_AUTOSIZE;
+		else
+			toep->rx_credits += newsize - hiwat;
+	}
+	toep->enqueued += m->m_pkthdr.len;
+	sbappendstream_locked(so_rcv, m);
+	sorwakeup_locked(so);
+	SOCKBUF_UNLOCK_ASSERT(so_rcv);
+
+	INP_WUNLOCK(inp);
+	return (0);
+}
+
+#define S_CPL_FW4_ACK_OPCODE    24
+#define M_CPL_FW4_ACK_OPCODE    0xff
+#define V_CPL_FW4_ACK_OPCODE(x) ((x) << S_CPL_FW4_ACK_OPCODE)
+#define G_CPL_FW4_ACK_OPCODE(x) \
+    (((x) >> S_CPL_FW4_ACK_OPCODE) & M_CPL_FW4_ACK_OPCODE)
+ 
+#define S_CPL_FW4_ACK_FLOWID    0
+#define M_CPL_FW4_ACK_FLOWID    0xffffff
+#define V_CPL_FW4_ACK_FLOWID(x) ((x) << S_CPL_FW4_ACK_FLOWID)
+#define G_CPL_FW4_ACK_FLOWID(x) \
+    (((x) >> S_CPL_FW4_ACK_FLOWID) & M_CPL_FW4_ACK_FLOWID)
+ 
+#define S_CPL_FW4_ACK_CR        24
+#define M_CPL_FW4_ACK_CR        0xff
+#define V_CPL_FW4_ACK_CR(x)     ((x) << S_CPL_FW4_ACK_CR)
+#define G_CPL_FW4_ACK_CR(x)     (((x) >> S_CPL_FW4_ACK_CR) & M_CPL_FW4_ACK_CR)
+ 
+#define S_CPL_FW4_ACK_SEQVAL    0
+#define M_CPL_FW4_ACK_SEQVAL    0x1
+#define V_CPL_FW4_ACK_SEQVAL(x) ((x) << S_CPL_FW4_ACK_SEQVAL)
+#define G_CPL_FW4_ACK_SEQVAL(x) \
+    (((x) >> S_CPL_FW4_ACK_SEQVAL) & M_CPL_FW4_ACK_SEQVAL)
+#define F_CPL_FW4_ACK_SEQVAL    V_CPL_FW4_ACK_SEQVAL(1U)
+
+static int
+do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
+	unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
+	struct toepcb *toep = lookup_tid(sc, tid);
+	struct inpcb *inp;
+	struct tcpcb *tp;
+	struct socket *so;
+	uint8_t credits = cpl->credits;
+	struct ofld_tx_sdesc *txsd;
+	int plen;
+#ifdef INVARIANTS
+	unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+	/*
+	 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and
+	 * now this comes back carrying the credits for the flowc.
+	 */
+	if (__predict_false(toepcb_flag(toep, TPF_SYNQE))) {
+		KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN),
+		    ("%s: credits for a synq entry %p", __func__, toep));
+		return (0);
+	}
+
+	inp = toep->inp;
+
+	KASSERT(opcode == CPL_FW4_ACK,
+	    ("%s: unexpected opcode 0x%x", __func__, opcode));
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
+
+	INP_WLOCK(inp);
+
+	if (__predict_false(toepcb_flag(toep, TPF_ABORT_SHUTDOWN))) {
+		INP_WUNLOCK(inp);
+		return (0);
+	}
+
+	KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0,
+	    ("%s: inp_flags 0x%x", __func__, inp->inp_flags));
+
+	tp = intotcpcb(inp);
+
+	if (cpl->seq_vld) {
+		tcp_seq snd_una = be32toh(cpl->snd_una);
+
+#ifdef INVARIANTS
+		if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
+			log(LOG_ERR,
+			    "%s: unexpected seq# %x for TID %u, snd_una %x\n",
+			    __func__, snd_una, toep->tid, tp->snd_una);
+		}
+#endif
+
+		if (tp->snd_una != snd_una) {
+			tp->snd_una = snd_una;
+			tp->ts_recent_age = tcp_ts_getticks();
+		}
+	}
+
+	so = inp->inp_socket;
+	txsd = &toep->txsd[toep->txsd_cidx];
+	plen = 0;
+	while (credits) {
+		KASSERT(credits >= txsd->tx_credits,
+		    ("%s: too many (or partial) credits", __func__));
+		credits -= txsd->tx_credits;
+		toep->tx_credits += txsd->tx_credits;
+		plen += txsd->plen;
+		txsd++;
+		toep->txsd_avail++;
+		KASSERT(toep->txsd_avail <= toep->txsd_total,
+		    ("%s: txsd avail > total", __func__));
+		if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) {
+			txsd = &toep->txsd[0];
+			toep->txsd_cidx = 0;
+		}
+	}
+
+	if (plen > 0) {
+		struct sockbuf *sb = &so->so_snd;
+
+		SOCKBUF_LOCK(sb);
+		sbdrop_locked(sb, plen);
+		sowwakeup_locked(so);
+		SOCKBUF_UNLOCK_ASSERT(sb);
+	}
+
+	/* XXX */
+	if ((toepcb_flag(toep, TPF_TX_SUSPENDED) &&
+	    toep->tx_credits >= MIN_OFLD_TX_CREDITS) ||
+	    toep->tx_credits == toep->txsd_total *
+	    howmany((sizeof(struct fw_ofld_tx_data_wr) + 1), 16)) {
+		toepcb_clr_flag(toep, TPF_TX_SUSPENDED);
+		t4_push_frames(sc, toep);
+	}
+	INP_WUNLOCK(inp);
+
+	return (0);
+}
+
+void
+t4_init_cpl_io_handlers(struct adapter *sc)
+{
+
+	t4_register_cpl_handler(sc, CPL_PEER_CLOSE, do_peer_close);
+	t4_register_cpl_handler(sc, CPL_CLOSE_CON_RPL, do_close_con_rpl);
+	t4_register_cpl_handler(sc, CPL_ABORT_REQ_RSS, do_abort_req);
+	t4_register_cpl_handler(sc, CPL_ABORT_RPL_RSS, do_abort_rpl);
+	t4_register_cpl_handler(sc, CPL_RX_DATA, do_rx_data);
+	t4_register_cpl_handler(sc, CPL_FW4_ACK, do_fw4_ack);
+}
+#endif
diff --git a/sys/dev/cxgbe/tom/t4_listen.c b/sys/dev/cxgbe/tom/t4_listen.c
new file mode 100644
index 0000000..895e57a
--- /dev/null
+++ b/sys/dev/cxgbe/tom/t4_listen.c
@@ -0,0 +1,1362 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+
+#ifdef TCP_OFFLOAD
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/module.h>
+#include <sys/protosw.h>
+#include <sys/refcount.h>
+#include <sys/domain.h>
+#include <sys/fnv_hash.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/if_vlan_var.h>
+#include <net/route.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/tcp_var.h>
+#define TCPSTATES
+#include <netinet/tcp_fsm.h>
+#include <netinet/toecore.h>
+
+#include "common/common.h"
+#include "common/t4_msg.h"
+#include "common/t4_regs.h"
+#include "tom/t4_tom_l2t.h"
+#include "tom/t4_tom.h"
+
+/* stid services */
+static int alloc_stid(struct adapter *, void *);
+static void *lookup_stid(struct adapter *, int);
+static void free_stid(struct adapter *, int);
+
+/* lctx services */
+static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *,
+    struct port_info *);
+static int free_lctx(struct adapter *, struct listen_ctx *);
+static void hold_lctx(struct listen_ctx *);
+static void listen_hash_add(struct adapter *, struct listen_ctx *);
+static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
+static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
+static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
+
+static inline void save_qids_in_mbuf(struct mbuf *, struct port_info *);
+static inline void get_qids_from_mbuf(struct mbuf *m, int *, int *);
+static void send_reset_synqe(struct toedev *, struct synq_entry *);
+
+/* XXX: won't work for IPv6 */
+static int
+alloc_stid(struct adapter *sc, void *ctx)
+{
+	struct tid_info *t = &sc->tids;
+	int stid = -1;
+
+	mtx_lock(&t->stid_lock);
+	if (t->sfree) {
+		union serv_entry *p = t->sfree;
+
+		stid = p - t->stid_tab;
+		stid += t->stid_base;
+		t->sfree = p->next;
+		p->data = ctx;
+		t->stids_in_use++;
+	}
+	mtx_unlock(&t->stid_lock);
+	return (stid);
+}
+
+static void *
+lookup_stid(struct adapter *sc, int stid)
+{
+	struct tid_info *t = &sc->tids;
+
+	return (t->stid_tab[stid - t->stid_base].data);
+}
+
+static void
+free_stid(struct adapter *sc, int stid)
+{
+	struct tid_info *t = &sc->tids;
+	union serv_entry *p = &t->stid_tab[stid - t->stid_base];
+
+	mtx_lock(&t->stid_lock);
+	p->next = t->sfree;
+	t->sfree = p;
+	t->stids_in_use--;
+	mtx_unlock(&t->stid_lock);
+}
+
+static struct listen_ctx *
+alloc_lctx(struct adapter *sc, struct inpcb *inp, struct port_info *pi)
+{
+	struct listen_ctx *lctx;
+
+	INP_WLOCK_ASSERT(inp);
+
+	lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO);
+	if (lctx == NULL)
+		return (NULL);
+
+	lctx->stid = alloc_stid(sc, lctx);
+	if (lctx->stid < 0) {
+		free(lctx, M_CXGBE);
+		return (NULL);
+	}
+
+	lctx->ctrlq = &sc->sge.ctrlq[pi->port_id];
+	lctx->ofld_rxq = &sc->sge.ofld_rxq[pi->first_ofld_rxq];
+	refcount_init(&lctx->refcount, 1);
+	TAILQ_INIT(&lctx->synq);
+
+	lctx->inp = inp;
+	in_pcbref(inp);
+
+	return (lctx);
+}
+
+/* Don't call this directly, use release_lctx instead */
+static int
+free_lctx(struct adapter *sc, struct listen_ctx *lctx)
+{
+	struct inpcb *inp = lctx->inp;
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(lctx->refcount == 0,
+	    ("%s: refcount %d", __func__, lctx->refcount));
+	KASSERT(TAILQ_EMPTY(&lctx->synq),
+	    ("%s: synq not empty.", __func__));
+	KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
+
+	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p",
+	    __func__, lctx->stid, lctx, lctx->inp);
+
+	free_stid(sc, lctx->stid);
+	free(lctx, M_CXGBE);
+
+	return (in_pcbrele_wlocked(inp));
+}
+
+static void
+hold_lctx(struct listen_ctx *lctx)
+{
+
+	refcount_acquire(&lctx->refcount);
+}
+
+static inline uint32_t
+listen_hashfn(void *key, u_long mask)
+{
+
+	return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
+}
+
+/*
+ * Add a listen_ctx entry to the listen hash table.
+ */
+static void
+listen_hash_add(struct adapter *sc, struct listen_ctx *lctx)
+{
+	struct tom_data *td = sc->tom_softc;
+	int bucket = listen_hashfn(lctx->inp, td->listen_mask);
+
+	mtx_lock(&td->lctx_hash_lock);
+	LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
+	td->lctx_count++;
+	mtx_unlock(&td->lctx_hash_lock);
+}
+
+/*
+ * Look for the listening socket's context entry in the hash and return it.
+ */
+static struct listen_ctx *
+listen_hash_find(struct adapter *sc, struct inpcb *inp)
+{
+	struct tom_data *td = sc->tom_softc;
+	int bucket = listen_hashfn(inp, td->listen_mask);
+	struct listen_ctx *lctx;
+
+	mtx_lock(&td->lctx_hash_lock);
+	LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
+		if (lctx->inp == inp)
+			break;
+	}
+	mtx_unlock(&td->lctx_hash_lock);
+
+	return (lctx);
+}
+
+/*
+ * Removes the listen_ctx structure for inp from the hash and returns it.
+ */
+static struct listen_ctx *
+listen_hash_del(struct adapter *sc, struct inpcb *inp)
+{
+	struct tom_data *td = sc->tom_softc;
+	int bucket = listen_hashfn(inp, td->listen_mask);
+	struct listen_ctx *lctx, *l;
+
+	mtx_lock(&td->lctx_hash_lock);
+	LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
+		if (lctx->inp == inp) {
+			LIST_REMOVE(lctx, link);
+			td->lctx_count--;
+			break;
+		}
+	}
+	mtx_unlock(&td->lctx_hash_lock);
+
+	return (lctx);
+}
+
+/*
+ * Releases a hold on the lctx.  Must be called with the listening socket's inp
+ * locked.  The inp may be freed by this function and it returns NULL to
+ * indicate this.
+ */
+static struct inpcb *
+release_lctx(struct adapter *sc, struct listen_ctx *lctx)
+{
+	struct inpcb *inp = lctx->inp;
+	int inp_freed = 0;
+
+	INP_WLOCK_ASSERT(inp);
+	if (refcount_release(&lctx->refcount))
+		inp_freed = free_lctx(sc, lctx);
+
+	return (inp_freed ? NULL : inp);
+}
+
+static void
+send_reset_synqe(struct toedev *tod, struct synq_entry *synqe)
+{
+	struct adapter *sc = tod->tod_softc;
+	struct mbuf *m = synqe->syn;
+	struct ifnet *ifp = m->m_pkthdr.rcvif;
+	struct port_info *pi = ifp->if_softc;
+	struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
+        struct wrqe *wr;
+        struct fw_flowc_wr *flowc;
+	struct cpl_abort_req *req;
+	int txqid, rxqid, flowclen;
+	struct sge_wrq *ofld_txq;
+	struct sge_ofld_rxq *ofld_rxq;
+	const int nparams = 4;
+	unsigned int pfvf = G_FW_VIID_PFN(pi->viid) << S_FW_VIID_PFN;
+
+	INP_WLOCK_ASSERT(synqe->lctx->inp);
+
+	CTR4(KTR_CXGBE, "%s: synqe %p, tid %d%s",
+	    __func__, synqe, synqe->tid,
+	    synqe_flag(synqe, TPF_ABORT_SHUTDOWN) ?
+	    " (abort already in progress)" : "");
+	if (synqe_flag(synqe, TPF_ABORT_SHUTDOWN))
+		return;	/* abort already in progress */
+	synqe_set_flag(synqe, TPF_ABORT_SHUTDOWN);
+
+	get_qids_from_mbuf(m, &txqid, &rxqid);
+	ofld_txq = &sc->sge.ofld_txq[txqid];
+	ofld_rxq = &sc->sge.ofld_rxq[rxqid];
+
+	/* The wrqe will have two WRs - a flowc followed by an abort_req */
+	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
+
+	wr = alloc_wrqe(roundup(flowclen, EQ_ESIZE) + sizeof(*req), ofld_txq);
+	if (wr == NULL) {
+		/* XXX */
+		panic("%s: allocation failure.", __func__);
+	}
+	flowc = wrtod(wr);
+	req = (void *)((caddr_t)flowc + roundup(flowclen, EQ_ESIZE));
+
+	/* First the flowc ... */
+	memset(flowc, 0, wr->wr_len);
+	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
+	    V_FW_FLOWC_WR_NPARAMS(nparams));
+	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
+	    V_FW_WR_FLOWID(synqe->tid));
+	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
+        flowc->mnemval[0].val = htobe32(pfvf);
+        flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
+        flowc->mnemval[1].val = htobe32(pi->tx_chan);
+        flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
+        flowc->mnemval[2].val = htobe32(pi->tx_chan);
+        flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
+        flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
+	synqe_set_flag(synqe, TPF_FLOWC_WR_SENT);
+
+	/* ... then ABORT request */
+	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
+	req->rsvd0 = 0;	/* don't have a snd_nxt */
+	req->rsvd1 = 1;	/* no data sent yet */
+	req->cmd = CPL_ABORT_SEND_RST;
+
+	t4_l2t_send(sc, wr, e);
+}
+
+static int
+create_server(struct adapter *sc, struct listen_ctx *lctx)
+{
+	struct wrqe *wr;
+	struct cpl_pass_open_req *req;
+	struct in_conninfo *inc = &lctx->inp->inp_inc;
+
+	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
+	if (wr == NULL) {
+		log(LOG_ERR, "%s: allocation failure", __func__);
+		return (ENOMEM);
+	}
+	req = wrtod(wr);
+
+	INIT_TP_WR(req, 0);
+	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
+	req->local_port = inc->inc_lport;
+	req->peer_port = 0;
+	req->local_ip = inc->inc_laddr.s_addr;
+	req->peer_ip = 0;
+	req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
+	req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
+	    F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
+
+	t4_wrq_tx(sc, wr);
+	return (0);
+}
+
+static int
+destroy_server(struct adapter *sc, struct listen_ctx *lctx)
+{
+	struct wrqe *wr;
+	struct cpl_close_listsvr_req *req;
+
+	wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
+	if (wr == NULL) {
+		/* XXX */
+		panic("%s: allocation failure.", __func__);
+	}
+	req = wrtod(wr);
+
+	INIT_TP_WR(req, 0);
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
+	    lctx->stid));
+	req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id);
+	req->rsvd = htobe16(0);
+
+	t4_wrq_tx(sc, wr);
+	return (0);
+}
+
+/*
+ * Start a listening server by sending a passive open request to HW.
+ *
+ * Can't take adapter lock here and access to sc->flags, sc->open_device_map,
+ * sc->offload_map, if_capenable are all race prone.
+ */
+int
+t4_listen_start(struct toedev *tod, struct tcpcb *tp)
+{
+	struct adapter *sc = tod->tod_softc;
+	struct port_info *pi;
+	struct inpcb *inp = tp->t_inpcb;
+	struct listen_ctx *lctx;
+	int i;
+
+	INP_WLOCK_ASSERT(inp);
+
+	if ((inp->inp_vflag & INP_IPV4) == 0)
+		return (0);
+
+#if 0
+	ADAPTER_LOCK(sc);
+	if (IS_BUSY(sc)) {
+		log(LOG_ERR, "%s: listen request ignored, %s is busy",
+		    __func__, device_get_nameunit(sc->dev));
+		goto done;
+	}
+
+	KASSERT(sc->flags & TOM_INIT_DONE,
+	    ("%s: TOM not initialized", __func__));
+#endif
+
+	if ((sc->open_device_map & sc->offload_map) == 0)
+		goto done;	/* no port that's UP with IFCAP_TOE enabled */
+
+	/*
+	 * Find a running port with IFCAP_TOE4.  We'll use the first such port's
+	 * queues to send the passive open and receive the reply to it.
+	 *
+	 * XXX: need a way to mark a port in use by offload.  if_cxgbe should
+	 * then reject any attempt to bring down such a port (and maybe reject
+	 * attempts to disable IFCAP_TOE on that port too?).
+	 */
+	for_each_port(sc, i) {
+		if (isset(&sc->open_device_map, i) &&
+		    sc->port[i]->ifp->if_capenable & IFCAP_TOE4)
+				break;
+	}
+	KASSERT(i < sc->params.nports,
+	    ("%s: no running port with TOE capability enabled.", __func__));
+	pi = sc->port[i];
+
+	if (listen_hash_find(sc, inp) != NULL)
+		goto done;	/* already setup */
+
+	lctx = alloc_lctx(sc, inp, pi);
+	if (lctx == NULL) {
+		log(LOG_ERR,
+		    "%s: listen request ignored, %s couldn't allocate lctx\n",
+		    __func__, device_get_nameunit(sc->dev));
+		goto done;
+	}
+	listen_hash_add(sc, lctx);
+
+	CTR5(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p", __func__,
+	    lctx->stid, tcpstates[tp->t_state], lctx, inp);
+
+	if (create_server(sc, lctx) != 0) {
+		log(LOG_ERR, "%s: %s failed to create hw listener.\n", __func__,
+		    device_get_nameunit(sc->dev));
+		(void) listen_hash_del(sc, inp);
+		inp = release_lctx(sc, lctx);
+		/* can't be freed, host stack has a reference */
+		KASSERT(inp != NULL, ("%s: inp freed", __func__));
+		goto done;
+	}
+	lctx->flags |= LCTX_RPL_PENDING;
+done:
+#if 0
+	ADAPTER_UNLOCK(sc);
+#endif
+	return (0);
+}
+
+int
+t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
+{
+	struct listen_ctx *lctx;
+	struct adapter *sc = tod->tod_softc;
+	struct inpcb *inp = tp->t_inpcb;
+	struct synq_entry *synqe;
+
+	INP_WLOCK_ASSERT(inp);
+
+	lctx = listen_hash_del(sc, inp);
+	if (lctx == NULL)
+		return (ENOENT);	/* no hardware listener for this inp */
+
+	CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
+	    lctx, lctx->flags);
+
+	/*
+	 * If the reply to the PASS_OPEN is still pending we'll wait for it to
+	 * arrive and clean up when it does.
+	 */
+	if (lctx->flags & LCTX_RPL_PENDING) {
+		KASSERT(TAILQ_EMPTY(&lctx->synq),
+		    ("%s: synq not empty.", __func__));
+		return (EINPROGRESS);
+	}
+
+	/*
+	 * The host stack will abort all the connections on the listening
+	 * socket's so_comp.  It doesn't know about the connections on the synq
+	 * so we need to take care of those.
+	 */
+	TAILQ_FOREACH(synqe, &lctx->synq, link)
+		send_reset_synqe(tod, synqe);
+
+	destroy_server(sc, lctx);
+	return (0);
+}
+
+static inline void
+hold_synqe(struct synq_entry *synqe)
+{
+
+	refcount_acquire(&synqe->refcnt);
+}
+
+static inline void
+release_synqe(struct synq_entry *synqe)
+{
+
+	if (refcount_release(&synqe->refcnt)) {
+		int needfree = synqe_flag(synqe, TPF_SYNQE_NEEDFREE);
+
+		m_freem(synqe->syn);
+		if (needfree)
+			free(synqe, M_CXGBE);
+	}
+}
+
+void
+t4_syncache_added(struct toedev *tod __unused, void *arg)
+{
+	struct synq_entry *synqe = arg;
+
+	hold_synqe(synqe);
+}
+
+void
+t4_syncache_removed(struct toedev *tod __unused, void *arg)
+{
+	struct synq_entry *synqe = arg;
+
+	release_synqe(synqe);
+}
+
+/* XXX */
+extern void tcp_dooptions(struct tcpopt *, u_char *, int, int);
+
+int
+t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
+{
+	struct adapter *sc = tod->tod_softc;
+	struct synq_entry *synqe = arg;
+	struct wrqe *wr;
+	struct l2t_entry *e;
+	struct tcpopt to;
+	struct ip *ip = mtod(m, struct ip *);
+	struct tcphdr *th = (void *)(ip + 1);
+
+	wr = (struct wrqe *)atomic_readandclear_ptr(&synqe->wr);
+	if (wr == NULL)
+		return (EALREADY);
+
+	bzero(&to, sizeof(to));
+	tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th),
+	    TO_SYN);
+
+	/* save these for later */
+	synqe->iss = be32toh(th->th_seq);
+	synqe->ts = to.to_tsval;
+
+	e = &sc->l2t->l2tab[synqe->l2e_idx];
+	t4_l2t_send(sc, wr, e);
+
+	m_freem(m);	/* don't need this any more */
+	return (0);
+}
+
+static int
+do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
+    struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1);
+	int stid = GET_TID(cpl);
+	unsigned int status = cpl->status;
+	struct listen_ctx *lctx = lookup_stid(sc, stid);
+	struct inpcb *inp = lctx->inp;
+#ifdef INVARIANTS
+	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+	KASSERT(opcode == CPL_PASS_OPEN_RPL,
+	    ("%s: unexpected opcode 0x%x", __func__, opcode));
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
+
+	INP_WLOCK(inp);
+
+	CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x",
+	    __func__, stid, status, lctx->flags);
+
+	lctx->flags &= ~LCTX_RPL_PENDING;
+
+	if (status != CPL_ERR_NONE)
+		log(LOG_ERR, "listener with stid %u failed: %d", stid, status);
+
+#ifdef INVARIANTS
+	/*
+	 * If the inp has been dropped (listening socket closed) then
+	 * listen_stop must have run and taken the inp out of the hash.
+	 */
+	if (inp->inp_flags & INP_DROPPED) {
+		KASSERT(listen_hash_del(sc, inp) == NULL,
+		    ("%s: inp %p still in listen hash", __func__, inp));
+	}
+#endif
+
+	if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) {
+		if (release_lctx(sc, lctx) != NULL)
+			INP_WUNLOCK(inp);
+		return (status);
+	}
+
+	/*
+	 * Listening socket stopped listening earlier and now the chip tells us
+	 * it has started the hardware listener.  Stop it; the lctx will be
+	 * released in do_close_server_rpl.
+	 */
+	if (inp->inp_flags & INP_DROPPED) {
+		destroy_server(sc, lctx);
+		INP_WUNLOCK(inp);
+		return (status);
+	}
+
+	/*
+	 * Failed to start hardware listener.  Take inp out of the hash and
+	 * release our reference on it.  An error message has been logged
+	 * already.
+	 */
+	if (status != CPL_ERR_NONE) {
+		listen_hash_del(sc, inp);
+		if (release_lctx(sc, lctx) != NULL)
+			INP_WUNLOCK(inp);
+		return (status);
+	}
+
+	/* hardware listener open for business */
+
+	INP_WUNLOCK(inp);
+	return (status);
+}
+
+static int
+do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss,
+    struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1);
+	int stid = GET_TID(cpl);
+	unsigned int status = cpl->status;
+	struct listen_ctx *lctx = lookup_stid(sc, stid);
+	struct inpcb *inp = lctx->inp;
+#ifdef INVARIANTS
+	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+	KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL,
+	    ("%s: unexpected opcode 0x%x", __func__, opcode));
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
+
+	CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status);
+
+	if (status != CPL_ERR_NONE) {
+		log(LOG_ERR, "%s: failed (%u) to close listener for stid %u",
+		    __func__, status, stid);
+		return (status);
+	}
+
+	INP_WLOCK(inp);
+	inp = release_lctx(sc, lctx);
+	if (inp != NULL)
+		INP_WUNLOCK(inp);
+
+	return (status);
+}
+
+static void
+done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
+{
+	struct listen_ctx *lctx = synqe->lctx;
+	struct inpcb *inp = lctx->inp;
+	struct port_info *pi = synqe->syn->m_pkthdr.rcvif->if_softc;
+	struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
+
+	INP_WLOCK_ASSERT(inp);
+
+	TAILQ_REMOVE(&lctx->synq, synqe, link);
+	inp = release_lctx(sc, lctx);
+	if (inp)
+		INP_WUNLOCK(inp);
+	remove_tid(sc, synqe->tid);
+	release_tid(sc, synqe->tid, &sc->sge.ctrlq[pi->port_id]);
+	t4_l2t_release(e);
+	release_synqe(synqe);	/* removed from synq list */
+}
+
+int
+do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss,
+    struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
+	unsigned int tid = GET_TID(cpl);
+	struct synq_entry *synqe = lookup_tid(sc, tid);
+	struct listen_ctx *lctx = synqe->lctx;
+	struct inpcb *inp = lctx->inp;
+	int txqid;
+	struct sge_wrq *ofld_txq;
+#ifdef INVARIANTS
+	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+	KASSERT(opcode == CPL_ABORT_REQ_RSS,
+	    ("%s: unexpected opcode 0x%x", __func__, opcode));
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
+
+	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
+	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
+
+	if (cpl->status == CPL_ERR_RTX_NEG_ADVICE ||
+	    cpl->status == CPL_ERR_PERSIST_NEG_ADVICE)
+		return (0);	/* Ignore negative advice */
+
+	INP_WLOCK(inp);
+
+	get_qids_from_mbuf(synqe->syn, &txqid, NULL);
+	ofld_txq = &sc->sge.ofld_txq[txqid];
+
+	/*
+	 * If we'd initiated an abort earlier the reply to it is responsible for
+	 * cleaning up resources.  Otherwise we tear everything down right here
+	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
+	 */
+	if (synqe_flag(synqe, TPF_ABORT_SHUTDOWN)) {
+		INP_WUNLOCK(inp);
+		goto done;
+	}
+
+	done_with_synqe(sc, synqe);
+	/* inp lock released by done_with_synqe */
+done:
+	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
+	return (0);
+}
+
+int
+do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss,
+    struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
+	unsigned int tid = GET_TID(cpl);
+	struct synq_entry *synqe = lookup_tid(sc, tid);
+	struct listen_ctx *lctx = synqe->lctx;
+	struct inpcb *inp = lctx->inp;
+#ifdef INVARIANTS
+	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+	KASSERT(opcode == CPL_ABORT_RPL_RSS,
+	    ("%s: unexpected opcode 0x%x", __func__, opcode));
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+	KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
+
+	CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
+	    __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
+
+	INP_WLOCK(inp);
+	KASSERT(synqe_flag(synqe, TPF_ABORT_SHUTDOWN),
+	    ("%s: wasn't expecting abort reply for synqe %p (0x%x)",
+	    __func__, synqe, synqe->flags));
+
+	done_with_synqe(sc, synqe);
+	/* inp lock released by done_with_synqe */
+
+	return (0);
+}
+
+void
+t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
+{
+	struct adapter *sc = tod->tod_softc;
+	struct synq_entry *synqe = arg;
+#ifdef INVARIANTS
+	struct inpcb *inp = sotoinpcb(so);
+#endif
+	struct cpl_pass_establish *cpl = mtod(synqe->syn, void *);
+	struct toepcb *toep = *(struct toepcb **)(cpl + 1);
+
+	INP_INFO_LOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(synqe_flag(synqe, TPF_SYNQE),
+	    ("%s: %p not a synq_entry?", __func__, arg));
+
+	offload_socket(so, toep);
+	make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt);
+	toepcb_set_flag(toep, TPF_CPL_PENDING);
+	update_tid(sc, synqe->tid, toep);
+}
+
+static inline void
+save_qids_in_mbuf(struct mbuf *m, struct port_info *pi)
+{
+	uint32_t txqid, rxqid;
+
+	txqid = (arc4random() % pi->nofldtxq) + pi->first_ofld_txq;
+	rxqid = (arc4random() % pi->nofldrxq) + pi->first_ofld_rxq;
+
+	m->m_pkthdr.flowid = (txqid << 16) | (rxqid & 0xffff);
+}
+
+static inline void
+get_qids_from_mbuf(struct mbuf *m, int *txqid, int *rxqid)
+{
+
+	if (txqid)
+		*txqid = m->m_pkthdr.flowid >> 16;
+	if (rxqid)
+		*rxqid = m->m_pkthdr.flowid & 0xffff;
+}
+
+/*
+ * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to
+ * store some state temporarily.
+ */
+static struct synq_entry *
+mbuf_to_synqe(struct mbuf *m)
+{
+	int len = roundup(sizeof (struct synq_entry), 8);
+	int tspace = M_TRAILINGSPACE(m);
+	struct synq_entry *synqe = NULL;
+
+	if (tspace < len) {
+		synqe = malloc(sizeof(*synqe), M_CXGBE, M_NOWAIT);
+		if (synqe == NULL)
+			return (NULL);
+	} else
+		synqe = (void *)(m->m_data + m->m_len + tspace - sizeof(*synqe));
+
+	synqe->flags = 0;
+	synqe_set_flag(synqe, TPF_SYNQE);
+	if (tspace < len)
+		synqe_set_flag(synqe, TPF_SYNQE_NEEDFREE);
+
+	return (synqe);
+}
+
+static void
+t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
+{
+	bzero(to, sizeof(*to));
+
+	if (t4opt->mss) {
+		to->to_flags |= TOF_MSS;
+		to->to_mss = be16toh(t4opt->mss);
+	}
+
+	if (t4opt->wsf) {
+		to->to_flags |= TOF_SCALE;
+		to->to_wscale = t4opt->wsf;
+	}
+
+	if (t4opt->tstamp)
+		to->to_flags |= TOF_TS;
+
+	if (t4opt->sack)
+		to->to_flags |= TOF_SACKPERM;
+}
+
+/*
+ * Options2 for passive open.
+ */
+static uint32_t
+calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid,
+    const struct tcp_options *tcpopt, struct tcphdr *th)
+{
+	uint32_t opt2 = 0;
+	struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid];
+
+	if (V_tcp_do_rfc1323) {
+		if (tcpopt->tstamp)
+			opt2 |= F_TSTAMPS_EN;
+		if (tcpopt->sack)
+			opt2 |= F_SACK_EN;
+		if (tcpopt->wsf > 0)
+			opt2 |= F_WND_SCALE_EN;
+	}
+
+	if (V_tcp_do_ecn && th->th_flags & (TH_ECE | TH_CWR))
+		opt2 |= F_CCTRL_ECN;
+
+	opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]);
+	opt2 |= F_RX_COALESCE_VALID | V_RX_COALESCE(M_RX_COALESCE);
+	opt2 |= F_RSS_QUEUE_VALID | V_RSS_QUEUE(ofld_rxq->iq.abs_id);
+
+	return htobe32(opt2);
+}
+
+/* XXX: duplication. */
+static inline void
+tcp_fields_to_host(struct tcphdr *th)
+{
+
+	th->th_seq = ntohl(th->th_seq);
+	th->th_ack = ntohl(th->th_ack);
+	th->th_win = ntohs(th->th_win);
+	th->th_urp = ntohs(th->th_urp);
+}
+
+static void
+pass_accept_req_to_protohdrs(const struct mbuf *m, struct in_conninfo *inc,
+    struct tcphdr *th)
+{
+	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
+	const struct ether_header *eh;
+	unsigned int hlen = be32toh(cpl->hdr_len);
+	const struct ip *ip;
+	const struct tcphdr *tcp;
+
+	eh = (const void *)(cpl + 1);
+	ip = (const void *)((uintptr_t)eh + G_ETH_HDR_LEN(hlen));
+	tcp = (const void *)((uintptr_t)ip + G_IP_HDR_LEN(hlen));
+
+	if (inc) {
+		bzero(inc, sizeof(*inc));
+		inc->inc_faddr = ip->ip_src;
+		inc->inc_laddr = ip->ip_dst;
+		inc->inc_fport = tcp->th_sport;
+		inc->inc_lport = tcp->th_dport;
+		if (ip->ip_v == 6)
+			inc->inc_flags |= INC_ISIPV6;
+	}
+
+	if (th) {
+		bcopy(tcp, th, sizeof(*th));
+		tcp_fields_to_host(th);		/* just like tcp_input */
+	}
+}
+
+#define REJECT_PASS_ACCEPT()	do { \
+	reject_reason = __LINE__; \
+	goto reject; \
+} while (0)
+
+/*
+ * The context associated with a tid entry via insert_tid could be a synq_entry
+ * or a toepcb.  The only way CPL handlers can tell is via a bit in these flags.
+ */
+CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags));
+
+/*
+ * Incoming SYN on a listening socket.
+ *
+ * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe,
+ * etc.
+ */
+static int
+do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
+    struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	struct toedev *tod;
+	const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
+	struct cpl_pass_accept_rpl *rpl;
+	struct wrqe *wr;
+	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
+	unsigned int tid = GET_TID(cpl);
+	struct listen_ctx *lctx = lookup_stid(sc, stid);
+	struct inpcb *inp;
+	struct socket *so;
+	struct in_conninfo inc;
+	struct tcphdr th;
+	struct tcpopt to;
+	struct port_info *pi;
+	struct ifnet *ifp, *ifp_vlan = NULL;
+	struct l2t_entry *e = NULL;
+	struct rtentry *rt;
+	struct sockaddr_in nam;
+	int rscale, mtu_idx, rx_credits, rxqid;
+	struct synq_entry *synqe = NULL;
+	int reject_reason;
+	uint16_t vid;
+#ifdef INVARIANTS
+	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+	KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
+	    ("%s: unexpected opcode 0x%x", __func__, opcode));
+	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
+
+	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
+	    lctx);
+
+	pass_accept_req_to_protohdrs(m, &inc, &th);
+	t4opt_to_tcpopt(&cpl->tcpopt, &to);
+
+	pi = sc->port[G_SYN_INTF(be16toh(cpl->l2info))];
+	ifp = pi->ifp;
+	m->m_pkthdr.rcvif = ifp;
+	tod = TOEDEV(ifp);
+
+	/*
+	 * Don't offload if the interface that received the SYN doesn't have
+	 * IFCAP_TOE enabled.
+	 */
+	if ((ifp->if_capenable & IFCAP_TOE4) == 0)
+		REJECT_PASS_ACCEPT();
+
+	/* Don't offload IPv6 connections. XXX: add IPv6 support */
+	if (inc.inc_flags & INC_ISIPV6)
+		REJECT_PASS_ACCEPT();
+
+	/*
+	 * Don't offload if the SYN had a VLAN tag and the vid doesn't match
+	 * anything on this interface.
+	 */
+	vid = EVL_VLANOFTAG(be16toh(cpl->vlan));
+	if (vid != 0xfff) {
+		ifp_vlan = VLAN_DEVAT(ifp, vid);
+		if (ifp_vlan == NULL)
+			REJECT_PASS_ACCEPT();
+	}
+
+	/*
+	 * Don't offload if the peer requested a TCP option that's not known to
+	 * the silicon.
+	 */
+	if (cpl->tcpopt.unknown)
+		REJECT_PASS_ACCEPT();
+
+	/*
+	 * Don't offload if the outgoing interface for the route back to the
+	 * peer is not the same as the interface that received the SYN.
+	 * XXX: too restrictive.
+	 */
+	nam.sin_len = sizeof(nam);
+	nam.sin_family = AF_INET;
+	nam.sin_addr = inc.inc_faddr;
+	rt = rtalloc1((struct sockaddr *)&nam, 0, 0);
+	if (rt == NULL)
+		REJECT_PASS_ACCEPT();
+	else {
+		struct sockaddr *nexthop;
+
+		RT_UNLOCK(rt);
+		nexthop = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway :
+		    (struct sockaddr *)&nam;
+		if (rt->rt_ifp == ifp ||
+		    (ifp_vlan != NULL && rt->rt_ifp == ifp_vlan))
+			e = t4_l2t_get(pi, rt->rt_ifp, nexthop);
+		RTFREE(rt);
+		if (e == NULL)
+			REJECT_PASS_ACCEPT();	/* no l2te, or ifp mismatch */
+	}
+
+	synqe = mbuf_to_synqe(m);
+	if (synqe == NULL)
+		REJECT_PASS_ACCEPT();
+
+	wr = alloc_wrqe(sizeof(*rpl), &sc->sge.ctrlq[pi->port_id]);
+	if (wr == NULL)
+		REJECT_PASS_ACCEPT();
+	rpl = wrtod(wr);
+
+	INP_INFO_WLOCK(&V_tcbinfo);	/* for 4-tuple check, syncache_add */
+
+	/* Don't offload if the 4-tuple is already in use */
+	if (toe_4tuple_check(&inc, &th, ifp) != 0) {
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+		free(wr, M_CXGBE);
+		REJECT_PASS_ACCEPT();
+	}
+
+	inp = lctx->inp;		/* listening socket, not owned by TOE */
+	INP_WLOCK(inp);
+
+	/* Don't offload if the listening socket has closed */
+	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
+		/*
+		 * The listening socket has closed.  The reply from the TOE to
+		 * our CPL_CLOSE_LISTSRV_REQ will ultimately release all
+		 * resources tied to this listen context.
+		 */
+		INP_WUNLOCK(inp);
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+		free(wr, M_CXGBE);
+		REJECT_PASS_ACCEPT();
+	}
+	so = inp->inp_socket;
+
+	mtu_idx = find_best_mtu_idx(sc, &inc, be16toh(cpl->tcpopt.mss));
+	rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0;
+	SOCKBUF_LOCK(&so->so_rcv);
+	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
+	rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
+	SOCKBUF_UNLOCK(&so->so_rcv);
+
+	save_qids_in_mbuf(m, pi);
+	get_qids_from_mbuf(m, NULL, &rxqid);
+
+	INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
+	rpl->opt0 = calc_opt0(so, pi, e, mtu_idx, rscale, rx_credits,
+	    ULP_MODE_NONE);
+	rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th);
+
+	synqe->tid = tid;
+	synqe->lctx = lctx;
+	synqe->syn = m;
+	m = NULL;
+	refcount_init(&synqe->refcnt, 1); /* 1 so that it is held for the
+					     duration of this function */
+	synqe->l2e_idx = e->idx;
+	synqe->rcv_bufsize = rx_credits;
+	atomic_store_rel_ptr(&synqe->wr, (uintptr_t)wr);
+
+	insert_tid(sc, tid, synqe);
+	TAILQ_INSERT_TAIL(&lctx->synq, synqe, link);
+	hold_synqe(synqe);	/* hold for the duration it's in the synq */
+	hold_lctx(lctx);	/* A synqe on the list has a ref on its lctx */
+
+	/*
+	 * If all goes well t4_syncache_respond will get called during
+	 * syncache_add.  Also note that syncache_add releases both pcbinfo and
+	 * pcb locks.
+	 */
+	toe_syncache_add(&inc, &to, &th, inp, tod, synqe);
+	INP_UNLOCK_ASSERT(inp);	/* ok to assert, we have a ref on the inp */
+	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+
+	/*
+	 * If we replied during syncache_add (synqe->wr has been consumed),
+	 * good.  Otherwise, set it to 0 so that further syncache_respond
+	 * attempts by the kernel will be ignored.
+	 *
+	 * The extra hold on the synqe makes sure that it is still around, even
+	 * if the listener has been dropped and the synqe was aborted and the
+	 * reply to the abort has removed and released the synqe from the synq
+	 * list.
+	 */
+	if (atomic_cmpset_ptr(&synqe->wr, (uintptr_t)wr, 0)) {
+
+		INP_WLOCK(inp);
+		if (__predict_false(inp->inp_flags & INP_DROPPED)) {
+			/* listener closed.  synqe must have been aborted. */
+			KASSERT(synqe_flag(synqe, TPF_ABORT_SHUTDOWN),
+			    ("%s: listener %p closed but synqe %p not aborted",
+			    __func__, inp, synqe));
+
+			CTR5(KTR_CXGBE,
+			    "%s: stid %u, tid %u, lctx %p, synqe %p, ABORTED",
+			    __func__, stid, tid, lctx, synqe);
+			INP_WUNLOCK(inp);
+			free(wr, M_CXGBE);
+			release_synqe(synqe);	/* about to exit function */
+			return (__LINE__);
+		}
+
+		/*
+		 * synqe aborted before TOM replied to PASS_ACCEPT_REQ.  But
+		 * that can only happen if the listener was closed and we just
+		 * checked for that.
+		 */
+		KASSERT(!synqe_flag(synqe, TPF_ABORT_SHUTDOWN),
+		    ("%s: synqe %p aborted, but listener %p not dropped.",
+		    __func__, synqe, inp));
+
+		/* Yank the synqe out of the lctx synq. */
+		TAILQ_REMOVE(&lctx->synq, synqe, link);
+		release_synqe(synqe);	/* removed from synq list */
+		inp = release_lctx(sc, lctx);
+		if (inp)
+			INP_WUNLOCK(inp);
+
+		/*
+		 * syncache may or may not have a hold on the synqe, which may
+		 * or may not be stashed in the original SYN mbuf passed to us.
+		 * Just copy it over instead of dealing with all possibilities.
+		 */
+		m = m_dup(synqe->syn, M_DONTWAIT);
+		if (m)
+			m->m_pkthdr.rcvif = ifp;
+
+		release_synqe(synqe);	/* about to exit function */
+		free(wr, M_CXGBE);
+		REJECT_PASS_ACCEPT();
+	}
+	release_synqe(synqe);	/* about to exit function */
+	CTR5(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p, synqe %p, SYNACK",
+	    __func__, stid, tid, lctx, synqe);
+	return (0);
+reject:
+	CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
+	    reject_reason);
+
+	if (e)
+		t4_l2t_release(e);
+	release_tid(sc, tid, lctx->ctrlq);
+
+	if (__predict_true(m != NULL)) {
+		m_adj(m, sizeof(*cpl));
+		m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID |
+		    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
+		m->m_pkthdr.csum_data = 0xffff;
+		ifp->if_input(ifp, m);
+	}
+
+	return (reject_reason);
+}
+
+static void
+synqe_to_protohdrs(struct synq_entry *synqe,
+    const struct cpl_pass_establish *cpl, struct in_conninfo *inc,
+    struct tcphdr *th, struct tcpopt *to)
+{
+	uint16_t tcp_opt = be16toh(cpl->tcp_opt);
+
+	/* start off with the original SYN */
+	pass_accept_req_to_protohdrs(synqe->syn, inc, th);
+
+	/* modify parts to make it look like the ACK to our SYN|ACK */
+	th->th_flags = TH_ACK;
+	th->th_ack = synqe->iss + 1;
+	th->th_seq = be32toh(cpl->rcv_isn);
+	bzero(to, sizeof(*to));
+	if (G_TCPOPT_TSTAMP(tcp_opt)) {
+		to->to_flags |= TOF_TS;
+		to->to_tsecr = synqe->ts;
+	}
+}
+
+static int
+do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
+    struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	struct port_info *pi;
+	struct ifnet *ifp;
+	const struct cpl_pass_establish *cpl = (const void *)(rss + 1);
+#if defined(KTR) || defined(INVARIANTS)
+	unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
+#endif
+	unsigned int tid = GET_TID(cpl);
+	struct synq_entry *synqe = lookup_tid(sc, tid);
+	struct listen_ctx *lctx = synqe->lctx;
+	struct inpcb *inp = lctx->inp;
+	struct socket *so;
+	struct tcphdr th;
+	struct tcpopt to;
+	struct in_conninfo inc;
+	struct toepcb *toep;
+	u_int txqid, rxqid;
+#ifdef INVARIANTS
+	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+	KASSERT(opcode == CPL_PASS_ESTABLISH,
+	    ("%s: unexpected opcode 0x%x", __func__, opcode));
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+	KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
+	KASSERT(synqe_flag(synqe, TPF_SYNQE),
+	    ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
+
+	INP_INFO_WLOCK(&V_tcbinfo);	/* for syncache_expand */
+	INP_WLOCK(inp);
+
+	CTR6(KTR_CXGBE,
+	    "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x",
+	    __func__, stid, tid, synqe, synqe->flags, inp->inp_flags);
+
+	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
+		/*
+		 * The listening socket has closed.  The TOM must have aborted
+		 * all the embryonic connections (including this one) that were
+		 * on the lctx's synq.  do_abort_rpl for the tid is responsible
+		 * for cleaning up.
+		 */
+		KASSERT(synqe_flag(synqe, TPF_ABORT_SHUTDOWN),
+		    ("%s: listen socket dropped but tid %u not aborted.",
+		    __func__, tid));
+
+		INP_WUNLOCK(inp);
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+		return (0);
+	}
+
+	ifp = synqe->syn->m_pkthdr.rcvif;
+	pi = ifp->if_softc;
+	KASSERT(pi->adapter == sc,
+	    ("%s: pi %p, sc %p mismatch", __func__, pi, sc));
+
+	get_qids_from_mbuf(synqe->syn, &txqid, &rxqid);
+	KASSERT(rxqid == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0],
+	    ("%s: CPL arrived on unexpected rxq.  %d %d", __func__, rxqid,
+	    (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0])));
+
+	toep = alloc_toepcb(pi, txqid, rxqid, M_NOWAIT);
+	if (toep == NULL) {
+reset:
+		/* The reply to this abort will perform final cleanup */
+		send_reset_synqe(TOEDEV(ifp), synqe);
+		INP_WUNLOCK(inp);
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+		return (0);
+	}
+	toep->tid = tid;
+	toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx];
+	toep->ulp_mode = ULP_MODE_NONE;
+	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
+	toep->rx_credits = synqe->rcv_bufsize;
+
+	so = inp->inp_socket;
+	KASSERT(so != NULL, ("%s: socket is NULL", __func__));
+
+	/* Come up with something that syncache_expand should be ok with. */
+	synqe_to_protohdrs(synqe, cpl, &inc, &th, &to);
+
+	/*
+	 * No more need for anything in the mbuf that carried the
+	 * CPL_PASS_ACCEPT_REQ.  Drop the CPL_PASS_ESTABLISH and toep pointer
+	 * there.  XXX: bad form but I don't want to increase the size of synqe.
+	 */
+	m = synqe->syn;
+	KASSERT(sizeof(*cpl) + sizeof(toep) <= m->m_len,
+	    ("%s: no room in mbuf %p (m_len %d)", __func__, m, m->m_len));
+	bcopy(cpl, mtod(m, void *), sizeof(*cpl));
+	*(struct toepcb **)(mtod(m, struct cpl_pass_establish *) + 1) = toep;
+
+	if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) {
+		free_toepcb(toep);
+		goto reset;
+	}
+
+	/* Done with the synqe */
+	TAILQ_REMOVE(&lctx->synq, synqe, link);
+	inp = release_lctx(sc, lctx);
+	if (inp != NULL)
+		INP_WUNLOCK(inp);
+	INP_INFO_WUNLOCK(&V_tcbinfo);
+	release_synqe(synqe);
+
+	return (0);
+}
+
+void
+t4_init_listen_cpl_handlers(struct adapter *sc)
+{
+
+	t4_register_cpl_handler(sc, CPL_PASS_OPEN_RPL, do_pass_open_rpl);
+	t4_register_cpl_handler(sc, CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
+	t4_register_cpl_handler(sc, CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
+	t4_register_cpl_handler(sc, CPL_PASS_ESTABLISH, do_pass_establish);
+}
+#endif
diff --git a/sys/dev/cxgbe/tom/t4_tom.c b/sys/dev/cxgbe/tom/t4_tom.c
new file mode 100644
index 0000000..c6e9a1f
--- /dev/null
+++ b/sys/dev/cxgbe/tom/t4_tom.c
@@ -0,0 +1,755 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/module.h>
+#include <sys/protosw.h>
+#include <sys/domain.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/tcp_var.h>
+#define TCPSTATES
+#include <netinet/tcp_fsm.h>
+#include <netinet/toecore.h>
+
+#ifdef TCP_OFFLOAD
+#include "common/common.h"
+#include "common/t4_msg.h"
+#include "common/t4_regs.h"
+#include "tom/t4_tom_l2t.h"
+#include "tom/t4_tom.h"
+
+/* Module ops */
+static int t4_tom_mod_load(void);
+static int t4_tom_mod_unload(void);
+static int t4_tom_modevent(module_t, int, void *);
+
+/* ULD ops and helpers */
+static int t4_tom_activate(struct adapter *);
+static int t4_tom_deactivate(struct adapter *);
+
+static struct uld_info tom_uld_info = {
+	.uld_id = ULD_TOM,
+	.activate = t4_tom_activate,
+	.deactivate = t4_tom_deactivate,
+};
+
+static void queue_tid_release(struct adapter *, int);
+static void release_offload_resources(struct toepcb *);
+static int alloc_tid_tabs(struct tid_info *);
+static void free_tid_tabs(struct tid_info *);
+static void free_tom_data(struct adapter *, struct tom_data *);
+
+struct toepcb *
+alloc_toepcb(struct port_info *pi, int txqid, int rxqid, int flags)
+{
+	struct adapter *sc = pi->adapter;
+	struct toepcb *toep;
+	int tx_credits, txsd_total, len;
+
+	/*
+	 * The firmware counts tx work request credits in units of 16 bytes
+	 * each.  Reserve room for an ABORT_REQ so the driver never has to worry
+	 * about tx credits if it wants to abort a connection.
+	 */
+	tx_credits = sc->params.ofldq_wr_cred;
+	tx_credits -= howmany(sizeof(struct cpl_abort_req), 16);
+
+	/*
+	 * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte
+	 * immediate payload, and firmware counts tx work request credits in
+	 * units of 16 byte.  Calculate the maximum work requests possible.
+	 */
+	txsd_total = tx_credits /
+	    howmany((sizeof(struct fw_ofld_tx_data_wr) + 1), 16);
+
+	if (txqid < 0)
+		txqid = (arc4random() % pi->nofldtxq) + pi->first_ofld_txq;
+	KASSERT(txqid >= pi->first_ofld_txq &&
+	    txqid < pi->first_ofld_txq + pi->nofldtxq,
+	    ("%s: txqid %d for port %p (first %d, n %d)", __func__, txqid, pi,
+		pi->first_ofld_txq, pi->nofldtxq));
+
+	if (rxqid < 0)
+		rxqid = (arc4random() % pi->nofldrxq) + pi->first_ofld_rxq;
+	KASSERT(rxqid >= pi->first_ofld_rxq &&
+	    rxqid < pi->first_ofld_rxq + pi->nofldrxq,
+	    ("%s: rxqid %d for port %p (first %d, n %d)", __func__, rxqid, pi,
+		pi->first_ofld_rxq, pi->nofldrxq));
+
+	len = offsetof(struct toepcb, txsd) +
+	    txsd_total * sizeof(struct ofld_tx_sdesc);
+
+	toep = malloc(len, M_CXGBE, M_ZERO | flags);
+	if (toep == NULL)
+		return (NULL);
+
+	toep->td = sc->tom_softc;
+	toep->port = pi;
+	toep->tx_credits = tx_credits;
+	toep->ofld_txq = &sc->sge.ofld_txq[txqid];
+	toep->ofld_rxq = &sc->sge.ofld_rxq[rxqid];
+	toep->ctrlq = &sc->sge.ctrlq[pi->port_id];
+	toep->txsd_total = txsd_total;
+	toep->txsd_avail = txsd_total;
+	toep->txsd_pidx = 0;
+	toep->txsd_cidx = 0;
+
+	return (toep);
+}
+
+void
+free_toepcb(struct toepcb *toep)
+{
+
+	KASSERT(toepcb_flag(toep, TPF_ATTACHED) == 0,
+	    ("%s: attached to an inpcb", __func__));
+	KASSERT(toepcb_flag(toep, TPF_CPL_PENDING) == 0,
+	    ("%s: CPL pending", __func__));
+
+	free(toep, M_CXGBE);
+}
+
+/*
+ * Set up the socket for TCP offload.
+ */
+void
+offload_socket(struct socket *so, struct toepcb *toep)
+{
+	struct tom_data *td = toep->td;
+	struct inpcb *inp = sotoinpcb(so);
+	struct tcpcb *tp = intotcpcb(inp);
+	struct sockbuf *sb;
+
+	INP_WLOCK_ASSERT(inp);
+
+	/* Update socket */
+	sb = &so->so_snd;
+	SOCKBUF_LOCK(sb);
+	sb->sb_flags |= SB_NOCOALESCE;
+	SOCKBUF_UNLOCK(sb);
+	sb = &so->so_rcv;
+	SOCKBUF_LOCK(sb);
+	sb->sb_flags |= SB_NOCOALESCE;
+	SOCKBUF_UNLOCK(sb);
+
+	/* Update TCP PCB */
+	tp->tod = &td->tod;
+	tp->t_toe = toep;
+	tp->t_flags |= TF_TOE;
+
+	/* Install an extra hold on inp */
+	toep->inp = inp;
+	toepcb_set_flag(toep, TPF_ATTACHED);
+	in_pcbref(inp);
+
+	/* Add the TOE PCB to the active list */
+	mtx_lock(&td->toep_list_lock);
+	TAILQ_INSERT_HEAD(&td->toep_list, toep, link);
+	mtx_unlock(&td->toep_list_lock);
+}
+
+/* This is _not_ the normal way to "unoffload" a socket. */
+void
+undo_offload_socket(struct socket *so)
+{
+	struct inpcb *inp = sotoinpcb(so);
+	struct tcpcb *tp = intotcpcb(inp);
+	struct toepcb *toep = tp->t_toe;
+	struct tom_data *td = toep->td;
+	struct sockbuf *sb;
+
+	INP_WLOCK_ASSERT(inp);
+
+	sb = &so->so_snd;
+	SOCKBUF_LOCK(sb);
+	sb->sb_flags &= ~SB_NOCOALESCE;
+	SOCKBUF_UNLOCK(sb);
+	sb = &so->so_rcv;
+	SOCKBUF_LOCK(sb);
+	sb->sb_flags &= ~SB_NOCOALESCE;
+	SOCKBUF_UNLOCK(sb);
+
+	tp->tod = NULL;
+	tp->t_toe = NULL;
+	tp->t_flags &= ~TF_TOE;
+
+	toep->inp = NULL;
+	toepcb_clr_flag(toep, TPF_ATTACHED);
+	if (in_pcbrele_wlocked(inp))
+		panic("%s: inp freed.", __func__);
+
+	mtx_lock(&td->toep_list_lock);
+	TAILQ_REMOVE(&td->toep_list, toep, link);
+	mtx_unlock(&td->toep_list_lock);
+}
+
+static void
+release_offload_resources(struct toepcb *toep)
+{
+	struct tom_data *td = toep->td;
+	struct adapter *sc = td_adapter(td);
+	int tid = toep->tid;
+
+	KASSERT(toepcb_flag(toep, TPF_CPL_PENDING) == 0,
+	    ("%s: %p has CPL pending.", __func__, toep));
+	KASSERT(toepcb_flag(toep, TPF_ATTACHED) == 0,
+	    ("%s: %p is still attached.", __func__, toep));
+
+	CTR4(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p)",
+	    __func__, toep, tid, toep->l2te);
+
+	if (toep->l2te)
+		t4_l2t_release(toep->l2te);
+
+	if (tid >= 0) {
+		remove_tid(sc, tid);
+		release_tid(sc, tid, toep->ctrlq);
+	}
+
+	mtx_lock(&td->toep_list_lock);
+	TAILQ_REMOVE(&td->toep_list, toep, link);
+	mtx_unlock(&td->toep_list_lock);
+
+	free_toepcb(toep);
+}
+
+/*
+ * The kernel is done with the TCP PCB and this is our opportunity to unhook the
+ * toepcb hanging off of it.  If the TOE driver is also done with the toepcb (no
+ * pending CPL) then it is time to release all resources tied to the toepcb.
+ *
+ * Also gets called when an offloaded active open fails and the TOM wants the
+ * kernel to take the TCP PCB back.
+ */
+static void
+t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp)
+{
+#if defined(KTR) || defined(INVARIANTS)
+	struct inpcb *inp = tp->t_inpcb;
+#endif
+	struct toepcb *toep = tp->t_toe;
+
+	INP_WLOCK_ASSERT(inp);
+
+	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
+	KASSERT(toepcb_flag(toep, TPF_ATTACHED),
+	    ("%s: not attached", __func__));
+
+#ifdef KTR
+	if (tp->t_state == TCPS_SYN_SENT) {
+		CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)",
+		    __func__, toep->tid, toep, toep->flags, inp,
+		    inp->inp_flags);
+	} else {
+		CTR6(KTR_CXGBE,
+		    "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)",
+		    toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp,
+		    inp->inp_flags);
+	}
+#endif
+
+	tp->t_toe = NULL;
+	tp->t_flags &= ~TF_TOE;
+	toepcb_clr_flag(toep, TPF_ATTACHED);
+
+	if (toepcb_flag(toep, TPF_CPL_PENDING) == 0)
+		release_offload_resources(toep);
+}
+
+/*
+ * The TOE driver will not receive any more CPLs for the tid associated with the
+ * toepcb; release the hold on the inpcb.
+ */
+void
+final_cpl_received(struct toepcb *toep)
+{
+	struct inpcb *inp = toep->inp;
+
+	KASSERT(inp != NULL, ("%s: inp is NULL", __func__));
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(toepcb_flag(toep, TPF_CPL_PENDING),
+	    ("%s: CPL not pending already?", __func__));
+
+	CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)",
+	    __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags);
+
+	toep->inp = NULL;
+	toepcb_clr_flag(toep, TPF_CPL_PENDING);
+
+	if (toepcb_flag(toep, TPF_ATTACHED) == 0)
+		release_offload_resources(toep);
+
+	if (!in_pcbrele_wlocked(inp))
+		INP_WUNLOCK(inp);
+}
+
+void
+insert_tid(struct adapter *sc, int tid, void *ctx)
+{
+	struct tid_info *t = &sc->tids;
+
+	t->tid_tab[tid] = ctx;
+	atomic_add_int(&t->tids_in_use, 1);
+}
+
+void *
+lookup_tid(struct adapter *sc, int tid)
+{
+	struct tid_info *t = &sc->tids;
+
+	return (t->tid_tab[tid]);
+}
+
+void
+update_tid(struct adapter *sc, int tid, void *ctx)
+{
+	struct tid_info *t = &sc->tids;
+
+	t->tid_tab[tid] = ctx;
+}
+
+void
+remove_tid(struct adapter *sc, int tid)
+{
+	struct tid_info *t = &sc->tids;
+
+	t->tid_tab[tid] = NULL;
+	atomic_subtract_int(&t->tids_in_use, 1);
+}
+
+void
+release_tid(struct adapter *sc, int tid, struct sge_wrq *ctrlq)
+{
+	struct wrqe *wr;
+	struct cpl_tid_release *req;
+
+	wr = alloc_wrqe(sizeof(*req), ctrlq);
+	if (wr == NULL) {
+		queue_tid_release(sc, tid);	/* defer */
+		return;
+	}
+	req = wrtod(wr);
+
+	INIT_TP_WR_MIT_CPL(req, CPL_TID_RELEASE, tid);
+
+	t4_wrq_tx(sc, wr);
+}
+
+static void
+queue_tid_release(struct adapter *sc, int tid)
+{
+
+	CXGBE_UNIMPLEMENTED("deferred tid release");
+}
+
+/*
+ * What mtu_idx to use, given a 4-tuple and/or an MSS cap
+ */
+int
+find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss)
+{
+	unsigned short *mtus = &sc->params.mtus[0];
+	int i = 0, mss;
+
+	KASSERT(inc != NULL || pmss > 0,
+	    ("%s: at least one of inc/pmss must be specified", __func__));
+
+	mss = inc ? tcp_mssopt(inc) : pmss;
+	if (pmss > 0 && mss > pmss)
+		mss = pmss;
+
+	while (i < NMTUS - 1 && mtus[i + 1] <= mss + 40)
+		++i;
+
+	return (i);
+}
+
+/*
+ * Determine the receive window size for a socket.
+ */
+u_long
+select_rcv_wnd(struct socket *so)
+{
+	unsigned long wnd;
+
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+
+	wnd = sbspace(&so->so_rcv);
+	if (wnd < MIN_RCV_WND)
+		wnd = MIN_RCV_WND;
+
+	return min(wnd, MAX_RCV_WND);
+}
+
+int
+select_rcv_wscale(void)
+{
+	int wscale = 0;
+	unsigned long space = sb_max;
+
+	if (space > MAX_RCV_WND)
+		space = MAX_RCV_WND;
+
+	while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space)
+		wscale++;
+
+	return (wscale);
+}
+
+extern int always_keepalive;
+#define VIID_SMACIDX(v)	(((unsigned int)(v) & 0x7f) << 1)
+
+/*
+ * socket so could be a listening socket too.
+ */
+uint64_t
+calc_opt0(struct socket *so, struct port_info *pi, struct l2t_entry *e,
+    int mtu_idx, int rscale, int rx_credits, int ulp_mode)
+{
+	uint64_t opt0;
+
+	KASSERT(rx_credits <= M_RCV_BUFSIZ,
+	    ("%s: rcv_bufsiz too high", __func__));
+
+	opt0 = F_TCAM_BYPASS | V_WND_SCALE(rscale) | V_MSS_IDX(mtu_idx) |
+	    V_ULP_MODE(ulp_mode) | V_RCV_BUFSIZ(rx_credits);
+
+	if (so != NULL) {
+		struct inpcb *inp = sotoinpcb(so);
+		struct tcpcb *tp = intotcpcb(inp);
+		int keepalive = always_keepalive ||
+		    so_options_get(so) & SO_KEEPALIVE;
+
+		opt0 |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0);
+		opt0 |= V_KEEP_ALIVE(keepalive != 0);
+	}
+
+	if (e != NULL)
+		opt0 |= V_L2T_IDX(e->idx);
+
+	if (pi != NULL) {
+		opt0 |= V_SMAC_SEL(VIID_SMACIDX(pi->viid));
+		opt0 |= V_TX_CHAN(pi->tx_chan);
+	}
+
+	return htobe64(opt0);
+}
+
+#define FILTER_SEL_WIDTH_P_FC (3 + 1)
+#define FILTER_SEL_WIDTH_VIN_P_FC (6 + 7 + FILTER_SEL_WIDTH_P_FC)
+#define FILTER_SEL_WIDTH_TAG_P_FC (3 + FILTER_SEL_WIDTH_VIN_P_FC)
+#define FILTER_SEL_WIDTH_VLD_TAG_P_FC (1 + FILTER_SEL_WIDTH_TAG_P_FC)
+#define VLAN_NONE 0xfff
+#define FILTER_SEL_VLAN_NONE 0xffff
+
+uint32_t
+select_ntuple(struct port_info *pi, struct l2t_entry *e, uint32_t filter_mode)
+{
+	uint16_t viid = pi->viid;
+	uint32_t ntuple = 0;
+
+	if (filter_mode == HW_TPL_FR_MT_PR_IV_P_FC) {
+                if (e->vlan == VLAN_NONE)
+			ntuple |= FILTER_SEL_VLAN_NONE << FILTER_SEL_WIDTH_P_FC;
+                else {
+                        ntuple |= e->vlan << FILTER_SEL_WIDTH_P_FC;
+                        ntuple |= 1 << FILTER_SEL_WIDTH_VLD_TAG_P_FC;
+                }
+                ntuple |= e->lport << S_PORT;
+		ntuple |= IPPROTO_TCP << FILTER_SEL_WIDTH_VLD_TAG_P_FC;
+	} else if (filter_mode == HW_TPL_FR_MT_PR_OV_P_FC) {
+                ntuple |= G_FW_VIID_VIN(viid) << FILTER_SEL_WIDTH_P_FC;
+                ntuple |= G_FW_VIID_PFN(viid) << FILTER_SEL_WIDTH_VIN_P_FC;
+                ntuple |= G_FW_VIID_VIVLD(viid) << FILTER_SEL_WIDTH_TAG_P_FC;
+                ntuple |= e->lport << S_PORT;
+		ntuple |= IPPROTO_TCP << FILTER_SEL_WIDTH_VLD_TAG_P_FC;
+        }
+
+	return (htobe32(ntuple));
+}
+
+static int
+alloc_tid_tabs(struct tid_info *t)
+{
+	size_t size;
+	unsigned int i;
+
+	size = t->ntids * sizeof(*t->tid_tab) +
+	    t->natids * sizeof(*t->atid_tab) +
+	    t->nstids * sizeof(*t->stid_tab);
+
+	t->tid_tab = malloc(size, M_CXGBE, M_ZERO | M_NOWAIT);
+	if (t->tid_tab == NULL)
+		return (ENOMEM);
+
+	mtx_init(&t->atid_lock, "atid lock", NULL, MTX_DEF);
+	t->atid_tab = (union aopen_entry *)&t->tid_tab[t->ntids];
+	t->afree = t->atid_tab;
+	t->atids_in_use = 0;
+	for (i = 1; i < t->natids; i++)
+		t->atid_tab[i - 1].next = &t->atid_tab[i];
+	t->atid_tab[t->natids - 1].next = NULL;
+
+	mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF);
+	t->stid_tab = (union serv_entry *)&t->atid_tab[t->natids];
+	t->sfree = t->stid_tab;
+	t->stids_in_use = 0;
+	for (i = 1; i < t->nstids; i++)
+		t->stid_tab[i - 1].next = &t->stid_tab[i];
+	t->stid_tab[t->nstids - 1].next = NULL;
+
+	atomic_store_rel_int(&t->tids_in_use, 0);
+
+	return (0);
+}
+
+static void
+free_tid_tabs(struct tid_info *t)
+{
+	KASSERT(t->tids_in_use == 0,
+	    ("%s: %d tids still in use.", __func__, t->tids_in_use));
+	KASSERT(t->atids_in_use == 0,
+	    ("%s: %d atids still in use.", __func__, t->atids_in_use));
+	KASSERT(t->stids_in_use == 0,
+	    ("%s: %d tids still in use.", __func__, t->stids_in_use));
+
+	free(t->tid_tab, M_CXGBE);
+	t->tid_tab = NULL;
+
+	if (mtx_initialized(&t->atid_lock))
+		mtx_destroy(&t->atid_lock);
+	if (mtx_initialized(&t->stid_lock))
+		mtx_destroy(&t->stid_lock);
+}
+
+static void
+free_tom_data(struct adapter *sc, struct tom_data *td)
+{
+	KASSERT(TAILQ_EMPTY(&td->toep_list),
+	    ("%s: TOE PCB list is not empty.", __func__));
+	KASSERT(td->lctx_count == 0,
+	    ("%s: lctx hash table is not empty.", __func__));
+
+	t4_uninit_l2t_cpl_handlers(sc);
+
+	if (td->listen_mask != 0)
+		hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask);
+
+	if (mtx_initialized(&td->lctx_hash_lock))
+		mtx_destroy(&td->lctx_hash_lock);
+	if (mtx_initialized(&td->toep_list_lock))
+		mtx_destroy(&td->toep_list_lock);
+
+	free_tid_tabs(&sc->tids);
+	free(td, M_CXGBE);
+}
+
+/*
+ * Ground control to Major TOM
+ * Commencing countdown, engines on
+ */
+static int
+t4_tom_activate(struct adapter *sc)
+{
+	struct tom_data *td;
+	struct toedev *tod;
+	int i, rc;
+
+	ADAPTER_LOCK_ASSERT_OWNED(sc);	/* for sc->flags */
+
+	/* per-adapter softc for TOM */
+	td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT);
+	if (td == NULL)
+		return (ENOMEM);
+
+	/* List of TOE PCBs and associated lock */
+	mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF);
+	TAILQ_INIT(&td->toep_list);
+
+	/* Listen context */
+	mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF);
+	td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE,
+	    &td->listen_mask, HASH_NOWAIT);
+
+	/* TID tables */
+	rc = alloc_tid_tabs(&sc->tids);
+	if (rc != 0)
+		goto done;
+
+	/* CPL handlers */
+	t4_init_connect_cpl_handlers(sc);
+	t4_init_l2t_cpl_handlers(sc);
+	t4_init_listen_cpl_handlers(sc);
+	t4_init_cpl_io_handlers(sc);
+
+	/* toedev ops */
+	tod = &td->tod;
+	init_toedev(tod);
+	tod->tod_softc = sc;
+	tod->tod_connect = t4_connect;
+	tod->tod_listen_start = t4_listen_start;
+	tod->tod_listen_stop = t4_listen_stop;
+	tod->tod_rcvd = t4_rcvd;
+	tod->tod_output = t4_tod_output;
+	tod->tod_send_rst = t4_send_rst;
+	tod->tod_send_fin = t4_send_fin;
+	tod->tod_pcb_detach = t4_pcb_detach;
+	tod->tod_l2_update = t4_l2_update;
+	tod->tod_syncache_added = t4_syncache_added;
+	tod->tod_syncache_removed = t4_syncache_removed;
+	tod->tod_syncache_respond = t4_syncache_respond;
+	tod->tod_offload_socket = t4_offload_socket;
+
+	for_each_port(sc, i)
+		TOEDEV(sc->port[i]->ifp) = &td->tod;
+
+	sc->tom_softc = td;
+	sc->flags |= TOM_INIT_DONE;
+	register_toedev(sc->tom_softc);
+
+done:
+	if (rc != 0)
+		free_tom_data(sc, td);
+	return (rc);
+}
+
+static int
+t4_tom_deactivate(struct adapter *sc)
+{
+	int rc = 0;
+	struct tom_data *td = sc->tom_softc;
+
+	ADAPTER_LOCK_ASSERT_OWNED(sc);	/* for sc->flags */
+
+	if (td == NULL)
+		return (0);	/* XXX. KASSERT? */
+
+	if (sc->offload_map != 0)
+		return (EBUSY);	/* at least one port has IFCAP_TOE enabled */
+
+	mtx_lock(&td->toep_list_lock);
+	if (!TAILQ_EMPTY(&td->toep_list))
+		rc = EBUSY;
+	mtx_unlock(&td->toep_list_lock);
+
+	mtx_lock(&td->lctx_hash_lock);
+	if (td->lctx_count > 0)
+		rc = EBUSY;
+	mtx_unlock(&td->lctx_hash_lock);
+
+	if (rc == 0) {
+		unregister_toedev(sc->tom_softc);
+		free_tom_data(sc, td);
+		sc->tom_softc = NULL;
+		sc->flags &= ~TOM_INIT_DONE;
+	}
+
+	return (rc);
+}
+
+static int
+t4_tom_mod_load(void)
+{
+	int rc;
+
+	rc = t4_register_uld(&tom_uld_info);
+	if (rc != 0)
+		t4_tom_mod_unload();
+
+	return (rc);
+}
+
+static void
+tom_uninit(struct adapter *sc, void *arg __unused)
+{
+	/* Try to free resources (works only if no port has IFCAP_TOE) */
+	ADAPTER_LOCK(sc);
+	if (sc->flags & TOM_INIT_DONE)
+		t4_deactivate_uld(sc, ULD_TOM);
+	ADAPTER_UNLOCK(sc);
+}
+
+static int
+t4_tom_mod_unload(void)
+{
+	t4_iterate(tom_uninit, NULL);
+
+	if (t4_unregister_uld(&tom_uld_info) == EBUSY)
+		return (EBUSY);
+
+	return (0);
+}
+#endif	/* TCP_OFFLOAD */
+
+static int
+t4_tom_modevent(module_t mod, int cmd, void *arg)
+{
+	int rc = 0;
+
+#ifdef TCP_OFFLOAD
+	switch (cmd) {
+	case MOD_LOAD:
+		rc = t4_tom_mod_load();
+		break;
+
+	case MOD_UNLOAD:
+		rc = t4_tom_mod_unload();
+		break;
+
+	default:
+		rc = EINVAL;
+	}
+#else
+	printf("t4_tom: compiled without TCP_OFFLOAD support.\n");
+	rc = EOPNOTSUPP;
+#endif
+	return (rc);
+}
+
+static moduledata_t t4_tom_moddata= {
+	"t4_tom",
+	t4_tom_modevent,
+	0
+};
+
+MODULE_VERSION(t4_tom, 1);
+MODULE_DEPEND(t4_tom, toecore, 1, 1, 1);
+MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1);
+DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY);
diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h
new file mode 100644
index 0000000..4e171e7
--- /dev/null
+++ b/sys/dev/cxgbe/tom/t4_tom.h
@@ -0,0 +1,248 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+#ifndef __T4_TOM_H__
+#define __T4_TOM_H__
+
+#define KTR_CXGBE	KTR_SPARE3
+#define LISTEN_HASH_SIZE 32
+
+/*
+ * Min receive window.  We want it to be large enough to accommodate receive
+ * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
+ */
+#define MIN_RCV_WND (24 * 1024U)
+
+/*
+ * Max receive window supported by HW in bytes.  Only a small part of it can
+ * be set through option0, the rest needs to be set through RX_DATA_ACK.
+ */
+#define MAX_RCV_WND ((1U << 27) - 1)
+
+/* TOE PCB flags */
+enum {
+	TPF_ATTACHED,		/* a tcpcb refers to this toepcb */
+	TPF_FLOWC_WR_SENT,	/* firmware flow context WR sent */
+	TPF_TX_DATA_SENT,	/* some data sent */
+	TPF_TX_SUSPENDED,	/* tx suspended for lack of resources */
+	TPF_SEND_FIN,		/* send FIN after sending all pending data */
+	TPF_FIN_SENT,		/* FIN has been sent */
+	TPF_ABORT_SHUTDOWN,	/* connection abort is in progress */
+	TPF_CPL_PENDING,	/* haven't received the last CPL */
+	TPF_SYNQE,		/* synq_entry, not really a toepcb */
+	TPF_SYNQE_NEEDFREE,	/* synq_entry was allocated externally */
+};
+
+struct ofld_tx_sdesc {
+	uint32_t plen;		/* payload length */
+	uint8_t tx_credits;	/* firmware tx credits (unit is 16B) */
+};
+
+struct toepcb {
+	TAILQ_ENTRY(toepcb) link; /* toep_list */
+	unsigned int flags;	/* miscellaneous flags */
+	struct tom_data *td;
+	struct inpcb *inp;	/* backpointer to host stack's PCB */
+	struct port_info *port;	/* physical port */
+	struct sge_wrq *ofld_txq;
+	struct sge_ofld_rxq *ofld_rxq;
+	struct sge_wrq *ctrlq;
+	struct l2t_entry *l2te;	/* L2 table entry used by this connection */
+	int tid;		/* Connection identifier */
+	unsigned int tx_credits;/* tx WR credits (in 16 byte units) remaining */
+	unsigned int enqueued;	/* # of bytes added to so_rcv (not yet read) */
+	int rx_credits;		/* rx credits (in bytes) to be returned to hw */
+
+	unsigned int ulp_mode;	/* ULP mode */
+
+	/* Tx software descriptor */
+	uint8_t txsd_total;
+	uint8_t txsd_pidx;
+	uint8_t txsd_cidx;
+	uint8_t txsd_avail;
+	struct ofld_tx_sdesc txsd[];
+};
+
+struct flowc_tx_params {
+	uint32_t snd_nxt;
+	uint32_t rcv_nxt;
+	unsigned int snd_space;
+	unsigned int mss;
+};
+
+static inline int
+toepcb_flag(struct toepcb *toep, int flag)
+{
+
+	return isset(&toep->flags, flag);
+}
+
+static inline void
+toepcb_set_flag(struct toepcb *toep, int flag)
+{
+
+	setbit(&toep->flags, flag);
+}
+
+static inline void
+toepcb_clr_flag(struct toepcb *toep, int flag)
+{
+
+	clrbit(&toep->flags, flag);
+}
+
+/*
+ * Compressed state for embryonic connections for a listener.  Barely fits in
+ * 64B, try not to grow it further.
+ */
+struct synq_entry {
+	TAILQ_ENTRY(synq_entry) link;	/* listen_ctx's synq link */
+	int flags;			/* same as toepcb's tp_flags */
+	int tid;
+	struct listen_ctx *lctx;	/* backpointer to listen ctx */
+	struct mbuf *syn;
+	uint32_t iss;
+	uint32_t ts;
+	volatile uintptr_t wr;
+	volatile u_int refcnt;
+	uint16_t l2e_idx;
+	uint16_t rcv_bufsize;
+};
+
+static inline int
+synqe_flag(struct synq_entry *synqe, int flag)
+{
+
+	return isset(&synqe->flags, flag);
+}
+
+static inline void
+synqe_set_flag(struct synq_entry *synqe, int flag)
+{
+
+	setbit(&synqe->flags, flag);
+}
+
+static inline void
+synqe_clr_flag(struct synq_entry *synqe, int flag)
+{
+
+	clrbit(&synqe->flags, flag);
+}
+
+/* listen_ctx flags */
+#define LCTX_RPL_PENDING 1	/* waiting for a CPL_PASS_OPEN_RPL */
+
+struct listen_ctx {
+	LIST_ENTRY(listen_ctx) link;	/* listen hash linkage */
+	volatile int refcount;
+	int stid;
+	int flags;
+	struct inpcb *inp;		/* listening socket's inp */
+	struct sge_wrq *ctrlq;
+	struct sge_ofld_rxq *ofld_rxq;
+	TAILQ_HEAD(, synq_entry) synq;
+};
+
+struct tom_data {
+	struct toedev tod;
+
+	/* toepcb's associated with this TOE device */
+	struct mtx toep_list_lock;
+	TAILQ_HEAD(, toepcb) toep_list;
+
+	LIST_HEAD(, listen_ctx) *listen_hash;
+	u_long listen_mask;
+	int lctx_count;		/* # of lctx in the hash table */
+	struct mtx lctx_hash_lock;
+};
+
+static inline struct tom_data *
+tod_td(struct toedev *tod)
+{
+
+	return (member2struct(tom_data, tod, tod));
+}
+
+static inline struct adapter *
+td_adapter(struct tom_data *td)
+{
+
+	return (td->tod.tod_softc);
+}
+
+/* t4_tom.c */
+struct toepcb *alloc_toepcb(struct port_info *, int, int, int);
+void free_toepcb(struct toepcb *);
+void offload_socket(struct socket *, struct toepcb *);
+void undo_offload_socket(struct socket *);
+void final_cpl_received(struct toepcb *);
+void insert_tid(struct adapter *, int, void *);
+void *lookup_tid(struct adapter *, int);
+void update_tid(struct adapter *, int, void *);
+void remove_tid(struct adapter *, int);
+void release_tid(struct adapter *, int, struct sge_wrq *);
+int find_best_mtu_idx(struct adapter *, struct in_conninfo *, int);
+u_long select_rcv_wnd(struct socket *);
+int select_rcv_wscale(void);
+uint64_t calc_opt0(struct socket *, struct port_info *, struct l2t_entry *,
+    int, int, int, int);
+uint32_t select_ntuple(struct port_info *, struct l2t_entry *, uint32_t);
+
+/* t4_connect.c */
+void t4_init_connect_cpl_handlers(struct adapter *);
+int t4_connect(struct toedev *, struct socket *, struct rtentry *,
+    struct sockaddr *);
+
+/* t4_listen.c */
+void t4_init_listen_cpl_handlers(struct adapter *);
+int t4_listen_start(struct toedev *, struct tcpcb *);
+int t4_listen_stop(struct toedev *, struct tcpcb *);
+void t4_syncache_added(struct toedev *, void *);
+void t4_syncache_removed(struct toedev *, void *);
+int t4_syncache_respond(struct toedev *, void *, struct mbuf *);
+int do_abort_req_synqe(struct sge_iq *, const struct rss_header *,
+    struct mbuf *);
+int do_abort_rpl_synqe(struct sge_iq *, const struct rss_header *,
+    struct mbuf *);
+void t4_offload_socket(struct toedev *, void *, struct socket *);
+
+/* t4_cpl_io.c */
+void t4_init_cpl_io_handlers(struct adapter *);
+void send_abort_rpl(struct adapter *, struct sge_wrq *, int , int);
+void send_flowc_wr(struct toepcb *, struct flowc_tx_params *);
+void send_reset(struct adapter *, struct toepcb *, uint32_t);
+void make_established(struct toepcb *, uint32_t, uint32_t, uint16_t);
+void t4_rcvd(struct toedev *, struct tcpcb *);
+int t4_tod_output(struct toedev *, struct tcpcb *);
+int t4_send_fin(struct toedev *, struct tcpcb *);
+int t4_send_rst(struct toedev *, struct tcpcb *);
+
+#endif
diff --git a/sys/dev/cxgbe/tom/t4_tom_l2t.c b/sys/dev/cxgbe/tom/t4_tom_l2t.c
new file mode 100644
index 0000000..ffe64c5
--- /dev/null
+++ b/sys/dev/cxgbe/tom/t4_tom_l2t.c
@@ -0,0 +1,405 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+
+#ifdef TCP_OFFLOAD
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/bus.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/sbuf.h>
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/ethernet.h>
+#include <net/if_vlan_var.h>
+#include <net/route.h>
+#include <netinet/in.h>
+#include <netinet/toecore.h>
+
+#include "common/common.h"
+#include "common/jhash.h"
+#include "common/t4_msg.h"
+#include "tom/t4_tom_l2t.h"
+#include "tom/t4_tom.h"
+
+#define VLAN_NONE	0xfff
+
+#define SA(x)           ((struct sockaddr *)(x))
+#define SIN(x)          ((struct sockaddr_in *)(x))
+#define SINADDR(x)      (SIN(x)->sin_addr.s_addr)
+
+static inline void
+l2t_hold(struct l2t_data *d, struct l2t_entry *e)
+{
+	if (atomic_fetchadd_int(&e->refcnt, 1) == 0)  /* 0 -> 1 transition */
+		atomic_subtract_int(&d->nfree, 1);
+}
+
+static inline unsigned int
+arp_hash(const uint32_t key, int ifindex)
+{
+	return jhash_2words(key, ifindex, 0) & (L2T_SIZE - 1);
+}
+
+/*
+ * Add a WR to an L2T entry's queue of work requests awaiting resolution.
+ * Must be called with the entry's lock held.
+ */
+static inline void
+arpq_enqueue(struct l2t_entry *e, struct wrqe *wr)
+{
+	mtx_assert(&e->lock, MA_OWNED);
+
+	STAILQ_INSERT_TAIL(&e->wr_list, wr, link);
+}
+
+static inline void
+send_pending(struct adapter *sc, struct l2t_entry *e)
+{
+	struct wrqe *wr;
+
+	mtx_assert(&e->lock, MA_OWNED);
+
+	while ((wr = STAILQ_FIRST(&e->wr_list)) != NULL) {
+		STAILQ_REMOVE_HEAD(&e->wr_list, link);
+		t4_wrq_tx(sc, wr);
+	}
+}
+
+static void
+resolution_failed_for_wr(struct wrqe *wr)
+{
+	log(LOG_ERR, "%s: leaked work request %p, wr_len %d", __func__, wr,
+	    wr->wr_len);
+
+	/* free(wr, M_CXGBE); */
+}
+
+static void
+resolution_failed(struct l2t_entry *e)
+{
+	struct wrqe *wr;
+
+	mtx_assert(&e->lock, MA_OWNED);
+
+	while ((wr = STAILQ_FIRST(&e->wr_list)) != NULL) {
+		STAILQ_REMOVE_HEAD(&e->wr_list, link);
+		resolution_failed_for_wr(wr);
+	}
+}
+
+static void
+update_entry(struct adapter *sc, struct l2t_entry *e, uint8_t *lladdr,
+    uint16_t vtag)
+{
+
+	mtx_assert(&e->lock, MA_OWNED);
+
+	/*
+	 * The entry may be in active use (e->refcount > 0) or not.  We update
+	 * it even when it's not as this simplifies the case where we decide to
+	 * reuse the entry later.
+	 */
+
+	if (lladdr == NULL &&
+	    (e->state == L2T_STATE_RESOLVING || e->state == L2T_STATE_FAILED)) {
+		/*
+		 * Never got a valid L2 address for this one.  Just mark it as
+		 * failed instead of removing it from the hash (for which we'd
+		 * need to wlock the table).
+		 */
+		e->state = L2T_STATE_FAILED;
+		resolution_failed(e);
+		return;
+
+	} else if (lladdr == NULL) {
+
+		/* Valid or already-stale entry was deleted (or expired) */
+
+		KASSERT(e->state == L2T_STATE_VALID ||
+		    e->state == L2T_STATE_STALE,
+		    ("%s: lladdr NULL, state %d", __func__, e->state));
+
+		e->state = L2T_STATE_STALE;
+
+	} else {
+
+		if (e->state == L2T_STATE_RESOLVING ||
+		    e->state == L2T_STATE_FAILED ||
+		    memcmp(e->dmac, lladdr, ETHER_ADDR_LEN)) {
+
+			/* unresolved -> resolved; or dmac changed */
+
+			memcpy(e->dmac, lladdr, ETHER_ADDR_LEN);
+			e->vlan = vtag;
+			t4_write_l2e(sc, e, 1);
+		}
+		e->state = L2T_STATE_VALID;
+	}
+}
+
+static int
+resolve_entry(struct adapter *sc, struct l2t_entry *e)
+{
+	struct tom_data *td = sc->tom_softc;
+	struct toedev *tod = &td->tod;
+	struct sockaddr_in sin = {0};
+	uint8_t dmac[ETHER_ADDR_LEN];
+	uint16_t vtag = VLAN_NONE;
+	int rc;
+
+	sin.sin_family = AF_INET;
+	sin.sin_len = sizeof(struct sockaddr_in);
+	SINADDR(&sin) = e->addr;
+
+	rc = toe_l2_resolve(tod, e->ifp, SA(&sin), dmac, &vtag);
+	if (rc == EWOULDBLOCK)
+		return (rc);
+
+	mtx_lock(&e->lock);
+	update_entry(sc, e, rc == 0 ? dmac : NULL, vtag);
+	mtx_unlock(&e->lock);
+
+	return (rc);
+}
+
+int
+t4_l2t_send_slow(struct adapter *sc, struct wrqe *wr, struct l2t_entry *e)
+{
+
+again:
+	switch (e->state) {
+	case L2T_STATE_STALE:     /* entry is stale, kick off revalidation */
+
+		if (resolve_entry(sc, e) != EWOULDBLOCK)
+			goto again;	/* entry updated, re-examine state */
+
+		/* Fall through */
+
+	case L2T_STATE_VALID:     /* fast-path, send the packet on */
+
+		t4_wrq_tx(sc, wr);
+		return (0);
+
+	case L2T_STATE_RESOLVING:
+	case L2T_STATE_SYNC_WRITE:
+
+		mtx_lock(&e->lock);
+		if (e->state != L2T_STATE_SYNC_WRITE &&
+		    e->state != L2T_STATE_RESOLVING) {
+			/* state changed by the time we got here */
+			mtx_unlock(&e->lock);
+			goto again;
+		}
+		arpq_enqueue(e, wr);
+		mtx_unlock(&e->lock);
+
+		if (resolve_entry(sc, e) == EWOULDBLOCK)
+			break;
+
+		mtx_lock(&e->lock);
+		if (e->state == L2T_STATE_VALID && !STAILQ_EMPTY(&e->wr_list))
+			send_pending(sc, e);
+		if (e->state == L2T_STATE_FAILED)
+			resolution_failed(e);
+		mtx_unlock(&e->lock);
+		break;
+
+	case L2T_STATE_FAILED:
+		resolution_failed_for_wr(wr);
+		return (EHOSTUNREACH);
+	}
+
+	return (0);
+}
+
+/*
+ * Called when an L2T entry has no more users.  The entry is left in the hash
+ * table since it is likely to be reused but we also bump nfree to indicate
+ * that the entry can be reallocated for a different neighbor.  We also drop
+ * the existing neighbor reference in case the neighbor is going away and is
+ * waiting on our reference.
+ *
+ * Because entries can be reallocated to other neighbors once their ref count
+ * drops to 0 we need to take the entry's lock to avoid races with a new
+ * incarnation.
+ */
+
+static int
+do_l2t_write_rpl2(struct sge_iq *iq, const struct rss_header *rss,
+    struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1);
+	unsigned int tid = GET_TID(rpl);
+	unsigned int idx = tid & (L2T_SIZE - 1);
+	int rc;
+
+	rc = do_l2t_write_rpl(iq, rss, m);
+	if (rc != 0)
+		return (rc);
+
+	if (tid & F_SYNC_WR) {
+		struct l2t_entry *e = &sc->l2t->l2tab[idx];
+
+		mtx_lock(&e->lock);
+		if (e->state != L2T_STATE_SWITCHING) {
+			send_pending(sc, e);
+			e->state = L2T_STATE_VALID;
+		}
+		mtx_unlock(&e->lock);
+	}
+
+	return (0);
+}
+
+void
+t4_init_l2t_cpl_handlers(struct adapter *sc)
+{
+
+	t4_register_cpl_handler(sc, CPL_L2T_WRITE_RPL, do_l2t_write_rpl2);
+}
+
+void
+t4_uninit_l2t_cpl_handlers(struct adapter *sc)
+{
+
+	t4_register_cpl_handler(sc, CPL_L2T_WRITE_RPL, do_l2t_write_rpl);
+}
+
+/*
+ * The TOE wants an L2 table entry that it can use to reach the next hop over
+ * the specified port.  Produce such an entry - create one if needed.
+ *
+ * Note that the ifnet could be a pseudo-device like if_vlan, if_lagg, etc. on
+ * top of the real cxgbe interface.
+ */
+struct l2t_entry *
+t4_l2t_get(struct port_info *pi, struct ifnet *ifp, struct sockaddr *sa)
+{
+	struct l2t_entry *e;
+	struct l2t_data *d = pi->adapter->l2t;
+	uint32_t addr = SINADDR(sa);
+	int hash = arp_hash(addr, ifp->if_index);
+	unsigned int smt_idx = pi->port_id;
+
+	if (sa->sa_family != AF_INET)
+		return (NULL);	/* XXX: no IPv6 support right now */
+
+#ifndef VLAN_TAG
+	if (ifp->if_type == IFT_L2VLAN)
+		return (NULL);
+#endif
+
+	rw_wlock(&d->lock);
+	for (e = d->l2tab[hash].first; e; e = e->next) {
+		if (e->addr == addr && e->ifp == ifp && e->smt_idx == smt_idx) {
+			l2t_hold(d, e);
+			goto done;
+		}
+	}
+
+	/* Need to allocate a new entry */
+	e = t4_alloc_l2e(d);
+	if (e) {
+		mtx_lock(&e->lock);          /* avoid race with t4_l2t_free */
+		e->next = d->l2tab[hash].first;
+		d->l2tab[hash].first = e;
+
+		e->state = L2T_STATE_RESOLVING;
+		e->addr = addr;
+		e->ifp = ifp;
+		e->smt_idx = smt_idx;
+		e->hash = hash;
+		e->lport = pi->lport;
+		atomic_store_rel_int(&e->refcnt, 1);
+#ifdef VLAN_TAG
+		if (ifp->if_type == IFT_L2VLAN)
+			VLAN_TAG(ifp, &e->vlan);
+		else
+			e->vlan = VLAN_NONE;
+#endif
+		mtx_unlock(&e->lock);
+	}
+done:
+	rw_wunlock(&d->lock);
+	return e;
+}
+
+/*
+ * Called when the host's ARP layer makes a change to some entry that is loaded
+ * into the HW L2 table.
+ */
+void
+t4_l2_update(struct toedev *tod, struct ifnet *ifp, struct sockaddr *sa,
+    uint8_t *lladdr, uint16_t vtag)
+{
+	struct adapter *sc = tod->tod_softc;
+	struct l2t_entry *e;
+	struct l2t_data *d = sc->l2t;
+	uint32_t addr = SINADDR(sa);
+	int hash = arp_hash(addr, ifp->if_index);
+
+	KASSERT(d != NULL, ("%s: no L2 table", __func__));
+
+	rw_rlock(&d->lock);
+	for (e = d->l2tab[hash].first; e; e = e->next) {
+		if (e->addr == addr && e->ifp == ifp) {
+			mtx_lock(&e->lock);
+			if (atomic_load_acq_int(&e->refcnt))
+				goto found;
+			e->state = L2T_STATE_STALE;
+			mtx_unlock(&e->lock);
+			break;
+		}
+	}
+	rw_runlock(&d->lock);
+
+	/*
+	 * This is of no interest to us.  We've never had an offloaded
+	 * connection to this destination, and we aren't attempting one right
+	 * now.
+	 */
+	return;
+
+found:
+	rw_runlock(&d->lock);
+
+	KASSERT(e->state != L2T_STATE_UNUSED,
+	    ("%s: unused entry in the hash.", __func__));
+
+	update_entry(sc, e, lladdr, vtag);
+	mtx_unlock(&e->lock);
+}
+#endif
diff --git a/sys/dev/cxgbe/tom/t4_tom_l2t.h b/sys/dev/cxgbe/tom/t4_tom_l2t.h
new file mode 100644
index 0000000..3d76735
--- /dev/null
+++ b/sys/dev/cxgbe/tom/t4_tom_l2t.h
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+#ifndef __T4_TOM_L2T_H
+#define __T4_TOM_L2T_H
+
+#include "t4_l2t.h"
+
+int t4_l2t_send_slow(struct adapter *, struct wrqe *, struct l2t_entry *);
+struct l2t_entry *t4_l2t_get(struct port_info *, struct ifnet *,
+    struct sockaddr *);
+void t4_l2_update(struct toedev *, struct ifnet *, struct sockaddr *,
+    uint8_t *, uint16_t);
+void t4_init_l2t_cpl_handlers(struct adapter *);
+void t4_uninit_l2t_cpl_handlers(struct adapter *);
+
+static inline int
+t4_l2t_send(struct adapter *sc, struct wrqe *wr, struct l2t_entry *e)
+{
+	if (__predict_true(e->state == L2T_STATE_VALID)) {
+		t4_wrq_tx(sc, wr);
+		return (0);
+	} else
+		return (t4_l2t_send_slow(sc, wr, e));
+}
+
+#endif  /* __T4_TOM_L2T_H */
author	np <np@FreeBSD.org>	2012-06-19 07:34:13 +0000
committer	np <np@FreeBSD.org>	2012-06-19 07:34:13 +0000
commit	67d5f1a727273d8e141e96c429114dff9fb06ec3 (patch)
tree	9255a545bbd49a0458ed8850371b4fe6ed2cd01f /sys/dev/cxgbe
parent	27063437e23a5e5e7debf9144ee974d21b6a6774 (diff)
download	FreeBSD-src-67d5f1a727273d8e141e96c429114dff9fb06ec3.zip FreeBSD-src-67d5f1a727273d8e141e96c429114dff9fb06ec3.tar.gz