24 files changed, 3309 insertions, 600 deletions
diff --git a/sys/conf/NOTES b/sys/conf/NOTES
index 4c88ca3..56c2885 100644
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@@ -1878,7 +1878,7 @@ device		xe
 device		bce		# Broadcom BCM5706/BCM5708 Gigabit Ethernet
 device		bfe		# Broadcom BCM440x 10/100 Ethernet
 device		bge		# Broadcom BCM570xx Gigabit Ethernet
-#device		cxgb		# Chelsio T3 10 Gigabit Ethernet
+device		cxgb		# Chelsio T3 10 Gigabit Ethernet
 device		dc		# DEC/Intel 21143 and various workalikes
 device		fxp		# Intel EtherExpress PRO/100B (82557, 82558)
 hint.fxp.0.prefer_iomap="0"
diff --git a/sys/dev/cxgb/common/cxgb_t3_cpl.h b/sys/dev/cxgb/common/cxgb_t3_cpl.h
index e1b4030..672823c 100644
--- a/sys/dev/cxgb/common/cxgb_t3_cpl.h
+++ b/sys/dev/cxgb/common/cxgb_t3_cpl.h
@@ -1131,6 +1131,18 @@ struct cpl_tx_pkt_lso {
 	__be32 lso_info;
 };
 
+struct cpl_tx_pkt_batch_entry {
+	__be32 cntrl;
+	__be32 len;
+	__be64 addr;
+};
+
+struct cpl_tx_pkt_batch {
+	WR_HDR;
+	struct cpl_tx_pkt_batch_entry pkt_entry[7];
+};
+
+
 /* cpl_tx_pkt*.cntrl fields */
 #define S_TXPKT_VLAN    0
 #define M_TXPKT_VLAN    0xFFFF
diff --git a/sys/dev/cxgb/cxgb_adapter.h b/sys/dev/cxgb/cxgb_adapter.h
index 23db259..542668e 100644
--- a/sys/dev/cxgb/cxgb_adapter.h
+++ b/sys/dev/cxgb/cxgb_adapter.h
@@ -31,7 +31,6 @@ $FreeBSD$
 ***************************************************************************/
 
 
-
 #ifndef _CXGB_ADAPTER_H_
 #define _CXGB_ADAPTER_H_
 
@@ -42,6 +41,7 @@ $FreeBSD$
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
+#include <sys/condvar.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
@@ -49,6 +49,7 @@ $FreeBSD$
 
 #include <machine/bus.h>
 #include <machine/resource.h>
+
 #include <sys/bus_dma.h>
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
@@ -56,8 +57,8 @@ $FreeBSD$
 #ifdef CONFIG_DEFINED
 #include <cxgb_osdep.h>
 #include <t3cdev.h>
-#include <sys/mbufq.h>
 #include <ulp/toecore/cxgb_toedev.h>
+#include <sys/mbufq.h>
 #else
 #include <dev/cxgb/cxgb_osdep.h>
 #include <dev/cxgb/t3cdev.h>
@@ -128,10 +129,12 @@ struct port_info {
 	struct task	timer_reclaim_task;
 	struct cdev     *port_cdev;
 
-#define PORT_NAME_LEN 32
+#define PORT_LOCK_NAME_LEN 32
 #define TASKQ_NAME_LEN 32
-	char            lockbuf[PORT_NAME_LEN];
+#define PORT_NAME_LEN 32
+	char            lockbuf[PORT_LOCK_NAME_LEN];
 	char            taskqbuf[TASKQ_NAME_LEN];
+	char            namebuf[PORT_NAME_LEN];
 };
 
 enum {				/* adapter flags */
@@ -143,19 +146,14 @@ enum {				/* adapter flags */
 	TPS_UPTODATE    = (1 << 5),
 };
 
-
 #define FL_Q_SIZE	4096
-#define JUMBO_Q_SIZE	512
+#define JUMBO_Q_SIZE	1024
 #define RSPQ_Q_SIZE	1024
 #define TX_ETH_Q_SIZE	1024
 
-
-
-/*
- * Types of Tx queues in each queue set.  Order here matters, do not change.
- * XXX TOE is not implemented yet, so the extra queues are just placeholders.
- */
-enum { TXQ_ETH, TXQ_OFLD, TXQ_CTRL };
+enum { TXQ_ETH = 0,
+       TXQ_OFLD = 1,
+       TXQ_CTRL = 2, };
 
 
 /* careful, the following are set on priv_flags and must not collide with
@@ -275,7 +273,22 @@ struct sge_txq {
 	bus_dmamap_t	desc_map;
 	bus_dma_tag_t   entry_tag;
 	struct mbuf_head sendq;
+	/*
+	 * cleanq should really be an buf_ring to avoid extra
+	 * mbuf touches
+	 */
+	struct mbuf_head cleanq;	
+	struct buf_ring txq_mr;
+	struct mbuf     *immpkt;
+	uint32_t        txq_drops;
+	uint32_t        txq_skipped;
+	uint32_t        txq_coalesced;
+	uint32_t        txq_enqueued;
+	unsigned long   txq_frees;
 	struct mtx      lock;
+	struct sg_ent  txq_sgl[TX_MAX_SEGS / 2 + 1];
+	bus_dma_segment_t txq_segs[TX_MAX_SEGS];
+	struct mbuf     *txq_m_vec[TX_WR_COUNT_MAX];
 #define TXQ_NAME_LEN  32
 	char            lockbuf[TXQ_NAME_LEN];
 };
@@ -294,6 +307,10 @@ enum {
 
 #define SGE_PSTAT_MAX (SGE_PSTATS_LRO_X_STREAMS+1)
 
+#define QS_EXITING              0x1
+#define QS_RUNNING              0x2
+#define QS_BOUND                0x4
+
 struct sge_qset {
 	struct sge_rspq		rspq;
 	struct sge_fl		fl[SGE_RXQ_PER_SET];
@@ -303,6 +320,12 @@ struct sge_qset {
 	uint64_t                port_stats[SGE_PSTAT_MAX];
 	struct port_info        *port;
 	int                     idx; /* qset # */
+	int                     qs_cpuid;
+	int                     qs_flags;
+	struct cv		qs_cv;
+	struct mtx		qs_mtx;
+#define QS_NAME_LEN 32
+	char                    namebuf[QS_NAME_LEN];
 };
 
 struct sge {
@@ -344,7 +367,15 @@ struct adapter {
 	void			*msix_intr_tag[SGE_QSETS];
 	uint8_t                 rxpkt_map[8]; /* maps RX_PKT interface values to port ids */
 	uint8_t                 rrss_map[SGE_QSETS]; /* revers RSS map table */
-
+	uint16_t                rspq_map[RSS_TABLE_SIZE];     /* maps 7-bit cookie to qidx */
+	union {
+		uint8_t                 fill[SGE_QSETS];
+		uint64_t                coalesce;
+	} u;
+
+#define tunq_fill u.fill
+#define tunq_coalesce u.coalesce
+	
 	struct filter_info      *filters;
 	
 	/* Tasks */
@@ -474,7 +505,7 @@ t3_get_next_mcaddr(struct t3_rx_mode *rm)
 	uint8_t *macaddr = NULL;
 	
 	if (rm->idx == 0)
-		macaddr = rm->port->hw_addr;
+		macaddr = (uint8_t *)rm->port->hw_addr;
 
 	rm->idx++;
 	return (macaddr);
@@ -515,18 +546,21 @@ void t3_sge_stop(adapter_t *);
 void t3b_intr(void *data);
 void t3_intr_msi(void *data);
 void t3_intr_msix(void *data);
-int t3_encap(struct port_info *, struct mbuf **, int *free);
+int t3_encap(struct sge_qset *, struct mbuf **, int);
 
 int t3_sge_init_adapter(adapter_t *);
 int t3_sge_init_port(struct port_info *);
 void t3_sge_deinit_sw(adapter_t *);
+void t3_free_tx_desc(struct sge_txq *q, int n);
+void t3_free_tx_desc_all(struct sge_txq *q);
 
 void t3_rx_eth_lro(adapter_t *adap, struct sge_rspq *rq, struct mbuf *m,
     int ethpad, uint32_t rss_hash, uint32_t rss_csum, int lro);
 void t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad);
 void t3_lro_flush(adapter_t *adap, struct sge_qset *qs, struct lro_state *state);
 
-void t3_add_sysctls(adapter_t *sc);
+void t3_add_attach_sysctls(adapter_t *sc);
+void t3_add_configured_sysctls(adapter_t *sc);
 int t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
     unsigned char *data);
 void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p);
@@ -535,7 +569,7 @@ void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p);
  */
 #define desc_reclaimable(q) ((int)((q)->processed - (q)->cleaned - TX_MAX_DESC))
 
-#define container_of(p, stype, field) ((stype *)(((uint8_t *)(p)) - offsetof(stype, field))) 
+#define container_of(p, stype, field) ((stype *)(((uint8_t *)(p)) - offsetof(stype, field)))
 
 static __inline struct sge_qset *
 fl_to_qset(struct sge_fl *q, int qidx)
@@ -569,5 +603,20 @@ static inline int offload_running(adapter_t *adapter)
         return isset(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT);
 }
 
+#ifdef IFNET_MULTIQUEUE
+int cxgb_pcpu_enqueue_packet(struct ifnet *ifp, struct mbuf *m);
+int cxgb_pcpu_start(struct ifnet *ifp, struct mbuf *m);
+int32_t cxgb_pcpu_get_cookie(struct ifnet *ifp, struct in6_addr *lip, uint16_t lport,
+    struct in6_addr *rip, uint16_t rport, int ipv6);
+void cxgb_pcpu_shutdown_threads(struct adapter *sc);
+void cxgb_pcpu_startup_threads(struct adapter *sc);
+#endif
+
+int process_responses(adapter_t *adap, struct sge_qset *qs, int budget);
+int cxgb_tx_common(struct ifnet *ifp, struct sge_qset  *qs, uint32_t txmax);
+void t3_free_qset(adapter_t *sc, struct sge_qset *q);
+int cxgb_dequeue_packet(struct ifnet *, struct sge_txq *, struct mbuf **);
+void cxgb_start(struct ifnet *ifp);
+void refill_fl_service(adapter_t *adap, struct sge_fl *fl);
 
 #endif
diff --git a/sys/dev/cxgb/cxgb_config.h b/sys/dev/cxgb/cxgb_config.h
index a12753f..6b072c3 100644
--- a/sys/dev/cxgb/cxgb_config.h
+++ b/sys/dev/cxgb/cxgb_config.h
@@ -34,7 +34,6 @@ $FreeBSD$
 
 #ifndef CONFIG_DEFINED
 #define CONFIG_CHELSIO_T3_CORE
-#define DISABLE_MBUF_IOVEC
 #endif
 
 #endif
diff --git a/sys/dev/cxgb/cxgb_l2t.c b/sys/dev/cxgb/cxgb_l2t.c
index 0bb0695..f3e02f2 100644
--- a/sys/dev/cxgb/cxgb_l2t.c
+++ b/sys/dev/cxgb/cxgb_l2t.c
@@ -115,7 +115,7 @@ neigh_replace(struct l2t_entry *e, struct rtentry *rt)
  */
 static int
 setup_l2e_send_pending(struct t3cdev *dev, struct mbuf *m,
-			struct l2t_entry *e)
+    struct l2t_entry *e)
 {
 	struct cpl_l2t_write_req *req;
 
@@ -183,7 +183,7 @@ t3_l2t_send_slow(struct t3cdev *dev, struct mbuf *m, struct l2t_entry *e)
 again:
 	switch (e->state) {
 	case L2T_STATE_STALE:     /* entry is stale, kick off revalidation */
-		arpresolve(rt->rt_ifp, rt, NULL, (struct sockaddr *)&sin, e->dmac);
+		arpresolve2(rt->rt_ifp, rt, (struct sockaddr *)&sin, e->dmac);
 		mtx_lock(&e->lock);
 		if (e->state == L2T_STATE_STALE)
 			e->state = L2T_STATE_VALID;
@@ -208,8 +208,8 @@ again:
 		 * A better way would be to use a work request to retry L2T
 		 * entries when there's no memory.
 		 */
-		printf("doing arpresolve on 0x%x \n", e->addr);
-		if (arpresolve(rt->rt_ifp, rt, NULL, (struct sockaddr *)&sin, e->dmac) == 0) {
+		printf("doing arpresolve2 on 0x%x \n", e->addr);
+		if (arpresolve2(rt->rt_ifp, rt, (struct sockaddr *)&sin, e->dmac) == 0) {
 			printf("mac=%x:%x:%x:%x:%x:%x\n",
 			    e->dmac[0], e->dmac[1], e->dmac[2], e->dmac[3], e->dmac[4], e->dmac[5]);
 			
@@ -223,7 +223,7 @@ again:
 				m_freem(m);
 			mtx_unlock(&e->lock);
 		} else
-			printf("arpresolve returned non-zero\n");
+			printf("arpresolve2 returned non-zero\n");
 	}
 	return 0;
 }
@@ -245,7 +245,7 @@ t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e)
 again:
 	switch (e->state) {
 	case L2T_STATE_STALE:     /* entry is stale, kick off revalidation */
-		arpresolve(rt->rt_ifp, rt, m0, (struct sockaddr *)&sin, e->dmac);
+		arpresolve2(rt->rt_ifp, rt, (struct sockaddr *)&sin, e->dmac);
 		mtx_lock(&e->lock);
 		if (e->state == L2T_STATE_STALE) {
 			e->state = L2T_STATE_VALID;
@@ -262,8 +262,6 @@ again:
 		}
 		mtx_unlock(&e->lock);
 		
-		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
-			return;
 		/*
 		 * Only the first packet added to the arpq should kick off
 		 * resolution.  However, because the alloc_skb below can fail,
@@ -272,7 +270,7 @@ again:
 		 * A better way would be to use a work request to retry L2T
 		 * entries when there's no memory.
 		 */
-		arpresolve(rt->rt_ifp, rt, m0, (struct sockaddr *)&sin, e->dmac);
+		arpresolve2(rt->rt_ifp, rt, (struct sockaddr *)&sin, e->dmac);
 
 	}
 	return;
@@ -459,7 +457,8 @@ handle_failed_resolution(struct t3cdev *dev, struct mbuf *arpq)
 }
 
 void
-t3_l2t_update(struct t3cdev *dev, struct rtentry *neigh, struct sockaddr *sa)
+t3_l2t_update(struct t3cdev *dev, struct rtentry *neigh,
+    uint8_t *enaddr, struct sockaddr *sa)
 {
 	struct l2t_entry *e;
 	struct mbuf *arpq = NULL;
@@ -468,8 +467,6 @@ t3_l2t_update(struct t3cdev *dev, struct rtentry *neigh, struct sockaddr *sa)
 	int ifidx = neigh->rt_ifp->if_index;
 	int hash = arp_hash(addr, ifidx, d);
 	struct llinfo_arp *la;
-	u_char edst[ETHER_ADDR_LEN];
-
 
 	printf("t3_l2t_update called with arp info\n");
 	
@@ -485,10 +482,11 @@ t3_l2t_update(struct t3cdev *dev, struct rtentry *neigh, struct sockaddr *sa)
 
 found:
 	printf("found 0x%08x\n", addr);
-	arpresolve(neigh->rt_ifp, neigh, NULL, sa, edst);
 
 	rw_runlock(&d->lock);
-	memcpy(e->dmac, edst, ETHER_ADDR_LEN);
+	memcpy(e->dmac, enaddr, ETHER_ADDR_LEN);
+	printf("mac=%x:%x:%x:%x:%x:%x\n",
+	    e->dmac[0], e->dmac[1], e->dmac[2], e->dmac[3], e->dmac[4], e->dmac[5]);
 	
 	if (atomic_load_acq_int(&e->refcnt)) {
 		if (neigh != e->neigh)
diff --git a/sys/dev/cxgb/cxgb_l2t.h b/sys/dev/cxgb/cxgb_l2t.h
index 9b4effd..a5d469b 100644
--- a/sys/dev/cxgb/cxgb_l2t.h
+++ b/sys/dev/cxgb/cxgb_l2t.h
@@ -118,7 +118,7 @@ static __inline void set_arp_failure_handler(struct mbuf *m,
 #define L2DATA(dev) ((dev)->l2opt)
 
 void t3_l2e_free(struct l2t_data *d, struct l2t_entry *e);
-void t3_l2t_update(struct t3cdev *dev, struct rtentry *rt, struct sockaddr *sa);
+void t3_l2t_update(struct t3cdev *dev, struct rtentry *rt, uint8_t *enaddr, struct sockaddr *sa);
 struct l2t_entry *t3_l2t_get(struct t3cdev *dev, struct rtentry *neigh,
     struct ifnet *ifp, struct sockaddr *sa);
 int t3_l2t_send_slow(struct t3cdev *dev, struct mbuf *m,
diff --git a/sys/dev/cxgb/cxgb_main.c b/sys/dev/cxgb/cxgb_main.c
index ef77dd5..92e5f2f 100644
--- a/sys/dev/cxgb/cxgb_main.c
+++ b/sys/dev/cxgb/cxgb_main.c
@@ -44,14 +44,15 @@ __FBSDID("$FreeBSD$");
 #include <sys/ioccom.h>
 #include <sys/mbuf.h>
 #include <sys/linker.h>
-#include <sys/syslog.h>
 #include <sys/firmware.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
+#include <sys/syslog.h>
 #include <sys/queue.h>
 #include <sys/taskqueue.h>
+#include <sys/proc.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
@@ -73,23 +74,18 @@ __FBSDID("$FreeBSD$");
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pci_private.h>
 
-
-#include <vm/vm.h>
-#include <vm/vm_page.h>
-#include <vm/vm_map.h>
-
 #ifdef CONFIG_DEFINED
 #include <cxgb_include.h>
-#include <sys/mvec.h>
 #else
 #include <dev/cxgb/cxgb_include.h>
-#include <dev/cxgb/sys/mvec.h>
 #endif
 
 #ifdef PRIV_SUPPORTED
 #include <sys/priv.h>
 #endif
 
+#include <machine/intr_machdep.h>
+
 static int cxgb_setup_msix(adapter_t *, int);
 static void cxgb_teardown_msix(adapter_t *);
 static void cxgb_init(void *);
@@ -97,8 +93,6 @@ static void cxgb_init_locked(struct port_info *);
 static void cxgb_stop_locked(struct port_info *);
 static void cxgb_set_rxmode(struct port_info *);
 static int cxgb_ioctl(struct ifnet *, unsigned long, caddr_t);
-static void cxgb_start(struct ifnet *);
-static void cxgb_start_proc(void *, int ncount);
 static int cxgb_media_change(struct ifnet *);
 static void cxgb_media_status(struct ifnet *, struct ifmediareq *);
 static int setup_sge_qsets(adapter_t *);
@@ -109,6 +103,10 @@ static void cxgb_down_locked(struct adapter *sc);
 static void cxgb_tick(void *);
 static void setup_rss(adapter_t *sc);
 
+#ifndef IFNET_MULTIQUEUE
+static void cxgb_start_proc(void *, int ncount);
+#endif
+
 /* Attachment glue for the PCI controller end of the device.  Each port of
  * the device is attached separately, as defined later.
  */
@@ -122,11 +120,7 @@ static void cxgb_get_regs(adapter_t *sc, struct ifconf_regs *regs, uint8_t *buf)
 static int cxgb_get_regs_len(void);
 static int offload_open(struct port_info *pi);
 static void touch_bars(device_t dev);
-
-#ifdef notyet
 static int offload_close(struct t3cdev *tdev);
-#endif
-
 
 static device_method_t cxgb_controller_methods[] = {
 	DEVMETHOD(device_probe,		cxgb_controller_probe),
@@ -188,7 +182,6 @@ DRIVER_MODULE(cxgb, cxgbc, cxgb_port_driver, cxgb_port_devclass, 0, 0);
 
 #define SGE_MSIX_COUNT (SGE_QSETS + 1)
 
-extern int collapse_mbufs;
 /*
  * The driver uses the best interrupt scheme available on a platform in the
  * order MSI-X, MSI, legacy pin interrupts.  This parameter determines which
@@ -218,11 +211,15 @@ SYSCTL_UINT(_hw_cxgb, OID_AUTO, ofld_disable, CTLFLAG_RDTUN, &ofld_disable, 0,
  * The driver uses an auto-queue algorithm by default.
  * To disable it and force a single queue-set per port, use singleq = 1.
  */
-static int singleq = 1;
+static int singleq = 0;
 TUNABLE_INT("hw.cxgb.singleq", &singleq);
 SYSCTL_UINT(_hw_cxgb, OID_AUTO, singleq, CTLFLAG_RDTUN, &singleq, 0,
     "use a single queue-set per port");
 
+#ifndef IFNET_MULTIQUEUE
+int cxgb_txq_buf_ring_size = 0;
+#endif
+
 enum {
 	MAX_TXQ_ENTRIES      = 16384,
 	MAX_CTRL_TXQ_ENTRIES = 1024,
@@ -281,10 +278,24 @@ struct cxgb_ident {
 	{0, 0, 0, NULL}
 };
 
-
 static int set_eeprom(struct port_info *pi, const uint8_t *data, int len, int offset);
 
-static inline char
+static __inline void
+check_pkt_coalesce(struct sge_qset *qs)
+{
+	struct adapter *sc;
+	struct sge_txq *txq;
+
+	txq = &qs->txq[TXQ_ETH];
+	sc = qs->port->adapter;
+
+	if (sc->tunq_fill[qs->idx] && (txq->in_use < (txq->size - (txq->size>>2)))) 
+		sc->tunq_fill[qs->idx] = 0;
+	else if (!sc->tunq_fill[qs->idx] && (txq->in_use > (txq->size - (txq->size>>2)))) 
+		sc->tunq_fill[qs->idx] = 1;
+}
+
+static __inline char
 t3rev2char(struct adapter *adapter)
 {
 	char rev = 'z';
@@ -582,6 +593,7 @@ cxgb_controller_attach(device_t dev)
 		pi->tx_chan = i >= ai->nports0;
 		pi->txpkt_intf = pi->tx_chan ? 2 * (i - ai->nports0) + 1 : 2 * i;
 		sc->rxpkt_map[pi->txpkt_intf] = i;
+		sc->port[i].tx_chan = i >= ai->nports0;
 		sc->portdev[i] = child;
 		device_set_softc(child, pi);
 	}
@@ -611,7 +623,7 @@ cxgb_controller_attach(device_t dev)
 	    G_FW_VERSION_MAJOR(vers), G_FW_VERSION_MINOR(vers),
 	    G_FW_VERSION_MICRO(vers));
 
-	t3_add_sysctls(sc);
+	t3_add_attach_sysctls(sc);
 out:
 	if (error)
 		cxgb_free(sc);
@@ -636,10 +648,14 @@ cxgb_free(struct adapter *sc)
 {
 	int i;
 
+	
+#ifdef IFNET_MULTIQUEUE
+	cxgb_pcpu_shutdown_threads(sc);
+#endif
 	ADAPTER_LOCK(sc);
-	/*
-	 * drops the lock
-	 */
+/*
+ * drops the lock
+ */
 	cxgb_down_locked(sc);
 	
 #ifdef MSI_SUPPORTED
@@ -664,7 +680,7 @@ cxgb_free(struct adapter *sc)
 	 * Wait for last callout
 	 */
 	
-	tsleep(&sc, 0, "cxgb unload", 3*hz);
+	DELAY(hz*100);
 
 	for (i = 0; i < (sc)->params.nports; ++i) {
 		if (sc->portdev[i] != NULL)
@@ -674,15 +690,17 @@ cxgb_free(struct adapter *sc)
 	bus_generic_detach(sc->dev);
 	if (sc->tq != NULL) 
 		taskqueue_free(sc->tq);
-#ifdef notyet
 	if (is_offload(sc)) {
 		cxgb_adapter_unofld(sc);
 		if (isset(&sc->open_device_map,	OFFLOAD_DEVMAP_BIT))
 			offload_close(&sc->tdev);
-	}
-#endif
-
+		else
+			printf("cxgb_free: DEVMAP_BIT not set\n");
+	} else
+		printf("not offloading set\n");	
+#ifndef IFNET_MULTIQUEUE
 	t3_free_sge_resources(sc);
+#endif
 	free(sc->filters, M_DEVBUF);
 	t3_sge_free(sc);
 	
@@ -696,8 +714,6 @@ cxgb_free(struct adapter *sc)
 	MTX_DESTROY(&sc->sge.reg_lock);
 	MTX_DESTROY(&sc->elmer_lock);
 	ADAPTER_LOCK_DEINIT(sc);
-	
-	return;
 }
 
 /**
@@ -803,7 +819,7 @@ cxgb_setup_msix(adapter_t *sc, int msix_count)
 			printf("setting up interrupt for port=%d\n",
 			    qs->port->port_id);
 			if (bus_setup_intr(sc->dev, sc->msix_irq_res[k],
-			    INTR_MPSAFE|INTR_TYPE_NET,
+				INTR_MPSAFE|INTR_TYPE_NET,
 #ifdef INTR_FILTERS
 				NULL,
 #endif
@@ -812,10 +828,17 @@ cxgb_setup_msix(adapter_t *sc, int msix_count)
 				    "interrupt for message %d\n", rid);
 				return (EINVAL);
 			}
+#ifdef IFNET_MULTIQUEUE			
+			if (singleq == 0) {
+				int vector = rman_get_start(sc->msix_irq_res[k]);
+				if (bootverbose)
+					device_printf(sc->dev, "binding vector=%d to cpu=%d\n", vector, k % mp_ncpus);
+				intr_bind(vector, k % mp_ncpus);
+			}
+#endif			
 		}
 	}
 
-
 	return (0);
 }
 
@@ -892,6 +915,12 @@ cxgb_port_attach(device_t dev)
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_ioctl = cxgb_ioctl;
 	ifp->if_start = cxgb_start;
+
+#ifdef IFNET_MULTIQUEUE
+	ifp->if_flags |= IFF_MULTIQ;
+	ifp->if_mq_start = cxgb_pcpu_start;
+#endif
+	
 	ifp->if_timer = 0;	/* Disable ifnet watchdog */
 	ifp->if_watchdog = NULL;
 
@@ -965,7 +994,7 @@ cxgb_port_attach(device_t dev)
 	p->tq = taskqueue_create_fast(p->taskqbuf, M_NOWAIT,
 	    taskqueue_thread_enqueue, &p->tq);
 #endif	
-
+#ifndef IFNET_MULTIQUEUE
 	if (p->tq == NULL) {
 		device_printf(dev, "failed to allocate port task queue\n");
 		return (ENOMEM);
@@ -974,7 +1003,7 @@ cxgb_port_attach(device_t dev)
 	    device_get_nameunit(dev));
 	
 	TASK_INIT(&p->start_task, 0, cxgb_start_proc, ifp);
-
+#endif
 	t3_sge_init_port(p);
 
 	return (0);
@@ -999,6 +1028,9 @@ cxgb_port_detach(device_t dev)
 	}
 
 	ether_ifdetach(p->ifp);
+	printf("waiting for callout to stop ...");
+	DELAY(1000000);
+	printf("done\n");
 	/*
 	 * the lock may be acquired in ifdetach
 	 */
@@ -1247,9 +1279,7 @@ offload_tx(struct t3cdev *tdev, struct mbuf *m)
 {
 	int ret;
 
-	critical_enter();
 	ret = t3_offload_tx(tdev, m);
-	critical_exit();
 	return (ret);
 }
 
@@ -1264,6 +1294,8 @@ write_smt_entry(struct adapter *adapter, int idx)
 		return (ENOMEM);
 
 	req = mtod(m, struct cpl_smt_write_req *);
+	m->m_pkthdr.len = m->m_len = sizeof(struct cpl_smt_write_req);
+	
 	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SMT_WRITE_REQ, idx));
 	req->mtu_idx = NMTUS - 1;  /* should be 0 but there's a T3 bug */
@@ -1325,6 +1357,10 @@ bind_qsets(adapter_t *sc)
 {
 	int i, j;
 
+#ifdef IFNET_MULTIQUEUE
+	cxgb_pcpu_startup_threads(sc);
+#endif
+	
 	for (i = 0; i < (sc)->params.nports; ++i) {
 		const struct port_info *pi = adap2pinfo(sc, i);
 
@@ -1473,6 +1509,7 @@ cxgb_up(struct adapter *sc)
 			goto out;
 
 		setup_rss(sc);
+		t3_add_configured_sysctls(sc);
 		sc->flags |= FULL_INIT_DONE;
 	}
 
@@ -1545,6 +1582,8 @@ cxgb_down_locked(struct adapter *sc)
 		cxgb_teardown_msix(sc);
 	ADAPTER_UNLOCK(sc);
 
+	callout_stop(&sc->cxgb_tick_ch);
+	callout_stop(&sc->sge_timer_ch);
 	callout_drain(&sc->cxgb_tick_ch);
 	callout_drain(&sc->sge_timer_ch);
 	
@@ -1553,26 +1592,28 @@ cxgb_down_locked(struct adapter *sc)
 		for (i = 0; i < sc->params.nports; i++) 
 			taskqueue_drain(sc->tq, &sc->port[i].timer_reclaim_task);
 	}
-#ifdef notyet	
-
-		if (sc->port[i].tq != NULL)
-#endif			
-
 }
 
 static int
 offload_open(struct port_info *pi)
 {
 	struct adapter *adapter = pi->adapter;
-	struct t3cdev *tdev = TOEDEV(pi->ifp);
+	struct t3cdev *tdev = &adapter->tdev;
+#ifdef notyet	
+	    T3CDEV(pi->ifp);
+#endif	
 	int adap_up = adapter->open_device_map & PORT_MASK;
 	int err = 0;
 
+	printf("device_map=0x%x\n", adapter->open_device_map); 
 	if (atomic_cmpset_int(&adapter->open_device_map,
-		(adapter->open_device_map & ~OFFLOAD_DEVMAP_BIT),
-		(adapter->open_device_map | OFFLOAD_DEVMAP_BIT)) == 0)
+		(adapter->open_device_map & ~(1<<OFFLOAD_DEVMAP_BIT)),
+		(adapter->open_device_map | (1<<OFFLOAD_DEVMAP_BIT))) == 0)
 		return (0);
 
+       
+	if (!isset(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT)) 
+		printf("offload_open: DEVMAP_BIT did not get set 0x%x\n", adapter->open_device_map);
 	ADAPTER_LOCK(pi->adapter); 
 	if (!adap_up)
 		err = cxgb_up(adapter);
@@ -1581,7 +1622,7 @@ offload_open(struct port_info *pi)
 		return (err);
 
 	t3_tp_set_offload_mode(adapter, 1);
-	tdev->lldev = adapter->port[0].ifp;
+	tdev->lldev = pi->ifp;
 	err = cxgb_offload_activate(adapter);
 	if (err)
 		goto out;
@@ -1605,15 +1646,18 @@ out:
 	}
 	return (err);
 }
-#ifdef notyet
+
 static int
-offload_close(struct t3cev *tdev)
+offload_close(struct t3cdev *tdev)
 {
 	struct adapter *adapter = tdev2adap(tdev);
 
-	if (!isset(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT))
+	if (!isset(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT)) {
+		printf("offload_close: DEVMAP_BIT not set\n");
+	
 		return (0);
-
+	}
+	
 	/* Call back all registered clients */
 	cxgb_remove_clients(tdev);
 	tdev->lldev = NULL;
@@ -1621,13 +1665,15 @@ offload_close(struct t3cev *tdev)
 	t3_tp_set_offload_mode(adapter, 0);
 	clrbit(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT);
 
+	ADAPTER_LOCK(adapter);
 	if (!adapter->open_device_map)
-		cxgb_down(adapter);
-
+		cxgb_down_locked(adapter);
+	else
+		ADAPTER_UNLOCK(adapter);
 	cxgb_offload_deactivate(adapter);
 	return (0);
 }
-#endif
+
 
 static void
 cxgb_init(void *arg)
@@ -1667,6 +1713,8 @@ cxgb_init_locked(struct port_info *p)
 		if (err)
 			log(LOG_WARNING,
 			    "Could not initialize offload capabilities\n");
+		else
+			printf("offload opened\n");
 	}
 	cxgb_link_start(p);
 	t3_link_changed(sc, p->port_id);
@@ -1675,8 +1723,7 @@ cxgb_init_locked(struct port_info *p)
 	device_printf(sc->dev, "enabling interrupts on port=%d\n", p->port_id);
 	t3_port_intr_enable(sc, p->port_id);
 
-	callout_reset(&sc->cxgb_tick_ch, sc->params.stats_update_period * hz,
-	    cxgb_tick, sc);
+	callout_reset(&sc->cxgb_tick_ch, hz, cxgb_tick, sc);
 
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
@@ -1703,7 +1750,6 @@ cxgb_stop_locked(struct port_info *p)
 	ADAPTER_LOCK_ASSERT_NOTOWNED(p->adapter);
 	
 	ifp = p->ifp;
-
 	t3_port_intr_disable(p->adapter, p->port_id);
 	ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
 	p->phy.ops->power_down(&p->phy, 1);
@@ -1712,7 +1758,6 @@ cxgb_stop_locked(struct port_info *p)
 	ADAPTER_LOCK(p->adapter);
 	clrbit(&p->adapter->open_device_map, p->port_id);
 
-	
 	if (p->adapter->open_device_map == 0) {
 		cxgb_down_locked(p->adapter);
 	} else 
@@ -1786,8 +1831,7 @@ cxgb_ioctl(struct ifnet *ifp, unsigned long command, caddr_t data)
 				
 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 			adapter_t *sc = p->adapter;
-			callout_reset(&sc->cxgb_tick_ch,
-			    sc->params.stats_update_period * hz,
+			callout_reset(&sc->cxgb_tick_ch, hz,
 			    cxgb_tick, sc);
 		}
 		PORT_UNLOCK(p);
@@ -1838,77 +1882,92 @@ cxgb_ioctl(struct ifnet *ifp, unsigned long command, caddr_t data)
 	return (error);
 }
 
-static int
-cxgb_start_tx(struct ifnet *ifp, uint32_t txmax)
+int
+cxgb_tx_common(struct ifnet *ifp, struct sge_qset *qs, uint32_t txmax)
 {
-	struct sge_qset *qs;
 	struct sge_txq *txq;
-	struct port_info *p = ifp->if_softc;
-	struct mbuf *m = NULL;
-	int err, in_use_init, free;
-
-	if (!p->link_config.link_ok)
-		return (ENXIO);
-
-	if (IFQ_DRV_IS_EMPTY(&ifp->if_snd))
-		return (ENOBUFS);
+	int err, in_use_init, count;
+	struct mbuf **m_vec;
 
-	qs = &p->adapter->sge.qs[p->first_qset];
 	txq = &qs->txq[TXQ_ETH];
-	err = 0;
-
-	if (txq->flags & TXQ_TRANSMITTING)
-		return (EINPROGRESS);
-	
-	mtx_lock(&txq->lock);
-	txq->flags |= TXQ_TRANSMITTING;
+	m_vec = txq->txq_m_vec;
 	in_use_init = txq->in_use;
+	err = 0;
 	while ((txq->in_use - in_use_init < txmax) &&
 	    (txq->size > txq->in_use + TX_MAX_DESC)) {
-		free = 0;
-		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
-		if (m == NULL)
+		check_pkt_coalesce(qs);
+		count = cxgb_dequeue_packet(ifp, txq, m_vec);
+		if (count == 0)
 			break;
-		/*
-		 * Convert chain to M_IOVEC
-		 */
-		KASSERT((m->m_flags & M_IOVEC) == 0, ("IOVEC set too early"));
-#ifdef notyet
-		m0 = m;
-		if (collapse_mbufs && m->m_pkthdr.len > MCLBYTES &&
-		    m_collapse(m, TX_MAX_SEGS, &m0) == EFBIG) {
-			if ((m0 = m_defrag(m, M_NOWAIT)) != NULL) {
-				m = m0;
-				m_collapse(m, TX_MAX_SEGS, &m0);
-			} else
-				break;
-		}
-		m = m0;
-#endif		
-		if ((err = t3_encap(p, &m, &free)) != 0)
+		ETHER_BPF_MTAP(ifp, m_vec[0]);
+		
+		if ((err = t3_encap(qs, m_vec, count)) != 0)
 			break;
-		BPF_MTAP(ifp, m);
-		if (free)
-			m_freem(m);
+		txq->txq_enqueued += count;
 	}
-	txq->flags &= ~TXQ_TRANSMITTING;
-	mtx_unlock(&txq->lock);
-
+#ifndef IFNET_MULTIQUEUE	
 	if (__predict_false(err)) {
 		if (err == ENOMEM) {
 			ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 			IFQ_LOCK(&ifp->if_snd);
-			IFQ_DRV_PREPEND(&ifp->if_snd, m);
+			IFQ_DRV_PREPEND(&ifp->if_snd, m_vec[0]);
 			IFQ_UNLOCK(&ifp->if_snd);
 		}
 	}
-	if (err == 0 && m == NULL) 
+	if (err == 0 && m_vec[0] == NULL) {
 		err = ENOBUFS;
+	}
 	else if ((err == 0) &&  (txq->size <= txq->in_use + TX_MAX_DESC) &&
 	    (ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
 		ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 		err = ENOSPC;
 	}
+#else
+	if ((err == 0) &&  (txq->size <= txq->in_use + TX_MAX_DESC)) {
+		err = ENOSPC;
+		setbit(&qs->txq_stopped, TXQ_ETH);
+	}
+	if (err == ENOMEM) {
+		int i;
+		/*
+		 * Sub-optimal :-/
+		 */
+		for (i = 0; i < count; i++)
+			m_freem(m_vec[i]);
+	}
+#endif
+	return (err);
+}
+
+#ifndef IFNET_MULTIQUEUE
+static int
+cxgb_start_tx(struct ifnet *ifp, uint32_t txmax)
+{
+	struct sge_qset *qs;
+	struct sge_txq *txq;
+	struct port_info *p = ifp->if_softc;
+	int err;
+	
+	if (!p->link_config.link_ok)
+		return (ENXIO);
+
+	if (IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
+		return (ENOBUFS);
+	}
+	
+	qs = &p->adapter->sge.qs[p->first_qset];
+	txq = &qs->txq[TXQ_ETH];
+	err = 0;
+
+	if (txq->flags & TXQ_TRANSMITTING)
+		return (EINPROGRESS);
+
+	mtx_lock(&txq->lock);
+	txq->flags |= TXQ_TRANSMITTING;
+	cxgb_tx_common(ifp, qs, txmax);
+	txq->flags &= ~TXQ_TRANSMITTING;
+	mtx_unlock(&txq->lock);
+
 	return (err);
 }
 
@@ -1932,7 +1991,15 @@ cxgb_start_proc(void *arg, int ncount)
 	} while (error == 0);
 }
 
-static void
+int
+cxgb_dequeue_packet(struct ifnet *ifp, struct sge_txq *unused, struct mbuf **m_vec)
+{
+	
+	IFQ_DRV_DEQUEUE(&ifp->if_snd, m_vec[0]);
+	return (m_vec[0] ? 1 : 0);
+}
+
+void
 cxgb_start(struct ifnet *ifp)
 {
 	struct port_info *pi = ifp->if_softc;	
@@ -1952,7 +2019,7 @@ cxgb_start(struct ifnet *ifp)
 	if (err == 0)
 		taskqueue_enqueue(pi->tq, &pi->start_task);
 }
-
+#endif
 
 static int
 cxgb_media_change(struct ifnet *ifp)
@@ -2078,12 +2145,26 @@ static void
 cxgb_tick(void *arg)
 {
 	adapter_t *sc = (adapter_t *)arg;
+	int i, running = 0;
+	
+	for_each_port(sc, i) {
+		
+		struct port_info *p = &sc->port[i];
+		struct ifnet *ifp = p->ifp;
+		PORT_LOCK(p);
 
+		if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) 
+			running = 1;
+		PORT_UNLOCK(p);
+	}	
+
+	if (running == 0)
+		return;
+		
 	taskqueue_enqueue(sc->tq, &sc->tick_task);
 	
 	if (sc->open_device_map != 0) 
-		callout_reset(&sc->cxgb_tick_ch, sc->params.stats_update_period * hz,
-		    cxgb_tick, sc);
+		callout_reset(&sc->cxgb_tick_ch, hz, cxgb_tick, sc);
 }
 
 static void
@@ -2478,7 +2559,7 @@ cxgb_extension_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data,
 		 * Read 256 bytes at a time as len can be large and we don't
 		 * want to use huge intermediate buffers.
 		 */
-		useraddr = (uint8_t *)(t + 1);   /* advance to start of buffer */
+		useraddr = (uint8_t *)t->buf; 
 		while (t->len) {
 			unsigned int chunk = min(t->len, sizeof(buf));
 
diff --git a/sys/dev/cxgb/cxgb_offload.c b/sys/dev/cxgb/cxgb_offload.c
index d0b9b32..3ce1a11 100644
--- a/sys/dev/cxgb/cxgb_offload.c
+++ b/sys/dev/cxgb/cxgb_offload.c
@@ -108,9 +108,12 @@ cxgb_register_client(struct cxgb_client *client)
 		printf("client->add set\n");
 		
 		TAILQ_FOREACH(tdev, &ofld_dev_list, entry) {
-			if (offload_activated(tdev))
+			if (offload_activated(tdev)) {
+				printf("calling add=%p on %p\n",
+				    client->add, tdev);
+				
 				client->add(tdev);
-			else
+			} else
 				printf("%p not activated\n", tdev);
 			
 		}
@@ -477,7 +480,8 @@ rx_offload_blackhole(struct t3cdev *dev, struct mbuf **m, int n)
 }
 
 static void
-dummy_neigh_update(struct t3cdev *dev, struct rtentry *neigh, struct sockaddr *sa)
+dummy_neigh_update(struct t3cdev *dev, struct rtentry *neigh, uint8_t *enaddr,
+    struct sockaddr *sa)
 {
 }
 
@@ -895,17 +899,32 @@ do_term(struct t3cdev *dev, struct mbuf *m)
 }
 
 static void
-cxgb_route_event(void *unused, int event, struct rtentry *rt0,
+cxgb_arp_update_event(void *unused, struct rtentry *rt0,
+    uint8_t *enaddr, struct sockaddr *sa)
+{
+
+	if (TOEDEV(rt0->rt_ifp) == NULL)
+		return;
+
+	RT_ADDREF(rt0);
+	RT_UNLOCK(rt0);
+	cxgb_neigh_update(rt0, enaddr, sa);
+	RT_LOCK(rt0);
+	RT_REMREF(rt0);
+}
+
+
+static void
+cxgb_redirect_event(void *unused, int event, struct rtentry *rt0,
     struct rtentry *rt1, struct sockaddr *sa)
 {
-	struct toedev *tdev0, *tdev1 = NULL;
+	struct toedev *tdev0, *tdev1;
 
 	/* 
 	 * ignore events on non-offloaded interfaces
 	 */
 	tdev0 = TOEDEV(rt0->rt_ifp);
-	if (rt1)
-		tdev1 = TOEDEV(rt1->rt_ifp);
+	tdev1 = TOEDEV(rt1->rt_ifp);
 	if (tdev0 == NULL && tdev1 == NULL)
 		return;
         /*
@@ -914,34 +933,16 @@ cxgb_route_event(void *unused, int event, struct rtentry *rt0,
 	 */
 	RT_ADDREF(rt0);
 	RT_UNLOCK(rt0);
-	if (rt1) {
-		RT_ADDREF(rt1);
-		RT_UNLOCK(rt1);
-	}
-
-	switch (event) {
-		case RTEVENT_ARP_UPDATE: {
-			cxgb_neigh_update(rt0, sa);
-			break;
-		}
-		case RTEVENT_REDIRECT_UPDATE: {
-			cxgb_redirect(rt0, rt1, sa);
-			cxgb_neigh_update(rt1, sa);
+	RT_ADDREF(rt1);
+	RT_UNLOCK(rt1);
 	
-			break;
-		}
-		case RTEVENT_PMTU_UPDATE:
-		default:
-			break;
-	}
+	cxgb_redirect(rt0, rt1, sa);
+	cxgb_neigh_update(rt1, NULL, sa);
 
 	RT_LOCK(rt0);
 	RT_REMREF(rt0);
-	if (rt1) {
-		RT_LOCK(rt1);
-		RT_REMREF(rt1);
-	}
-	
+	RT_LOCK(rt1);
+	RT_REMREF(rt1);
 }
 
 /*
@@ -1048,14 +1049,14 @@ cxgb_ofld_recv(struct t3cdev *dev, struct mbuf **m, int n)
 }
 
 void
-cxgb_neigh_update(struct rtentry *rt, struct sockaddr *sa)
+cxgb_neigh_update(struct rtentry *rt, uint8_t *enaddr, struct sockaddr *sa)
 {
 
 	if (is_offloading(rt->rt_ifp)) {
 		struct t3cdev *tdev = T3CDEV(rt->rt_ifp);
 
 		PANIC_IF(!tdev);
-		t3_l2t_update(tdev, rt, sa);
+		t3_l2t_update(tdev, rt, enaddr, sa);
 	}
 }
 
@@ -1425,7 +1426,10 @@ cxgb_offload_init(void)
 	t3_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_hwtid_rpl);
 	t3_register_cpl_handler(CPL_ISCSI_HDR, do_hwtid_rpl);
 
-	EVENTHANDLER_REGISTER(route_event, cxgb_route_event, NULL, EVENTHANDLER_PRI_ANY);
+	EVENTHANDLER_REGISTER(route_arp_update_event, cxgb_arp_update_event,
+	    NULL, EVENTHANDLER_PRI_ANY);
+	EVENTHANDLER_REGISTER(route_redirect_event, cxgb_redirect_event,
+	    NULL, EVENTHANDLER_PRI_ANY);
 	
 #if 0
        if (offload_proc_init())
diff --git a/sys/dev/cxgb/cxgb_offload.h b/sys/dev/cxgb/cxgb_offload.h
index 59afe6b..8c84d07 100644
--- a/sys/dev/cxgb/cxgb_offload.h
+++ b/sys/dev/cxgb/cxgb_offload.h
@@ -253,7 +253,7 @@ static inline struct toe_tid_entry *lookup_atid(const struct tid_info *t,
 
 void *cxgb_alloc_mem(unsigned long size);
 void cxgb_free_mem(void *addr);
-void cxgb_neigh_update(struct rtentry *rt, struct sockaddr *sa);
+void cxgb_neigh_update(struct rtentry *rt, uint8_t *enaddr, struct sockaddr *sa);
 void cxgb_redirect(struct rtentry *old, struct rtentry *new, struct sockaddr *sa);
 int process_rx(struct t3cdev *dev, struct mbuf **m, int n);
 int attach_t3cdev(struct t3cdev *dev);
diff --git a/sys/dev/cxgb/cxgb_osdep.h b/sys/dev/cxgb/cxgb_osdep.h
index cf5a8b6..7f75779 100644
--- a/sys/dev/cxgb/cxgb_osdep.h
+++ b/sys/dev/cxgb/cxgb_osdep.h
@@ -36,6 +36,9 @@ $FreeBSD$
 #include <sys/endian.h>
 #include <sys/bus.h>
 
+#include <sys/lock.h>
+#include <sys/mutex.h>
+
 #include <dev/mii/mii.h>
 
 #ifdef CONFIG_DEFINED
@@ -52,18 +55,17 @@ $FreeBSD$
 typedef struct adapter adapter_t;
 struct sge_rspq;
 
+
 struct t3_mbuf_hdr {
 	struct mbuf *mh_head;
 	struct mbuf *mh_tail;
 };
 
-
 #define PANIC_IF(exp) do {                  \
 	if (exp)                            \
 		panic("BUG: %s", #exp);      \
 } while (0)
 
-
 #define m_get_priority(m) ((uintptr_t)(m)->m_pkthdr.rcvif)
 #define m_set_priority(m, pri) ((m)->m_pkthdr.rcvif = (struct ifnet *)((uintptr_t)pri))
 #define m_set_sgl(m, sgl) ((m)->m_pkthdr.header = (sgl))
@@ -113,6 +115,7 @@ struct t3_mbuf_hdr {
 
 #define CXGB_TX_CLEANUP_THRESHOLD        32
 
+
 #ifdef DEBUG_PRINT
 #define DPRINTF printf
 #else 
@@ -121,19 +124,25 @@ struct t3_mbuf_hdr {
 
 #define TX_MAX_SIZE                (1 << 16)    /* 64KB                          */
 #define TX_MAX_SEGS                      36     /* maximum supported by card     */
+
 #define TX_MAX_DESC                       4     /* max descriptors per packet    */
 
+
 #define TX_START_MIN_DESC  (TX_MAX_DESC << 2)
 
-#if 0
-#define TX_START_MAX_DESC (TX_ETH_Q_SIZE >> 2)  /* maximum number of descriptors */
-#endif
+
 
 #define TX_START_MAX_DESC (TX_MAX_DESC << 3)    /* maximum number of descriptors
 						 * call to start used per 	 */
 
 #define TX_CLEAN_MAX_DESC (TX_MAX_DESC << 4)    /* maximum tx descriptors
 						 * to clean per iteration        */
+#define TX_WR_SIZE_MAX    11*1024              /* the maximum total size of packets aggregated into a single
+						* TX WR
+						*/
+#define TX_WR_COUNT_MAX         7              /* the maximum total number of packets that can be
+						* aggregated into a single TX WR
+						*/
 
 
 #if defined(__i386__) || defined(__amd64__)
@@ -142,7 +151,7 @@ struct t3_mbuf_hdr {
 #define wmb()   __asm volatile("sfence" ::: "memory")
 #define smp_mb() mb()
 
-#define L1_CACHE_BYTES 64
+#define L1_CACHE_BYTES 128
 static __inline
 void prefetch(void *x) 
 { 
@@ -167,6 +176,107 @@ extern void kdb_backtrace(void);
 #define prefetch(x)
 #define L1_CACHE_BYTES 32
 #endif
+
+struct buf_ring {
+	caddr_t          *br_ring;
+	volatile uint32_t br_cons;
+	volatile uint32_t br_prod;
+	int               br_size;
+	struct mtx        br_lock;
+};
+
+struct buf_ring *buf_ring_alloc(int count, int flags);
+void buf_ring_free(struct buf_ring *);
+
+static __inline int
+buf_ring_count(struct buf_ring *mr)
+{
+	int size = mr->br_size;
+	int mask = size - 1;
+	
+	return ((size + mr->br_prod - mr->br_cons) & mask);
+}
+
+static __inline int
+buf_ring_empty(struct buf_ring *mr)
+{
+	return (mr->br_cons == mr->br_prod);
+}
+
+/*
+ * The producer and consumer are independently locked
+ * this relies on the consumer providing his own serialization
+ *
+ */
+static __inline void *
+buf_ring_dequeue(struct buf_ring *mr)
+{
+	int prod, cons, mask;
+	caddr_t *ring, m;
+	
+	ring = (caddr_t *)mr->br_ring;
+	mask = mr->br_size - 1;
+	cons = mr->br_cons;
+	prod = mr->br_prod;
+	m = NULL;
+	if (cons != prod) {
+		m = ring[cons];
+		mr->br_cons = (cons + 1) & mask;
+		mb();
+	}
+	return (m);
+}
+
+
+static __inline int
+__buf_ring_enqueue(struct buf_ring *mr, void *m)
+{
+	
+	int prod, cons, mask, err;
+	
+	cons = mr->br_cons;
+	prod = mr->br_prod;
+	mask = mr->br_size - 1;
+	if (((prod + 1) & mask) != cons) {
+		mr->br_ring[prod] = m;
+		mb();
+		mr->br_prod = (prod + 1) & mask;
+		err = 0;
+	} else
+		err = ENOBUFS;
+
+	return (err);
+}
+
+static __inline int
+buf_ring_enqueue(struct buf_ring *mr, void *m)
+{
+	int err;
+	
+	mtx_lock(&mr->br_lock);
+	err = __buf_ring_enqueue(mr, m);
+	mtx_unlock(&mr->br_lock);
+
+	return (err);
+}
+
+static __inline void *
+buf_ring_peek(struct buf_ring *mr)
+{
+	int prod, cons, mask;
+	caddr_t *ring, m;
+	
+	ring = (caddr_t *)mr->br_ring;
+	mask = mr->br_size - 1;
+	cons = mr->br_cons;
+	prod = mr->br_prod;
+	m = NULL;
+	if (cons != prod)
+		m = ring[cons];
+
+	return (m);
+}
+
 #define DBG_RX          (1 << 0)
 static const int debug_flags = DBG_RX;
 
@@ -189,15 +299,12 @@ static const int debug_flags = DBG_RX;
 
 #define t3_os_sleep(x) DELAY((x) * 1000)
 
-#define test_and_clear_bit(bit, p) atomic_cmpset_int((p), ((*(p)) | bit), ((*(p)) & ~bit)) 
-
+#define test_and_clear_bit(bit, p) atomic_cmpset_int((p), ((*(p)) | (1<<bit)), ((*(p)) & ~(1<<bit)))
 
 #define max_t(type, a, b) (type)max((a), (b))
 #define net_device ifnet
 #define cpu_to_be32            htobe32
 
-
-
 /* Standard PHY definitions */
 #define BMCR_LOOPBACK		BMCR_LOOP
 #define BMCR_ISOLATE		BMCR_ISO
@@ -247,13 +354,13 @@ static const int debug_flags = DBG_RX;
 #define swab32(x) bswap32(x)
 #define simple_strtoul strtoul
 
-/* More types and endian definitions */
+
 typedef uint8_t u8;
 typedef uint16_t u16;
 typedef uint32_t u32;
 typedef uint64_t u64;
-
-typedef uint8_t	__u8;
+ 
+typedef uint8_t       __u8;
 typedef uint16_t __u16;
 typedef uint32_t __u32;
 typedef uint8_t __be8;
@@ -261,6 +368,7 @@ typedef uint16_t __be16;
 typedef uint32_t __be32;
 typedef uint64_t __be64;
 
+
 #if BYTE_ORDER == BIG_ENDIAN
 #define __BIG_ENDIAN_BITFIELD
 #elif BYTE_ORDER == LITTLE_ENDIAN
diff --git a/sys/dev/cxgb/cxgb_sge.c b/sys/dev/cxgb/cxgb_sge.c
index e41148a..a079679 100644
--- a/sys/dev/cxgb/cxgb_sge.c
+++ b/sys/dev/cxgb/cxgb_sge.c
@@ -42,13 +42,13 @@ __FBSDID("$FreeBSD$");
 #include <sys/rman.h>
 #include <sys/queue.h>
 #include <sys/sysctl.h>
-#include <sys/syslog.h>
 #include <sys/taskqueue.h>
 
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/systm.h>
+#include <sys/syslog.h>
 
 #include <netinet/in_systm.h>
 #include <netinet/in.h>
@@ -59,8 +59,7 @@ __FBSDID("$FreeBSD$");
 #include <dev/pci/pcivar.h>
 
 #include <vm/vm.h>
-#include <vm/vm_page.h>
-#include <vm/vm_map.h>
+#include <vm/pmap.h>
 
 #ifdef CONFIG_DEFINED
 #include <cxgb_include.h>
@@ -70,14 +69,15 @@ __FBSDID("$FreeBSD$");
 #include <dev/cxgb/sys/mvec.h>
 #endif
 
-uint32_t collapse_free = 0;
-uint32_t mb_free_vec_free = 0;
 int      txq_fills = 0;
-int      collapse_mbufs = 0;
 static int bogus_imm = 0;
 #ifndef DISABLE_MBUF_IOVEC
 static int recycle_enable = 1;
 #endif
+extern int cxgb_txq_buf_ring_size;
+int cxgb_cached_allocations;
+int cxgb_cached;
+int cxgb_ext_freed;
 
 #define USE_GTS 0
 
@@ -134,15 +134,17 @@ struct rsp_desc {               /* response queue descriptor */
 #define RSPQ_SOP_EOP             G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
 
 struct tx_sw_desc {                /* SW state per Tx descriptor */
-	struct mbuf	*m;        
+	struct mbuf_iovec mi;
 	bus_dmamap_t	map;
 	int		flags;
 };
 
 struct rx_sw_desc {                /* SW state per Rx descriptor */
-	void	        *cl;
-	bus_dmamap_t	map;
-	int		flags;
+	caddr_t	         rxsd_cl;
+	uint32_t         *rxsd_ref;
+	caddr_t	         data;
+	bus_dmamap_t	  map;
+	int		  flags;
 };
 
 struct txq_state {
@@ -186,11 +188,9 @@ static uint8_t flit_desc_map[] = {
 static int lro_default = 0;
 int cxgb_debug = 0;
 
-static void t3_free_qset(adapter_t *sc, struct sge_qset *q);
 static void sge_timer_cb(void *arg);
 static void sge_timer_reclaim(void *arg, int ncount);
 static void sge_txq_reclaim_handler(void *arg, int ncount);
-static int free_tx_desc(struct sge_txq *q, int n, struct mbuf **m_vec);
 
 /**
  *	reclaim_completed_tx - reclaims completed Tx descriptors
@@ -202,19 +202,17 @@ static int free_tx_desc(struct sge_txq *q, int n, struct mbuf **m_vec);
  *	queue's lock held.
  */
 static __inline int
-reclaim_completed_tx(struct sge_txq *q, int nbufs, struct mbuf **mvec)
+reclaim_completed_tx(struct sge_txq *q)
 {
-	int reclaimed, reclaim = desc_reclaimable(q);
-	int n = 0;
+	int reclaim = desc_reclaimable(q);
 
 	mtx_assert(&q->lock, MA_OWNED);
 	if (reclaim > 0) {
-		n = free_tx_desc(q, min(reclaim, nbufs), mvec);
-		reclaimed = min(reclaim, nbufs);
-		q->cleaned += reclaimed;
-		q->in_use -= reclaimed;
+		t3_free_tx_desc(q, reclaim);
+		q->cleaned += reclaim;
+		q->in_use -= reclaim;
 	} 
-	return (n);
+	return (reclaim);
 }
 
 /**
@@ -298,38 +296,14 @@ sgl_len(unsigned int n)
 static __inline int
 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct t3_mbuf_hdr *mh)
 {
-	struct mbuf *m;
-	int len;
-	uint32_t flags = ntohl(resp->flags);       	
-	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
-
-	/*
-	 * would be a firmware bug
-	 */
-	if (sopeop == RSPQ_NSOP_NEOP || sopeop == RSPQ_SOP)
 		return (0);
 	
-	m = m_gethdr(M_NOWAIT, MT_DATA);
-	len = G_RSPD_LEN(ntohl(resp->len_cq));
+	m = m_gethdr(M_DONTWAIT, MT_DATA);
+	len = IMMED_PKT_SIZE; 
 	
 	if (m) {
-		MH_ALIGN(m, IMMED_PKT_SIZE);
 		memcpy(m->m_data, resp->imm_data, IMMED_PKT_SIZE);
-		m->m_len = len;
-		
-		switch (sopeop) {
-		case RSPQ_SOP_EOP:
-			mh->mh_head = mh->mh_tail = m;
-			m->m_pkthdr.len = len;
-			m->m_flags |= M_PKTHDR;
-			break;
-		case RSPQ_EOP:	
-			m->m_flags &= ~M_PKTHDR;
-			mh->mh_head->m_pkthdr.len += len;
-			mh->mh_tail->m_next = m;
-			mh->mh_tail = m;
-			break;
-		}
+		m->m_pkthdr.len = m->m_len = len;		
 	}
 	return (m != NULL);
 }
@@ -338,35 +312,11 @@ get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct t3_mbuf_hdr *m
 static int
 get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m, void *cl, uint32_t flags)
 {
-	int len, error;
-	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
-	
-	/*
-	 * would be a firmware bug
-	 */
-	len = G_RSPD_LEN(ntohl(resp->len_cq));	
-	if (sopeop == RSPQ_NSOP_NEOP || sopeop == RSPQ_SOP) {
-		if (cxgb_debug)
-			device_printf(sc->dev, "unexpected value sopeop=%d flags=0x%x len=%din get_imm_packet\n", sopeop, flags, len);
-		bogus_imm++;
-		return (EINVAL);
-	}
-	error = 0;
-	switch (sopeop) {
-	case RSPQ_SOP_EOP:
-		m->m_len = m->m_pkthdr.len = len; 
-		memcpy(mtod(m, uint8_t *), resp->imm_data, len); 
-		break;
-	case RSPQ_EOP:
-		memcpy(cl, resp->imm_data, len); 
-		m_iovappend(m, cl, MSIZE, len, 0); 
-		break;
-	default:
-		bogus_imm++;
-		error = EINVAL;
-	}
 
-	return (error);
+	m->m_len = m->m_pkthdr.len = IMMED_PKT_SIZE; 
+	memcpy(mtod(m, uint8_t *), resp->imm_data, IMMED_PKT_SIZE); 
+	return (0);
+	
 }
 #endif
 
@@ -413,11 +363,15 @@ t3_sge_prep(adapter_t *adap, struct sge_params *p)
 
 		q->polling = adap->params.rev > 0;
 
-		if (adap->params.nports > 2)
+		if (adap->params.nports > 2) {
 			q->coalesce_nsecs = 50000;
-		else
+		} else {
+#ifdef INVARIANTS			
+			q->coalesce_nsecs = 20000;
+#else
 			q->coalesce_nsecs = 5000;
-
+#endif			
+		}
 		q->rspq_size = RSPQ_Q_SIZE;
 		q->fl_size = FL_Q_SIZE;
 		q->jumbo_size = JUMBO_Q_SIZE;
@@ -509,6 +463,7 @@ t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
 	qs->rspq.polling = 0 /* p->polling */;
 }
 
+#if !defined(__i386__) && !defined(__amd64__)
 static void
 refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 {
@@ -519,7 +474,7 @@ refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 	cb_arg->nseg = nseg;
 
 }
-
+#endif
 /**
  *	refill_fl - refill an SGE free-buffer list
  *	@sc: the controller softc
@@ -535,7 +490,7 @@ refill_fl(adapter_t *sc, struct sge_fl *q, int n)
 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
 	struct rx_desc *d = &q->desc[q->pidx];
 	struct refill_fl_cb_arg cb_arg;
-	void *cl;
+	caddr_t cl;
 	int err;
 
 	cb_arg.error = 0;
@@ -543,10 +498,11 @@ refill_fl(adapter_t *sc, struct sge_fl *q, int n)
 		/*
 		 * We only allocate a cluster, mbuf allocation happens after rx
 		 */
-		if ((cl = m_cljget(NULL, M_DONTWAIT, q->buf_size)) == NULL) {
+		if ((cl = cxgb_cache_get(q->zone)) == NULL) {
 			log(LOG_WARNING, "Failed to allocate cluster\n");
 			goto done;
 		}
+		
 		if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
 			if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
 				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
@@ -555,7 +511,9 @@ refill_fl(adapter_t *sc, struct sge_fl *q, int n)
 			}
 			sd->flags |= RX_SW_DESC_MAP_CREATED;
 		}
-		err = bus_dmamap_load(q->entry_tag, sd->map, cl, q->buf_size,
+#if !defined(__i386__) && !defined(__amd64__)
+		err = bus_dmamap_load(q->entry_tag, sd->map,
+		    cl + sizeof(struct m_hdr) + sizeof(struct pkthdr) + sizeof(struct m_ext_) + sizeof(uint32_t), q->buf_size,
 		    refill_fl_cb, &cb_arg, 0);
 		
 		if (err != 0 || cb_arg.error) {
@@ -565,9 +523,14 @@ refill_fl(adapter_t *sc, struct sge_fl *q, int n)
 			 */
 			return;
 		}
-		
+#else
+		cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)(cl + sizeof(struct m_hdr) +
+			sizeof(struct pkthdr) + sizeof(struct m_ext_) + sizeof(uint32_t)));
+#endif		
 		sd->flags |= RX_SW_DESC_INUSE;
-		sd->cl = cl;
+		sd->rxsd_cl = cl;
+		sd->rxsd_ref = (uint32_t *)(cl + sizeof(struct m_hdr) + sizeof(struct pkthdr) + sizeof(struct m_ext_));
+		sd->data = cl + sizeof(struct m_hdr) + sizeof(struct pkthdr) + sizeof(struct m_ext_) + sizeof(uint32_t);
 		d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
 		d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
 		d->len_gen = htobe32(V_FLD_GEN1(q->gen));
@@ -609,9 +572,9 @@ free_rx_bufs(adapter_t *sc, struct sge_fl *q)
 		if (d->flags & RX_SW_DESC_INUSE) {
 			bus_dmamap_unload(q->entry_tag, d->map);
 			bus_dmamap_destroy(q->entry_tag, d->map);
-			uma_zfree(q->zone, d->cl);
+			uma_zfree(q->zone, d->rxsd_cl);
 		}
-		d->cl = NULL;
+		d->rxsd_cl = NULL;
 		if (++cidx == q->size)
 			cidx = 0;
 	}
@@ -623,6 +586,19 @@ __refill_fl(adapter_t *adap, struct sge_fl *fl)
 	refill_fl(adap, fl, min(16U, fl->size - fl->credits));
 }
 
+static __inline void
+__refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
+{
+	if ((fl->size - fl->credits) < max)
+		refill_fl(adap, fl, min(max, fl->size - fl->credits));
+}
+
+void
+refill_fl_service(adapter_t *adap, struct sge_fl *fl)
+{
+	__refill_fl_lt(adap, fl, 512);
+}
+
 #ifndef DISABLE_MBUF_IOVEC
 /**
  *	recycle_rx_buf - recycle a receive buffer
@@ -753,12 +729,13 @@ static void
 sge_timer_cb(void *arg)
 {
 	adapter_t *sc = arg;
-	struct port_info *p;
+#ifndef IFNET_MULTIQUEUE	
+	struct port_info *pi;
 	struct sge_qset *qs;
 	struct sge_txq  *txq;
 	int i, j;
 	int reclaim_eth, reclaim_ofl, refill_rx;
-	
+
 	for (i = 0; i < sc->params.nports; i++) 
 		for (j = 0; j < sc->port[i].nqsets; j++) {
 			qs = &sc->sge.qs[i + j];
@@ -768,11 +745,12 @@ sge_timer_cb(void *arg)
 			refill_rx = ((qs->fl[0].credits < qs->fl[0].size) || 
 			    (qs->fl[1].credits < qs->fl[1].size));
 			if (reclaim_eth || reclaim_ofl || refill_rx) {
-				p = &sc->port[i];
-				taskqueue_enqueue(p->tq, &p->timer_reclaim_task);
+				pi = &sc->port[i];
+				taskqueue_enqueue(pi->tq, &pi->timer_reclaim_task);
 				break;
 			}
 		}
+#endif
 	if (sc->params.nports > 2) {
 		int i;
 
@@ -799,13 +777,15 @@ t3_sge_init_adapter(adapter_t *sc)
 	callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
 	callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
 	TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
+	mi_init();
+	cxgb_cache_init();
 	return (0);
 }
 
 int
-t3_sge_init_port(struct port_info *p)
+t3_sge_init_port(struct port_info *pi)
 {
-	TASK_INIT(&p->timer_reclaim_task, 0, sge_timer_reclaim, p);
+	TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
 	return (0);
 }
 
@@ -820,6 +800,8 @@ t3_sge_deinit_sw(adapter_t *sc)
 	for (i = 0; i < sc->params.nports; i++) 
 		if (sc->port[i].tq != NULL)
 			taskqueue_drain(sc->port[i].tq, &sc->port[i].timer_reclaim_task);
+
+	mi_deinit();
 }
 
 /**
@@ -843,29 +825,22 @@ refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
 static __inline void
 sge_txq_reclaim_(struct sge_txq *txq)
 {
-	int reclaimable, i, n;
-	struct mbuf *m_vec[TX_CLEAN_MAX_DESC];
-	struct port_info *p;
+	int reclaimable, n;
+	struct port_info *pi;
 
-	p = txq->port;
+	pi = txq->port;
 reclaim_more:
 	n = 0;
 	reclaimable = desc_reclaimable(txq);
 	if (reclaimable > 0 && mtx_trylock(&txq->lock)) {
-		n = reclaim_completed_tx(txq, TX_CLEAN_MAX_DESC, m_vec);
+		n = reclaim_completed_tx(txq);
 		mtx_unlock(&txq->lock);
 	}
-	if (n == 0)
-		return;
-	
-	for (i = 0; i < n; i++) {
-		m_freem(m_vec[i]);
-	}
-	if (p && p->ifp->if_drv_flags & IFF_DRV_OACTIVE &&
+	if (pi && pi->ifp->if_drv_flags & IFF_DRV_OACTIVE &&
 	    txq->size - txq->in_use >= TX_START_MAX_DESC) {
 		txq_fills++;
-		p->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
-		taskqueue_enqueue(p->tq, &p->start_task);
+		pi->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
+		taskqueue_enqueue(pi->tq, &pi->start_task);
 	}
 
 	if (n)
@@ -883,13 +858,16 @@ sge_txq_reclaim_handler(void *arg, int ncount)
 static void
 sge_timer_reclaim(void *arg, int ncount)
 {
-	struct port_info *p = arg;
-	int i, nqsets = p->nqsets;
-	adapter_t *sc = p->adapter;
+	struct port_info *pi = arg;
+	int i, nqsets = pi->nqsets;
+	adapter_t *sc = pi->adapter;
 	struct sge_qset *qs;
 	struct sge_txq *txq;
 	struct mtx *lock;
 
+#ifdef IFNET_MULTIQUEUE
+	panic("%s should not be called with multiqueue support\n", __FUNCTION__);
+#endif 
 	for (i = 0; i < nqsets; i++) {
 		qs = &sc->sge.qs[i];
 		txq = &qs->txq[TXQ_ETH];
@@ -942,6 +920,10 @@ init_qset_cntxt(struct sge_qset *qs, u_int id)
 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
+
+	mbufq_init(&qs->txq[TXQ_ETH].sendq);
+	mbufq_init(&qs->txq[TXQ_OFLD].sendq);
+	mbufq_init(&qs->txq[TXQ_CTRL].sendq);
 }
 
 
@@ -985,7 +967,7 @@ calc_tx_descs(const struct mbuf *m, int nsegs)
 
 	flits = sgl_len(nsegs) + 2;
 #ifdef TSO_SUPPORTED
-	if  (m->m_pkthdr.csum_flags & (CSUM_TSO))
+	if (m->m_pkthdr.csum_flags & CSUM_TSO)
 		flits++;
 #endif	
 	return flits_to_desc(flits);
@@ -993,28 +975,27 @@ calc_tx_descs(const struct mbuf *m, int nsegs)
 
 static unsigned int
 busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
-    struct tx_sw_desc *stx, bus_dma_segment_t *segs, int *nsegs)
+    struct tx_sw_desc *txsd, bus_dma_segment_t *segs, int *nsegs)
 {
 	struct mbuf *m0;
-	int err, pktlen;
+	int err, pktlen, pass = 0;
 	
+retry:
+	err = 0;
 	m0 = *m;
 	pktlen = m0->m_pkthdr.len;
+#if defined(__i386__) || defined(__amd64__)
+	if (busdma_map_sg_collapse(m, segs, nsegs) == 0) {
+		goto done;
+	} else
+#endif
+		err = bus_dmamap_load_mbuf_sg(txq->entry_tag, txsd->map, m0, segs, nsegs, 0);
 
-	err = bus_dmamap_load_mbuf_sg(txq->entry_tag, stx->map, m0, segs, nsegs, 0);
-#ifdef DEBUG		
-	if (err) {
-		int n = 0;
-		struct mbuf *mtmp = m0;
-		while(mtmp) {
-			n++;
-			mtmp = mtmp->m_next;
-		}
-		printf("map_mbufs: bus_dmamap_load_mbuf_sg failed with %d - pkthdr.len==%d nmbufs=%d\n",
-		    err, m0->m_pkthdr.len, n);
+	if (err == 0) {
+		goto done;
 	}
-#endif
-	if (err == EFBIG) {
+	if (err == EFBIG && pass == 0) {
+		pass = 1;
 		/* Too many segments, try to defrag */
 		m0 = m_defrag(m0, M_DONTWAIT);
 		if (m0 == NULL) {
@@ -1023,23 +1004,21 @@ busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
 			return (ENOBUFS);
 		}
 		*m = m0;
-		err = bus_dmamap_load_mbuf_sg(txq->entry_tag, stx->map, m0, segs, nsegs, 0);
-	}
-
-	if (err == ENOMEM) {
+		goto retry;
+	} else if (err == ENOMEM) {
 		return (err);
-	}
-
-	if (err) {
+	} if (err) {
 		if (cxgb_debug)
 			printf("map failure err=%d pktlen=%d\n", err, pktlen);
 		m_freem(m0);
 		*m = NULL;
 		return (err);
 	}
-
-	bus_dmamap_sync(txq->entry_tag, stx->map, BUS_DMASYNC_PREWRITE);
-	stx->flags |= TX_SW_DESC_MAPPED;
+done:
+#if !defined(__i386__) && !defined(__amd64__)
+	bus_dmamap_sync(txq->entry_tag, txsd->map, BUS_DMASYNC_PREWRITE);
+#endif	
+	txsd->flags |= TX_SW_DESC_MAPPED;
 
 	return (0);
 }
@@ -1059,12 +1038,18 @@ make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
 {
 	int i, idx;
 	
-	for (idx = 0, i = 0; i < nsegs; i++, idx ^= 1) {
+	for (idx = 0, i = 0; i < nsegs; i++) {
+		/*
+		 * firmware doesn't like empty segments
+		 */
+		if (segs[i].ds_len == 0)
+			continue;
 		if (i && idx == 0) 
 			++sgp;
-
+		
 		sgp->len[idx] = htobe32(segs[i].ds_len);
 		sgp->addr[idx] = htobe64(segs[i].ds_addr);
+		idx ^= 1;
 	}
 	
 	if (idx)
@@ -1112,6 +1097,20 @@ wr_gen2(struct tx_desc *d, unsigned int gen)
 #endif
 }
 
+#if 0
+static int print_wr = 0;
+static __inline void
+do_print_wr(struct tx_desc *d, int flits)
+{
+	int i = 0;
+	
+	if (print_wr)
+		while (flits--) {
+			printf("flit[%d]: 0x%016lx\n", i, d->flit[i]);
+			i++;
+		}
+}
+#endif
 
 
 /**
@@ -1131,7 +1130,6 @@ wr_gen2(struct tx_desc *d, unsigned int gen)
  *	and we just need to write the WR header.  Otherwise we distribute the
  *	SGL across the number of descriptors it spans.
  */
-
 static void
 write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
     const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
@@ -1149,6 +1147,7 @@ write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs
 		    V_WR_GEN(txqs->gen)) | wr_lo;
 		/* XXX gen? */
 		wr_gen2(txd, txqs->gen);
+		
 	} else {
 		unsigned int ogen = txqs->gen;
 		const uint64_t *fp = (const uint64_t *)sgl;
@@ -1183,7 +1182,7 @@ write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs
 			 * is freed all clusters will be freed
 			 * with it
 			 */
-			txsd->m = NULL;
+			txsd->mi.mi_base = NULL;
 			wrp = (struct work_request_hdr *)txd;
 			wrp->wr_hi = htonl(V_WR_DATATYPE(1) |
 			    V_WR_SGLSFLT(1)) | wr_hi;
@@ -1200,80 +1199,151 @@ write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs
 	}
 }
 
-	
 /* sizeof(*eh) + sizeof(*vhdr) + sizeof(*ip) + sizeof(*tcp) */
 #define TCPPKTHDRSIZE (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + 20 + 20)
 
+#ifdef VLAN_SUPPORTED
+#define GET_VTAG(cntrl, m) \
+do { \
+	if ((m)->m_flags & M_VLANTAG)					            \
+		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
+} while (0)
+
+#define GET_VTAG_MI(cntrl, mi) \
+do { \
+	if ((mi)->mi_flags & M_VLANTAG)					\
+		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((mi)->mi_ether_vtag); \
+} while (0)
+#else
+#define GET_VTAG(cntrl, m)
+#define GET_VTAG_MI(cntrl, m)
+#endif
+
 int
-t3_encap(struct port_info *p, struct mbuf **m, int *free)
+t3_encap(struct sge_qset *qs, struct mbuf **m, int count)
 {
 	adapter_t *sc;
 	struct mbuf *m0;
-	struct sge_qset *qs;
 	struct sge_txq *txq;
-	struct tx_sw_desc *stx;
 	struct txq_state txqs;
+	struct port_info *pi;
 	unsigned int ndesc, flits, cntrl, mlen;
 	int err, nsegs, tso_info = 0;
 
 	struct work_request_hdr *wrp;
 	struct tx_sw_desc *txsd;
-	struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
-	bus_dma_segment_t segs[TX_MAX_SEGS];
+	struct sg_ent *sgp, *sgl;
+	bus_dma_segment_t *segs;
 	uint32_t wr_hi, wr_lo, sgl_flits; 
 
 	struct tx_desc *txd;
-	struct cpl_tx_pkt *cpl;
-       
-	m0 = *m;	
-	sc = p->adapter;
-	
-	DPRINTF("t3_encap port_id=%d qsidx=%d ", p->port_id, p->first_qset);
-
-	/* port_id=1 qsid=1 txpkt_intf=2 tx_chan=0 */
-
-	qs = &sc->sge.qs[p->first_qset];
+	struct mbuf_vec *mv;
+	struct mbuf_iovec *mi;
+		
+	DPRINTF("t3_encap cpu=%d ", curcpu);
 
+	pi = qs->port;
+	sc = pi->adapter;
 	txq = &qs->txq[TXQ_ETH];
-	stx = &txq->sdesc[txq->pidx];
+	txsd = &txq->sdesc[txq->pidx];
 	txd = &txq->desc[txq->pidx];
-	cpl = (struct cpl_tx_pkt *)txd;
-	mlen = m0->m_pkthdr.len;
-	cpl->len = htonl(mlen | 0x80000000);
-	
-	DPRINTF("mlen=%d txpkt_intf=%d tx_chan=%d\n", mlen, p->txpkt_intf, p->tx_chan);
-	/*
-	 * XXX handle checksum, TSO, and VLAN here
-	 *	 
-	 */
-	cntrl = V_TXPKT_INTF(p->txpkt_intf);
+	sgl = txq->txq_sgl;
+	segs = txq->txq_segs;
+	m0 = *m;
+	DPRINTF("t3_encap port_id=%d qsidx=%d ", pi->port_id, pi->first_qset);
+	DPRINTF("mlen=%d txpkt_intf=%d tx_chan=%d\n", m[0]->m_pkthdr.len, pi->txpkt_intf, pi->tx_chan);
 
-	/*
-	 * XXX need to add VLAN support for 6.x
-	 */
+	cntrl = V_TXPKT_INTF(pi->txpkt_intf);
+/*
+ * XXX need to add VLAN support for 6.x
+ */
 #ifdef VLAN_SUPPORTED
-	if (m0->m_flags & M_VLANTAG) 
-		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
 	if  (m0->m_pkthdr.csum_flags & (CSUM_TSO))
 		tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
-#endif		
-	if (tso_info) {
-		int eth_type;
-		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *) cpl;
+#endif
+	
+	if (count > 1) {
+		if ((err = busdma_map_sg_vec(m, &m0, segs, count)))
+			return (err);
+		nsegs = count;
+	} else if ((err = busdma_map_sg_collapse(&m0, segs, &nsegs))) {
+		if (cxgb_debug)
+			printf("failed ... err=%d\n", err);
+		return (err);
+	} 
+	KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d count=%d", nsegs, count));
+
+	if (m0->m_type == MT_DATA)
+		DPRINTF("mbuf type=%d tags:%d head=%p", m0->m_type, !SLIST_EMPTY(&m0->m_pkthdr.tags),
+		    SLIST_FIRST(&m0->m_pkthdr.tags));
+	 
+	mi_collapse_mbuf(&txsd->mi, m0);
+	mi = &txsd->mi;
+
+	if (count > 1) {
+		struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
+		int i, fidx;
+		struct mbuf_iovec *batchmi;
+
+		mv = mtomv(m0);
+		batchmi = mv->mv_vec;
+		
+		wrp = (struct work_request_hdr *)txd;
+
+		flits = count*2 + 1;
+		txq_prod(txq, 1, &txqs);
+
+		for (fidx = 1, i = 0; i < count; i++, batchmi++, fidx += 2) {
+			struct cpl_tx_pkt_batch_entry *cbe = &cpl_batch->pkt_entry[i];
+
+			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
+			GET_VTAG_MI(cntrl, batchmi);
+			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
+			cbe->cntrl = htonl(cntrl);
+			cbe->len = htonl(batchmi->mi_len | 0x80000000);
+			cbe->addr = htobe64(segs[i].ds_addr);
+			txd->flit[fidx] |= htobe64(1 << 24);
+		}
+
+		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
+		    V_WR_SGLSFLT(flits)) | htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
+		wmb();
+		wrp->wr_lo = htonl(V_WR_LEN(flits) |
+		    V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
+		/* XXX gen? */
+		wr_gen2(txd, txqs.gen);
+		check_ring_tx_db(sc, txq);
+		
+		return (0);		
+	} else if (tso_info) {
+		int undersized, eth_type;
+		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
 		struct ip *ip;
 		struct tcphdr *tcp;
-		char *pkthdr, tmp[TCPPKTHDRSIZE]; /* is this too large for the stack? */
+		char *pkthdr, tmp[TCPPKTHDRSIZE];
+		struct mbuf_vec *mv;
+		struct mbuf_iovec *tmpmi;
+
+		mv = mtomv(m0);
+		tmpmi = mv->mv_vec;
 		
 		txd->flit[2] = 0;
+		GET_VTAG_MI(cntrl, mi);
 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
 		hdr->cntrl = htonl(cntrl);
-		
-		if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
-			pkthdr = &tmp[0];
-			m_copydata(m0, 0, TCPPKTHDRSIZE, pkthdr);
-		} else {
-			pkthdr = mtod(m0, char *);
-		}
+		mlen = m0->m_pkthdr.len;
+		hdr->len = htonl(mlen | 0x80000000);
+
+		DPRINTF("tso buf len=%d\n", mlen);
+		undersized = (((tmpmi->mi_len < TCPPKTHDRSIZE) &&
+			(m0->m_flags & M_VLANTAG)) ||
+		    (tmpmi->mi_len < TCPPKTHDRSIZE - ETHER_VLAN_ENCAP_LEN));
+		if (__predict_false(undersized)) {
+			pkthdr = tmp;
+			dump_mi(mi);
+			panic("discontig packet - fixxorz");
+		} else 
+			pkthdr = m0->m_data;
 
 		if (__predict_false(m0->m_flags & M_VLANTAG)) {
 			eth_type = CPL_ETH_II_VLAN;
@@ -1292,19 +1362,33 @@ t3_encap(struct port_info *p, struct mbuf **m, int *free)
 		hdr->lso_info = htonl(tso_info);
 		flits = 3;	
 	} else {
+		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
+
+		GET_VTAG(cntrl, m0);
 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
 		cpl->cntrl = htonl(cntrl);
-		
+		mlen = m0->m_pkthdr.len;
+		cpl->len = htonl(mlen | 0x80000000);
+
 		if (mlen <= WR_LEN - sizeof(*cpl)) {
 			txq_prod(txq, 1, &txqs);
-			txq->sdesc[txqs.pidx].m = NULL;
 			
-			if (m0->m_len == m0->m_pkthdr.len)
-				memcpy(&txd->flit[2], mtod(m0, uint8_t *), mlen);
-			else
+			DPRINTF("mlen==%d max=%ld\n", mlen, (WR_LEN - sizeof(*cpl)));
+			if (mi->mi_type != MT_IOVEC &&
+			    mi->mi_type != MT_CLIOVEC) 
+				memcpy(&txd->flit[2], mi->mi_data, mlen);
+			else {
+				/*
+				 * XXX mbuf_iovec
+				 */
+#if 0
 				m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
+#endif
+				printf("bailing on m_copydata\n");
+			}
+			m_freem_iovec(&txsd->mi);
+			txsd->mi.mi_base = NULL;
 
-			*free = 1;
 			flits = (mlen + 7) / 8 + 2;
 			cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
 					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
@@ -1315,17 +1399,23 @@ t3_encap(struct port_info *p, struct mbuf **m, int *free)
 
 			wr_gen2(txd, txqs.gen);
 			check_ring_tx_db(sc, txq);
+			DPRINTF("pio buf\n");
 			return (0);
 		}
+		DPRINTF("regular buf\n");
 		flits = 2;
 	}
-
 	wrp = (struct work_request_hdr *)txd;
-	
-	if ((err = busdma_map_mbufs(m, txq, stx, segs, &nsegs)) != 0) {
+
+#ifdef	nomore
+	/*
+	 * XXX need to move into one of the helper routines above
+	 *
+	 */
+	if ((err = busdma_map_mbufs(m, txq, txsd, segs, &nsegs)) != 0) 
 		return (err);
-	}
 	m0 = *m;
+#endif
 	ndesc = calc_tx_descs(m0, nsegs);
 	
 	sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
@@ -1335,15 +1425,16 @@ t3_encap(struct port_info *p, struct mbuf **m, int *free)
 
 	DPRINTF("make_sgl success nsegs==%d ndesc==%d\n", nsegs, ndesc);
 	txq_prod(txq, ndesc, &txqs);
-	txsd = &txq->sdesc[txqs.pidx];
 	wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
 	wr_lo = htonl(V_WR_TID(txq->token));
-	txsd->m = m0;
-	m_set_priority(m0, txqs.pidx); 
-
 	write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits, sgl_flits, wr_hi, wr_lo);
-	check_ring_tx_db(p->adapter, txq);
+	check_ring_tx_db(pi->adapter, txq);
 
+	if ((m0->m_type == MT_DATA) && ((m0->m_flags & (M_EXT|M_NOFREE)) == M_EXT)) {
+		m0->m_flags &= ~M_EXT ;
+		m_free(m0);
+	}
+	
 	return (0);
 }
 
@@ -1367,6 +1458,11 @@ write_imm(struct tx_desc *d, struct mbuf *m,
 	struct work_request_hdr *from = mtod(m, struct work_request_hdr *);
 	struct work_request_hdr *to = (struct work_request_hdr *)d;
 
+	if (len > WR_LEN)
+		panic("len too big %d\n", len);
+	if (len < sizeof(*from))
+		panic("len too small %d", len);
+	
 	memcpy(&to[1], &from[1], len - sizeof(*from));
 	to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP |
 					V_WR_BCNTLFLT(len & 7));
@@ -1374,7 +1470,14 @@ write_imm(struct tx_desc *d, struct mbuf *m,
 	to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) |
 					V_WR_LEN((len + 7) / 8));
 	wr_gen2(d, gen);
-	m_freem(m);
+
+	/*
+	 * This check is a hack we should really fix the logic so
+	 * that this can't happen
+	 */
+	if (m->m_type != MT_DONTFREE)
+		m_freem(m);
+	
 }
 
 /**
@@ -1413,6 +1516,8 @@ addq_exit:	mbufq_tail(&q->sendq, m);
 
 		struct sge_qset *qs = txq_to_qset(q, qid);
 
+		printf("stopping q\n");
+		
 		setbit(&qs->txq_stopped, qid);
 		smp_mb();
 
@@ -1472,7 +1577,7 @@ ctrl_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
 		m_freem(m);
 		return 0;
 	}
-
+	
 	wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP);
 	wrp->wr_lo = htonl(V_WR_TID(q->token));
 
@@ -1483,13 +1588,14 @@ again:	reclaim_completed_tx_imm(q);
 	if (__predict_false(ret)) {
 		if (ret == 1) {
 			mtx_unlock(&q->lock);
-			return (-1);
+			log(LOG_ERR, "no desc available\n");
+			
+			return (ENOSPC);
 		}
 		goto again;
 	}
-
 	write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
-
+	
 	q->in_use++;
 	if (++q->pidx >= q->size) {
 		q->pidx = 0;
@@ -1517,6 +1623,8 @@ restart_ctrlq(void *data, int npending)
 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
 	adapter_t *adap = qs->port->adapter;
 
+	log(LOG_WARNING, "Restart_ctrlq in_use=%d\n", q->in_use);
+	
 	mtx_lock(&q->lock);
 again:	reclaim_completed_tx_imm(q);
 
@@ -1555,6 +1663,7 @@ t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
 	return ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], m);
 }
 
+
 /**
  *	free_qset - free the resources of an SGE queue set
  *	@sc: the controller owning the queue set
@@ -1564,11 +1673,18 @@ t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
  *	queue set must be quiesced prior to calling this.
  */
-static void
+void
 t3_free_qset(adapter_t *sc, struct sge_qset *q)
 {
 	int i;
-
+	
+	t3_free_tx_desc_all(&q->txq[TXQ_ETH]);
+	
+	for (i = 0; i < SGE_TXQ_PER_SET; i++) 
+		if (q->txq[i].txq_mr.br_ring != NULL) {
+			free(q->txq[i].txq_mr.br_ring, M_DEVBUF);
+			mtx_destroy(&q->txq[i].txq_mr.br_lock);
+		}
 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
 		if (q->fl[i].desc) {
 			mtx_lock(&sc->sge.reg_lock);
@@ -1629,10 +1745,13 @@ void
 t3_free_sge_resources(adapter_t *sc)
 {
 	int i, nqsets;
-
+	
+#ifdef IFNET_MULTIQUEUE
+	panic("%s should not be called when IFNET_MULTIQUEUE is defined", __FUNCTION__);
+#endif		
 	for (nqsets = i = 0; i < (sc)->params.nports; i++) 
 		nqsets += sc->port[i].nqsets;
-	
+
 	for (i = 0; i < nqsets; ++i)
 		t3_free_qset(sc, &sc->sge.qs[i]);
 }
@@ -1686,52 +1805,76 @@ t3_sge_stop(adapter_t *sc)
 
 
 /**
- *	free_tx_desc - reclaims Tx descriptors and their buffers
+ *	t3_free_tx_desc - reclaims Tx descriptors and their buffers
  *	@adapter: the adapter
  *	@q: the Tx queue to reclaim descriptors from
- *	@n: the number of descriptors to reclaim
+ *	@reclaimable: the number of descriptors to reclaim
+ *      @m_vec_size: maximum number of buffers to reclaim
+ *      @desc_reclaimed: returns the number of descriptors reclaimed
  *
  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
  *	Tx buffers.  Called with the Tx queue lock held.
+ *
+ *      Returns number of buffers of reclaimed   
  */
-int
-free_tx_desc(struct sge_txq *q, int n, struct mbuf **m_vec)
+void
+t3_free_tx_desc(struct sge_txq *q, int reclaimable)
 {
-	struct tx_sw_desc *d;
-	unsigned int cidx = q->cidx;
-	int nbufs = 0;
+	struct tx_sw_desc *txsd;
+	unsigned int cidx;
 	
 #ifdef T3_TRACE
 	T3_TRACE2(sc->tb[q->cntxt_id & 7],
-		  "reclaiming %u Tx descriptors at cidx %u", n, cidx);
+		  "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
 #endif
-	d = &q->sdesc[cidx];
-	
-	while (n-- > 0) {
-		DPRINTF("cidx=%d d=%p\n", cidx, d);
-		if (d->m) {
-			if (d->flags & TX_SW_DESC_MAPPED) {
-				bus_dmamap_unload(q->entry_tag, d->map);
-				bus_dmamap_destroy(q->entry_tag, d->map);
-				d->flags &= ~TX_SW_DESC_MAPPED;
-			}
-			if (m_get_priority(d->m) == cidx) {
-				m_vec[nbufs] = d->m;
-				d->m = NULL;
-				nbufs++;
-			} else {
-				printf("pri=%d cidx=%d\n", (int)m_get_priority(d->m), cidx);
+	cidx = q->cidx;
+	txsd = &q->sdesc[cidx];
+	DPRINTF("reclaiming %d WR\n", reclaimable);
+	while (reclaimable--) {
+		DPRINTF("cidx=%d d=%p\n", cidx, txsd);
+		if (txsd->mi.mi_base != NULL) {
+			if (txsd->flags & TX_SW_DESC_MAPPED) {
+				bus_dmamap_unload(q->entry_tag, txsd->map);
+				txsd->flags &= ~TX_SW_DESC_MAPPED;
 			}
-		}
-		++d;
+			m_freem_iovec(&txsd->mi);
+			txsd->mi.mi_base = NULL;
+
+#if defined(DIAGNOSTIC) && 0
+			if (m_get_priority(txsd->m[0]) != cidx) 
+				printf("pri=%d cidx=%d\n", (int)m_get_priority(txsd->m[0]), cidx);
+#endif			
+
+		} else
+			q->txq_skipped++;
+		
+		++txsd;
 		if (++cidx == q->size) {
 			cidx = 0;
-			d = q->sdesc;
+			txsd = q->sdesc;
 		}
 	}
 	q->cidx = cidx;
 
-	return (nbufs);
+}
+
+void
+t3_free_tx_desc_all(struct sge_txq *q)
+{
+	int i;
+	struct tx_sw_desc *txsd;
+	
+	for (i = 0; i < q->size; i++) {
+		txsd = &q->sdesc[i];
+		if (txsd->mi.mi_base != NULL) {
+			if (txsd->flags & TX_SW_DESC_MAPPED) {
+				bus_dmamap_unload(q->entry_tag, txsd->map);
+				txsd->flags &= ~TX_SW_DESC_MAPPED;
+			}
+			m_freem_iovec(&txsd->mi);
+			bzero(&txsd->mi, sizeof(txsd->mi));
+		}
+	}
 }
 
 /**
@@ -1782,31 +1925,31 @@ write_ofld_wr(adapter_t *adap, struct mbuf *m,
 	struct tx_desc *d = &q->desc[pidx];
 	struct txq_state txqs;
 	
-	if (immediate(m)) {
-		q->sdesc[pidx].m = NULL;
+	if (immediate(m) && segs == NULL) {
 		write_imm(d, m, m->m_len, gen);
 		return;
 	}
 
 	/* Only TX_DATA builds SGLs */
-
 	from = mtod(m, struct work_request_hdr *);
-	memcpy(&d->flit[1], &from[1],
-	    (uint8_t *)m->m_pkthdr.header - mtod(m, uint8_t *) - sizeof(*from));
+	memcpy(&d->flit[1], &from[1], m->m_len - sizeof(*from));
 
-	flits = ((uint8_t *)m->m_pkthdr.header - mtod(m, uint8_t *)) / 8;
+	flits = m->m_len / 8;
 	sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl;
 
 	make_sgl(sgp, segs, nsegs);
 	sgl_flits = sgl_len(nsegs);
 
-	txqs.gen = q->gen;
-	txqs.pidx = q->pidx;
-	txqs.compl = (q->unacked & 8) << (S_WR_COMPL - 3);
+	txqs.gen = gen;
+	txqs.pidx = pidx;
+	txqs.compl = 0;
+
 	write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits,
 	    from->wr_hi, from->wr_lo);
 }
 
+
+
 /**
  *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
  *	@m: the packet
@@ -1845,25 +1988,27 @@ ofld_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
 	int ret, nsegs;
 	unsigned int ndesc;
 	unsigned int pidx, gen;
-	struct mbuf *m_vec[TX_CLEAN_MAX_DESC];
-	bus_dma_segment_t segs[TX_MAX_SEGS];
-	int i, cleaned;
-	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
+	bus_dma_segment_t segs[TX_MAX_SEGS], *vsegs;
+	struct tx_sw_desc *stx;
 
-	mtx_lock(&q->lock);
-	if ((ret = busdma_map_mbufs(&m, q, stx, segs, &nsegs)) != 0) {
-		mtx_unlock(&q->lock);
-		return (ret);
-	}
+	nsegs = m_get_sgllen(m);
+	vsegs = m_get_sgl(m);
 	ndesc = calc_tx_descs_ofld(m, nsegs);
-again:	cleaned = reclaim_completed_tx(q, TX_CLEAN_MAX_DESC, m_vec);
+	busdma_map_sgl(vsegs, segs, nsegs);
 
+	stx = &q->sdesc[q->pidx];
+	KASSERT(stx->mi.mi_base == NULL, ("mi_base set"));
+	
+	mtx_lock(&q->lock);
+again:	reclaim_completed_tx(q);
 	ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
 	if (__predict_false(ret)) {
 		if (ret == 1) {
+			printf("no ofld desc avail\n");
+			
 			m_set_priority(m, ndesc);     /* save for restart */
 			mtx_unlock(&q->lock);
-			return EINTR;
+			return (EINTR);
 		}
 		goto again;
 	}
@@ -1886,10 +2031,7 @@ again:	cleaned = reclaim_completed_tx(q, TX_CLEAN_MAX_DESC, m_vec);
 
 	write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
 	check_ring_tx_db(adap, q);
-	
-	for (i = 0; i < cleaned; i++) {
-		m_freem(m_vec[i]);
-	}
+
 	return (0);
 }
 
@@ -1902,18 +2044,16 @@ again:	cleaned = reclaim_completed_tx(q, TX_CLEAN_MAX_DESC, m_vec);
 static void
 restart_offloadq(void *data, int npending)
 {
-
 	struct mbuf *m;
 	struct sge_qset *qs = data;
 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
 	adapter_t *adap = qs->port->adapter;
-	struct mbuf *m_vec[TX_CLEAN_MAX_DESC];
 	bus_dma_segment_t segs[TX_MAX_SEGS];
-	int nsegs, i, cleaned;
 	struct tx_sw_desc *stx = &q->sdesc[q->pidx];
+	int nsegs, cleaned;
 		
 	mtx_lock(&q->lock);
-again:	cleaned = reclaim_completed_tx(q, TX_CLEAN_MAX_DESC, m_vec);
+again:	cleaned = reclaim_completed_tx(q);
 
 	while ((m = mbufq_peek(&q->sendq)) != NULL) {
 		unsigned int gen, pidx;
@@ -1953,10 +2093,12 @@ again:	cleaned = reclaim_completed_tx(q, TX_CLEAN_MAX_DESC, m_vec);
 #endif
 	t3_write_reg(adap, A_SG_KDOORBELL,
 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
+#if 0
 	
 	for (i = 0; i < cleaned; i++) {
-		m_freem(m_vec[i]);
+		m_freem_vec(m_vec[i]);
 	}
+#endif
 }
 
 /**
@@ -2000,7 +2142,7 @@ t3_offload_tx(struct t3cdev *tdev, struct mbuf *m)
 	adapter_t *adap = tdev2adap(tdev);
 	struct sge_qset *qs = &adap->sge.qs[queue_set(m)];
 
-	if (__predict_false(is_ctrl_pkt(m)))
+	if (__predict_false(is_ctrl_pkt(m))) 
 		return ctrl_xmit(adap, &qs->txq[TXQ_CTRL], m);
 
 	return ofld_xmit(adap, &qs->txq[TXQ_OFLD], m);
@@ -2031,9 +2173,9 @@ rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
     struct mbuf *m, struct mbuf *rx_gather[],
     unsigned int gather_idx)
 {
+	
 	rq->offload_pkts++;
 	m->m_pkthdr.header = mtod(m, void *);
-	    
 	rx_gather[gather_idx++] = m;
 	if (gather_idx == RX_BUNDLE_SIZE) {
 		cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE);
@@ -2048,16 +2190,24 @@ restart_tx(struct sge_qset *qs)
 {
 	struct adapter *sc = qs->port->adapter;
 	
+	
 	if (isset(&qs->txq_stopped, TXQ_OFLD) &&
 	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
 	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
 		qs->txq[TXQ_OFLD].restarts++;
+		printf("restarting TXQ_OFLD\n");
 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
 	}
+	printf("stopped=0x%x restart=%d processed=%d cleaned=%d in_use=%d\n",
+	    qs->txq_stopped, should_restart_tx(&qs->txq[TXQ_CTRL]),
+	    qs->txq[TXQ_CTRL].processed, qs->txq[TXQ_CTRL].cleaned,
+	    qs->txq[TXQ_CTRL].in_use);
+	
 	if (isset(&qs->txq_stopped, TXQ_CTRL) &&
 	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
 	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
 		qs->txq[TXQ_CTRL].restarts++;
+		printf("restarting TXQ_CTRL\n");
 		taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
 	}
 }
@@ -2084,6 +2234,17 @@ t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
 	struct sge_qset *q = &sc->sge.qs[id];
 	int i, ret = 0;
 
+	for (i = 0; i < SGE_TXQ_PER_SET; i++) {
+		if ((q->txq[i].txq_mr.br_ring = malloc(cxgb_txq_buf_ring_size*sizeof(struct mbuf *),
+			    M_DEVBUF, M_WAITOK|M_ZERO)) == NULL) {
+			device_printf(sc->dev, "failed to allocate mbuf ring\n");
+			goto err;
+		}
+		q->txq[i].txq_mr.br_prod = q->txq[i].txq_mr.br_cons = 0;
+		q->txq[i].txq_mr.br_size = cxgb_txq_buf_ring_size;
+		mtx_init(&q->txq[i].txq_mr.br_lock, "txq mbuf ring", NULL, MTX_DEF);
+	}
+
 	init_qset_cntxt(q, id);
 	
 	if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
@@ -2155,13 +2316,18 @@ t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
 	q->txq[TXQ_ETH].stop_thres = nports *
 	    flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
 
-	q->fl[0].buf_size = MCLBYTES;
+	q->fl[0].buf_size = (MCLBYTES - sizeof(uint32_t) - sizeof(struct m_hdr) - sizeof(struct pkthdr) - sizeof(struct m_ext_));
 	q->fl[0].zone = zone_clust;
 	q->fl[0].type = EXT_CLUSTER;
-	q->fl[1].buf_size = MJUMPAGESIZE;
-	q->fl[1].zone = zone_jumbop;
-	q->fl[1].type = EXT_JUMBOP;
-	
+#if __FreeBSD_version > 800000
+		q->fl[1].buf_size = MJUM16BYTES - sizeof(uint32_t) - sizeof(struct m_hdr) - sizeof(struct pkthdr) - sizeof(struct m_ext_);
+		q->fl[1].zone = zone_jumbo16;
+		q->fl[1].type = EXT_JUMBO16;
+#else
+		q->fl[1].buf_size = MJUMPAGESIZE - sizeof(uint32_t) - sizeof(struct m_hdr) - sizeof(struct pkthdr) - sizeof(struct m_ext_);
+		q->fl[1].zone = zone_jumbop;
+		q->fl[1].type = EXT_JUMBOP;
+#endif
 	q->lro.enabled = lro_default;
 
 	mtx_lock(&sc->sge.reg_lock);
@@ -2269,11 +2435,15 @@ t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad)
 	
 	m->m_pkthdr.rcvif = ifp;
 	m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad;
+#ifndef DISABLE_MBUF_IOVEC
 	m_explode(m);
+#endif	
 	/*
 	 * adjust after conversion to mbuf chain
 	 */
-	m_adj(m, sizeof(*cpl) + ethpad);
+	m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
+	m->m_len -= (sizeof(*cpl) + ethpad);
+	m->m_data += (sizeof(*cpl) + ethpad);
 
 	(*ifp->if_input)(ifp, m);
 }
@@ -2307,17 +2477,24 @@ get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
 	uint32_t len = G_RSPD_LEN(len_cq);
 	uint32_t flags = ntohl(r->flags);
 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
+	uint32_t *ref;
 	int ret = 0;
 
-	prefetch(sd->cl);
+	prefetch(sd->rxsd_cl);
 
 	fl->credits--;
 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
 	bus_dmamap_unload(fl->entry_tag, sd->map);
 
-	m_cljset(m, sd->cl, fl->type);
+	ref = sd->rxsd_ref;
+	m_cljset(m, sd->rxsd_cl, fl->type, sd->rxsd_ref);
+	*ref = 1;
 	m->m_len = len;
-
+	/*
+	 * bump past the refcnt address
+	 */
+	m->m_data = sd->data;
+	
 	switch(sopeop) {
 	case RSPQ_SOP_EOP:
 		DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m));
@@ -2363,9 +2540,48 @@ get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
 }
 
 #else
+static void
+ext_free_handler(void *cl, void * arg)
+{
+	uintptr_t type = (uintptr_t)arg;
+	uma_zone_t zone;
+	struct mbuf *m;
+
+	m = cl;
+	zone = m_getzonefromtype(type);
+	m->m_ext.ext_type = (int)type;
+	cxgb_ext_freed++;
+	cxgb_cache_put(zone, cl);
+}
+
+static void
+init_cluster_mbuf(caddr_t cl, int flags, int type, uma_zone_t zone)
+{
+	struct mbuf *m;
+	int header_size;
+	
+	header_size = sizeof(struct m_hdr) + sizeof(struct pkthdr) + sizeof(struct m_ext_) + sizeof(uint32_t);
+	
+	bzero(cl, header_size);
+	m = (struct mbuf *)cl;
+
+	SLIST_INIT(&m->m_pkthdr.tags);
+	m->m_type = MT_DATA;
+	m->m_flags = flags | M_NOFREE | M_EXT;
+	m->m_data = cl + header_size;
+	m->m_ext.ext_buf = cl;
+	m->m_ext.ref_cnt = (uint32_t *)(cl + header_size - sizeof(uint32_t));
+	m->m_ext.ext_size = m_getsizefromtype(type);
+	m->m_ext.ext_free = ext_free_handler;
+	m->m_ext.ext_args = (void *)(uintptr_t)type;
+	m->m_ext.ext_type = EXT_EXTREF;
+	*(m->m_ext.ref_cnt) = 1;
+	DPRINTF("data=%p ref_cnt=%p\n", m->m_data, m->m_ext.ref_cnt); 
+}
+
 static int
 get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
-    struct mbuf *m, struct rsp_desc *r)
+    struct mbuf **m, struct rsp_desc *r)
 {
 	
 	unsigned int len_cq =  ntohl(r->len_cq);
@@ -2376,45 +2592,61 @@ get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
 	uint8_t sopeop = G_RSPD_SOP_EOP(flags);
 	void *cl;
 	int ret = 0;
-	
-	prefetch(sd->cl);
-
+	struct mbuf *m0;
+#if 0
+	if ((sd + 1 )->rxsd_cl)
+		prefetch((sd + 1)->rxsd_cl);
+	if ((sd + 2)->rxsd_cl)
+		prefetch((sd + 2)->rxsd_cl);
+#endif
+	DPRINTF("rx cpu=%d\n", curcpu);
 	fl->credits--;
 	bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
 
 	if (recycle_enable && len <= SGE_RX_COPY_THRES && sopeop == RSPQ_SOP_EOP) {
-		cl = mtod(m, void *);
-		memcpy(cl, sd->cl, len);
+		if ((m0 = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
+			goto skip_recycle;
+		cl = mtod(m0, void *);
+		memcpy(cl, sd->data, len);
 		recycle_rx_buf(adap, fl, fl->cidx);
+		*m = m0;
 	} else {
-		cl = sd->cl;
+	skip_recycle:
 		bus_dmamap_unload(fl->entry_tag, sd->map);
+		cl = sd->rxsd_cl;
+		*m = m0 = (struct mbuf *)cl;
 	}
+
 	switch(sopeop) {
 	case RSPQ_SOP_EOP:
 		DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m));
-		if (cl == sd->cl)
-			m_cljset(m, cl, fl->type);
-		m->m_len = m->m_pkthdr.len = len;
+		if (cl == sd->rxsd_cl)
+			init_cluster_mbuf(cl, M_PKTHDR, fl->type, fl->zone);
+		m0->m_len = m0->m_pkthdr.len = len;
 		ret = 1;
 		goto done;
 		break;
 	case RSPQ_NSOP_NEOP:
 		DBG(DBG_RX, ("get_packet: NO_SOP-NO_EOP m %p\n", m));
+		panic("chaining unsupported");
 		ret = 0;
 		break;
 	case RSPQ_SOP:
 		DBG(DBG_RX, ("get_packet: SOP m %p\n", m));
-		m_iovinit(m);
+		panic("chaining unsupported");
+		m_iovinit(m0);
 		ret = 0;
 		break;
 	case RSPQ_EOP:
 		DBG(DBG_RX, ("get_packet: EOP m %p\n", m));
+		panic("chaining unsupported");
 		ret = 1;
 		break;
 	}
-	m_iovappend(m, cl, fl->buf_size, len, 0);
-
+	panic("append not supported");
+#if 0	
+	m_iovappend(m0, cl, fl->buf_size, len, sizeof(uint32_t), sd->rxsd_ref);
+#endif	
 done:	
 	if (++fl->cidx == fl->size)
 		fl->cidx = 0;
@@ -2443,9 +2675,11 @@ handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
 	credits = G_RSPD_TXQ0_CR(flags);
 	if (credits) {
 		qs->txq[TXQ_ETH].processed += credits;
+#ifndef	IFNET_MULTIQUEUE
 		if (desc_reclaimable(&qs->txq[TXQ_ETH]) > TX_START_MAX_DESC)
 			taskqueue_enqueue(qs->port->adapter->tq,
 			    &qs->port->timer_reclaim_task);
+#endif		
 	}
 	
 	credits = G_RSPD_TXQ2_CR(flags);
@@ -2459,6 +2693,7 @@ handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
 	credits = G_RSPD_TXQ1_CR(flags);
 	if (credits)
 		qs->txq[TXQ_OFLD].processed += credits;
+
 }
 
 static void
@@ -2483,7 +2718,7 @@ check_ring_db(adapter_t *adap, struct sge_qset *qs,
  *	on this queue.  If the system is under memory shortage use a fairly
  *	long delay to help recovery.
  */
-static int
+int
 process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
 {
 	struct sge_rspq *rspq = &qs->rspq;
@@ -2506,7 +2741,7 @@ process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
 		int eth, eop = 0, ethpad = 0;
 		uint32_t flags = ntohl(r->flags);
 		uint32_t rss_csum = *(const uint32_t *)r;
-		uint32_t rss_hash = r->rss_hdr.rss_hash_val;
+		uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
 		
 		eth = (r->rss_hdr.opcode == CPL_RX_PKT);
 		
@@ -2517,8 +2752,7 @@ process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
 		} else if  (flags & F_RSPD_IMM_DATA_VALID) {
 #ifdef DISABLE_MBUF_IOVEC
 
-			if (cxgb_debug)
-				printf("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n", r->rss_hdr.opcode, rspq->cidx);
+			DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n", r->rss_hdr.opcode, rspq->cidx);
 
 			if(get_imm_packet(adap, r, &rspq->rspq_mh) == 0) {
 				rspq->next_holdoff = NOMEM_INTR_DELAY;
@@ -2529,10 +2763,11 @@ process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
 			}
 #else
 			struct mbuf *m = NULL;
-
+			
+			DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n", r->rss_hdr.opcode, rspq->cidx);
 			if (rspq->rspq_mbuf == NULL)
 				rspq->rspq_mbuf = m_gethdr(M_DONTWAIT, MT_DATA);
-                        else
+                        else 
 				m = m_gethdr(M_DONTWAIT, MT_DATA);
 
 			/*
@@ -2543,82 +2778,79 @@ process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
 				budget_left--;
 				break;
 			}
-			if (get_imm_packet(adap, r, rspq->rspq_mbuf, m, flags))
-				goto skip;
+			get_imm_packet(adap, r, rspq->rspq_mbuf, m, flags);
+			
 			eop = 1;
-#endif			
 			rspq->imm_data++;
+#endif			
 		} else if (r->len_cq) {			
 			int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
 			
 #ifdef DISABLE_MBUF_IOVEC
 			struct mbuf *m;
-			m = m_gethdr(M_NOWAIT, MT_DATA);
+			m = m_gethdr(M_DONTWAIT, MT_DATA);
 
 			if (m == NULL) {
 				log(LOG_WARNING, "failed to get mbuf for packet\n");
 				break;
+			} else {
+				m->m_next = m->m_nextpkt = NULL;
 			}
-			
+
 			eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mh, r, m);
 #else
-			if (rspq->rspq_mbuf == NULL)  
-				rspq->rspq_mbuf = m_gethdr(M_DONTWAIT, MT_DATA);
-			if (rspq->rspq_mbuf == NULL) { 
-				log(LOG_WARNING, "failed to get mbuf for packet\n"); 
-				break; 
-			}
-			eop = get_packet(adap, drop_thresh, qs, rspq->rspq_mbuf, r);
+			eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mbuf, r);
+#ifdef IFNET_MULTIQUEUE
+			rspq->rspq_mbuf->m_pkthdr.rss_hash = rss_hash;
+#endif			
 #endif
 			ethpad = 2;
 		} else {
 			DPRINTF("pure response\n");
 			rspq->pure_rsps++;
 		}
-
 		if (flags & RSPD_CTRL_MASK) {
 			sleeping |= flags & RSPD_GTS_MASK;
 			handle_rsp_cntrl_info(qs, flags);
 		}
-#ifndef DISABLE_MBUF_IOVEC		
-	skip:
-#endif		
+
 		r++;
 		if (__predict_false(++rspq->cidx == rspq->size)) {
 			rspq->cidx = 0;
 			rspq->gen ^= 1;
 			r = rspq->desc;
 		}
-		
 		prefetch(r);
 		if (++rspq->credits >= (rspq->size / 4)) {
 			refill_rspq(adap, rspq, rspq->credits);
 			rspq->credits = 0;
 		}
-		
-		if (eop) {
-			prefetch(mtod(rspq->rspq_mh.mh_head, uint8_t *)); 
-			prefetch(mtod(rspq->rspq_mh.mh_head, uint8_t *) + L1_CACHE_BYTES); 
+		DPRINTF("eth=%d eop=%d flags=0x%x\n", eth, eop, flags);
 
-			if (eth) {				
-				t3_rx_eth_lro(adap, rspq, rspq->rspq_mh.mh_head, ethpad,
-				    rss_hash, rss_csum, lro);
+		if (!eth && eop) {
+			rspq->rspq_mh.mh_head->m_pkthdr.csum_data = rss_csum;
+			/*
+			 * XXX size mismatch
+			 */
+			m_set_priority(rspq->rspq_mh.mh_head, rss_hash);
+				
+			ngathered = rx_offload(&adap->tdev, rspq,
+			    rspq->rspq_mh.mh_head, offload_mbufs, ngathered);
+			rspq->rspq_mh.mh_head = NULL;
+			DPRINTF("received offload packet\n");
+			
+		} else if (eth && eop) {
+			prefetch(mtod(rspq->rspq_mh.mh_head, uint8_t *)); 
+			prefetch(mtod(rspq->rspq_mh.mh_head, uint8_t *) + L1_CACHE_BYTES);
 
+			t3_rx_eth_lro(adap, rspq, rspq->rspq_mh.mh_head, ethpad,
+			    rss_hash, rss_csum, lro);
+			DPRINTF("received tunnel packet\n");
 				rspq->rspq_mh.mh_head = NULL;
-			} else {
-				rspq->rspq_mh.mh_head->m_pkthdr.csum_data = rss_csum;
-				/*
-				 * XXX size mismatch
-				 */
-				m_set_priority(rspq->rspq_mh.mh_head, rss_hash);
-				
-				ngathered = rx_offload(&adap->tdev, rspq,
-				    rspq->rspq_mh.mh_head, offload_mbufs, ngathered);
-			}
-			__refill_fl(adap, &qs->fl[0]);
-			__refill_fl(adap, &qs->fl[1]);
 
 		}
+		__refill_fl_lt(adap, &qs->fl[0], 32);
+		__refill_fl_lt(adap, &qs->fl[1], 32);
 		--budget_left;
 	}
 
@@ -2629,9 +2861,14 @@ process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
 		check_ring_db(adap, qs, sleeping);
 
 	smp_mb();  /* commit Tx queue processed updates */
-	if (__predict_false(qs->txq_stopped != 0))
+	if (__predict_false(qs->txq_stopped != 0)) {
+		printf("restarting tx on %p\n", qs);
+		
 		restart_tx(qs);
-
+	}
+	
+	__refill_fl_lt(adap, &qs->fl[0], 512);
+	__refill_fl_lt(adap, &qs->fl[1], 512);
 	budget -= budget_left;
 	return (budget);
 }
@@ -2718,10 +2955,11 @@ t3_intr_msix(void *data)
 	adapter_t *adap = qs->port->adapter;
 	struct sge_rspq *rspq = &qs->rspq;
 
-	mtx_lock(&rspq->lock);
-	if (process_responses_gts(adap, rspq) == 0)
-		rspq->unhandled_irqs++;
-	mtx_unlock(&rspq->lock);
+	if (mtx_trylock(&rspq->lock)) {
+		if (process_responses_gts(adap, rspq) == 0)
+			rspq->unhandled_irqs++;
+		mtx_unlock(&rspq->lock);
+	}
 }
 
 /* 
@@ -2765,7 +3003,10 @@ t3_set_coalesce_nsecs(SYSCTL_HANDLER_ARGS)
 	struct sge_qset *qs;
 	int i, j, err, nqsets = 0;
 	struct mtx *lock;
-	
+
+	if ((sc->flags & FULL_INIT_DONE) == 0)
+		return (ENXIO);
+		
 	coalesce_nsecs = qsp->coalesce_nsecs;
         err = sysctl_handle_int(oidp, &coalesce_nsecs, arg2, req);
 
@@ -2801,11 +3042,11 @@ t3_set_coalesce_nsecs(SYSCTL_HANDLER_ARGS)
 
 
 void
-t3_add_sysctls(adapter_t *sc)
+t3_add_attach_sysctls(adapter_t *sc)
 {
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid_list *children;
-	
+
 	ctx = device_get_sysctl_ctx(sc->dev);
 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
 
@@ -2821,28 +3062,13 @@ t3_add_sysctls(adapter_t *sc)
 	    0, t3_lro_enable,
 	    "I", "enable large receive offload");
 
-	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
-	    "intr_coal",
-	    CTLTYPE_INT|CTLFLAG_RW, sc,
-	    0, t3_set_coalesce_nsecs,
-	    "I", "interrupt coalescing timer (ns)");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 	    "enable_debug",
 	    CTLFLAG_RW, &cxgb_debug,
 	    0, "enable verbose debugging output");
-
-	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
-	    "collapse_free",
-	    CTLFLAG_RD, &collapse_free,
-	    0, "frees during collapse");
-	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
-	    "mb_free_vec_free",
-	    CTLFLAG_RD, &mb_free_vec_free,
-	    0, "frees during mb_free_vec");
-	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
-	    "collapse_mbufs",
-	    CTLFLAG_RW, &collapse_mbufs,
-	    0, "collapse mbuf chains into iovecs");
+	SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "tunq_coalesce",
+	    CTLFLAG_RD, &sc->tunq_coalesce,
+	    "#tunneled packets freed");
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
 	    "txq_overrun",
 	    CTLFLAG_RD, &txq_fills,
@@ -2851,8 +3077,103 @@ t3_add_sysctls(adapter_t *sc)
 	    "bogus_imm",
 	    CTLFLAG_RD, &bogus_imm,
 	    0, "#times a bogus immediate response was seen");	
+	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
+	    "cache_alloc",
+	    CTLFLAG_RD, &cxgb_cached_allocations,
+	    0, "#times a cluster was allocated from cache");
+	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
+	    "cached",
+	    CTLFLAG_RD, &cxgb_cached,
+	    0, "#times a cluster was cached");
+	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
+	    "ext_freed",
+	    CTLFLAG_RD, &cxgb_ext_freed,
+	    0, "#times a cluster was freed through ext_free");		
+
 }
 
+void
+t3_add_configured_sysctls(adapter_t *sc)
+{
+	struct sysctl_ctx_list *ctx;
+	struct sysctl_oid_list *children;
+	int i, j;
+	
+	ctx = device_get_sysctl_ctx(sc->dev);
+	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
+
+	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
+	    "intr_coal",
+	    CTLTYPE_INT|CTLFLAG_RW, sc,
+	    0, t3_set_coalesce_nsecs,
+	    "I", "interrupt coalescing timer (ns)");
+
+	for (i = 0; i < sc->params.nports; i++) {
+		struct port_info *pi = &sc->port[i];
+		struct sysctl_oid *poid;
+		struct sysctl_oid_list *poidlist;
+		
+		snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
+		poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, 
+		    pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
+		poidlist = SYSCTL_CHILDREN(poid);
+		SYSCTL_ADD_INT(ctx, poidlist, OID_AUTO, 
+		    "nqsets", CTLFLAG_RD, &pi->nqsets,
+		    0, "#queue sets");
+		
+		for (j = 0; j < pi->nqsets; j++) {
+			struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
+			struct sysctl_oid *qspoid;
+			struct sysctl_oid_list *qspoidlist;
+			struct sge_txq *txq = &qs->txq[TXQ_ETH];
+			
+			snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
+			
+			qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, 
+			    qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
+			qspoidlist = SYSCTL_CHILDREN(qspoid);
+			
+			SYSCTL_ADD_INT(ctx, qspoidlist, OID_AUTO, "dropped",
+			    CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_drops,
+			    0, "#tunneled packets dropped");
+			SYSCTL_ADD_INT(ctx, qspoidlist, OID_AUTO, "sendqlen",
+			    CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen,
+			    0, "#tunneled packets waiting to be sent");
+			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "queue_pidx",
+			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
+			    0, "#tunneled packets queue producer index");
+			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "queue_cidx",
+			    CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
+			    0, "#tunneled packets queue consumer index");
+			SYSCTL_ADD_INT(ctx, qspoidlist, OID_AUTO, "processed",
+			    CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
+			    0, "#tunneled packets processed by the card");
+			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "cleaned",
+			    CTLFLAG_RD, &txq->cleaned,
+			    0, "#tunneled packets cleaned");
+			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "in_use",
+			    CTLFLAG_RD, &txq->in_use,
+			    0, "#tunneled packet slots in use");
+			SYSCTL_ADD_ULONG(ctx, qspoidlist, OID_AUTO, "frees",
+			    CTLFLAG_RD, &txq->txq_frees,
+			    "#tunneled packets freed");
+			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "skipped",
+			    CTLFLAG_RD, &txq->txq_skipped,
+			    0, "#tunneled packet descriptors skipped");
+			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "coalesced",
+			    CTLFLAG_RD, &txq->txq_coalesced,
+			    0, "#tunneled packets coalesced");
+			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "enqueued",
+			    CTLFLAG_RD, &txq->txq_enqueued,
+			    0, "#tunneled packets enqueued to hardware");
+			SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "stopped_flags",
+			    CTLFLAG_RD, &qs->txq_stopped,
+			    0, "tx queues stopped");
+			
+		}
+	}
+}
+	
 /**
  *	t3_get_desc - dump an SGE descriptor for debugging purposes
  *	@qs: the queue set
diff --git a/sys/dev/cxgb/sys/cxgb_support.c b/sys/dev/cxgb/sys/cxgb_support.c
index 7a28556..176206c 100644
--- a/sys/dev/cxgb/sys/cxgb_support.c
+++ b/sys/dev/cxgb/sys/cxgb_support.c
@@ -126,11 +126,11 @@ cxgb_cache_pcpu_init(struct cxgb_cache_pcpu *ccp)
 	if ((err = buf_stack_init(&ccp->ccp_cluster_free, (FL_Q_SIZE >> 1))))
 		return (err);
 
-	if (jumbo_phys_contig)
+#if __FreeBSD_version > 800000		
 		ccp->ccp_jumbo_zone = zone_jumbo16;
-	else
+#else
 		ccp->ccp_jumbo_zone = zone_jumbop;
-
+#endif
 	return (0);
 }
 
diff --git a/sys/dev/cxgb/sys/mvec.h b/sys/dev/cxgb/sys/mvec.h
index 2ef7ecd..04b6449 100644
--- a/sys/dev/cxgb/sys/mvec.h
+++ b/sys/dev/cxgb/sys/mvec.h
@@ -63,6 +63,9 @@ struct m_ext_ {
 	int		 ext_type;	/* type of external storage */
 };
 
+#define MT_IOVEC        9
+#define MT_CLIOVEC      10
+
 #define EXT_IOVEC       8
 #define EXT_CLIOVEC     9
 #define EXT_JMPIOVEC    10
diff --git a/sys/dev/cxgb/t3cdev.h b/sys/dev/cxgb/t3cdev.h
index 8223f98..67db552 100644
--- a/sys/dev/cxgb/t3cdev.h
+++ b/sys/dev/cxgb/t3cdev.h
@@ -50,7 +50,7 @@ struct t3cdev {
 	int (*send)(struct t3cdev *dev, struct mbuf *m);
 	int (*recv)(struct t3cdev *dev, struct mbuf **m, int n);
 	int (*ctl)(struct t3cdev *dev, unsigned int req, void *data);
-	void (*arp_update)(struct t3cdev *dev, struct rtentry *neigh, struct sockaddr *sa);
+	void (*arp_update)(struct t3cdev *dev, struct rtentry *neigh, uint8_t *enaddr, struct sockaddr *sa);
 	void *priv;                         /* driver private data */
 	void *l2opt;                        /* optional layer 2 data */
 	void *l3opt;                        /* optional layer 3 data */
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
index 0c796b5..4b17f8e 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
@@ -60,7 +60,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet/ip.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_fsm.h>
-#include <netinet/tcp_ofld.h>
+#include <netinet/tcp_offload.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_syncache.h>
 #include <net/route.h>
@@ -82,6 +82,7 @@ __FBSDID("$FreeBSD$");
 #include <dev/cxgb/ulp/tom/cxgb_tom.h>
 #include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
 #include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
 
 
 
@@ -559,7 +560,7 @@ cxgb_toe_disconnect(struct tcpcb *tp)
 }
 
 static int
-cxgb_toe_abort(struct tcpcb *tp)
+cxgb_toe_reset(struct tcpcb *tp)
 {
 	struct toepcb *toep = tp->t_toe;
        
@@ -620,7 +621,7 @@ cxgb_toe_detach(struct tcpcb *tp)
 
 static struct toe_usrreqs cxgb_toe_usrreqs = {
 	.tu_disconnect = cxgb_toe_disconnect,
-	.tu_abort = cxgb_toe_abort,
+	.tu_reset = cxgb_toe_reset,
 	.tu_send = cxgb_toe_send,
 	.tu_rcvd = cxgb_toe_rcvd,
 	.tu_detach = cxgb_toe_detach,
@@ -1145,7 +1146,7 @@ fail_act_open(struct toepcb *toep, int errno)
 	t3_release_offload_resources(toep);
 	if (tp) {
 		INP_LOCK_ASSERT(tp->t_inpcb);
-		tcp_drop(tp, errno);
+		cxgb_tcp_drop(tp, errno);
 	}
 	
 #ifdef notyet
@@ -1957,7 +1958,7 @@ process_close_con_rpl(struct socket *so, struct mbuf *m)
 			wakeup(&so->so_timeo);
 		} else if ((so->so_options & SO_LINGER) && so->so_linger == 0 &&
 		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
-			tp = tcp_drop(tp, 0);
+			tp = cxgb_tcp_drop(tp, 0);
 		}
 
 		break;
@@ -2483,7 +2484,7 @@ handle_syncache_event(int event, void *arg)
 	struct toepcb *toep = arg;
 
 	switch (event) {
-	case SC_ENTRY_PRESENT:
+	case TOE_SC_ENTRY_PRESENT:
 		/*
 		 * entry already exists - free toepcb
 		 * and l2t
@@ -2491,7 +2492,7 @@ handle_syncache_event(int event, void *arg)
 		printf("syncache entry present\n");
 		toepcb_release(toep);
 		break;
-	case SC_DROP:
+	case TOE_SC_DROP:
 		/*
 		 * The syncache has given up on this entry
 		 * either it timed out, or it was evicted
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
index 8cb42e1..e411ab4 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
@@ -62,7 +62,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet/tcp.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_fsm.h>
-#include <netinet/tcp_ofld.h>
+#include <netinet/tcp_offload.h>
 #include <net/route.h>
 
 #include <dev/cxgb/t3cdev.h>
@@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$");
 #include <dev/cxgb/ulp/tom/cxgb_tom.h>
 #include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
 #include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
 
 static int	(*pru_sosend)(struct socket *so, struct sockaddr *addr,
     struct uio *uio, struct mbuf *top, struct mbuf *control,
@@ -99,9 +100,6 @@ static int  vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp,
     int *count, int flags);
 #endif
 static void vm_fault_unhold_pages(vm_page_t *m, int count);
-
-
-
 #define TMP_IOV_MAX 16
 
 void
@@ -112,6 +110,15 @@ t3_init_socket_ops(void)
 	prp = pffindtype(AF_INET, SOCK_STREAM);
 	pru_sosend = prp->pr_usrreqs->pru_sosend;
 	pru_soreceive = prp->pr_usrreqs->pru_soreceive;
+	tcp_usrreqs.pru_connect = cxgb_tcp_usrreqs.pru_connect;
+	tcp_usrreqs.pru_abort = cxgb_tcp_usrreqs.pru_abort;
+	tcp_usrreqs.pru_listen = cxgb_tcp_usrreqs.pru_listen;
+	tcp_usrreqs.pru_send = cxgb_tcp_usrreqs.pru_send;
+	tcp_usrreqs.pru_abort = cxgb_tcp_usrreqs.pru_abort;
+	tcp_usrreqs.pru_disconnect = cxgb_tcp_usrreqs.pru_disconnect;
+	tcp_usrreqs.pru_close = cxgb_tcp_usrreqs.pru_close;
+	tcp_usrreqs.pru_shutdown = cxgb_tcp_usrreqs.pru_shutdown;
+	tcp_usrreqs.pru_rcvd = cxgb_tcp_usrreqs.pru_rcvd;
 }
 
 
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_listen.c b/sys/dev/cxgb/ulp/tom/cxgb_listen.c
index e785790..a88b26e 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_listen.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_listen.c
@@ -57,7 +57,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_fsm.h>
 
-#include <netinet/tcp_ofld.h>
+#include <netinet/tcp_offload.h>
 #include <net/route.h>
 
 #include <dev/cxgb/t3cdev.h>
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp.h b/sys/dev/cxgb/ulp/tom/cxgb_tcp.h
new file mode 100644
index 0000000..feb2916
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tcp.h
@@ -0,0 +1,44 @@
+
+/*-
+ * Copyright (c) 2007, Chelsio Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Neither the name of the Chelsio Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+#ifndef CXGB_TCP_H_
+#define CXGB_TCP_H_
+
+struct tcpcb *cxgb_tcp_drop(struct tcpcb *tp, int errno);
+void cxgb_tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip);
+struct tcpcb *cxgb_tcp_close(struct tcpcb *tp);
+
+extern struct pr_usrreqs cxgb_tcp_usrreqs;
+#ifdef INET6
+extern struct pr_usrreqs cxgb_tcp6_usrreqs;
+#endif
+
+#include <sys/sysctl.h>
+SYSCTL_DECL(_net_inet_tcp_cxgb);
+#endif  /* CXGB_TCP_H_ */
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp_subr.c b/sys/dev/cxgb/ulp/tom/cxgb_tcp_subr.c
new file mode 100644
index 0000000..2eca099
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tcp_subr.c
@@ -0,0 +1,694 @@
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_mac.h"
+#include "opt_tcpdebug.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#ifdef INET6
+#include <sys/domain.h>
+#endif
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/protosw.h>
+#include <sys/random.h>
+
+#include <vm/uma.h>
+
+#include <net/route.h>
+#include <net/if.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif
+#include <netinet/in_pcb.h>
+#ifdef INET6
+#include <netinet6/in6_pcb.h>
+#endif
+#include <netinet/in_var.h>
+#include <netinet/ip_var.h>
+#ifdef INET6
+#include <netinet6/ip6_var.h>
+#include <netinet6/scope6_var.h>
+#include <netinet6/nd6.h>
+#endif
+#include <netinet/ip_icmp.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_syncache.h>
+#include <netinet/tcp_offload.h>
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+#include <netinet/tcpip.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+#endif
+#include <netinet6/ip6protosw.h>
+
+#ifdef IPSEC
+#include <netipsec/ipsec.h>
+#include <netipsec/xform.h>
+#ifdef INET6
+#include <netipsec/ipsec6.h>
+#endif
+#include <netipsec/key.h>
+#endif /*IPSEC*/
+
+#include <machine/in_cksum.h>
+#include <sys/md5.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
+
+
+SYSCTL_NODE(_net_inet_tcp, 0,	cxgb,	CTLFLAG_RW, 0,	"chelsio TOE");
+
+static int	tcp_log_debug = 0;
+SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, log_debug, CTLFLAG_RW,
+    &tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
+
+static int	tcp_tcbhashsize = 0;
+SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN,
+    &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
+
+static int	do_tcpdrain = 1;
+SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, do_tcpdrain, CTLFLAG_RW,
+    &do_tcpdrain, 0,
+    "Enable tcp_drain routine for extra help when low on mbufs");
+
+SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, pcbcount, CTLFLAG_RD,
+    &tcbinfo.ipi_count, 0, "Number of active PCBs");
+
+static int	icmp_may_rst = 1;
+SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, icmp_may_rst, CTLFLAG_RW,
+    &icmp_may_rst, 0,
+    "Certain ICMP unreachable messages may abort connections in SYN_SENT");
+
+static int	tcp_isn_reseed_interval = 0;
+SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
+    &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
+
+/*
+ * TCP bandwidth limiting sysctls.  Note that the default lower bound of
+ * 1024 exists only for debugging.  A good production default would be
+ * something like 6100.
+ */
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0,
+    "TCP inflight data limiting");
+
+static int	tcp_inflight_enable = 1;
+SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW,
+    &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
+
+static int	tcp_inflight_debug = 0;
+SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW,
+    &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
+
+static int	tcp_inflight_rttthresh;
+SYSCTL_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, CTLTYPE_INT|CTLFLAG_RW,
+    &tcp_inflight_rttthresh, 0, sysctl_msec_to_ticks, "I",
+    "RTT threshold below which inflight will deactivate itself");
+
+static int	tcp_inflight_min = 6144;
+SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW,
+    &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
+
+static int	tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW,
+    &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
+
+static int	tcp_inflight_stab = 20;
+SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW,
+    &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
+
+uma_zone_t sack_hole_zone;
+
+static struct inpcb *tcp_notify(struct inpcb *, int);
+static struct inpcb *cxgb_tcp_drop_syn_sent(struct inpcb *inp, int errno);
+
+/*
+ * Target size of TCP PCB hash tables. Must be a power of two.
+ *
+ * Note that this can be overridden by the kernel environment
+ * variable net.inet.tcp.tcbhashsize
+ */
+#ifndef TCBHASHSIZE
+#define TCBHASHSIZE	512
+#endif
+
+/*
+ * XXX
+ * Callouts should be moved into struct tcp directly.  They are currently
+ * separate because the tcpcb structure is exported to userland for sysctl
+ * parsing purposes, which do not know about callouts.
+ */
+struct tcpcb_mem {
+	struct	tcpcb		tcb;
+	struct	tcp_timer	tt;
+};
+
+MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
+
+/*
+ * Drop a TCP connection, reporting
+ * the specified error.  If connection is synchronized,
+ * then send a RST to peer.
+ */
+struct tcpcb *
+cxgb_tcp_drop(struct tcpcb *tp, int errno)
+{
+	struct socket *so = tp->t_inpcb->inp_socket;
+
+	INP_INFO_WLOCK_ASSERT(&tcbinfo);
+	INP_LOCK_ASSERT(tp->t_inpcb);
+
+	if (TCPS_HAVERCVDSYN(tp->t_state)) {
+		tp->t_state = TCPS_CLOSED;
+		(void) tcp_gen_reset(tp);
+		tcpstat.tcps_drops++;
+	} else
+		tcpstat.tcps_conndrops++;
+	if (errno == ETIMEDOUT && tp->t_softerror)
+		errno = tp->t_softerror;
+	so->so_error = errno;
+	return (cxgb_tcp_close(tp));
+}
+
+/*
+ * Attempt to close a TCP control block, marking it as dropped, and freeing
+ * the socket if we hold the only reference.
+ */
+struct tcpcb *
+cxgb_tcp_close(struct tcpcb *tp)
+{
+	struct inpcb *inp = tp->t_inpcb;
+	struct socket *so;
+
+	INP_INFO_WLOCK_ASSERT(&tcbinfo);
+	INP_LOCK_ASSERT(inp);
+
+	if (tp->t_state == TCPS_LISTEN)
+		tcp_gen_listen_close(tp);
+	in_pcbdrop(inp);
+	tcpstat.tcps_closed++;
+	KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
+	so = inp->inp_socket;
+	soisdisconnected(so);
+	if (inp->inp_vflag & INP_SOCKREF) {
+		KASSERT(so->so_state & SS_PROTOREF,
+		    ("tcp_close: !SS_PROTOREF"));
+		inp->inp_vflag &= ~INP_SOCKREF;
+		INP_UNLOCK(inp);
+		ACCEPT_LOCK();
+		SOCK_LOCK(so);
+		so->so_state &= ~SS_PROTOREF;
+		sofree(so);
+		return (NULL);
+	}
+	return (tp);
+}
+
+/*
+ * Notify a tcp user of an asynchronous error;
+ * store error as soft error, but wake up user
+ * (for now, won't do anything until can select for soft error).
+ *
+ * Do not wake up user since there currently is no mechanism for
+ * reporting soft errors (yet - a kqueue filter may be added).
+ */
+static struct inpcb *
+tcp_notify(struct inpcb *inp, int error)
+{
+	struct tcpcb *tp;
+
+	INP_INFO_WLOCK_ASSERT(&tcbinfo);
+	INP_LOCK_ASSERT(inp);
+
+	if ((inp->inp_vflag & INP_TIMEWAIT) ||
+	    (inp->inp_vflag & INP_DROPPED))
+		return (inp);
+
+	tp = intotcpcb(inp);
+	KASSERT(tp != NULL, ("tcp_notify: tp == NULL"));
+
+	/*
+	 * Ignore some errors if we are hooked up.
+	 * If connection hasn't completed, has retransmitted several times,
+	 * and receives a second error, give up now.  This is better
+	 * than waiting a long time to establish a connection that
+	 * can never complete.
+	 */
+	if (tp->t_state == TCPS_ESTABLISHED &&
+	    (error == EHOSTUNREACH || error == ENETUNREACH ||
+	     error == EHOSTDOWN)) {
+		return (inp);
+	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
+	    tp->t_softerror) {
+		tp = cxgb_tcp_drop(tp, error);
+		if (tp != NULL)
+			return (inp);
+		else
+			return (NULL);
+	} else {
+		tp->t_softerror = error;
+		return (inp);
+	}
+#if 0
+	wakeup( &so->so_timeo);
+	sorwakeup(so);
+	sowwakeup(so);
+#endif
+}
+
+void
+cxgb_tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
+{
+	struct ip *ip = vip;
+	struct tcphdr *th;
+	struct in_addr faddr;
+	struct inpcb *inp;
+	struct tcpcb *tp;
+	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
+	struct icmp *icp;
+	struct in_conninfo inc;
+	tcp_seq icmp_tcp_seq;
+	int mtu;
+
+	faddr = ((struct sockaddr_in *)sa)->sin_addr;
+	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
+		return;
+
+	if (cmd == PRC_MSGSIZE)
+		notify = tcp_mtudisc;
+	else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
+		cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
+		notify = cxgb_tcp_drop_syn_sent;
+	/*
+	 * Redirects don't need to be handled up here.
+	 */
+	else if (PRC_IS_REDIRECT(cmd))
+		return;
+	/*
+	 * Source quench is depreciated.
+	 */
+	else if (cmd == PRC_QUENCH)
+		return;
+	/*
+	 * Hostdead is ugly because it goes linearly through all PCBs.
+	 * XXX: We never get this from ICMP, otherwise it makes an
+	 * excellent DoS attack on machines with many connections.
+	 */
+	else if (cmd == PRC_HOSTDEAD)
+		ip = NULL;
+	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
+		return;
+	if (ip != NULL) {
+		icp = (struct icmp *)((caddr_t)ip
+				      - offsetof(struct icmp, icmp_ip));
+		th = (struct tcphdr *)((caddr_t)ip
+				       + (ip->ip_hl << 2));
+		INP_INFO_WLOCK(&tcbinfo);
+		inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
+		    ip->ip_src, th->th_sport, 0, NULL);
+		if (inp != NULL)  {
+			INP_LOCK(inp);
+			if (!(inp->inp_vflag & INP_TIMEWAIT) &&
+			    !(inp->inp_vflag & INP_DROPPED) &&
+			    !(inp->inp_socket == NULL)) {
+				icmp_tcp_seq = htonl(th->th_seq);
+				tp = intotcpcb(inp);
+				if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
+				    SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
+					if (cmd == PRC_MSGSIZE) {
+					    /*
+					     * MTU discovery:
+					     * If we got a needfrag set the MTU
+					     * in the route to the suggested new
+					     * value (if given) and then notify.
+					     */
+					    bzero(&inc, sizeof(inc));
+					    inc.inc_flags = 0;	/* IPv4 */
+					    inc.inc_faddr = faddr;
+
+					    mtu = ntohs(icp->icmp_nextmtu);
+					    /*
+					     * If no alternative MTU was
+					     * proposed, try the next smaller
+					     * one.  ip->ip_len has already
+					     * been swapped in icmp_input().
+					     */
+					    if (!mtu)
+						mtu = ip_next_mtu(ip->ip_len,
+						 1);
+					    if (mtu < max(296, (tcp_minmss)
+						 + sizeof(struct tcpiphdr)))
+						mtu = 0;
+					    if (!mtu)
+						mtu = tcp_mssdflt
+						 + sizeof(struct tcpiphdr);
+					    /*
+					     * Only cache the the MTU if it
+					     * is smaller than the interface
+					     * or route MTU.  tcp_mtudisc()
+					     * will do right thing by itself.
+					     */
+					    if (mtu <= tcp_maxmtu(&inc, NULL))
+						tcp_hc_updatemtu(&inc, mtu);
+					}
+
+					inp = (*notify)(inp, inetctlerrmap[cmd]);
+				}
+			}
+			if (inp != NULL)
+				INP_UNLOCK(inp);
+		} else {
+			inc.inc_fport = th->th_dport;
+			inc.inc_lport = th->th_sport;
+			inc.inc_faddr = faddr;
+			inc.inc_laddr = ip->ip_src;
+#ifdef INET6
+			inc.inc_isipv6 = 0;
+#endif
+			syncache_unreach(&inc, th);
+		}
+		INP_INFO_WUNLOCK(&tcbinfo);
+	} else
+		in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
+}
+
+#ifdef INET6
+void
+tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
+{
+	struct tcphdr th;
+	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
+	struct ip6_hdr *ip6;
+	struct mbuf *m;
+	struct ip6ctlparam *ip6cp = NULL;
+	const struct sockaddr_in6 *sa6_src = NULL;
+	int off;
+	struct tcp_portonly {
+		u_int16_t th_sport;
+		u_int16_t th_dport;
+	} *thp;
+
+	if (sa->sa_family != AF_INET6 ||
+	    sa->sa_len != sizeof(struct sockaddr_in6))
+		return;
+
+	if (cmd == PRC_MSGSIZE)
+		notify = tcp_mtudisc;
+	else if (!PRC_IS_REDIRECT(cmd) &&
+		 ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
+		return;
+	/* Source quench is depreciated. */
+	else if (cmd == PRC_QUENCH)
+		return;
+
+	/* if the parameter is from icmp6, decode it. */
+	if (d != NULL) {
+		ip6cp = (struct ip6ctlparam *)d;
+		m = ip6cp->ip6c_m;
+		ip6 = ip6cp->ip6c_ip6;
+		off = ip6cp->ip6c_off;
+		sa6_src = ip6cp->ip6c_src;
+	} else {
+		m = NULL;
+		ip6 = NULL;
+		off = 0;	/* fool gcc */
+		sa6_src = &sa6_any;
+	}
+
+	if (ip6 != NULL) {
+		struct in_conninfo inc;
+		/*
+		 * XXX: We assume that when IPV6 is non NULL,
+		 * M and OFF are valid.
+		 */
+
+		/* check if we can safely examine src and dst ports */
+		if (m->m_pkthdr.len < off + sizeof(*thp))
+			return;
+
+		bzero(&th, sizeof(th));
+		m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
+
+		in6_pcbnotify(&tcbinfo, sa, th.th_dport,
+		    (struct sockaddr *)ip6cp->ip6c_src,
+		    th.th_sport, cmd, NULL, notify);
+
+		inc.inc_fport = th.th_dport;
+		inc.inc_lport = th.th_sport;
+		inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
+		inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
+		inc.inc_isipv6 = 1;
+		INP_INFO_WLOCK(&tcbinfo);
+		syncache_unreach(&inc, &th);
+		INP_INFO_WUNLOCK(&tcbinfo);
+	} else
+		in6_pcbnotify(&tcbinfo, sa, 0, (const struct sockaddr *)sa6_src,
+			      0, cmd, NULL, notify);
+}
+#endif /* INET6 */
+
+
+/*
+ * Following is where TCP initial sequence number generation occurs.
+ *
+ * There are two places where we must use initial sequence numbers:
+ * 1.  In SYN-ACK packets.
+ * 2.  In SYN packets.
+ *
+ * All ISNs for SYN-ACK packets are generated by the syncache.  See
+ * tcp_syncache.c for details.
+ *
+ * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
+ * depends on this property.  In addition, these ISNs should be
+ * unguessable so as to prevent connection hijacking.  To satisfy
+ * the requirements of this situation, the algorithm outlined in
+ * RFC 1948 is used, with only small modifications.
+ *
+ * Implementation details:
+ *
+ * Time is based off the system timer, and is corrected so that it
+ * increases by one megabyte per second.  This allows for proper
+ * recycling on high speed LANs while still leaving over an hour
+ * before rollover.
+ *
+ * As reading the *exact* system time is too expensive to be done
+ * whenever setting up a TCP connection, we increment the time
+ * offset in two ways.  First, a small random positive increment
+ * is added to isn_offset for each connection that is set up.
+ * Second, the function tcp_isn_tick fires once per clock tick
+ * and increments isn_offset as necessary so that sequence numbers
+ * are incremented at approximately ISN_BYTES_PER_SECOND.  The
+ * random positive increments serve only to ensure that the same
+ * exact sequence number is never sent out twice (as could otherwise
+ * happen when a port is recycled in less than the system tick
+ * interval.)
+ *
+ * net.inet.tcp.isn_reseed_interval controls the number of seconds
+ * between seeding of isn_secret.  This is normally set to zero,
+ * as reseeding should not be necessary.
+ *
+ * Locking of the global variables isn_secret, isn_last_reseed, isn_offset,
+ * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock.  In
+ * general, this means holding an exclusive (write) lock.
+ */
+
+#define ISN_BYTES_PER_SECOND 1048576
+#define ISN_STATIC_INCREMENT 4096
+#define ISN_RANDOM_INCREMENT (4096 - 1)
+
+
+/*
+ * When a specific ICMP unreachable message is received and the
+ * connection state is SYN-SENT, drop the connection.  This behavior
+ * is controlled by the icmp_may_rst sysctl.
+ */
+static struct inpcb *
+cxgb_tcp_drop_syn_sent(struct inpcb *inp, int errno)
+{
+	struct tcpcb *tp;
+
+	INP_INFO_WLOCK_ASSERT(&tcbinfo);
+	INP_LOCK_ASSERT(inp);
+
+	if ((inp->inp_vflag & INP_TIMEWAIT) ||
+	    (inp->inp_vflag & INP_DROPPED))
+		return (inp);
+
+	tp = intotcpcb(inp);
+	if (tp->t_state != TCPS_SYN_SENT)
+		return (inp);
+
+	tp = cxgb_tcp_drop(tp, errno);
+	if (tp != NULL)
+		return (inp);
+	else
+		return (NULL);
+}
+
+static int
+cxgb_sysctl_drop(SYSCTL_HANDLER_ARGS)
+{
+	/* addrs[0] is a foreign socket, addrs[1] is a local one. */
+	struct sockaddr_storage addrs[2];
+	struct inpcb *inp;
+	struct tcpcb *tp;
+	struct tcptw *tw;
+	struct sockaddr_in *fin, *lin;
+#ifdef INET6
+	struct sockaddr_in6 *fin6, *lin6;
+	struct in6_addr f6, l6;
+#endif
+	int error;
+
+	inp = NULL;
+	fin = lin = NULL;
+#ifdef INET6
+	fin6 = lin6 = NULL;
+#endif
+	error = 0;
+
+	if (req->oldptr != NULL || req->oldlen != 0)
+		return (EINVAL);
+	if (req->newptr == NULL)
+		return (EPERM);
+	if (req->newlen < sizeof(addrs))
+		return (ENOMEM);
+	error = SYSCTL_IN(req, &addrs, sizeof(addrs));
+	if (error)
+		return (error);
+
+	switch (addrs[0].ss_family) {
+#ifdef INET6
+	case AF_INET6:
+		fin6 = (struct sockaddr_in6 *)&addrs[0];
+		lin6 = (struct sockaddr_in6 *)&addrs[1];
+		if (fin6->sin6_len != sizeof(struct sockaddr_in6) ||
+		    lin6->sin6_len != sizeof(struct sockaddr_in6))
+			return (EINVAL);
+		if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) {
+			if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr))
+				return (EINVAL);
+			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]);
+			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]);
+			fin = (struct sockaddr_in *)&addrs[0];
+			lin = (struct sockaddr_in *)&addrs[1];
+			break;
+		}
+		error = sa6_embedscope(fin6, ip6_use_defzone);
+		if (error)
+			return (error);
+		error = sa6_embedscope(lin6, ip6_use_defzone);
+		if (error)
+			return (error);
+		break;
+#endif
+	case AF_INET:
+		fin = (struct sockaddr_in *)&addrs[0];
+		lin = (struct sockaddr_in *)&addrs[1];
+		if (fin->sin_len != sizeof(struct sockaddr_in) ||
+		    lin->sin_len != sizeof(struct sockaddr_in))
+			return (EINVAL);
+		break;
+	default:
+		return (EINVAL);
+	}
+	INP_INFO_WLOCK(&tcbinfo);
+	switch (addrs[0].ss_family) {
+#ifdef INET6
+	case AF_INET6:
+		inp = in6_pcblookup_hash(&tcbinfo, &f6, fin6->sin6_port,
+		    &l6, lin6->sin6_port, 0, NULL);
+		break;
+#endif
+	case AF_INET:
+		inp = in_pcblookup_hash(&tcbinfo, fin->sin_addr, fin->sin_port,
+		    lin->sin_addr, lin->sin_port, 0, NULL);
+		break;
+	}
+	if (inp != NULL) {
+		INP_LOCK(inp);
+		if (inp->inp_vflag & INP_TIMEWAIT) {
+			/*
+			 * XXXRW: There currently exists a state where an
+			 * inpcb is present, but its timewait state has been
+			 * discarded.  For now, don't allow dropping of this
+			 * type of inpcb.
+			 */
+			tw = intotw(inp);
+			if (tw != NULL)
+				tcp_twclose(tw, 0);
+			else
+				INP_UNLOCK(inp);
+		} else if (!(inp->inp_vflag & INP_DROPPED) &&
+			   !(inp->inp_socket->so_options & SO_ACCEPTCONN)) {
+			tp = intotcpcb(inp);
+			tp = cxgb_tcp_drop(tp, ECONNABORTED);
+			if (tp != NULL)
+				INP_UNLOCK(inp);
+		} else
+			INP_UNLOCK(inp);
+	} else
+		error = ESRCH;
+	INP_INFO_WUNLOCK(&tcbinfo);
+	return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp_cxgb, TCPCTL_DROP, drop,
+    CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL,
+    0, cxgb_sysctl_drop, "", "Drop TCP connection");
+
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp_usrreq.c b/sys/dev/cxgb/ulp/tom/cxgb_tcp_usrreq.c
new file mode 100644
index 0000000..bd940b2
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tcp_usrreq.c
@@ -0,0 +1,1362 @@
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1993
+ *	The Regents of the University of California.
+ * Copyright (c) 2006-2007 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	From: @(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_tcpdebug.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/mbuf.h>
+#ifdef INET6
+#include <sys/domain.h>
+#endif /* INET6 */
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/protosw.h>
+#include <sys/proc.h>
+#include <sys/jail.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif
+#include <netinet/in_pcb.h>
+#ifdef INET6
+#include <netinet6/in6_pcb.h>
+#endif
+#include <netinet/in_var.h>
+#include <netinet/ip_var.h>
+#ifdef INET6
+#include <netinet6/ip6_var.h>
+#include <netinet6/scope6_var.h>
+#endif
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+#endif
+#include <netinet/tcp_offload.h>
+#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
+
+
+/*
+ * TCP protocol interface to socket abstraction.
+ */
+static int	tcp_attach(struct socket *);
+static int	tcp_connect(struct tcpcb *, struct sockaddr *,
+		    struct thread *td);
+#ifdef INET6
+static int	tcp6_connect(struct tcpcb *, struct sockaddr *,
+		    struct thread *td);
+#endif /* INET6 */
+static void	tcp_disconnect(struct tcpcb *);
+static void	tcp_usrclosed(struct tcpcb *);
+
+#ifdef TCPDEBUG
+#define	TCPDEBUG0	int ostate = 0
+#define	TCPDEBUG1()	ostate = tp ? tp->t_state : 0
+#define	TCPDEBUG2(req)	if (tp && (so->so_options & SO_DEBUG)) \
+				tcp_trace(TA_USER, ostate, tp, 0, 0, req)
+#else
+#define	TCPDEBUG0
+#define	TCPDEBUG1()
+#define	TCPDEBUG2(req)
+#endif
+
+/*
+ * TCP attaches to socket via pru_attach(), reserving space,
+ * and an internet control block.
+ */
+static int
+tcp_usr_attach(struct socket *so, int proto, struct thread *td)
+{
+	struct inpcb *inp;
+	struct tcpcb *tp = NULL;
+	int error;
+	TCPDEBUG0;
+
+	inp = sotoinpcb(so);
+	KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL"));
+	TCPDEBUG1();
+
+	error = tcp_attach(so);
+	if (error)
+		goto out;
+
+	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
+		so->so_linger = TCP_LINGERTIME;
+
+	inp = sotoinpcb(so);
+	tp = intotcpcb(inp);
+out:
+	TCPDEBUG2(PRU_ATTACH);
+	return error;
+}
+
+/*
+ * tcp_detach is called when the socket layer loses its final reference
+ * to the socket, be it a file descriptor reference, a reference from TCP,
+ * etc.  At this point, there is only one case in which we will keep around
+ * inpcb state: time wait.
+ *
+ * This function can probably be re-absorbed back into tcp_usr_detach() now
+ * that there is a single detach path.
+ */
+static void
+tcp_detach(struct socket *so, struct inpcb *inp)
+{
+	struct tcpcb *tp;
+#ifdef INET6
+	int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0;
+#endif
+
+	INP_INFO_WLOCK_ASSERT(&tcbinfo);
+	INP_LOCK_ASSERT(inp);
+
+	KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp"));
+	KASSERT(inp->inp_socket == so, ("tcp_detach: inp_socket != so"));
+
+	tp = intotcpcb(inp);
+
+	if (inp->inp_vflag & INP_TIMEWAIT) {
+		/*
+		 * There are two cases to handle: one in which the time wait
+		 * state is being discarded (INP_DROPPED), and one in which
+		 * this connection will remain in timewait.  In the former,
+		 * it is time to discard all state (except tcptw, which has
+		 * already been discarded by the timewait close code, which
+		 * should be further up the call stack somewhere).  In the
+		 * latter case, we detach from the socket, but leave the pcb
+		 * present until timewait ends.
+		 *
+		 * XXXRW: Would it be cleaner to free the tcptw here?
+		 */
+		if (inp->inp_vflag & INP_DROPPED) {
+			KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && "
+			    "INP_DROPPED && tp != NULL"));
+#ifdef INET6
+			if (isipv6) {
+				in6_pcbdetach(inp);
+				in6_pcbfree(inp);
+			} else {
+#endif
+				in_pcbdetach(inp);
+				in_pcbfree(inp);
+#ifdef INET6
+			}
+#endif
+		} else {
+#ifdef INET6
+			if (isipv6)
+				in6_pcbdetach(inp);
+			else
+#endif
+				in_pcbdetach(inp);
+			INP_UNLOCK(inp);
+		}
+	} else {
+		/*
+		 * If the connection is not in timewait, we consider two
+		 * two conditions: one in which no further processing is
+		 * necessary (dropped || embryonic), and one in which TCP is
+		 * not yet done, but no longer requires the socket, so the
+		 * pcb will persist for the time being.
+		 *
+		 * XXXRW: Does the second case still occur?
+		 */
+		if (inp->inp_vflag & INP_DROPPED ||
+		    tp->t_state < TCPS_SYN_SENT) {
+			tcp_discardcb(tp);
+#ifdef INET6
+			if (isipv6) {
+				in6_pcbdetach(inp);
+				in6_pcbfree(inp);
+			} else {
+#endif
+				in_pcbdetach(inp);
+				in_pcbfree(inp);
+#ifdef INET6
+			}
+#endif
+		} else {
+#ifdef INET6
+			if (isipv6)
+				in6_pcbdetach(inp);
+			else
+#endif
+				in_pcbdetach(inp);
+		}
+	}
+}
+
+/*
+ * pru_detach() detaches the TCP protocol from the socket.
+ * If the protocol state is non-embryonic, then can't
+ * do this directly: have to initiate a pru_disconnect(),
+ * which may finish later; embryonic TCB's can just
+ * be discarded here.
+ */
+static void
+tcp_usr_detach(struct socket *so)
+{
+	struct inpcb *inp;
+
+	inp = sotoinpcb(so);
+	KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL"));
+	INP_INFO_WLOCK(&tcbinfo);
+	INP_LOCK(inp);
+	KASSERT(inp->inp_socket != NULL,
+	    ("tcp_usr_detach: inp_socket == NULL"));
+	tcp_detach(so, inp);
+	INP_INFO_WUNLOCK(&tcbinfo);
+}
+
+/*
+ * Give the socket an address.
+ */
+static int
+tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+	int error = 0;
+	struct inpcb *inp;
+	struct tcpcb *tp = NULL;
+	struct sockaddr_in *sinp;
+
+	sinp = (struct sockaddr_in *)nam;
+	if (nam->sa_len != sizeof (*sinp))
+		return (EINVAL);
+	/*
+	 * Must check for multicast addresses and disallow binding
+	 * to them.
+	 */
+	if (sinp->sin_family == AF_INET &&
+	    IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
+		return (EAFNOSUPPORT);
+
+	TCPDEBUG0;
+	INP_INFO_WLOCK(&tcbinfo);
+	inp = sotoinpcb(so);
+	KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL"));
+	INP_LOCK(inp);
+	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+		error = EINVAL;
+		goto out;
+	}
+	tp = intotcpcb(inp);
+	TCPDEBUG1();
+	error = in_pcbbind(inp, nam, td->td_ucred);
+out:
+	TCPDEBUG2(PRU_BIND);
+	INP_UNLOCK(inp);
+	INP_INFO_WUNLOCK(&tcbinfo);
+
+	return (error);
+}
+
+#ifdef INET6
+static int
+tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+	int error = 0;
+	struct inpcb *inp;
+	struct tcpcb *tp = NULL;
+	struct sockaddr_in6 *sin6p;
+
+	sin6p = (struct sockaddr_in6 *)nam;
+	if (nam->sa_len != sizeof (*sin6p))
+		return (EINVAL);
+	/*
+	 * Must check for multicast addresses and disallow binding
+	 * to them.
+	 */
+	if (sin6p->sin6_family == AF_INET6 &&
+	    IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
+		return (EAFNOSUPPORT);
+
+	TCPDEBUG0;
+	INP_INFO_WLOCK(&tcbinfo);
+	inp = sotoinpcb(so);
+	KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL"));
+	INP_LOCK(inp);
+	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+		error = EINVAL;
+		goto out;
+	}
+	tp = intotcpcb(inp);
+	TCPDEBUG1();
+	inp->inp_vflag &= ~INP_IPV4;
+	inp->inp_vflag |= INP_IPV6;
+	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
+		if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
+			inp->inp_vflag |= INP_IPV4;
+		else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
+			struct sockaddr_in sin;
+
+			in6_sin6_2_sin(&sin, sin6p);
+			inp->inp_vflag |= INP_IPV4;
+			inp->inp_vflag &= ~INP_IPV6;
+			error = in_pcbbind(inp, (struct sockaddr *)&sin,
+			    td->td_ucred);
+			goto out;
+		}
+	}
+	error = in6_pcbbind(inp, nam, td->td_ucred);
+out:
+	TCPDEBUG2(PRU_BIND);
+	INP_UNLOCK(inp);
+	INP_INFO_WUNLOCK(&tcbinfo);
+	return (error);
+}
+#endif /* INET6 */
+
+/*
+ * Prepare to accept connections.
+ */
+static int
+tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
+{
+	int error = 0;
+	struct inpcb *inp;
+	struct tcpcb *tp = NULL;
+
+	TCPDEBUG0;
+	INP_INFO_WLOCK(&tcbinfo);
+	inp = sotoinpcb(so);
+	KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL"));
+	INP_LOCK(inp);
+	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+		error = EINVAL;
+		goto out;
+	}
+	tp = intotcpcb(inp);
+	TCPDEBUG1();
+	SOCK_LOCK(so);
+	error = solisten_proto_check(so);
+	if (error == 0 && inp->inp_lport == 0)
+		error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
+	if (error == 0) {
+		tp->t_state = TCPS_LISTEN;
+		solisten_proto(so, backlog);
+		tcp_gen_listen_open(tp);
+	}
+	SOCK_UNLOCK(so);
+
+out:
+	TCPDEBUG2(PRU_LISTEN);
+	INP_UNLOCK(inp);
+	INP_INFO_WUNLOCK(&tcbinfo);
+	return (error);
+}
+
+#ifdef INET6
+static int
+tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
+{
+	int error = 0;
+	struct inpcb *inp;
+	struct tcpcb *tp = NULL;
+
+	TCPDEBUG0;
+	INP_INFO_WLOCK(&tcbinfo);
+	inp = sotoinpcb(so);
+	KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL"));
+	INP_LOCK(inp);
+	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+		error = EINVAL;
+		goto out;
+	}
+	tp = intotcpcb(inp);
+	TCPDEBUG1();
+	SOCK_LOCK(so);
+	error = solisten_proto_check(so);
+	if (error == 0 && inp->inp_lport == 0) {
+		inp->inp_vflag &= ~INP_IPV4;
+		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
+			inp->inp_vflag |= INP_IPV4;
+		error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
+	}
+	if (error == 0) {
+		tp->t_state = TCPS_LISTEN;
+		solisten_proto(so, backlog);
+	}
+	SOCK_UNLOCK(so);
+
+out:
+	TCPDEBUG2(PRU_LISTEN);
+	INP_UNLOCK(inp);
+	INP_INFO_WUNLOCK(&tcbinfo);
+	return (error);
+}
+#endif /* INET6 */
+
+/*
+ * Initiate connection to peer.
+ * Create a template for use in transmissions on this connection.
+ * Enter SYN_SENT state, and mark socket as connecting.
+ * Start keep-alive timer, and seed output sequence space.
+ * Send initial segment on connection.
+ */
+static int
+tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+	int error = 0;
+	struct inpcb *inp;
+	struct tcpcb *tp = NULL;
+	struct sockaddr_in *sinp;
+
+	sinp = (struct sockaddr_in *)nam;
+	if (nam->sa_len != sizeof (*sinp))
+		return (EINVAL);
+	/*
+	 * Must disallow TCP ``connections'' to multicast addresses.
+	 */
+	if (sinp->sin_family == AF_INET
+	    && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
+		return (EAFNOSUPPORT);
+	if (jailed(td->td_ucred))
+		prison_remote_ip(td->td_ucred, 0, &sinp->sin_addr.s_addr);
+
+	TCPDEBUG0;
+	INP_INFO_WLOCK(&tcbinfo);
+	inp = sotoinpcb(so);
+	KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL"));
+	INP_LOCK(inp);
+	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+		error = EINVAL;
+		goto out;
+	}
+	tp = intotcpcb(inp);
+	TCPDEBUG1();
+	if ((error = tcp_connect(tp, nam, td)) != 0)
+		goto out;
+	printf("calling tcp_gen_connect\n");
+	
+	error = tcp_gen_connect(so, nam);
+out:
+	TCPDEBUG2(PRU_CONNECT);
+	INP_UNLOCK(inp);
+	INP_INFO_WUNLOCK(&tcbinfo);
+	return (error);
+}
+
+#ifdef INET6
+static int
+tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+	int error = 0;
+	struct inpcb *inp;
+	struct tcpcb *tp = NULL;
+	struct sockaddr_in6 *sin6p;
+
+	TCPDEBUG0;
+
+	sin6p = (struct sockaddr_in6 *)nam;
+	if (nam->sa_len != sizeof (*sin6p))
+		return (EINVAL);
+	/*
+	 * Must disallow TCP ``connections'' to multicast addresses.
+	 */
+	if (sin6p->sin6_family == AF_INET6
+	    && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
+		return (EAFNOSUPPORT);
+
+	INP_INFO_WLOCK(&tcbinfo);
+	inp = sotoinpcb(so);
+	KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL"));
+	INP_LOCK(inp);
+	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+		error = EINVAL;
+		goto out;
+	}
+	tp = intotcpcb(inp);
+	TCPDEBUG1();
+	if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
+		struct sockaddr_in sin;
+
+		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
+			error = EINVAL;
+			goto out;
+		}
+
+		in6_sin6_2_sin(&sin, sin6p);
+		inp->inp_vflag |= INP_IPV4;
+		inp->inp_vflag &= ~INP_IPV6;
+		if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
+			goto out;
+		error = tcp_gen_connect(so, nam);
+		goto out;
+	}
+	inp->inp_vflag &= ~INP_IPV4;
+	inp->inp_vflag |= INP_IPV6;
+	inp->inp_inc.inc_isipv6 = 1;
+	if ((error = tcp6_connect(tp, nam, td)) != 0)
+		goto out;
+	error = tcp_gen_connect(so, nam);
+
+out:
+	TCPDEBUG2(PRU_CONNECT);
+	INP_UNLOCK(inp);
+	INP_INFO_WUNLOCK(&tcbinfo);
+	return (error);
+}
+#endif /* INET6 */
+
+/*
+ * Initiate disconnect from peer.
+ * If connection never passed embryonic stage, just drop;
+ * else if don't need to let data drain, then can just drop anyways,
+ * else have to begin TCP shutdown process: mark socket disconnecting,
+ * drain unread data, state switch to reflect user close, and
+ * send segment (e.g. FIN) to peer.  Socket will be really disconnected
+ * when peer sends FIN and acks ours.
+ *
+ * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
+ */
+static int
+tcp_usr_disconnect(struct socket *so)
+{
+	struct inpcb *inp;
+	struct tcpcb *tp = NULL;
+	int error = 0;
+
+	TCPDEBUG0;
+	INP_INFO_WLOCK(&tcbinfo);
+	inp = sotoinpcb(so);
+	KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL"));
+	INP_LOCK(inp);
+	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+		error = ECONNRESET;
+		goto out;
+	}
+	tp = intotcpcb(inp);
+	TCPDEBUG1();
+	tcp_disconnect(tp);
+out:
+	TCPDEBUG2(PRU_DISCONNECT);
+	INP_UNLOCK(inp);
+	INP_INFO_WUNLOCK(&tcbinfo);
+	return (error);
+}
+
+/*
+ * Accept a connection.  Essentially all the work is
+ * done at higher levels; just return the address
+ * of the peer, storing through addr.
+ */
+static int
+tcp_usr_accept(struct socket *so, struct sockaddr **nam)
+{
+	int error = 0;
+	struct inpcb *inp = NULL;
+	struct tcpcb *tp = NULL;
+	struct in_addr addr;
+	in_port_t port = 0;
+	TCPDEBUG0;
+
+	if (so->so_state & SS_ISDISCONNECTED)
+		return (ECONNABORTED);
+
+	inp = sotoinpcb(so);
+	KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL"));
+	INP_LOCK(inp);
+	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+		error = ECONNABORTED;
+		goto out;
+	}
+	tp = intotcpcb(inp);
+	TCPDEBUG1();
+
+	/*
+	 * We inline in_getpeeraddr and COMMON_END here, so that we can
+	 * copy the data of interest and defer the malloc until after we
+	 * release the lock.
+	 */
+	port = inp->inp_fport;
+	addr = inp->inp_faddr;
+
+out:
+	TCPDEBUG2(PRU_ACCEPT);
+	INP_UNLOCK(inp);
+	if (error == 0)
+		*nam = in_sockaddr(port, &addr);
+	return error;
+}
+
+#ifdef INET6
+static int
+tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
+{
+	struct inpcb *inp = NULL;
+	int error = 0;
+	struct tcpcb *tp = NULL;
+	struct in_addr addr;
+	struct in6_addr addr6;
+	in_port_t port = 0;
+	int v4 = 0;
+	TCPDEBUG0;
+
+	if (so->so_state & SS_ISDISCONNECTED)
+		return (ECONNABORTED);
+
+	inp = sotoinpcb(so);
+	KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL"));
+	INP_LOCK(inp);
+	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+		error = ECONNABORTED;
+		goto out;
+	}
+	tp = intotcpcb(inp);
+	TCPDEBUG1();
+
+	/*
+	 * We inline in6_mapped_peeraddr and COMMON_END here, so that we can
+	 * copy the data of interest and defer the malloc until after we
+	 * release the lock.
+	 */
+	if (inp->inp_vflag & INP_IPV4) {
+		v4 = 1;
+		port = inp->inp_fport;
+		addr = inp->inp_faddr;
+	} else {
+		port = inp->inp_fport;
+		addr6 = inp->in6p_faddr;
+	}
+
+out:
+	TCPDEBUG2(PRU_ACCEPT);
+	INP_UNLOCK(inp);
+	if (error == 0) {
+		if (v4)
+			*nam = in6_v4mapsin6_sockaddr(port, &addr);
+		else
+			*nam = in6_sockaddr(port, &addr6);
+	}
+	return error;
+}
+#endif /* INET6 */
+
+/*
+ * Mark the connection as being incapable of further output.
+ */
+static int
+tcp_usr_shutdown(struct socket *so)
+{
+	int error = 0;
+	struct inpcb *inp;
+	struct tcpcb *tp = NULL;
+
+	TCPDEBUG0;
+	INP_INFO_WLOCK(&tcbinfo);
+	inp = sotoinpcb(so);
+	KASSERT(inp != NULL, ("inp == NULL"));
+	INP_LOCK(inp);
+	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+		error = ECONNRESET;
+		goto out;
+	}
+	tp = intotcpcb(inp);
+	TCPDEBUG1();
+	socantsendmore(so);
+	tcp_usrclosed(tp);
+	error = tcp_gen_disconnect(tp);
+
+out:
+	TCPDEBUG2(PRU_SHUTDOWN);
+	INP_UNLOCK(inp);
+	INP_INFO_WUNLOCK(&tcbinfo);
+
+	return (error);
+}
+
+/*
+ * After a receive, possibly send window update to peer.
+ */
+static int
+tcp_usr_rcvd(struct socket *so, int flags)
+{
+	struct inpcb *inp;
+	struct tcpcb *tp = NULL;
+	int error = 0;
+
+	TCPDEBUG0;
+	inp = sotoinpcb(so);
+	KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL"));
+	INP_LOCK(inp);
+	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+		error = ECONNRESET;
+		goto out;
+	}
+	tp = intotcpcb(inp);
+	TCPDEBUG1();
+	tcp_gen_rcvd(tp);
+
+out:
+	TCPDEBUG2(PRU_RCVD);
+	INP_UNLOCK(inp);
+	return (error);
+}
+
+/*
+ * Do a send by putting data in output queue and updating urgent
+ * marker if URG set.  Possibly send more data.  Unlike the other
+ * pru_*() routines, the mbuf chains are our responsibility.  We
+ * must either enqueue them or free them.  The other pru_* routines
+ * generally are caller-frees.
+ */
+static int
+tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
+    struct sockaddr *nam, struct mbuf *control, struct thread *td)
+{
+	int error = 0;
+	struct inpcb *inp;
+	struct tcpcb *tp = NULL;
+	int headlocked = 0;
+#ifdef INET6
+	int isipv6;
+#endif
+	TCPDEBUG0;
+
+	/*
+	 * We require the pcbinfo lock in two cases:
+	 *
+	 * (1) An implied connect is taking place, which can result in
+	 *     binding IPs and ports and hence modification of the pcb hash
+	 *     chains.
+	 *
+	 * (2) PRUS_EOF is set, resulting in explicit close on the send.
+	 */
+	if ((nam != NULL) || (flags & PRUS_EOF)) {
+		INP_INFO_WLOCK(&tcbinfo);
+		headlocked = 1;
+	}
+	inp = sotoinpcb(so);
+	KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
+	INP_LOCK(inp);
+	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+		if (control)
+			m_freem(control);
+		if (m)
+			m_freem(m);
+		error = ECONNRESET;
+		goto out;
+	}
+#ifdef INET6
+	isipv6 = nam && nam->sa_family == AF_INET6;
+#endif /* INET6 */
+	tp = intotcpcb(inp);
+	TCPDEBUG1();
+	if (control) {
+		/* TCP doesn't do control messages (rights, creds, etc) */
+		if (control->m_len) {
+			m_freem(control);
+			if (m)
+				m_freem(m);
+			error = EINVAL;
+			goto out;
+		}
+		m_freem(control);	/* empty control, just free it */
+	}
+	if (!(flags & PRUS_OOB)) {
+		sbappendstream(&so->so_snd, m);
+		if (nam && tp->t_state < TCPS_SYN_SENT) {
+			/*
+			 * Do implied connect if not yet connected,
+			 * initialize window to default value, and
+			 * initialize maxseg/maxopd using peer's cached
+			 * MSS.
+			 */
+			INP_INFO_WLOCK_ASSERT(&tcbinfo);
+#ifdef INET6
+			if (isipv6)
+				error = tcp6_connect(tp, nam, td);
+			else
+#endif /* INET6 */
+			error = tcp_connect(tp, nam, td);
+			if (error)
+				goto out;
+			tp->snd_wnd = TTCP_CLIENT_SND_WND;
+			tcp_mss(tp, -1);
+		}
+		if (flags & PRUS_EOF) {
+			/*
+			 * Close the send side of the connection after
+			 * the data is sent.
+			 */
+			INP_INFO_WLOCK_ASSERT(&tcbinfo);
+			socantsendmore(so);
+			tcp_usrclosed(tp);
+		}
+		if (headlocked) {
+			INP_INFO_WUNLOCK(&tcbinfo);
+			headlocked = 0;
+		}
+		if (tp != NULL) {
+			if (flags & PRUS_MORETOCOME)
+				tp->t_flags |= TF_MORETOCOME;
+			error = tcp_gen_send(tp);
+			if (flags & PRUS_MORETOCOME)
+				tp->t_flags &= ~TF_MORETOCOME;
+		}
+	} else {
+		/*
+		 * XXXRW: PRUS_EOF not implemented with PRUS_OOB?
+		 */
+		SOCKBUF_LOCK(&so->so_snd);
+		if (sbspace(&so->so_snd) < -512) {
+			SOCKBUF_UNLOCK(&so->so_snd);
+			m_freem(m);
+			error = ENOBUFS;
+			goto out;
+		}
+		/*
+		 * According to RFC961 (Assigned Protocols),
+		 * the urgent pointer points to the last octet
+		 * of urgent data.  We continue, however,
+		 * to consider it to indicate the first octet
+		 * of data past the urgent section.
+		 * Otherwise, snd_up should be one lower.
+		 */
+		sbappendstream_locked(&so->so_snd, m);
+		SOCKBUF_UNLOCK(&so->so_snd);
+		if (nam && tp->t_state < TCPS_SYN_SENT) {
+			/*
+			 * Do implied connect if not yet connected,
+			 * initialize window to default value, and
+			 * initialize maxseg/maxopd using peer's cached
+			 * MSS.
+			 */
+			INP_INFO_WLOCK_ASSERT(&tcbinfo);
+#ifdef INET6
+			if (isipv6)
+				error = tcp6_connect(tp, nam, td);
+			else
+#endif /* INET6 */
+			error = tcp_connect(tp, nam, td);
+			if (error)
+				goto out;
+			tp->snd_wnd = TTCP_CLIENT_SND_WND;
+			tcp_mss(tp, -1);
+			INP_INFO_WUNLOCK(&tcbinfo);
+			headlocked = 0;
+		} else if (nam) {
+			INP_INFO_WUNLOCK(&tcbinfo);
+			headlocked = 0;
+		}
+		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
+		tp->t_flags |= TF_FORCEDATA;
+		error = tcp_gen_send(tp);
+		tp->t_flags &= ~TF_FORCEDATA;
+	}
+out:
+	TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB :
+		  ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
+	INP_UNLOCK(inp);
+	if (headlocked)
+		INP_INFO_WUNLOCK(&tcbinfo);
+	return (error);
+}
+
+/*
+ * Abort the TCP.  Drop the connection abruptly.
+ */
+static void
+tcp_usr_abort(struct socket *so)
+{
+	struct inpcb *inp;
+	struct tcpcb *tp = NULL;
+	TCPDEBUG0;
+
+	inp = sotoinpcb(so);
+	KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL"));
+
+	INP_INFO_WLOCK(&tcbinfo);
+	INP_LOCK(inp);
+	KASSERT(inp->inp_socket != NULL,
+	    ("tcp_usr_abort: inp_socket == NULL"));
+
+	/*
+	 * If we still have full TCP state, and we're not dropped, drop.
+	 */
+	if (!(inp->inp_vflag & INP_TIMEWAIT) &&
+	    !(inp->inp_vflag & INP_DROPPED)) {
+		tp = intotcpcb(inp);
+		TCPDEBUG1();
+		cxgb_tcp_drop(tp, ECONNABORTED);
+		TCPDEBUG2(PRU_ABORT);
+	}
+	if (!(inp->inp_vflag & INP_DROPPED)) {
+		SOCK_LOCK(so);
+		so->so_state |= SS_PROTOREF;
+		SOCK_UNLOCK(so);
+		inp->inp_vflag |= INP_SOCKREF;
+	}
+	INP_UNLOCK(inp);
+	INP_INFO_WUNLOCK(&tcbinfo);
+}
+
+/*
+ * TCP socket is closed.  Start friendly disconnect.
+ */
+static void
+tcp_usr_close(struct socket *so)
+{
+	struct inpcb *inp;
+	struct tcpcb *tp = NULL;
+	TCPDEBUG0;
+
+	inp = sotoinpcb(so);
+	KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL"));
+
+	INP_INFO_WLOCK(&tcbinfo);
+	INP_LOCK(inp);
+	KASSERT(inp->inp_socket != NULL,
+	    ("tcp_usr_close: inp_socket == NULL"));
+
+	/*
+	 * If we still have full TCP state, and we're not dropped, initiate
+	 * a disconnect.
+	 */
+	if (!(inp->inp_vflag & INP_TIMEWAIT) &&
+	    !(inp->inp_vflag & INP_DROPPED)) {
+		tp = intotcpcb(inp);
+		TCPDEBUG1();
+		tcp_disconnect(tp);
+		TCPDEBUG2(PRU_CLOSE);
+	}
+	if (!(inp->inp_vflag & INP_DROPPED)) {
+		SOCK_LOCK(so);
+		so->so_state |= SS_PROTOREF;
+		SOCK_UNLOCK(so);
+		inp->inp_vflag |= INP_SOCKREF;
+	}
+	INP_UNLOCK(inp);
+	INP_INFO_WUNLOCK(&tcbinfo);
+}
+
+/*
+ * Receive out-of-band data.
+ */
+static int
+tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
+{
+	int error = 0;
+	struct inpcb *inp;
+	struct tcpcb *tp = NULL;
+
+	TCPDEBUG0;
+	inp = sotoinpcb(so);
+	KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL"));
+	INP_LOCK(inp);
+	if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+		error = ECONNRESET;
+		goto out;
+	}
+	tp = intotcpcb(inp);
+	TCPDEBUG1();
+	if ((so->so_oobmark == 0 &&
+	     (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
+	    so->so_options & SO_OOBINLINE ||
+	    tp->t_oobflags & TCPOOB_HADDATA) {
+		error = EINVAL;
+		goto out;
+	}
+	if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
+		error = EWOULDBLOCK;
+		goto out;
+	}
+	m->m_len = 1;
+	*mtod(m, caddr_t) = tp->t_iobc;
+	if ((flags & MSG_PEEK) == 0)
+		tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
+
+out:
+	TCPDEBUG2(PRU_RCVOOB);
+	INP_UNLOCK(inp);
+	return (error);
+}
+
+struct pr_usrreqs cxgb_tcp_usrreqs = {
+	.pru_abort =		tcp_usr_abort,
+	.pru_accept =		tcp_usr_accept,
+	.pru_attach =		tcp_usr_attach,
+	.pru_bind =		tcp_usr_bind,
+	.pru_connect =		tcp_usr_connect,
+	.pru_control =		in_control,
+	.pru_detach =		tcp_usr_detach,
+	.pru_disconnect =	tcp_usr_disconnect,
+	.pru_listen =		tcp_usr_listen,
+	.pru_peeraddr =		in_getpeeraddr,
+	.pru_rcvd =		tcp_usr_rcvd,
+	.pru_rcvoob =		tcp_usr_rcvoob,
+	.pru_send =		tcp_usr_send,
+	.pru_shutdown =		tcp_usr_shutdown,
+	.pru_sockaddr =		in_getsockaddr,
+	.pru_sosetlabel =	in_pcbsosetlabel,
+	.pru_close =		tcp_usr_close,
+};
+
+#ifdef INET6
+struct pr_usrreqs cxgb_tcp6_usrreqs = {
+	.pru_abort =		tcp_usr_abort,
+	.pru_accept =		tcp6_usr_accept,
+	.pru_attach =		tcp_usr_attach,
+	.pru_bind =		tcp6_usr_bind,
+	.pru_connect =		tcp6_usr_connect,
+	.pru_control =		in6_control,
+	.pru_detach =		tcp_usr_detach,
+	.pru_disconnect =	tcp_usr_disconnect,
+	.pru_listen =		tcp6_usr_listen,
+	.pru_peeraddr =		in6_mapped_peeraddr,
+	.pru_rcvd =		tcp_usr_rcvd,
+	.pru_rcvoob =		tcp_usr_rcvoob,
+	.pru_send =		tcp_usr_send,
+	.pru_shutdown =		tcp_usr_shutdown,
+	.pru_sockaddr =		in6_mapped_sockaddr,
+ 	.pru_sosetlabel =	in_pcbsosetlabel,
+	.pru_close =		tcp_usr_close,
+};
+#endif /* INET6 */
+
+/*
+ * Common subroutine to open a TCP connection to remote host specified
+ * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
+ * port number if needed.  Call in_pcbconnect_setup to do the routing and
+ * to choose a local host address (interface).  If there is an existing
+ * incarnation of the same connection in TIME-WAIT state and if the remote
+ * host was sending CC options and if the connection duration was < MSL, then
+ * truncate the previous TIME-WAIT state and proceed.
+ * Initialize connection parameters and enter SYN-SENT state.
+ */
+static int
+tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
+{
+	struct inpcb *inp = tp->t_inpcb, *oinp;
+	struct socket *so = inp->inp_socket;
+	struct in_addr laddr;
+	u_short lport;
+	int error;
+
+	INP_INFO_WLOCK_ASSERT(&tcbinfo);
+	INP_LOCK_ASSERT(inp);
+
+	if (inp->inp_lport == 0) {
+		error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * Cannot simply call in_pcbconnect, because there might be an
+	 * earlier incarnation of this same connection still in
+	 * TIME_WAIT state, creating an ADDRINUSE error.
+	 */
+	laddr = inp->inp_laddr;
+	lport = inp->inp_lport;
+	error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport,
+	    &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred);
+	if (error && oinp == NULL)
+		return error;
+	if (oinp)
+		return EADDRINUSE;
+	inp->inp_laddr = laddr;
+	in_pcbrehash(inp);
+
+	/*
+	 * Compute window scaling to request:
+	 * Scale to fit into sweet spot.  See tcp_syncache.c.
+	 * XXX: This should move to tcp_output().
+	 */
+	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
+	    (TCP_MAXWIN << tp->request_r_scale) < sb_max)
+		tp->request_r_scale++;
+
+	soisconnecting(so);
+	tcpstat.tcps_connattempt++;
+	tp->t_state = TCPS_SYN_SENT;
+	tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
+	tp->iss = tcp_new_isn(tp);
+	tp->t_bw_rtseq = tp->iss;
+	tcp_sendseqinit(tp);
+
+	return 0;
+}
+
+#ifdef INET6
+static int
+tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
+{
+	struct inpcb *inp = tp->t_inpcb, *oinp;
+	struct socket *so = inp->inp_socket;
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
+	struct in6_addr *addr6;
+	int error;
+
+	INP_INFO_WLOCK_ASSERT(&tcbinfo);
+	INP_LOCK_ASSERT(inp);
+
+	if (inp->inp_lport == 0) {
+		error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * Cannot simply call in_pcbconnect, because there might be an
+	 * earlier incarnation of this same connection still in
+	 * TIME_WAIT state, creating an ADDRINUSE error.
+	 * in6_pcbladdr() also handles scope zone IDs.
+	 */
+	error = in6_pcbladdr(inp, nam, &addr6);
+	if (error)
+		return error;
+	oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
+				  &sin6->sin6_addr, sin6->sin6_port,
+				  IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
+				  ? addr6
+				  : &inp->in6p_laddr,
+				  inp->inp_lport,  0, NULL);
+	if (oinp)
+		return EADDRINUSE;
+	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
+		inp->in6p_laddr = *addr6;
+	inp->in6p_faddr = sin6->sin6_addr;
+	inp->inp_fport = sin6->sin6_port;
+	/* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
+	inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK;
+	if (inp->in6p_flags & IN6P_AUTOFLOWLABEL)
+		inp->in6p_flowinfo |=
+		    (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
+	in_pcbrehash(inp);
+
+	/* Compute window scaling to request.  */
+	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
+	    (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
+		tp->request_r_scale++;
+
+	soisconnecting(so);
+	tcpstat.tcps_connattempt++;
+	tp->t_state = TCPS_SYN_SENT;
+	tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
+	tp->iss = tcp_new_isn(tp);
+	tp->t_bw_rtseq = tp->iss;
+	tcp_sendseqinit(tp);
+
+	return 0;
+}
+#endif /* INET6 */
+
+/*
+ * tcp_sendspace and tcp_recvspace are the default send and receive window
+ * sizes, respectively.  These are obsolescent (this information should
+ * be set by the route).
+ */
+u_long	tcp_sendspace = 1024*32;
+SYSCTL_ULONG(_net_inet_tcp_cxgb, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
+    &tcp_sendspace , 0, "Maximum outgoing TCP datagram size");
+u_long	tcp_recvspace = 1024*64;
+SYSCTL_ULONG(_net_inet_tcp_cxgb, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
+    &tcp_recvspace , 0, "Maximum incoming TCP datagram size");
+
+/*
+ * Attach TCP protocol to socket, allocating
+ * internet protocol control block, tcp control block,
+ * bufer space, and entering LISTEN state if to accept connections.
+ */
+static int
+tcp_attach(struct socket *so)
+{
+	struct tcpcb *tp;
+	struct inpcb *inp;
+	int error;
+#ifdef INET6
+	int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0;
+#endif
+
+	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
+		error = soreserve(so, tcp_sendspace, tcp_recvspace);
+		if (error)
+			return (error);
+	}
+	so->so_rcv.sb_flags |= SB_AUTOSIZE;
+	so->so_snd.sb_flags |= SB_AUTOSIZE;
+	INP_INFO_WLOCK(&tcbinfo);
+	error = in_pcballoc(so, &tcbinfo);
+	if (error) {
+		INP_INFO_WUNLOCK(&tcbinfo);
+		return (error);
+	}
+	inp = sotoinpcb(so);
+#ifdef INET6
+	if (isipv6) {
+		inp->inp_vflag |= INP_IPV6;
+		inp->in6p_hops = -1;	/* use kernel default */
+	}
+	else
+#endif
+	inp->inp_vflag |= INP_IPV4;
+	tp = tcp_newtcpcb(inp);
+	if (tp == NULL) {
+#ifdef INET6
+		if (isipv6) {
+			in6_pcbdetach(inp);
+			in6_pcbfree(inp);
+		} else {
+#endif
+			in_pcbdetach(inp);
+			in_pcbfree(inp);
+#ifdef INET6
+		}
+#endif
+		INP_INFO_WUNLOCK(&tcbinfo);
+		return (ENOBUFS);
+	}
+	tp->t_state = TCPS_CLOSED;
+	INP_UNLOCK(inp);
+	INP_INFO_WUNLOCK(&tcbinfo);
+	return (0);
+}
+
+/*
+ * Initiate (or continue) disconnect.
+ * If embryonic state, just send reset (once).
+ * If in ``let data drain'' option and linger null, just drop.
+ * Otherwise (hard), mark socket disconnecting and drop
+ * current input data; switch states based on user close, and
+ * send segment to peer (with FIN).
+ */
+static void
+tcp_disconnect(struct tcpcb *tp)
+{
+	struct inpcb *inp = tp->t_inpcb;
+	struct socket *so = inp->inp_socket;
+
+	INP_INFO_WLOCK_ASSERT(&tcbinfo);
+	INP_LOCK_ASSERT(inp);
+
+	/*
+	 * Neither tcp_close() nor tcp_drop() should return NULL, as the
+	 * socket is still open.
+	 */
+	if (tp->t_state < TCPS_ESTABLISHED) {
+		tp = cxgb_tcp_close(tp);
+		KASSERT(tp != NULL,
+		    ("tcp_disconnect: tcp_close() returned NULL"));
+	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
+		tp = cxgb_tcp_drop(tp, 0);
+		KASSERT(tp != NULL,
+		    ("tcp_disconnect: tcp_drop() returned NULL"));
+	} else {
+		soisdisconnecting(so);
+		sbflush(&so->so_rcv);
+		tcp_usrclosed(tp);
+		if (!(inp->inp_vflag & INP_DROPPED))
+			tcp_gen_disconnect(tp);
+	}
+}
+
+/*
+ * User issued close, and wish to trail through shutdown states:
+ * if never received SYN, just forget it.  If got a SYN from peer,
+ * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
+ * If already got a FIN from peer, then almost done; go to LAST_ACK
+ * state.  In all other cases, have already sent FIN to peer (e.g.
+ * after PRU_SHUTDOWN), and just have to play tedious game waiting
+ * for peer to send FIN or not respond to keep-alives, etc.
+ * We can let the user exit from the close as soon as the FIN is acked.
+ */
+static void
+tcp_usrclosed(struct tcpcb *tp)
+{
+
+	INP_INFO_WLOCK_ASSERT(&tcbinfo);
+	INP_LOCK_ASSERT(tp->t_inpcb);
+
+	switch (tp->t_state) {
+	case TCPS_LISTEN:
+		tcp_gen_listen_close(tp);
+	case TCPS_CLOSED:
+		tp->t_state = TCPS_CLOSED;
+		tp = cxgb_tcp_close(tp);
+		/*
+		 * tcp_close() should never return NULL here as the socket is
+		 * still open.
+		 */
+		KASSERT(tp != NULL,
+		    ("tcp_usrclosed: tcp_close() returned NULL"));
+		break;
+
+	case TCPS_SYN_SENT:
+	case TCPS_SYN_RECEIVED:
+		tp->t_flags |= TF_NEEDFIN;
+		break;
+
+	case TCPS_ESTABLISHED:
+		tp->t_state = TCPS_FIN_WAIT_1;
+		break;
+
+	case TCPS_CLOSE_WAIT:
+		tp->t_state = TCPS_LAST_ACK;
+		break;
+	}
+	if (tp->t_state >= TCPS_FIN_WAIT_2) {
+		soisdisconnected(tp->t_inpcb->inp_socket);
+		/* Prevent the connection hanging in FIN_WAIT_2 forever. */
+		if (tp->t_state == TCPS_FIN_WAIT_2) {
+			int timeout;
+
+			timeout = (tcp_fast_finwait2_recycle) ? 
+			    tcp_finwait2_timeout : tcp_maxidle;
+			tcp_timer_activate(tp, TT_2MSL, timeout);
+		}
+	}
+}
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom.c b/sys/dev/cxgb/ulp/tom/cxgb_tom.c
index 2dc6150..b5b87b7 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_tom.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tom.c
@@ -60,7 +60,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet/in_pcb.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_var.h>
-#include <netinet/tcp_ofld.h>
+#include <netinet/tcp_offload.h>
 #include <netinet/tcp_fsm.h>
 #include <net/route.h>
 
@@ -77,6 +77,8 @@ __FBSDID("$FreeBSD$");
 #include <dev/cxgb/ulp/tom/cxgb_defs.h>
 #include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
 #include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
+
 
 static int activated = 1;
 TUNABLE_INT("hw.t3toe.activated", &activated);
@@ -177,6 +179,8 @@ toepcb_release(struct toepcb *toep)
 static void
 t3cdev_add(struct tom_data *t)
 {
+	printf("t3cdev_add\n");
+	
 	mtx_lock(&cxgb_list_lock);
 	TAILQ_INSERT_TAIL(&cxgb_list, t, entry);
 	mtx_unlock(&cxgb_list_lock);
@@ -187,7 +191,8 @@ t3cdev_add(struct tom_data *t)
  * initialize its cpl_handlers
  * and register it as a T3C client
  */
-static void t3c_tom_add(struct t3cdev *cdev)
+static void
+t3c_tom_add(struct t3cdev *cdev)
 {
 	int i;
 	unsigned int wr_len;
@@ -195,9 +200,12 @@ static void t3c_tom_add(struct t3cdev *cdev)
 	struct toedev *tdev;
 	struct adap_ports *port_info;
 
+	printf("%s called\n", __FUNCTION__);
+	
+	
 	t = malloc(sizeof(*t), M_CXGB, M_NOWAIT|M_ZERO);
 	
-	if (!t)
+	if (t == NULL)
 		return;
 
 	if (cdev->ctl(cdev, GET_WR_LEN, &wr_len) < 0)
@@ -226,11 +234,15 @@ static void t3c_tom_add(struct t3cdev *cdev)
 	}
 	TOM_DATA(tdev) = t;
 
+	printf("nports=%d\n", port_info->nports);
 	for (i = 0; i < port_info->nports; i++) {
 		struct ifnet *ifp = port_info->lldevs[i];
 		TOEDEV(ifp) = tdev;
+
+		printf("enabling toe on %p\n", ifp);
 		
-		ifp->if_capabilities |= IFCAP_TOE;
+		ifp->if_capabilities |= IFCAP_TOE4;
+		ifp->if_capenable |= IFCAP_TOE4;
 	}
 	t->ports = port_info;
 
@@ -242,8 +254,10 @@ static void t3c_tom_add(struct t3cdev *cdev)
 	return;
 
 out_free_all:
+	printf("out_free_all fail\n");
 	free(port_info, M_CXGB);
 out_free_tom:
+	printf("out_free_tom fail\n");
 	free(t, M_CXGB);
 	return;
 }
@@ -293,8 +307,8 @@ can_offload(struct toedev *dev, struct socket *so)
 	     atomic_load_acq_int(&t->tids_in_use) + t->atids_in_use < tomd->conf.max_conn);
 }
 
-
-static int tom_ctl(struct toedev *dev, unsigned int req, void *data)
+static int
+tom_ctl(struct toedev *dev, unsigned int req, void *data)
 {
 	struct tom_data *t = TOM_DATA(dev);
 	struct t3cdev *cdev = t->cdev;
@@ -377,32 +391,33 @@ t3_toe_attach(struct toedev *dev, const struct offload_id *entry)
 }
 
 static void
-cxgb_toe_listen(void *unused, int event, struct tcpcb *tp)
+cxgb_toe_listen_start(void *unused, struct tcpcb *tp)
 {
 	struct socket *so = tp->t_inpcb->inp_socket;
 	struct tom_data *p;
+	
+	mtx_lock(&cxgb_list_lock);
+	TAILQ_FOREACH(p, &cxgb_list, entry) {
+			t3_listen_start(&p->tdev, so, p->cdev);
+	}
+	mtx_unlock(&cxgb_list_lock);
+}
 
-	switch (event) {
-	case OFLD_LISTEN_OPEN:
-	case OFLD_LISTEN_CLOSE:
-		mtx_lock(&cxgb_list_lock);
-		TAILQ_FOREACH(p, &cxgb_list, entry) {
-			if (event == OFLD_LISTEN_OPEN)
-				t3_listen_start(&p->tdev, so, p->cdev);
-			else if (tp->t_state == TCPS_LISTEN) {
-				printf("stopping listen on port=%d\n",
-				    ntohs(tp->t_inpcb->inp_lport));
-				
-				t3_listen_stop(&p->tdev, so, p->cdev);
-			}
-			
+static void
+cxgb_toe_listen_stop(void *unused, struct tcpcb *tp)
+{
+	struct socket *so = tp->t_inpcb->inp_socket;
+	struct tom_data *p;
+	
+	mtx_lock(&cxgb_list_lock);
+	TAILQ_FOREACH(p, &cxgb_list, entry) {
+		if (tp->t_state == TCPS_LISTEN) {
+			printf("stopping listen on port=%d\n",
+			    ntohs(tp->t_inpcb->inp_lport));
+			t3_listen_stop(&p->tdev, so, p->cdev);
 		}
-		mtx_unlock(&cxgb_list_lock);
-		break;
-	default:
-		log(LOG_ERR, "unrecognized listen event %d\n", event);
-		break;
 	}
+	mtx_unlock(&cxgb_list_lock);
 }
 
 static void
@@ -416,7 +431,7 @@ cxgb_register_listeners(void)
 		tp = intotcpcb(inp);
 
 		if (tp->t_state == TCPS_LISTEN)
-			cxgb_toe_listen(NULL, OFLD_LISTEN_OPEN, tp);
+			cxgb_toe_listen_start(NULL, tp);
 	}
 	INP_INFO_RUNLOCK(&tcbinfo);
 }
@@ -450,12 +465,19 @@ t3_tom_init(void)
 		    "Unable to register Chelsio T3 TCP offload module.\n");
 		return -1;
 	}
+	INP_INFO_WLOCK(&tcbinfo);
+
+	INP_INFO_WUNLOCK(&tcbinfo);	    
 
 	mtx_init(&cxgb_list_lock, "cxgb tom list", NULL, MTX_DEF);
-	listen_tag = EVENTHANDLER_REGISTER(ofld_listen, cxgb_toe_listen, NULL, EVENTHANDLER_PRI_ANY);
+	listen_tag = EVENTHANDLER_REGISTER(tcp_offload_listen_start,
+	    cxgb_toe_listen_start, NULL, EVENTHANDLER_PRI_ANY);
+	listen_tag = EVENTHANDLER_REGISTER(tcp_offload_listen_stop,
+	    cxgb_toe_listen_stop, NULL, EVENTHANDLER_PRI_ANY);
 	TAILQ_INIT(&cxgb_list);
 	
 	/* Register to offloading devices */
+	printf("setting add to %p\n", t3c_tom_add);
 	t3c_tom_client.add = t3c_tom_add;
 	cxgb_register_client(&t3c_tom_client);
 	cxgb_register_listeners();
diff --git a/sys/modules/cxgb/Makefile b/sys/modules/cxgb/Makefile
index 120cc9b..ef633e7 100644
--- a/sys/modules/cxgb/Makefile
+++ b/sys/modules/cxgb/Makefile
@@ -1,7 +1,7 @@
 # $FreeBSD$
 SUBDIR= cxgb
 SUBDIR+= toecore
-#SUBDIR+= tom
+SUBDIR+= tom
 #SUBDIR+= iw_cxgb
 
 .include <bsd.subdir.mk>
diff --git a/sys/modules/cxgb/cxgb/Makefile b/sys/modules/cxgb/cxgb/Makefile
index b8455f1..1f41ac2 100644
--- a/sys/modules/cxgb/cxgb/Makefile
+++ b/sys/modules/cxgb/cxgb/Makefile
@@ -8,11 +8,11 @@ SRCS=	cxgb_mc5.c cxgb_vsc8211.c cxgb_ael1002.c cxgb_mv88e1xxx.c
 SRCS+=	cxgb_xgmac.c cxgb_vsc7323.c cxgb_t3_hw.c cxgb_main.c 
 SRCS+=  cxgb_sge.c cxgb_lro.c cxgb_offload.c cxgb_l2t.c
 SRCS+=	device_if.h bus_if.h pci_if.h opt_zero.h opt_sched.h
-SRCS+=	uipc_mvec.c 
-#SRCS+=	cxgb_multiq.c cxgb_support.c
+SRCS+=	uipc_mvec.c cxgb_support.c
+#SRCS+=	cxgb_multiq.c 
 
 CFLAGS+= -DCONFIG_CHELSIO_T3_CORE -g -DCONFIG_DEFINED -DDEFAULT_JUMBO -I${CXGB} -DSMP
-CFLAGS+= -DDISABLE_MBUF_IOVEC
+#CFLAGS+= -DDISABLE_MBUF_IOVEC
 #CFLAGS+= -DIFNET_MULTIQUEUE
 #CFLAGS+= -DINVARIANT_SUPPORT -DINVARIANTS 
 #CFLAGS+= -DWITNESS
diff --git a/sys/modules/cxgb/tom/Makefile b/sys/modules/cxgb/tom/Makefile
index ece891ce..ba02b91 100644
--- a/sys/modules/cxgb/tom/Makefile
+++ b/sys/modules/cxgb/tom/Makefile
@@ -4,5 +4,9 @@ TOM = ${.CURDIR}/../../../dev/cxgb/ulp/tom
 
 KMOD=	tom
 SRCS=   cxgb_tom.c cxgb_cpl_io.c cxgb_listen.c cxgb_tom_sysctl.c cxgb_cpl_socket.c
-SRCS+=	device_if.h bus_if.h pci_if.h 
-.include <bsd.kmod.mk>
-\ No newline at end of file
+SRCS+=  cxgb_tcp_subr.c cxgb_tcp_usrreq.c
+SRCS+=	opt_compat.h opt_inet.h opt_inet6.h opt_ipsec.h opt_mac.h opt_tcpdebug.h opt_ddb.h
+SRCS+=	device_if.h bus_if.h pci_if.h
+
+#CFLAGS+= -DDEBUG_PRINT -DDEBUG
+.include <bsd.kmod.mk>