8 files changed, 1686 insertions, 1132 deletions
diff --git a/sys/conf/files b/sys/conf/files
index 3884c11..9e55f42 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -1142,6 +1142,8 @@ dev/cxgb/sys/uipc_mvec.c	optional cxgb pci \
 	compile-with "${NORMAL_C} -I$S/dev/cxgb"
 dev/cxgb/cxgb_t3fw.c		optional cxgb cxgb_t3fw \
 	compile-with "${NORMAL_C} -I$S/dev/cxgb"
+dev/cxgbe/t4_mp_ring.c		optional cxgbe pci \
+	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
 dev/cxgbe/t4_main.c		optional cxgbe pci \
 	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
 dev/cxgbe/t4_netmap.c		optional cxgbe pci \
diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h
index ec84bb4..62ff9af 100644
--- a/sys/dev/cxgbe/adapter.h
+++ b/sys/dev/cxgbe/adapter.h
@@ -152,7 +152,8 @@ enum {
 	CL_METADATA_SIZE = CACHE_LINE_SIZE,
 
 	SGE_MAX_WR_NDESC = SGE_MAX_WR_LEN / EQ_ESIZE, /* max WR size in desc */
-	TX_SGL_SEGS = 36,
+	TX_SGL_SEGS = 39,
+	TX_SGL_SEGS_TSO = 38,
 	TX_WR_FLITS = SGE_MAX_WR_LEN / 8
 };
 
@@ -273,6 +274,7 @@ struct port_info {
 	struct timeval last_refreshed;
  	struct port_stats stats;
 	u_int tnl_cong_drops;
+	u_int tx_parse_error;
 
 	eventhandler_tag vlan_c;
 
@@ -308,23 +310,9 @@ struct tx_desc {
 	__be64 flit[8];
 };
 
-struct tx_map {
-	struct mbuf *m;
-	bus_dmamap_t map;
-};
-
-/* DMA maps used for tx */
-struct tx_maps {
-	struct tx_map *maps;
-	uint32_t map_total;	/* # of DMA maps */
-	uint32_t map_pidx;	/* next map to be used */
-	uint32_t map_cidx;	/* reclaimed up to this index */
-	uint32_t map_avail;	/* # of available maps */
-};
-
 struct tx_sdesc {
+	struct mbuf *m;		/* m_nextpkt linked chain of frames */
 	uint8_t desc_used;	/* # of hardware descriptors used by the WR */
-	uint8_t credits;	/* NIC txq: # of frames sent out in the WR */
 };
 
 
@@ -378,16 +366,12 @@ struct sge_iq {
 enum {
 	EQ_CTRL		= 1,
 	EQ_ETH		= 2,
-#ifdef TCP_OFFLOAD
 	EQ_OFLD		= 3,
-#endif
 
 	/* eq flags */
-	EQ_TYPEMASK	= 7,		/* 3 lsbits hold the type */
-	EQ_ALLOCATED	= (1 << 3),	/* firmware resources allocated */
-	EQ_DOOMED	= (1 << 4),	/* about to be destroyed */
-	EQ_CRFLUSHED	= (1 << 5),	/* expecting an update from SGE */
-	EQ_STALLED	= (1 << 6),	/* out of hw descriptors or dmamaps */
+	EQ_TYPEMASK	= 0x3,		/* 2 lsbits hold the type (see above) */
+	EQ_ALLOCATED	= (1 << 2),	/* firmware resources allocated */
+	EQ_ENABLED	= (1 << 3),	/* open for business */
 };
 
 /* Listed in order of preference.  Update t4_sysctls too if you change these */
@@ -402,32 +386,25 @@ enum {DOORBELL_UDB, DOORBELL_WCWR, DOORBELL_UDBWC, DOORBELL_KDB};
 struct sge_eq {
 	unsigned int flags;	/* MUST be first */
 	unsigned int cntxt_id;	/* SGE context id for the eq */
-	bus_dma_tag_t desc_tag;
-	bus_dmamap_t desc_map;
-	char lockname[16];
 	struct mtx eq_lock;
 
 	struct tx_desc *desc;	/* KVA of descriptor ring */
-	bus_addr_t ba;		/* bus address of descriptor ring */
-	struct sge_qstat *spg;	/* status page, for convenience */
 	uint16_t doorbells;
 	volatile uint32_t *udb;	/* KVA of doorbell (lies within BAR2) */
 	u_int udb_qid;		/* relative qid within the doorbell page */
-	uint16_t cap;		/* max # of desc, for convenience */
-	uint16_t avail;		/* available descriptors, for convenience */
-	uint16_t qsize;		/* size (# of entries) of the queue */
+	uint16_t sidx;		/* index of the entry with the status page */
 	uint16_t cidx;		/* consumer idx (desc idx) */
 	uint16_t pidx;		/* producer idx (desc idx) */
-	uint16_t pending;	/* # of descriptors used since last doorbell */
+	uint16_t equeqidx;	/* EQUEQ last requested at this pidx */
+	uint16_t dbidx;		/* pidx of the most recent doorbell */
 	uint16_t iqid;		/* iq that gets egr_update for the eq */
 	uint8_t tx_chan;	/* tx channel used by the eq */
-	struct task tx_task;
-	struct callout tx_callout;
-
-	/* stats */
+	volatile u_int equiq;	/* EQUIQ outstanding */
 
-	uint32_t egr_update;	/* # of SGE_EGR_UPDATE notifications for eq */
-	uint32_t unstalled;	/* recovered from stall */
+	bus_dma_tag_t desc_tag;
+	bus_dmamap_t desc_map;
+	bus_addr_t ba;		/* bus address of descriptor ring */
+	char lockname[16];
 };
 
 struct sw_zone_info {
@@ -499,18 +476,19 @@ struct sge_fl {
 	struct cluster_layout cll_alt;	/* alternate refill zone, layout */
 };
 
+struct mp_ring;
+
 /* txq: SGE egress queue + what's needed for Ethernet NIC */
 struct sge_txq {
 	struct sge_eq eq;	/* MUST be first */
 
 	struct ifnet *ifp;	/* the interface this txq belongs to */
-	bus_dma_tag_t tx_tag;	/* tag for transmit buffers */
-	struct buf_ring *br;	/* tx buffer ring */
+	struct mp_ring *r;	/* tx software ring */
 	struct tx_sdesc *sdesc;	/* KVA of software descriptor ring */
-	struct mbuf *m;		/* held up due to temporary resource shortage */
-
-	struct tx_maps txmaps;
+	struct sglist *gl;
+	__be32 cpl_ctrl0;	/* for convenience */
 
+	struct task tx_reclaim_task;
 	/* stats for common events first */
 
 	uint64_t txcsum;	/* # of times hardware assisted with checksum */
@@ -519,13 +497,12 @@ struct sge_txq {
 	uint64_t imm_wrs;	/* # of work requests with immediate data */
 	uint64_t sgl_wrs;	/* # of work requests with direct SGL */
 	uint64_t txpkt_wrs;	/* # of txpkt work requests (not coalesced) */
-	uint64_t txpkts_wrs;	/* # of coalesced tx work requests */
-	uint64_t txpkts_pkts;	/* # of frames in coalesced tx work requests */
+	uint64_t txpkts0_wrs;	/* # of type0 coalesced tx work requests */
+	uint64_t txpkts1_wrs;	/* # of type1 coalesced tx work requests */
+	uint64_t txpkts0_pkts;	/* # of frames in type0 coalesced tx WRs */
+	uint64_t txpkts1_pkts;	/* # of frames in type1 coalesced tx WRs */
 
 	/* stats for not-that-common events */
-
-	uint32_t no_dmamap;	/* no DMA map to load the mbuf */
-	uint32_t no_desc;	/* out of hardware descriptors */
 } __aligned(CACHE_LINE_SIZE);
 
 /* rxq: SGE ingress queue + SGE free list + miscellaneous items */
@@ -574,7 +551,13 @@ struct wrqe {
 	STAILQ_ENTRY(wrqe) link;
 	struct sge_wrq *wrq;
 	int wr_len;
-	uint64_t wr[] __aligned(16);
+	char wr[] __aligned(16);
+};
+
+struct wrq_cookie {
+	TAILQ_ENTRY(wrq_cookie) link;
+	int ndesc;
+	int pidx;
 };
 
 /*
@@ -585,17 +568,32 @@ struct sge_wrq {
 	struct sge_eq eq;	/* MUST be first */
 
 	struct adapter *adapter;
+	struct task wrq_tx_task;
+
+	/* Tx desc reserved but WR not "committed" yet. */
+	TAILQ_HEAD(wrq_incomplete_wrs , wrq_cookie) incomplete_wrs;
 
-	/* List of WRs held up due to lack of tx descriptors */
+	/* List of WRs ready to go out as soon as descriptors are available. */
 	STAILQ_HEAD(, wrqe) wr_list;
+	u_int nwr_pending;
+	u_int ndesc_needed;
 
 	/* stats for common events first */
 
-	uint64_t tx_wrs;	/* # of tx work requests */
+	uint64_t tx_wrs_direct;	/* # of WRs written directly to desc ring. */
+	uint64_t tx_wrs_ss;	/* # of WRs copied from scratch space. */
+	uint64_t tx_wrs_copied;	/* # of WRs queued and copied to desc ring. */
 
 	/* stats for not-that-common events */
 
-	uint32_t no_desc;	/* out of hardware descriptors */
+	/*
+	 * Scratch space for work requests that wrap around after reaching the
+	 * status page, and some infomation about the last WR that used it.
+	 */
+	uint16_t ss_pidx;
+	uint16_t ss_len;
+	uint8_t ss[SGE_MAX_WR_LEN];
+
 } __aligned(CACHE_LINE_SIZE);
 
 
@@ -744,7 +742,7 @@ struct adapter {
 	struct sge sge;
 	int lro_timeout;
 
-	struct taskqueue *tq[NCHAN];	/* taskqueues that flush data out */
+	struct taskqueue *tq[NCHAN];	/* General purpose taskqueues */
 	struct port_info *port[MAX_NPORTS];
 	uint8_t chan_map[NCHAN];
 
@@ -978,12 +976,11 @@ static inline int
 tx_resume_threshold(struct sge_eq *eq)
 {
 
-	return (eq->qsize / 4);
+	/* not quite the same as qsize / 4, but this will do. */
+	return (eq->sidx / 4);
 }
 
 /* t4_main.c */
-void t4_tx_task(void *, int);
-void t4_tx_callout(void *);
 int t4_os_find_pci_capability(struct adapter *, int);
 int t4_os_pci_save_state(struct adapter *);
 int t4_os_pci_restore_state(struct adapter *);
@@ -1024,16 +1021,15 @@ int t4_setup_adapter_queues(struct adapter *);
 int t4_teardown_adapter_queues(struct adapter *);
 int t4_setup_port_queues(struct port_info *);
 int t4_teardown_port_queues(struct port_info *);
-int t4_alloc_tx_maps(struct tx_maps *, bus_dma_tag_t, int, int);
-void t4_free_tx_maps(struct tx_maps *, bus_dma_tag_t);
 void t4_intr_all(void *);
 void t4_intr(void *);
 void t4_intr_err(void *);
 void t4_intr_evt(void *);
 void t4_wrq_tx_locked(struct adapter *, struct sge_wrq *, struct wrqe *);
-int t4_eth_tx(struct ifnet *, struct sge_txq *, struct mbuf *);
 void t4_update_fl_bufsize(struct ifnet *);
-int can_resume_tx(struct sge_eq *);
+int parse_pkt(struct mbuf **);
+void *start_wrq_wr(struct sge_wrq *, int, struct wrq_cookie *);
+void commit_wrq_wr(struct sge_wrq *, void *, struct wrq_cookie *);
 
 /* t4_tracer.c */
 struct t4_tracer;
diff --git a/sys/dev/cxgbe/t4_l2t.c b/sys/dev/cxgbe/t4_l2t.c
index 6f7378a..cca1bf3 100644
--- a/sys/dev/cxgbe/t4_l2t.c
+++ b/sys/dev/cxgbe/t4_l2t.c
@@ -113,16 +113,15 @@ found:
 int
 t4_write_l2e(struct adapter *sc, struct l2t_entry *e, int sync)
 {
-	struct wrqe *wr;
+	struct wrq_cookie cookie;
 	struct cpl_l2t_write_req *req;
 	int idx = e->idx + sc->vres.l2t.start;
 
 	mtx_assert(&e->lock, MA_OWNED);
 
-	wr = alloc_wrqe(sizeof(*req), &sc->sge.mgmtq);
-	if (wr == NULL)
+	req = start_wrq_wr(&sc->sge.mgmtq, howmany(sizeof(*req), 16), &cookie);
+	if (req == NULL)
 		return (ENOMEM);
-	req = wrtod(wr);
 
 	INIT_TP_WR(req, 0);
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, idx |
@@ -132,7 +131,7 @@ t4_write_l2e(struct adapter *sc, struct l2t_entry *e, int sync)
 	req->vlan = htons(e->vlan);
 	memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac));
 
-	t4_wrq_tx(sc, wr);
+	commit_wrq_wr(&sc->sge.mgmtq, req, &cookie);
 
 	if (sync && e->state != L2T_STATE_SWITCHING)
 		e->state = L2T_STATE_SYNC_WRITE;
diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c
index 2c384fd..39dc816 100644
--- a/sys/dev/cxgbe/t4_main.c
+++ b/sys/dev/cxgbe/t4_main.c
@@ -66,6 +66,7 @@ __FBSDID("$FreeBSD$");
 #include "common/t4_regs_values.h"
 #include "t4_ioctl.h"
 #include "t4_l2t.h"
+#include "t4_mp_ring.h"
 
 /* T4 bus driver interface */
 static int t4_probe(device_t);
@@ -378,7 +379,8 @@ static void build_medialist(struct port_info *, struct ifmedia *);
 static int cxgbe_init_synchronized(struct port_info *);
 static int cxgbe_uninit_synchronized(struct port_info *);
 static int setup_intr_handlers(struct adapter *);
-static void quiesce_eq(struct adapter *, struct sge_eq *);
+static void quiesce_txq(struct adapter *, struct sge_txq *);
+static void quiesce_wrq(struct adapter *, struct sge_wrq *);
 static void quiesce_iq(struct adapter *, struct sge_iq *);
 static void quiesce_fl(struct adapter *, struct sge_fl *);
 static int t4_alloc_irq(struct adapter *, struct irq *, int rid,
@@ -434,7 +436,6 @@ static int sysctl_tx_rate(SYSCTL_HANDLER_ARGS);
 static int sysctl_ulprx_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS);
 #endif
-static inline void txq_start(struct ifnet *, struct sge_txq *);
 static uint32_t fconf_to_mode(uint32_t);
 static uint32_t mode_to_fconf(uint32_t);
 static uint32_t fspec_to_fconf(struct t4_filter_specification *);
@@ -1429,67 +1430,36 @@ cxgbe_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct port_info *pi = ifp->if_softc;
 	struct adapter *sc = pi->adapter;
-	struct sge_txq *txq = &sc->sge.txq[pi->first_txq];
-	struct buf_ring *br;
+	struct sge_txq *txq;
+	void *items[1];
 	int rc;
 
 	M_ASSERTPKTHDR(m);
+	MPASS(m->m_nextpkt == NULL);	/* not quite ready for this yet */
 
 	if (__predict_false(pi->link_cfg.link_ok == 0)) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
-	/* check if flowid is set */
-	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
-		txq += ((m->m_pkthdr.flowid % (pi->ntxq - pi->rsrv_noflowq))
-		    + pi->rsrv_noflowq);
-	br = txq->br;
-
-	if (TXQ_TRYLOCK(txq) == 0) {
-		struct sge_eq *eq = &txq->eq;
-
-		/*
-		 * It is possible that t4_eth_tx finishes up and releases the
-		 * lock between the TRYLOCK above and the drbr_enqueue here.  We
-		 * need to make sure that this mbuf doesn't just sit there in
-		 * the drbr.
-		 */
-
-		rc = drbr_enqueue(ifp, br, m);
-		if (rc == 0 && callout_pending(&eq->tx_callout) == 0 &&
-		    !(eq->flags & EQ_DOOMED))
-			callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq);
+	rc = parse_pkt(&m);
+	if (__predict_false(rc != 0)) {
+		MPASS(m == NULL);			/* was freed already */
+		atomic_add_int(&pi->tx_parse_error, 1);	/* rare, atomic is ok */
 		return (rc);
 	}
 
-	/*
-	 * txq->m is the mbuf that is held up due to a temporary shortage of
-	 * resources and it should be put on the wire first.  Then what's in
-	 * drbr and finally the mbuf that was just passed in to us.
-	 *
-	 * Return code should indicate the fate of the mbuf that was passed in
-	 * this time.
-	 */
-
-	TXQ_LOCK_ASSERT_OWNED(txq);
-	if (drbr_needs_enqueue(ifp, br) || txq->m) {
-
-		/* Queued for transmission. */
-
-		rc = drbr_enqueue(ifp, br, m);
-		m = txq->m ? txq->m : drbr_dequeue(ifp, br);
-		(void) t4_eth_tx(ifp, txq, m);
-		TXQ_UNLOCK(txq);
-		return (rc);
-	}
+	/* Select a txq. */
+	txq = &sc->sge.txq[pi->first_txq];
+	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
+		txq += ((m->m_pkthdr.flowid % (pi->ntxq - pi->rsrv_noflowq)) +
+		    pi->rsrv_noflowq);
 
-	/* Direct transmission. */
-	rc = t4_eth_tx(ifp, txq, m);
-	if (rc != 0 && txq->m)
-		rc = 0;	/* held, will be transmitted soon (hopefully) */
+	items[0] = m;
+	rc = mp_ring_enqueue(txq->r, items, 1, 4096);
+	if (__predict_false(rc != 0))
+		m_freem(m);
 
-	TXQ_UNLOCK(txq);
 	return (rc);
 }
 
@@ -1499,17 +1469,17 @@ cxgbe_qflush(struct ifnet *ifp)
 	struct port_info *pi = ifp->if_softc;
 	struct sge_txq *txq;
 	int i;
-	struct mbuf *m;
 
 	/* queues do not exist if !PORT_INIT_DONE. */
 	if (pi->flags & PORT_INIT_DONE) {
 		for_each_txq(pi, i, txq) {
 			TXQ_LOCK(txq);
-			m_freem(txq->m);
-			txq->m = NULL;
-			while ((m = buf_ring_dequeue_sc(txq->br)) != NULL)
-				m_freem(m);
+			txq->eq.flags &= ~EQ_ENABLED;
 			TXQ_UNLOCK(txq);
+			while (!mp_ring_is_idle(txq->r)) {
+				mp_ring_check_drainage(txq->r, 0);
+				pause("qflush", 1);
+			}
 		}
 	}
 	if_qflush(ifp);
@@ -1564,7 +1534,7 @@ cxgbe_get_counter(struct ifnet *ifp, ift_counter c)
 			struct sge_txq *txq;
 
 			for_each_txq(pi, i, txq)
-				drops += txq->br->br_drops;
+				drops += counter_u64_fetch(txq->r->drops);
 		}
 
 		return (drops);
@@ -3236,7 +3206,8 @@ cxgbe_init_synchronized(struct port_info *pi)
 {
 	struct adapter *sc = pi->adapter;
 	struct ifnet *ifp = pi->ifp;
-	int rc = 0;
+	int rc = 0, i;
+	struct sge_txq *txq;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
@@ -3265,6 +3236,17 @@ cxgbe_init_synchronized(struct port_info *pi)
 	}
 
 	/*
+	 * Can't fail from this point onwards.  Review cxgbe_uninit_synchronized
+	 * if this changes.
+	 */
+
+	for_each_txq(pi, i, txq) {
+		TXQ_LOCK(txq);
+		txq->eq.flags |= EQ_ENABLED;
+		TXQ_UNLOCK(txq);
+	}
+
+	/*
 	 * The first iq of the first port to come up is used for tracing.
 	 */
 	if (sc->traceq < 0) {
@@ -3297,7 +3279,8 @@ cxgbe_uninit_synchronized(struct port_info *pi)
 {
 	struct adapter *sc = pi->adapter;
 	struct ifnet *ifp = pi->ifp;
-	int rc;
+	int rc, i;
+	struct sge_txq *txq;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
@@ -3314,6 +3297,12 @@ cxgbe_uninit_synchronized(struct port_info *pi)
 		return (rc);
 	}
 
+	for_each_txq(pi, i, txq) {
+		TXQ_LOCK(txq);
+		txq->eq.flags &= ~EQ_ENABLED;
+		TXQ_UNLOCK(txq);
+	}
+
 	clrbit(&sc->open_device_map, pi->port_id);
 	PORT_LOCK(pi);
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
@@ -3543,15 +3532,17 @@ port_full_uninit(struct port_info *pi)
 
 	if (pi->flags & PORT_INIT_DONE) {
 
-		/* Need to quiesce queues.  XXX: ctrl queues? */
+		/* Need to quiesce queues.  */
+
+		quiesce_wrq(sc, &sc->sge.ctrlq[pi->port_id]);
 
 		for_each_txq(pi, i, txq) {
-			quiesce_eq(sc, &txq->eq);
+			quiesce_txq(sc, txq);
 		}
 
 #ifdef TCP_OFFLOAD
 		for_each_ofld_txq(pi, i, ofld_txq) {
-			quiesce_eq(sc, &ofld_txq->eq);
+			quiesce_wrq(sc, ofld_txq);
 		}
 #endif
 
@@ -3576,23 +3567,39 @@ port_full_uninit(struct port_info *pi)
 }
 
 static void
-quiesce_eq(struct adapter *sc, struct sge_eq *eq)
+quiesce_txq(struct adapter *sc, struct sge_txq *txq)
 {
-	EQ_LOCK(eq);
-	eq->flags |= EQ_DOOMED;
+	struct sge_eq *eq = &txq->eq;
+	struct sge_qstat *spg = (void *)&eq->desc[eq->sidx];
 
-	/*
-	 * Wait for the response to a credit flush if one's
-	 * pending.
-	 */
-	while (eq->flags & EQ_CRFLUSHED)
-		mtx_sleep(eq, &eq->eq_lock, 0, "crflush", 0);
-	EQ_UNLOCK(eq);
+	(void) sc;	/* unused */
+
+#ifdef INVARIANTS
+	TXQ_LOCK(txq);
+	MPASS((eq->flags & EQ_ENABLED) == 0);
+	TXQ_UNLOCK(txq);
+#endif
+
+	/* Wait for the mp_ring to empty. */
+	while (!mp_ring_is_idle(txq->r)) {
+		mp_ring_check_drainage(txq->r, 0);
+		pause("rquiesce", 1);
+	}
 
-	callout_drain(&eq->tx_callout);	/* XXX: iffy */
-	pause("callout", 10);		/* Still iffy */
+	/* Then wait for the hardware to finish. */
+	while (spg->cidx != htobe16(eq->pidx))
+		pause("equiesce", 1);
 
-	taskqueue_drain(sc->tq[eq->tx_chan], &eq->tx_task);
+	/* Finally, wait for the driver to reclaim all descriptors. */
+	while (eq->cidx != eq->pidx)
+		pause("dquiesce", 1);
+}
+
+static void
+quiesce_wrq(struct adapter *sc, struct sge_wrq *wrq)
+{
+
+	/* XXXTX */
 }
 
 static void
@@ -4892,6 +4899,9 @@ cxgbe_sysctls(struct port_info *pi)
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "stats", CTLFLAG_RD,
 	    NULL, "port statistics");
 	children = SYSCTL_CHILDREN(oid);
+	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "tx_parse_error", CTLFLAG_RD,
+	    &pi->tx_parse_error, 0,
+	    "# of tx packets with invalid length or # of segments");
 
 #define SYSCTL_ADD_T4_REG64(pi, name, desc, reg) \
 	SYSCTL_ADD_OID(ctx, children, OID_AUTO, name, \
@@ -6947,74 +6957,6 @@ sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS)
 }
 #endif
 
-static inline void
-txq_start(struct ifnet *ifp, struct sge_txq *txq)
-{
-	struct buf_ring *br;
-	struct mbuf *m;
-
-	TXQ_LOCK_ASSERT_OWNED(txq);
-
-	br = txq->br;
-	m = txq->m ? txq->m : drbr_dequeue(ifp, br);
-	if (m)
-		t4_eth_tx(ifp, txq, m);
-}
-
-void
-t4_tx_callout(void *arg)
-{
-	struct sge_eq *eq = arg;
-	struct adapter *sc;
-
-	if (EQ_TRYLOCK(eq) == 0)
-		goto reschedule;
-
-	if (eq->flags & EQ_STALLED && !can_resume_tx(eq)) {
-		EQ_UNLOCK(eq);
-reschedule:
-		if (__predict_true(!(eq->flags && EQ_DOOMED)))
-			callout_schedule(&eq->tx_callout, 1);
-		return;
-	}
-
-	EQ_LOCK_ASSERT_OWNED(eq);
-
-	if (__predict_true((eq->flags & EQ_DOOMED) == 0)) {
-
-		if ((eq->flags & EQ_TYPEMASK) == EQ_ETH) {
-			struct sge_txq *txq = arg;
-			struct port_info *pi = txq->ifp->if_softc;
-
-			sc = pi->adapter;
-		} else {
-			struct sge_wrq *wrq = arg;
-
-			sc = wrq->adapter;
-		}
-
-		taskqueue_enqueue(sc->tq[eq->tx_chan], &eq->tx_task);
-	}
-
-	EQ_UNLOCK(eq);
-}
-
-void
-t4_tx_task(void *arg, int count)
-{
-	struct sge_eq *eq = arg;
-
-	EQ_LOCK(eq);
-	if ((eq->flags & EQ_TYPEMASK) == EQ_ETH) {
-		struct sge_txq *txq = arg;
-		txq_start(txq->ifp, txq);
-	} else {
-		struct sge_wrq *wrq = arg;
-		t4_wrq_tx_locked(wrq->adapter, wrq, NULL);
-	}
-	EQ_UNLOCK(eq);
-}
-
 static uint32_t
 fconf_to_mode(uint32_t fconf)
 {
@@ -7452,9 +7394,9 @@ static int
 set_filter_wr(struct adapter *sc, int fidx)
 {
 	struct filter_entry *f = &sc->tids.ftid_tab[fidx];
-	struct wrqe *wr;
 	struct fw_filter_wr *fwr;
 	unsigned int ftid;
+	struct wrq_cookie cookie;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
@@ -7473,12 +7415,10 @@ set_filter_wr(struct adapter *sc, int fidx)
 
 	ftid = sc->tids.ftid_base + fidx;
 
-	wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq);
-	if (wr == NULL)
+	fwr = start_wrq_wr(&sc->sge.mgmtq, howmany(sizeof(*fwr), 16), &cookie);
+	if (fwr == NULL)
 		return (ENOMEM);
-
-	fwr = wrtod(wr);
-	bzero(fwr, sizeof (*fwr));
+	bzero(fwr, sizeof(*fwr));
 
 	fwr->op_pkd = htobe32(V_FW_WR_OP(FW_FILTER_WR));
 	fwr->len16_pkd = htobe32(FW_LEN16(*fwr));
@@ -7547,7 +7487,7 @@ set_filter_wr(struct adapter *sc, int fidx)
 	f->pending = 1;
 	sc->tids.ftids_in_use++;
 
-	t4_wrq_tx(sc, wr);
+	commit_wrq_wr(&sc->sge.mgmtq, fwr, &cookie);
 	return (0);
 }
 
@@ -7555,22 +7495,21 @@ static int
 del_filter_wr(struct adapter *sc, int fidx)
 {
 	struct filter_entry *f = &sc->tids.ftid_tab[fidx];
-	struct wrqe *wr;
 	struct fw_filter_wr *fwr;
 	unsigned int ftid;
+	struct wrq_cookie cookie;
 
 	ftid = sc->tids.ftid_base + fidx;
 
-	wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq);
-	if (wr == NULL)
+	fwr = start_wrq_wr(&sc->sge.mgmtq, howmany(sizeof(*fwr), 16), &cookie);
+	if (fwr == NULL)
 		return (ENOMEM);
-	fwr = wrtod(wr);
 	bzero(fwr, sizeof (*fwr));
 
 	t4_mk_filtdelwr(ftid, fwr, sc->sge.fwq.abs_id);
 
 	f->pending = 1;
-	t4_wrq_tx(sc, wr);
+	commit_wrq_wr(&sc->sge.mgmtq, fwr, &cookie);
 	return (0);
 }
 
@@ -8170,6 +8109,7 @@ t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag,
 
 		/* MAC stats */
 		t4_clr_port_stats(sc, pi->tx_chan);
+		pi->tx_parse_error = 0;
 
 		if (pi->flags & PORT_INIT_DONE) {
 			struct sge_rxq *rxq;
@@ -8192,24 +8132,24 @@ t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag,
 				txq->imm_wrs = 0;
 				txq->sgl_wrs = 0;
 				txq->txpkt_wrs = 0;
-				txq->txpkts_wrs = 0;
-				txq->txpkts_pkts = 0;
-				txq->br->br_drops = 0;
-				txq->no_dmamap = 0;
-				txq->no_desc = 0;
+				txq->txpkts0_wrs = 0;
+				txq->txpkts1_wrs = 0;
+				txq->txpkts0_pkts = 0;
+				txq->txpkts1_pkts = 0;
+				mp_ring_reset_stats(txq->r);
 			}
 
 #ifdef TCP_OFFLOAD
 			/* nothing to clear for each ofld_rxq */
 
 			for_each_ofld_txq(pi, i, wrq) {
-				wrq->tx_wrs = 0;
-				wrq->no_desc = 0;
+				wrq->tx_wrs_direct = 0;
+				wrq->tx_wrs_copied = 0;
 			}
 #endif
 			wrq = &sc->sge.ctrlq[pi->port_id];
-			wrq->tx_wrs = 0;
-			wrq->no_desc = 0;
+			wrq->tx_wrs_direct = 0;
+			wrq->tx_wrs_copied = 0;
 		}
 		break;
 	}
diff --git a/sys/dev/cxgbe/t4_mp_ring.c b/sys/dev/cxgbe/t4_mp_ring.c
new file mode 100644
index 0000000..ef09f01
--- /dev/null
+++ b/sys/dev/cxgbe/t4_mp_ring.c
@@ -0,0 +1,364 @@
+/*-
+ * Copyright (c) 2014 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/counter.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <machine/cpu.h>
+
+#include "t4_mp_ring.h"
+
+union ring_state {
+	struct {
+		uint16_t pidx_head;
+		uint16_t pidx_tail;
+		uint16_t cidx;
+		uint16_t flags;
+	};
+	uint64_t state;
+};
+
+enum {
+	IDLE = 0,	/* consumer ran to completion, nothing more to do. */
+	BUSY,		/* consumer is running already, or will be shortly. */
+	STALLED,	/* consumer stopped due to lack of resources. */
+	ABDICATED,	/* consumer stopped even though there was work to be
+			   done because it wants another thread to take over. */
+};
+
+static inline uint16_t
+space_available(struct mp_ring *r, union ring_state s)
+{
+	uint16_t x = r->size - 1;
+
+	if (s.cidx == s.pidx_head)
+		return (x);
+	else if (s.cidx > s.pidx_head)
+		return (s.cidx - s.pidx_head - 1);
+	else
+		return (x - s.pidx_head + s.cidx);
+}
+
+static inline uint16_t
+increment_idx(struct mp_ring *r, uint16_t idx, uint16_t n)
+{
+	int x = r->size - idx;
+
+	MPASS(x > 0);
+	return (x > n ? idx + n : n - x);
+}
+
+/* Consumer is about to update the ring's state to s */
+static inline uint16_t
+state_to_flags(union ring_state s, int abdicate)
+{
+
+	if (s.cidx == s.pidx_tail)
+		return (IDLE);
+	else if (abdicate && s.pidx_tail != s.pidx_head)
+		return (ABDICATED);
+
+	return (BUSY);
+}
+
+/*
+ * Caller passes in a state, with a guarantee that there is work to do and that
+ * all items up to the pidx_tail in the state are visible.
+ */
+static void
+drain_ring(struct mp_ring *r, union ring_state os, uint16_t prev, int budget)
+{
+	union ring_state ns;
+	int n, pending, total;
+	uint16_t cidx = os.cidx;
+	uint16_t pidx = os.pidx_tail;
+
+	MPASS(os.flags == BUSY);
+	MPASS(cidx != pidx);
+
+	if (prev == IDLE)
+		counter_u64_add(r->starts, 1);
+	pending = 0;
+	total = 0;
+
+	while (cidx != pidx) {
+
+		/* Items from cidx to pidx are available for consumption. */
+		n = r->drain(r, cidx, pidx);
+		if (n == 0) {
+			critical_enter();
+			do {
+				os.state = ns.state = r->state;
+				ns.cidx = cidx;
+				ns.flags = STALLED;
+			} while (atomic_cmpset_64(&r->state, os.state,
+			    ns.state) == 0);
+			critical_exit();
+			if (prev != STALLED)
+				counter_u64_add(r->stalls, 1);
+			else if (total > 0) {
+				counter_u64_add(r->restarts, 1);
+				counter_u64_add(r->stalls, 1);
+			}
+			break;
+		}
+		cidx = increment_idx(r, cidx, n);
+		pending += n;
+		total += n;
+
+		/*
+		 * We update the cidx only if we've caught up with the pidx, the
+		 * real cidx is getting too far ahead of the one visible to
+		 * everyone else, or we have exceeded our budget.
+		 */
+		if (cidx != pidx && pending < 64 && total < budget)
+			continue;
+		critical_enter();
+		do {
+			os.state = ns.state = r->state;
+			ns.cidx = cidx;
+			ns.flags = state_to_flags(ns, total >= budget);
+		} while (atomic_cmpset_acq_64(&r->state, os.state, ns.state) == 0);
+		critical_exit();
+
+		if (ns.flags == ABDICATED)
+			counter_u64_add(r->abdications, 1);
+		if (ns.flags != BUSY) {
+			/* Wrong loop exit if we're going to stall. */
+			MPASS(ns.flags != STALLED);
+			if (prev == STALLED) {
+				MPASS(total > 0);
+				counter_u64_add(r->restarts, 1);
+			}
+			break;
+		}
+
+		/*
+		 * The acquire style atomic above guarantees visibility of items
+		 * associated with any pidx change that we notice here.
+		 */
+		pidx = ns.pidx_tail;
+		pending = 0;
+	}
+}
+
+int
+mp_ring_alloc(struct mp_ring **pr, int size, void *cookie, ring_drain_t drain,
+    ring_can_drain_t can_drain, struct malloc_type *mt, int flags)
+{
+	struct mp_ring *r;
+
+	/* All idx are 16b so size can be 65536 at most */
+	if (pr == NULL || size < 2 || size > 65536 || drain == NULL ||
+	    can_drain == NULL)
+		return (EINVAL);
+	*pr = NULL;
+	flags &= M_NOWAIT | M_WAITOK;
+	MPASS(flags != 0);
+
+	r = malloc(__offsetof(struct mp_ring, items[size]), mt, flags | M_ZERO);
+	if (r == NULL)
+		return (ENOMEM);
+	r->size = size;
+	r->cookie = cookie;
+	r->mt = mt;
+	r->drain = drain;
+	r->can_drain = can_drain;
+	r->enqueues = counter_u64_alloc(flags);
+	r->drops = counter_u64_alloc(flags);
+	r->starts = counter_u64_alloc(flags);
+	r->stalls = counter_u64_alloc(flags);
+	r->restarts = counter_u64_alloc(flags);
+	r->abdications = counter_u64_alloc(flags);
+	if (r->enqueues == NULL || r->drops == NULL || r->starts == NULL ||
+	    r->stalls == NULL || r->restarts == NULL ||
+	    r->abdications == NULL) {
+		mp_ring_free(r);
+		return (ENOMEM);
+	}
+
+	*pr = r;
+	return (0);
+}
+
+void
+
+mp_ring_free(struct mp_ring *r)
+{
+
+	if (r == NULL)
+		return;
+
+	if (r->enqueues != NULL)
+		counter_u64_free(r->enqueues);
+	if (r->drops != NULL)
+		counter_u64_free(r->drops);
+	if (r->starts != NULL)
+		counter_u64_free(r->starts);
+	if (r->stalls != NULL)
+		counter_u64_free(r->stalls);
+	if (r->restarts != NULL)
+		counter_u64_free(r->restarts);
+	if (r->abdications != NULL)
+		counter_u64_free(r->abdications);
+
+	free(r, r->mt);
+}
+
+/*
+ * Enqueue n items and maybe drain the ring for some time.
+ *
+ * Returns an errno.
+ */
+int
+mp_ring_enqueue(struct mp_ring *r, void **items, int n, int budget)
+{
+	union ring_state os, ns;
+	uint16_t pidx_start, pidx_stop;
+	int i;
+
+	MPASS(items != NULL);
+	MPASS(n > 0);
+
+	/*
+	 * Reserve room for the new items.  Our reservation, if successful, is
+	 * from 'pidx_start' to 'pidx_stop'.
+	 */
+	for (;;) {
+		os.state = r->state;
+		if (n >= space_available(r, os)) {
+			counter_u64_add(r->drops, n);
+			MPASS(os.flags != IDLE);
+			if (os.flags == STALLED)
+				mp_ring_check_drainage(r, 0);
+			return (ENOBUFS);
+		}
+		ns.state = os.state;
+		ns.pidx_head = increment_idx(r, os.pidx_head, n);
+		critical_enter();
+		if (atomic_cmpset_64(&r->state, os.state, ns.state))
+			break;
+		critical_exit();
+		cpu_spinwait();
+	}
+	pidx_start = os.pidx_head;
+	pidx_stop = ns.pidx_head;
+
+	/*
+	 * Wait for other producers who got in ahead of us to enqueue their
+	 * items, one producer at a time.  It is our turn when the ring's
+	 * pidx_tail reaches the begining of our reservation (pidx_start).
+	 */
+	while (ns.pidx_tail != pidx_start) {
+		cpu_spinwait();
+		ns.state = r->state;
+	}
+
+	/* Now it is our turn to fill up the area we reserved earlier. */
+	i = pidx_start;
+	do {
+		r->items[i] = *items++;
+		if (__predict_false(++i == r->size))
+			i = 0;
+	} while (i != pidx_stop);
+
+	/*
+	 * Update the ring's pidx_tail.  The release style atomic guarantees
+	 * that the items are visible to any thread that sees the updated pidx.
+	 */
+	do {
+		os.state = ns.state = r->state;
+		ns.pidx_tail = pidx_stop;
+		ns.flags = BUSY;
+	} while (atomic_cmpset_rel_64(&r->state, os.state, ns.state) == 0);
+	critical_exit();
+	counter_u64_add(r->enqueues, n);
+
+	/*
+	 * Turn into a consumer if some other thread isn't active as a consumer
+	 * already.
+	 */
+	if (os.flags != BUSY)
+		drain_ring(r, ns, os.flags, budget);
+
+	return (0);
+}
+
+void
+mp_ring_check_drainage(struct mp_ring *r, int budget)
+{
+	union ring_state os, ns;
+
+	os.state = r->state;
+	if (os.flags != STALLED || os.pidx_head != os.pidx_tail ||
+	    r->can_drain(r) == 0)
+		return;
+
+	MPASS(os.cidx != os.pidx_tail);	/* implied by STALLED */
+	ns.state = os.state;
+	ns.flags = BUSY;
+
+	/*
+	 * The acquire style atomic guarantees visibility of items associated
+	 * with the pidx that we read here.
+	 */
+	if (!atomic_cmpset_acq_64(&r->state, os.state, ns.state))
+		return;
+
+	drain_ring(r, ns, os.flags, budget);
+}
+
+void
+mp_ring_reset_stats(struct mp_ring *r)
+{
+
+	counter_u64_zero(r->enqueues);
+	counter_u64_zero(r->drops);
+	counter_u64_zero(r->starts);
+	counter_u64_zero(r->stalls);
+	counter_u64_zero(r->restarts);
+	counter_u64_zero(r->abdications);
+}
+
+int
+mp_ring_is_idle(struct mp_ring *r)
+{
+	union ring_state s;
+
+	s.state = r->state;
+	if (s.pidx_head == s.pidx_tail && s.pidx_tail == s.cidx &&
+	    s.flags == IDLE)
+		return (1);
+
+	return (0);
+}
diff --git a/sys/dev/cxgbe/t4_mp_ring.h b/sys/dev/cxgbe/t4_mp_ring.h
new file mode 100644
index 0000000..c9ee346
--- /dev/null
+++ b/sys/dev/cxgbe/t4_mp_ring.h
@@ -0,0 +1,68 @@
+/*-
+ * Copyright (c) 2014 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+#ifndef __CXGBE_MP_RING_H
+#define __CXGBE_MP_RING_H
+
+#ifndef _KERNEL
+#error "no user-serviceable parts inside"
+#endif
+
+struct mp_ring;
+typedef u_int (*ring_drain_t)(struct mp_ring *, u_int, u_int);
+typedef u_int (*ring_can_drain_t)(struct mp_ring *);
+
+struct mp_ring {
+	volatile uint64_t	state __aligned(CACHE_LINE_SIZE);
+
+	int			size __aligned(CACHE_LINE_SIZE);
+	void *			cookie;
+	struct malloc_type *	mt;
+	ring_drain_t		drain;
+	ring_can_drain_t	can_drain;	/* cheap, may be unreliable */
+	counter_u64_t		enqueues;
+	counter_u64_t		drops;
+	counter_u64_t		starts;
+	counter_u64_t		stalls;
+	counter_u64_t		restarts;	/* recovered after stalling */
+	counter_u64_t		abdications;
+
+	void * volatile		items[] __aligned(CACHE_LINE_SIZE);
+};
+
+int mp_ring_alloc(struct mp_ring **, int, void *, ring_drain_t,
+    ring_can_drain_t, struct malloc_type *, int);
+void mp_ring_free(struct mp_ring *);
+int mp_ring_enqueue(struct mp_ring *, void **, int, int);
+void mp_ring_check_drainage(struct mp_ring *, int);
+void mp_ring_reset_stats(struct mp_ring *);
+int mp_ring_is_idle(struct mp_ring *);
+
+#endif
diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c
index 96e22cb..026b4ce 100644
--- a/sys/dev/cxgbe/t4_sge.c
+++ b/sys/dev/cxgbe/t4_sge.c
@@ -36,12 +36,12 @@ __FBSDID("$FreeBSD$");
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/kernel.h>
-#include <sys/kdb.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/taskqueue.h>
 #include <sys/time.h>
+#include <sys/sglist.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 #include <sys/counter.h>
@@ -68,6 +68,7 @@ __FBSDID("$FreeBSD$");
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "common/t4_msg.h"
+#include "t4_mp_ring.h"
 
 #ifdef T4_PKT_TIMESTAMP
 #define RX_COPY_THRESHOLD (MINCLSIZE - 8)
@@ -147,19 +148,17 @@ TUNABLE_INT("hw.cxgbe.largest_rx_cluster", &largest_rx_cluster);
 static int safest_rx_cluster = PAGE_SIZE;
 TUNABLE_INT("hw.cxgbe.safest_rx_cluster", &safest_rx_cluster);
 
-/* Used to track coalesced tx work request */
 struct txpkts {
-	uint64_t *flitp;	/* ptr to flit where next pkt should start */
-	uint8_t npkt;		/* # of packets in this work request */
-	uint8_t nflits;		/* # of flits used by this work request */
-	uint16_t plen;		/* total payload (sum of all packets) */
+	u_int wr_type;		/* type 0 or type 1 */
+	u_int npkt;		/* # of packets in this work request */
+	u_int plen;		/* total payload (sum of all packets) */
+	u_int len16;		/* # of 16B pieces used by this work request */
 };
 
 /* A packet's SGL.  This + m_pkthdr has all info needed for tx */
 struct sgl {
-	int nsegs;		/* # of segments in the SGL, 0 means imm. tx */
-	int nflits;		/* # of flits needed for the SGL */
-	bus_dma_segment_t seg[TX_SGL_SEGS];
+	struct sglist sg;
+	struct sglist_seg seg[TX_SGL_SEGS];
 };
 
 static int service_iq(struct sge_iq *, int);
@@ -221,26 +220,31 @@ static void find_best_refill_source(struct adapter *, struct sge_fl *, int);
 static void find_safe_refill_source(struct adapter *, struct sge_fl *);
 static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
 
-static int get_pkt_sgl(struct sge_txq *, struct mbuf **, struct sgl *, int);
-static int free_pkt_sgl(struct sge_txq *, struct sgl *);
-static int write_txpkt_wr(struct port_info *, struct sge_txq *, struct mbuf *,
-    struct sgl *);
-static int add_to_txpkts(struct port_info *, struct sge_txq *, struct txpkts *,
-    struct mbuf *, struct sgl *);
-static void write_txpkts_wr(struct sge_txq *, struct txpkts *);
-static inline void write_ulp_cpl_sgl(struct port_info *, struct sge_txq *,
-    struct txpkts *, struct mbuf *, struct sgl *);
-static int write_sgl_to_txd(struct sge_eq *, struct sgl *, caddr_t *);
+static inline void get_pkt_gl(struct mbuf *, struct sglist *);
+static inline u_int txpkt_len16(u_int, u_int);
+static inline u_int txpkts0_len16(u_int);
+static inline u_int txpkts1_len16(void);
+static u_int write_txpkt_wr(struct sge_txq *, struct fw_eth_tx_pkt_wr *,
+    struct mbuf *, u_int);
+static int try_txpkts(struct mbuf *, struct mbuf *, struct txpkts *, u_int);
+static int add_to_txpkts(struct mbuf *, struct txpkts *, u_int);
+static u_int write_txpkts_wr(struct sge_txq *, struct fw_eth_tx_pkts_wr *,
+    struct mbuf *, const struct txpkts *, u_int);
+static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int);
 static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int);
-static inline void ring_eq_db(struct adapter *, struct sge_eq *);
-static inline int reclaimable(struct sge_eq *);
-static int reclaim_tx_descs(struct sge_txq *, int, int);
-static void write_eqflush_wr(struct sge_eq *);
-static __be64 get_flit(bus_dma_segment_t *, int, int);
+static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int);
+static inline uint16_t read_hw_cidx(struct sge_eq *);
+static inline u_int reclaimable_tx_desc(struct sge_eq *);
+static inline u_int total_available_tx_desc(struct sge_eq *);
+static u_int reclaim_tx_descs(struct sge_txq *, u_int);
+static void tx_reclaim(void *, int);
+static __be64 get_flit(struct sglist_seg *, int, int);
 static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 static int handle_fw_msg(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
+static void wrq_tx_drain(void *, int);
+static void drain_wrq_wr_list(struct adapter *, struct sge_wrq *);
 
 static int sysctl_uint16(SYSCTL_HANDLER_ARGS);
 static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS);
@@ -1785,327 +1789,679 @@ t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
 }
 
 /*
+ * Must drain the wrq or make sure that someone else will.
+ */
+static void
+wrq_tx_drain(void *arg, int n)
+{
+	struct sge_wrq *wrq = arg;
+	struct sge_eq *eq = &wrq->eq;
+
+	EQ_LOCK(eq);
+	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
+		drain_wrq_wr_list(wrq->adapter, wrq);
+	EQ_UNLOCK(eq);
+}
+
+static void
+drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq)
+{
+	struct sge_eq *eq = &wrq->eq;
+	u_int available, dbdiff;	/* # of hardware descriptors */
+	u_int n;
+	struct wrqe *wr;
+	struct fw_eth_tx_pkt_wr *dst;	/* any fw WR struct will do */
+
+	EQ_LOCK_ASSERT_OWNED(eq);
+	MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs));
+	wr = STAILQ_FIRST(&wrq->wr_list);
+	MPASS(wr != NULL);	/* Must be called with something useful to do */
+	dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx);
+
+	do {
+		eq->cidx = read_hw_cidx(eq);
+		if (eq->pidx == eq->cidx)
+			available = eq->sidx - 1;
+		else
+			available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
+
+		MPASS(wr->wrq == wrq);
+		n = howmany(wr->wr_len, EQ_ESIZE);
+		if (available < n)
+			return;
+
+		dst = (void *)&eq->desc[eq->pidx];
+		if (__predict_true(eq->sidx - eq->pidx > n)) {
+			/* Won't wrap, won't end exactly at the status page. */
+			bcopy(&wr->wr[0], dst, wr->wr_len);
+			eq->pidx += n;
+		} else {
+			int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE;
+
+			bcopy(&wr->wr[0], dst, first_portion);
+			if (wr->wr_len > first_portion) {
+				bcopy(&wr->wr[first_portion], &eq->desc[0],
+				    wr->wr_len - first_portion);
+			}
+			eq->pidx = n - (eq->sidx - eq->pidx);
+		}
+
+		if (available < eq->sidx / 4 &&
+		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
+			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
+			    F_FW_WR_EQUEQ);
+			eq->equeqidx = eq->pidx;
+		} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
+			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
+			eq->equeqidx = eq->pidx;
+		}
+
+		dbdiff += n;
+		if (dbdiff >= 16) {
+			ring_eq_db(sc, eq, dbdiff);
+			dbdiff = 0;
+		}
+
+		STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
+		free_wrqe(wr);
+		MPASS(wrq->nwr_pending > 0);
+		wrq->nwr_pending--;
+		MPASS(wrq->ndesc_needed >= n);
+		wrq->ndesc_needed -= n;
+	} while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL);
+
+	if (dbdiff)
+		ring_eq_db(sc, eq, dbdiff);
+}
+
+/*
  * Doesn't fail.  Holds on to work requests it can't send right away.
  */
 void
 t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr)
 {
+#ifdef INVARIANTS
 	struct sge_eq *eq = &wrq->eq;
-	int can_reclaim;
-	caddr_t dst;
+#endif
+
+	EQ_LOCK_ASSERT_OWNED(eq);
+	MPASS(wr != NULL);
+	MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN);
+	MPASS((wr->wr_len & 0x7) == 0);
 
-	TXQ_LOCK_ASSERT_OWNED(wrq);
+	STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link);
+	wrq->nwr_pending++;
+	wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE);
+
+	if (!TAILQ_EMPTY(&wrq->incomplete_wrs))
+		return;	/* commit_wrq_wr will drain wr_list as well. */
+
+	drain_wrq_wr_list(sc, wrq);
+
+	/* Doorbell must have caught up to the pidx. */
+	MPASS(eq->pidx == eq->dbidx);
+}
+
+void
+t4_update_fl_bufsize(struct ifnet *ifp)
+{
+	struct port_info *pi = ifp->if_softc;
+	struct adapter *sc = pi->adapter;
+	struct sge_rxq *rxq;
 #ifdef TCP_OFFLOAD
-	KASSERT((eq->flags & EQ_TYPEMASK) == EQ_OFLD ||
-	    (eq->flags & EQ_TYPEMASK) == EQ_CTRL,
-	    ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK));
-#else
-	KASSERT((eq->flags & EQ_TYPEMASK) == EQ_CTRL,
-	    ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK));
+	struct sge_ofld_rxq *ofld_rxq;
 #endif
+	struct sge_fl *fl;
+	int i, maxp, mtu = ifp->if_mtu;
 
-	if (__predict_true(wr != NULL))
-		STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link);
+	maxp = mtu_to_max_payload(sc, mtu, 0);
+	for_each_rxq(pi, i, rxq) {
+		fl = &rxq->fl;
 
-	can_reclaim = reclaimable(eq);
-	if (__predict_false(eq->flags & EQ_STALLED)) {
-		if (eq->avail + can_reclaim < tx_resume_threshold(eq))
-			return;
-		eq->flags &= ~EQ_STALLED;
-		eq->unstalled++;
+		FL_LOCK(fl);
+		find_best_refill_source(sc, fl, maxp);
+		FL_UNLOCK(fl);
 	}
-	eq->cidx += can_reclaim;
-	eq->avail += can_reclaim;
-	if (__predict_false(eq->cidx >= eq->cap))
-		eq->cidx -= eq->cap;
+#ifdef TCP_OFFLOAD
+	maxp = mtu_to_max_payload(sc, mtu, 1);
+	for_each_ofld_rxq(pi, i, ofld_rxq) {
+		fl = &ofld_rxq->fl;
 
-	while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL) {
-		int ndesc;
+		FL_LOCK(fl);
+		find_best_refill_source(sc, fl, maxp);
+		FL_UNLOCK(fl);
+	}
+#endif
+}
 
-		if (__predict_false(wr->wr_len < 0 ||
-		    wr->wr_len > SGE_MAX_WR_LEN || (wr->wr_len & 0x7))) {
+static inline int
+mbuf_nsegs(struct mbuf *m)
+{
 
-#ifdef INVARIANTS
-			panic("%s: work request with length %d", __func__,
-			    wr->wr_len);
-#endif
-#ifdef KDB
-			kdb_backtrace();
-#endif
-			log(LOG_ERR, "%s: %s work request with length %d",
-			    device_get_nameunit(sc->dev), __func__, wr->wr_len);
-			STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
-			free_wrqe(wr);
-			continue;
-		}
+	M_ASSERTPKTHDR(m);
+	KASSERT(m->m_pkthdr.l5hlen > 0,
+	    ("%s: mbuf %p missing information on # of segments.", __func__, m));
 
-		ndesc = howmany(wr->wr_len, EQ_ESIZE);
-		if (eq->avail < ndesc) {
-			wrq->no_desc++;
-			break;
-		}
+	return (m->m_pkthdr.l5hlen);
+}
 
-		dst = (void *)&eq->desc[eq->pidx];
-		copy_to_txd(eq, wrtod(wr), &dst, wr->wr_len);
+static inline void
+set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs)
+{
 
-		eq->pidx += ndesc;
-		eq->avail -= ndesc;
-		if (__predict_false(eq->pidx >= eq->cap))
-			eq->pidx -= eq->cap;
+	M_ASSERTPKTHDR(m);
+	m->m_pkthdr.l5hlen = nsegs;
+}
 
-		eq->pending += ndesc;
-		if (eq->pending >= 8)
-			ring_eq_db(sc, eq);
+static inline int
+mbuf_len16(struct mbuf *m)
+{
+	int n;
 
-		wrq->tx_wrs++;
-		STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
-		free_wrqe(wr);
+	M_ASSERTPKTHDR(m);
+	n = m->m_pkthdr.PH_loc.eight[0];
+	MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
 
-		if (eq->avail < 8) {
-			can_reclaim = reclaimable(eq);
-			eq->cidx += can_reclaim;
-			eq->avail += can_reclaim;
-			if (__predict_false(eq->cidx >= eq->cap))
-				eq->cidx -= eq->cap;
-		}
-	}
+	return (n);
+}
+
+static inline void
+set_mbuf_len16(struct mbuf *m, uint8_t len16)
+{
 
-	if (eq->pending)
-		ring_eq_db(sc, eq);
+	M_ASSERTPKTHDR(m);
+	m->m_pkthdr.PH_loc.eight[0] = len16;
+}
+
+static inline int
+needs_tso(struct mbuf *m)
+{
 
-	if (wr != NULL) {
-		eq->flags |= EQ_STALLED;
-		if (callout_pending(&eq->tx_callout) == 0)
-			callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq);
+	M_ASSERTPKTHDR(m);
+
+	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
+		KASSERT(m->m_pkthdr.tso_segsz > 0,
+		    ("%s: TSO requested in mbuf %p but MSS not provided",
+		    __func__, m));
+		return (1);
 	}
+
+	return (0);
 }
 
-/* Per-packet header in a coalesced tx WR, before the SGL starts (in flits) */
-#define TXPKTS_PKT_HDR ((\
-    sizeof(struct ulp_txpkt) + \
-    sizeof(struct ulptx_idata) + \
-    sizeof(struct cpl_tx_pkt_core) \
-    ) / 8)
-
-/* Header of a coalesced tx WR, before SGL of first packet (in flits) */
-#define TXPKTS_WR_HDR (\
-    sizeof(struct fw_eth_tx_pkts_wr) / 8 + \
-    TXPKTS_PKT_HDR)
-
-/* Header of a tx WR, before SGL of first packet (in flits) */
-#define TXPKT_WR_HDR ((\
-    sizeof(struct fw_eth_tx_pkt_wr) + \
-    sizeof(struct cpl_tx_pkt_core) \
-    ) / 8 )
-
-/* Header of a tx LSO WR, before SGL of first packet (in flits) */
-#define TXPKT_LSO_WR_HDR ((\
-    sizeof(struct fw_eth_tx_pkt_wr) + \
-    sizeof(struct cpl_tx_pkt_lso_core) + \
-    sizeof(struct cpl_tx_pkt_core) \
-    ) / 8 )
+static inline int
+needs_l3_csum(struct mbuf *m)
+{
+
+	M_ASSERTPKTHDR(m);
 
-int
-t4_eth_tx(struct ifnet *ifp, struct sge_txq *txq, struct mbuf *m)
+	if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO))
+		return (1);
+	return (0);
+}
+
+static inline int
+needs_l4_csum(struct mbuf *m)
 {
-	struct port_info *pi = (void *)ifp->if_softc;
-	struct adapter *sc = pi->adapter;
-	struct sge_eq *eq = &txq->eq;
-	struct buf_ring *br = txq->br;
-	struct mbuf *next;
-	int rc, coalescing, can_reclaim;
-	struct txpkts txpkts;
-	struct sgl sgl;
 
-	TXQ_LOCK_ASSERT_OWNED(txq);
-	KASSERT(m, ("%s: called with nothing to do.", __func__));
-	KASSERT((eq->flags & EQ_TYPEMASK) == EQ_ETH,
-	    ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK));
+	M_ASSERTPKTHDR(m);
 
-	prefetch(&eq->desc[eq->pidx]);
-	prefetch(&txq->sdesc[eq->pidx]);
+	if (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
+	    CSUM_TCP_IPV6 | CSUM_TSO))
+		return (1);
+	return (0);
+}
 
-	txpkts.npkt = 0;/* indicates there's nothing in txpkts */
-	coalescing = 0;
+static inline int
+needs_vlan_insertion(struct mbuf *m)
+{
 
-	can_reclaim = reclaimable(eq);
-	if (__predict_false(eq->flags & EQ_STALLED)) {
-		if (eq->avail + can_reclaim < tx_resume_threshold(eq)) {
-			txq->m = m;
-			return (0);
-		}
-		eq->flags &= ~EQ_STALLED;
-		eq->unstalled++;
-	}
+	M_ASSERTPKTHDR(m);
 
-	if (__predict_false(eq->flags & EQ_DOOMED)) {
-		m_freem(m);
-		while ((m = buf_ring_dequeue_sc(txq->br)) != NULL)
-			m_freem(m);
-		return (ENETDOWN);
+	if (m->m_flags & M_VLANTAG) {
+		KASSERT(m->m_pkthdr.ether_vtag != 0,
+		    ("%s: HWVLAN requested in mbuf %p but tag not provided",
+		    __func__, m));
+		return (1);
 	}
+	return (0);
+}
 
-	if (eq->avail < 8 && can_reclaim)
-		reclaim_tx_descs(txq, can_reclaim, 32);
+static void *
+m_advance(struct mbuf **pm, int *poffset, int len)
+{
+	struct mbuf *m = *pm;
+	int offset = *poffset;
+	uintptr_t p = 0;
 
-	for (; m; m = next ? next : drbr_dequeue(ifp, br)) {
+	MPASS(len > 0);
 
-		if (eq->avail < 8)
+	while (len) {
+		if (offset + len < m->m_len) {
+			offset += len;
+			p = mtod(m, uintptr_t) + offset;
 			break;
+		}
+		len -= m->m_len - offset;
+		m = m->m_next;
+		offset = 0;
+		MPASS(m != NULL);
+	}
+	*poffset = offset;
+	*pm = m;
+	return ((void *)p);
+}
 
-		next = m->m_nextpkt;
-		m->m_nextpkt = NULL;
+static inline int
+same_paddr(char *a, char *b)
+{
 
-		if (next || buf_ring_peek(br))
-			coalescing = 1;
+	if (a == b)
+		return (1);
+	else if (a != NULL && b != NULL) {
+		vm_offset_t x = (vm_offset_t)a;
+		vm_offset_t y = (vm_offset_t)b;
 
-		rc = get_pkt_sgl(txq, &m, &sgl, coalescing);
-		if (rc != 0) {
-			if (rc == ENOMEM) {
+		if ((x & PAGE_MASK) == (y & PAGE_MASK) &&
+		    pmap_kextract(x) == pmap_kextract(y))
+			return (1);
+	}
 
-				/* Short of resources, suspend tx */
+	return (0);
+}
 
-				m->m_nextpkt = next;
-				break;
-			}
+/*
+ * Can deal with empty mbufs in the chain that have m_len = 0, but the chain
+ * must have at least one mbuf that's not empty.
+ */
+static inline int
+count_mbuf_nsegs(struct mbuf *m)
+{
+	char *prev_end, *start;
+	int len, nsegs;
 
-			/*
-			 * Unrecoverable error for this packet, throw it away
-			 * and move on to the next.  get_pkt_sgl may already
-			 * have freed m (it will be NULL in that case and the
-			 * m_freem here is still safe).
-			 */
+	MPASS(m != NULL);
 
-			m_freem(m);
+	nsegs = 0;
+	prev_end = NULL;
+	for (; m; m = m->m_next) {
+
+		len = m->m_len;
+		if (__predict_false(len == 0))
 			continue;
-		}
+		start = mtod(m, char *);
 
-		if (coalescing &&
-		    add_to_txpkts(pi, txq, &txpkts, m, &sgl) == 0) {
+		nsegs += sglist_count(start, len);
+		if (same_paddr(prev_end, start))
+			nsegs--;
+		prev_end = start + len;
+	}
 
-			/* Successfully absorbed into txpkts */
+	MPASS(nsegs > 0);
+	return (nsegs);
+}
 
-			write_ulp_cpl_sgl(pi, txq, &txpkts, m, &sgl);
-			goto doorbell;
+/*
+ * Analyze the mbuf to determine its tx needs.  The mbuf passed in may change:
+ * a) caller can assume it's been freed if this function returns with an error.
+ * b) it may get defragged up if the gather list is too long for the hardware.
+ */
+int
+parse_pkt(struct mbuf **mp)
+{
+	struct mbuf *m0 = *mp, *m;
+	int rc, nsegs, defragged = 0, offset;
+	struct ether_header *eh;
+	void *l3hdr;
+#if defined(INET) || defined(INET6)
+	struct tcphdr *tcp;
+#endif
+	uint16_t eh_type;
+
+	M_ASSERTPKTHDR(m0);
+	if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) {
+		rc = EINVAL;
+fail:
+		m_freem(m0);
+		*mp = NULL;
+		return (rc);
+	}
+restart:
+	/*
+	 * First count the number of gather list segments in the payload.
+	 * Defrag the mbuf if nsegs exceeds the hardware limit.
+	 */
+	M_ASSERTPKTHDR(m0);
+	MPASS(m0->m_pkthdr.len > 0);
+	nsegs = count_mbuf_nsegs(m0);
+	if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) {
+		if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) {
+			rc = EFBIG;
+			goto fail;
 		}
+		*mp = m0 = m;	/* update caller's copy after defrag */
+		goto restart;
+	}
 
-		/*
-		 * We weren't coalescing to begin with, or current frame could
-		 * not be coalesced (add_to_txpkts flushes txpkts if a frame
-		 * given to it can't be coalesced).  Either way there should be
-		 * nothing in txpkts.
-		 */
-		KASSERT(txpkts.npkt == 0,
-		    ("%s: txpkts not empty: %d", __func__, txpkts.npkt));
+	if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN)) {
+		m0 = m_pullup(m0, m0->m_pkthdr.len);
+		if (m0 == NULL) {
+			/* Should have left well enough alone. */
+			rc = EFBIG;
+			goto fail;
+		}
+		*mp = m0;	/* update caller's copy after pullup */
+		goto restart;
+	}
+	set_mbuf_nsegs(m0, nsegs);
+	set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0)));
 
-		/* We're sending out individual packets now */
-		coalescing = 0;
+	if (!needs_tso(m0))
+		return (0);
 
-		if (eq->avail < 8)
-			reclaim_tx_descs(txq, 0, 8);
-		rc = write_txpkt_wr(pi, txq, m, &sgl);
-		if (rc != 0) {
+	m = m0;
+	eh = mtod(m, struct ether_header *);
+	eh_type = ntohs(eh->ether_type);
+	if (eh_type == ETHERTYPE_VLAN) {
+		struct ether_vlan_header *evh = (void *)eh;
 
-			/* Short of hardware descriptors, suspend tx */
+		eh_type = ntohs(evh->evl_proto);
+		m0->m_pkthdr.l2hlen = sizeof(*evh);
+	} else
+		m0->m_pkthdr.l2hlen = sizeof(*eh);
 
-			/*
-			 * This is an unlikely but expensive failure.  We've
-			 * done all the hard work (DMA mappings etc.) and now we
-			 * can't send out the packet.  What's worse, we have to
-			 * spend even more time freeing up everything in sgl.
-			 */
-			txq->no_desc++;
-			free_pkt_sgl(txq, &sgl);
+	offset = 0;
+	l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen);
 
-			m->m_nextpkt = next;
-			break;
-		}
+	switch (eh_type) {
+#ifdef INET6
+	case ETHERTYPE_IPV6:
+	{
+		struct ip6_hdr *ip6 = l3hdr;
 
-		ETHER_BPF_MTAP(ifp, m);
-		if (sgl.nsegs == 0)
-			m_freem(m);
-doorbell:
-		if (eq->pending >= 8)
-			ring_eq_db(sc, eq);
+		MPASS(ip6->ip6_nxt == IPPROTO_TCP);
 
-		can_reclaim = reclaimable(eq);
-		if (can_reclaim >= 32)
-			reclaim_tx_descs(txq, can_reclaim, 64);
+		m0->m_pkthdr.l3hlen = sizeof(*ip6);
+		break;
 	}
+#endif
+#ifdef INET
+	case ETHERTYPE_IP:
+	{
+		struct ip *ip = l3hdr;
 
-	if (txpkts.npkt > 0)
-		write_txpkts_wr(txq, &txpkts);
+		m0->m_pkthdr.l3hlen = ip->ip_hl * 4;
+		break;
+	}
+#endif
+	default:
+		panic("%s: ethertype 0x%04x unknown.  if_cxgbe must be compiled"
+		    " with the same INET/INET6 options as the kernel.",
+		    __func__, eh_type);
+	}
 
-	/*
-	 * m not NULL means there was an error but we haven't thrown it away.
-	 * This can happen when we're short of tx descriptors (no_desc) or maybe
-	 * even DMA maps (no_dmamap).  Either way, a credit flush and reclaim
-	 * will get things going again.
-	 */
-	if (m && !(eq->flags & EQ_CRFLUSHED)) {
-		struct tx_sdesc *txsd = &txq->sdesc[eq->pidx];
+#if defined(INET) || defined(INET6)
+	tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen);
+	m0->m_pkthdr.l4hlen = tcp->th_off * 4;
+#endif
+	MPASS(m0 == *mp);
+	return (0);
+}
 
-		/*
-		 * If EQ_CRFLUSHED is not set then we know we have at least one
-		 * available descriptor because any WR that reduces eq->avail to
-		 * 0 also sets EQ_CRFLUSHED.
-		 */
-		KASSERT(eq->avail > 0, ("%s: no space for eqflush.", __func__));
+void *
+start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie)
+{
+	struct sge_eq *eq = &wrq->eq;
+	struct adapter *sc = wrq->adapter;
+	int ndesc, available;
+	struct wrqe *wr;
+	void *w;
 
-		txsd->desc_used = 1;
-		txsd->credits = 0;
-		write_eqflush_wr(eq);
-	}
-	txq->m = m;
+	MPASS(len16 > 0);
+	ndesc = howmany(len16, EQ_ESIZE / 16);
+	MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC);
 
-	if (eq->pending)
-		ring_eq_db(sc, eq);
+	EQ_LOCK(eq);
 
-	reclaim_tx_descs(txq, 0, 128);
+	if (!STAILQ_EMPTY(&wrq->wr_list))
+		drain_wrq_wr_list(sc, wrq);
 
-	if (eq->flags & EQ_STALLED && callout_pending(&eq->tx_callout) == 0)
-		callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq);
+	if (!STAILQ_EMPTY(&wrq->wr_list)) {
+slowpath:
+		EQ_UNLOCK(eq);
+		wr = alloc_wrqe(len16 * 16, wrq);
+		if (__predict_false(wr == NULL))
+			return (NULL);
+		cookie->pidx = -1;
+		cookie->ndesc = ndesc;
+		return (&wr->wr);
+	}
 
-	return (0);
+	eq->cidx = read_hw_cidx(eq);
+	if (eq->pidx == eq->cidx)
+		available = eq->sidx - 1;
+	else
+		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
+	if (available < ndesc)
+		goto slowpath;
+
+	cookie->pidx = eq->pidx;
+	cookie->ndesc = ndesc;
+	TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link);
+
+	w = &eq->desc[eq->pidx];
+	IDXINCR(eq->pidx, ndesc, eq->sidx);
+	if (__predict_false(eq->pidx < ndesc - 1)) {
+		w = &wrq->ss[0];
+		wrq->ss_pidx = cookie->pidx;
+		wrq->ss_len = len16 * 16;
+	}
+
+	EQ_UNLOCK(eq);
+
+	return (w);
 }
 
 void
-t4_update_fl_bufsize(struct ifnet *ifp)
+commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie)
 {
-	struct port_info *pi = ifp->if_softc;
-	struct adapter *sc = pi->adapter;
-	struct sge_rxq *rxq;
-#ifdef TCP_OFFLOAD
-	struct sge_ofld_rxq *ofld_rxq;
-#endif
-	struct sge_fl *fl;
-	int i, maxp, mtu = ifp->if_mtu;
+	struct sge_eq *eq = &wrq->eq;
+	struct adapter *sc = wrq->adapter;
+	int ndesc, pidx;
+	struct wrq_cookie *prev, *next;
 
-	maxp = mtu_to_max_payload(sc, mtu, 0);
-	for_each_rxq(pi, i, rxq) {
-		fl = &rxq->fl;
+	if (cookie->pidx == -1) {
+		struct wrqe *wr = __containerof(w, struct wrqe, wr);
 
-		FL_LOCK(fl);
-		find_best_refill_source(sc, fl, maxp);
-		FL_UNLOCK(fl);
+		t4_wrq_tx(sc, wr);
+		return;
 	}
-#ifdef TCP_OFFLOAD
-	maxp = mtu_to_max_payload(sc, mtu, 1);
-	for_each_ofld_rxq(pi, i, ofld_rxq) {
-		fl = &ofld_rxq->fl;
 
-		FL_LOCK(fl);
-		find_best_refill_source(sc, fl, maxp);
-		FL_UNLOCK(fl);
+	ndesc = cookie->ndesc;	/* Can be more than SGE_MAX_WR_NDESC here. */
+	pidx = cookie->pidx;
+	MPASS(pidx >= 0 && pidx < eq->sidx);
+	if (__predict_false(w == &wrq->ss[0])) {
+		int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE;
+
+		MPASS(wrq->ss_len > n);	/* WR had better wrap around. */
+		bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n);
+		bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n);
+		wrq->tx_wrs_ss++;
+	} else
+		wrq->tx_wrs_direct++;
+
+	EQ_LOCK(eq);
+	prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link);
+	next = TAILQ_NEXT(cookie, link);
+	if (prev == NULL) {
+		MPASS(pidx == eq->dbidx);
+		if (next == NULL || ndesc >= 16)
+			ring_eq_db(wrq->adapter, eq, ndesc);
+		else {
+			MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc);
+			next->pidx = pidx;
+			next->ndesc += ndesc;
+		}
+	} else {
+		MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc);
+		prev->ndesc += ndesc;
+	}
+	TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link);
+
+	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
+		drain_wrq_wr_list(sc, wrq);
+
+#ifdef INVARIANTS
+	if (TAILQ_EMPTY(&wrq->incomplete_wrs)) {
+		/* Doorbell must have caught up to the pidx. */
+		MPASS(wrq->eq.pidx == wrq->eq.dbidx);
 	}
 #endif
+	EQ_UNLOCK(eq);
 }
 
-int
-can_resume_tx(struct sge_eq *eq)
+static u_int
+can_resume_eth_tx(struct mp_ring *r)
+{
+	struct sge_eq *eq = r->cookie;
+
+	return (total_available_tx_desc(eq) > eq->sidx / 8);
+}
+
+static inline int
+cannot_use_txpkts(struct mbuf *m)
+{
+	/* maybe put a GL limit too, to avoid silliness? */
+
+	return (needs_tso(m));
+}
+
+/*
+ * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to
+ * be consumed.  Return the actual number consumed.  0 indicates a stall.
+ */
+static u_int
+eth_tx(struct mp_ring *r, u_int cidx, u_int pidx)
 {
+	struct sge_txq *txq = r->cookie;
+	struct sge_eq *eq = &txq->eq;
+	struct ifnet *ifp = txq->ifp;
+	struct port_info *pi = (void *)ifp->if_softc;
+	struct adapter *sc = pi->adapter;
+	u_int total, remaining;		/* # of packets */
+	u_int available, dbdiff;	/* # of hardware descriptors */
+	u_int n, next_cidx;
+	struct mbuf *m0, *tail;
+	struct txpkts txp;
+	struct fw_eth_tx_pkts_wr *wr;	/* any fw WR struct will do */
+
+	remaining = IDXDIFF(pidx, cidx, r->size);
+	MPASS(remaining > 0);	/* Must not be called without work to do. */
+	total = 0;
+
+	TXQ_LOCK(txq);
+	if (__predict_false((eq->flags & EQ_ENABLED) == 0)) {
+		while (cidx != pidx) {
+			m0 = r->items[cidx];
+			m_freem(m0);
+			if (++cidx == r->size)
+				cidx = 0;
+		}
+		reclaim_tx_descs(txq, 2048);
+		total = remaining;
+		goto done;
+	}
+
+	/* How many hardware descriptors do we have readily available. */
+	if (eq->pidx == eq->cidx)
+		available = eq->sidx - 1;
+	else
+		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
+	dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx);
+
+	while (remaining > 0) {
+
+		m0 = r->items[cidx];
+		M_ASSERTPKTHDR(m0);
+		MPASS(m0->m_nextpkt == NULL);
+
+		if (available < SGE_MAX_WR_NDESC) {
+			available += reclaim_tx_descs(txq, 64);
+			if (available < howmany(mbuf_len16(m0), EQ_ESIZE / 16))
+				break;	/* out of descriptors */
+		}
 
-	return (eq->avail + reclaimable(eq) >= tx_resume_threshold(eq));
+		next_cidx = cidx + 1;
+		if (__predict_false(next_cidx == r->size))
+			next_cidx = 0;
+
+		wr = (void *)&eq->desc[eq->pidx];
+		if (remaining > 1 &&
+		    try_txpkts(m0, r->items[next_cidx], &txp, available) == 0) {
+
+			/* pkts at cidx, next_cidx should both be in txp. */
+			MPASS(txp.npkt == 2);
+			tail = r->items[next_cidx];
+			MPASS(tail->m_nextpkt == NULL);
+			ETHER_BPF_MTAP(ifp, m0);
+			ETHER_BPF_MTAP(ifp, tail);
+			m0->m_nextpkt = tail;
+
+			if (__predict_false(++next_cidx == r->size))
+				next_cidx = 0;
+
+			while (next_cidx != pidx) {
+				if (add_to_txpkts(r->items[next_cidx], &txp,
+				    available) != 0)
+					break;
+				tail->m_nextpkt = r->items[next_cidx];
+				tail = tail->m_nextpkt;
+				ETHER_BPF_MTAP(ifp, tail);
+				if (__predict_false(++next_cidx == r->size))
+					next_cidx = 0;
+			}
+
+			n = write_txpkts_wr(txq, wr, m0, &txp, available);
+			total += txp.npkt;
+			remaining -= txp.npkt;
+		} else {
+			total++;
+			remaining--;
+			n = write_txpkt_wr(txq, (void *)wr, m0, available);
+			ETHER_BPF_MTAP(ifp, m0);
+		}
+		MPASS(n >= 1 && n <= available && n <= SGE_MAX_WR_NDESC);
+
+		available -= n;
+		dbdiff += n;
+		IDXINCR(eq->pidx, n, eq->sidx);
+
+		if (total_available_tx_desc(eq) < eq->sidx / 4 &&
+		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
+			wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
+			    F_FW_WR_EQUEQ);
+			eq->equeqidx = eq->pidx;
+		} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
+			wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
+			eq->equeqidx = eq->pidx;
+		}
+
+		if (dbdiff >= 16 && remaining >= 4) {
+			ring_eq_db(sc, eq, dbdiff);
+			available += reclaim_tx_descs(txq, 4 * dbdiff);
+			dbdiff = 0;
+		}
+
+		cidx = next_cidx;
+	}
+	if (dbdiff != 0) {
+		ring_eq_db(sc, eq, dbdiff);
+		reclaim_tx_descs(txq, 32);
+	}
+done:
+	TXQ_UNLOCK(txq);
+
+	return (total);
 }
 
 static inline void
@@ -2155,11 +2511,8 @@ init_eq(struct sge_eq *eq, int eqtype, int qsize, uint8_t tx_chan,
 	eq->flags = eqtype & EQ_TYPEMASK;
 	eq->tx_chan = tx_chan;
 	eq->iqid = iqid;
-	eq->qsize = qsize;
+	eq->sidx = qsize - spg_len / EQ_ESIZE;
 	strlcpy(eq->lockname, name, sizeof(eq->lockname));
-
-	TASK_INIT(&eq->tx_task, 0, t4_tx_task, eq);
-	callout_init(&eq->tx_callout, CALLOUT_MPSAFE);
 }
 
 static int
@@ -2848,6 +3201,7 @@ ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_ctrl_cmd c;
+	int qsize = eq->sidx + spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
@@ -2856,17 +3210,16 @@ ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq)
 	    V_FW_EQ_CTRL_CMD_VFN(0));
 	c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC |
 	    F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c));
-	c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid)); /* XXX */
+	c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid));
 	c.physeqid_pkd = htobe32(0);
 	c.fetchszm_to_iqid =
-	    htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
+	    htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
 		V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) |
 		F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize =
 	    htobe32(V_FW_EQ_CTRL_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
 		V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
-		V_FW_EQ_CTRL_CMD_CIDXFTHRESH(X_CIDXFLUSHTHRESH_32) |
-		V_FW_EQ_CTRL_CMD_EQSIZE(eq->qsize));
+		V_FW_EQ_CTRL_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
@@ -2892,6 +3245,7 @@ eth_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_eth_cmd c;
+	int qsize = eq->sidx + spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
@@ -2900,15 +3254,15 @@ eth_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
 	    V_FW_EQ_ETH_CMD_VFN(0));
 	c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC |
 	    F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c));
-	c.autoequiqe_to_viid = htobe32(V_FW_EQ_ETH_CMD_VIID(pi->viid));
+	c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE |
+	    F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(pi->viid));
 	c.fetchszm_to_iqid =
-	    htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
+	    htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
 		V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO |
 		V_FW_EQ_ETH_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
-		      V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
-		      V_FW_EQ_ETH_CMD_CIDXFTHRESH(X_CIDXFLUSHTHRESH_32) |
-		      V_FW_EQ_ETH_CMD_EQSIZE(eq->qsize));
+	    V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
+	    V_FW_EQ_ETH_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
@@ -2935,6 +3289,7 @@ ofld_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_ofld_cmd c;
+	int qsize = eq->sidx + spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
@@ -2944,14 +3299,13 @@ ofld_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
 	c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC |
 	    F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c));
 	c.fetchszm_to_iqid =
-		htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
+		htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
 		    V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) |
 		    F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize =
 	    htobe32(V_FW_EQ_OFLD_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
 		V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
-		V_FW_EQ_OFLD_CMD_CIDXFTHRESH(X_CIDXFLUSHTHRESH_32) |
-		V_FW_EQ_OFLD_CMD_EQSIZE(eq->qsize));
+		V_FW_EQ_OFLD_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
@@ -2976,21 +3330,20 @@ ofld_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
 static int
 alloc_eq(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
 {
-	int rc;
+	int rc, qsize;
 	size_t len;
 
 	mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF);
 
-	len = eq->qsize * EQ_ESIZE;
+	qsize = eq->sidx + spg_len / EQ_ESIZE;
+	len = qsize * EQ_ESIZE;
 	rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map,
 	    &eq->ba, (void **)&eq->desc);
 	if (rc)
 		return (rc);
 
-	eq->cap = eq->qsize - spg_len / EQ_ESIZE;
-	eq->spg = (void *)&eq->desc[eq->cap];
-	eq->avail = eq->cap - 1;	/* one less to avoid cidx = pidx */
 	eq->pidx = eq->cidx = 0;
+	eq->equeqidx = eq->dbidx = 0;
 	eq->doorbells = sc->doorbells;
 
 	switch (eq->flags & EQ_TYPEMASK) {
@@ -3018,8 +3371,6 @@ alloc_eq(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
 		    eq->flags & EQ_TYPEMASK, rc);
 	}
 
-	eq->tx_callout.c_cpu = eq->cntxt_id % mp_ncpus;
-
 	if (isset(&eq->doorbells, DOORBELL_UDB) ||
 	    isset(&eq->doorbells, DOORBELL_UDBWC) ||
 	    isset(&eq->doorbells, DOORBELL_WCWR)) {
@@ -3101,7 +3452,11 @@ alloc_wrq(struct adapter *sc, struct port_info *pi, struct sge_wrq *wrq,
 		return (rc);
 
 	wrq->adapter = sc;
+	TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq);
+	TAILQ_INIT(&wrq->incomplete_wrs);
 	STAILQ_INIT(&wrq->wr_list);
+	wrq->nwr_pending = 0;
+	wrq->ndesc_needed = 0;
 
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &wrq->eq.cntxt_id, 0, "SGE context id of the queue");
@@ -3111,13 +3466,10 @@ alloc_wrq(struct adapter *sc, struct port_info *pi, struct sge_wrq *wrq,
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.pidx, 0, sysctl_uint16, "I",
 	    "producer index");
-	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs", CTLFLAG_RD,
-	    &wrq->tx_wrs, "# of work requests");
-	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "no_desc", CTLFLAG_RD,
-	    &wrq->no_desc, 0,
-	    "# of times queue ran out of hardware descriptors");
-	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "unstalled", CTLFLAG_RD,
-	    &wrq->eq.unstalled, 0, "# of times queue recovered after stall");
+	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD,
+	    &wrq->tx_wrs_direct, "# of work requests (direct)");
+	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD,
+	    &wrq->tx_wrs_copied, "# of work requests (copied)");
 
 	return (rc);
 }
@@ -3145,37 +3497,30 @@ alloc_txq(struct port_info *pi, struct sge_txq *txq, int idx,
 	char name[16];
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
-	rc = alloc_eq(sc, pi, eq);
-	if (rc)
-		return (rc);
-
-	txq->ifp = pi->ifp;
-
-	txq->sdesc = malloc(eq->cap * sizeof(struct tx_sdesc), M_CXGBE,
-	    M_ZERO | M_WAITOK);
-	txq->br = buf_ring_alloc(eq->qsize, M_CXGBE, M_WAITOK, &eq->eq_lock);
-
-	rc = bus_dma_tag_create(sc->dmat, 1, 0, BUS_SPACE_MAXADDR,
-	    BUS_SPACE_MAXADDR, NULL, NULL, 64 * 1024, TX_SGL_SEGS,
-	    BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL, NULL, &txq->tx_tag);
+	rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx,
+	    M_CXGBE, M_WAITOK);
 	if (rc != 0) {
-		device_printf(sc->dev,
-		    "failed to create tx DMA tag: %d\n", rc);
+		device_printf(sc->dev, "failed to allocate mp_ring: %d\n", rc);
 		return (rc);
 	}
 
-	/*
-	 * We can stuff ~10 frames in an 8-descriptor txpkts WR (8 is the SGE
-	 * limit for any WR).  txq->no_dmamap events shouldn't occur if maps is
-	 * sized for the worst case.
-	 */
-	rc = t4_alloc_tx_maps(&txq->txmaps, txq->tx_tag, eq->qsize * 10 / 8,
-	    M_WAITOK);
+	rc = alloc_eq(sc, pi, eq);
 	if (rc != 0) {
-		device_printf(sc->dev, "failed to setup tx DMA maps: %d\n", rc);
+		mp_ring_free(txq->r);
+		txq->r = NULL;
 		return (rc);
 	}
 
+	/* Can't fail after this point. */
+
+	TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq);
+	txq->ifp = pi->ifp;
+	txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK);
+	txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
+	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf));
+	txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE,
+	    M_ZERO | M_WAITOK);
+
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
 	    NULL, "tx queue");
@@ -3203,23 +3548,39 @@ alloc_txq(struct port_info *pi, struct sge_txq *txq, int idx,
 	    &txq->sgl_wrs, "# of work requests with direct SGL");
 	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD,
 	    &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)");
-	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts_wrs", CTLFLAG_RD,
-	    &txq->txpkts_wrs, "# of txpkts work requests (multiple pkts/WR)");
-	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts_pkts", CTLFLAG_RD,
-	    &txq->txpkts_pkts, "# of frames tx'd using txpkts work requests");
-
-	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "br_drops", CTLFLAG_RD,
-	    &txq->br->br_drops, "# of drops in the buf_ring for this queue");
-	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "no_dmamap", CTLFLAG_RD,
-	    &txq->no_dmamap, 0, "# of times txq ran out of DMA maps");
-	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "no_desc", CTLFLAG_RD,
-	    &txq->no_desc, 0, "# of times txq ran out of hardware descriptors");
-	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "egr_update", CTLFLAG_RD,
-	    &eq->egr_update, 0, "egress update notifications from the SGE");
-	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "unstalled", CTLFLAG_RD,
-	    &eq->unstalled, 0, "# of times txq recovered after stall");
+	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts0_wrs",
+	    CTLFLAG_RD, &txq->txpkts0_wrs,
+	    "# of txpkts (type 0) work requests");
+	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts1_wrs",
+	    CTLFLAG_RD, &txq->txpkts1_wrs,
+	    "# of txpkts (type 1) work requests");
+	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts0_pkts",
+	    CTLFLAG_RD, &txq->txpkts0_pkts,
+	    "# of frames tx'd using type0 txpkts work requests");
+	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts1_pkts",
+	    CTLFLAG_RD, &txq->txpkts1_pkts,
+	    "# of frames tx'd using type1 txpkts work requests");
+
+	SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_enqueues",
+	    CTLFLAG_RD, &txq->r->enqueues,
+	    "# of enqueues to the mp_ring for this queue");
+	SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_drops",
+	    CTLFLAG_RD, &txq->r->drops,
+	    "# of drops in the mp_ring for this queue");
+	SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_starts",
+	    CTLFLAG_RD, &txq->r->starts,
+	    "# of normal consumer starts in the mp_ring for this queue");
+	SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_stalls",
+	    CTLFLAG_RD, &txq->r->stalls,
+	    "# of consumer stalls in the mp_ring for this queue");
+	SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_restarts",
+	    CTLFLAG_RD, &txq->r->restarts,
+	    "# of consumer restarts in the mp_ring for this queue");
+	SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_abdications",
+	    CTLFLAG_RD, &txq->r->abdications,
+	    "# of consumer abdications in the mp_ring for this queue");
 
-	return (rc);
+	return (0);
 }
 
 static int
@@ -3233,15 +3594,9 @@ free_txq(struct port_info *pi, struct sge_txq *txq)
 	if (rc)
 		return (rc);
 
+	sglist_free(txq->gl);
 	free(txq->sdesc, M_CXGBE);
-
-	if (txq->txmaps.maps)
-		t4_free_tx_maps(&txq->txmaps, txq->tx_tag);
-
-	buf_ring_free(txq->br, M_CXGBE);
-
-	if (txq->tx_tag)
-		bus_dma_tag_destroy(txq->tx_tag);
+	mp_ring_free(txq->r);
 
 	bzero(txq, sizeof(*txq));
 	return (0);
@@ -3466,293 +3821,159 @@ free_fl_sdesc(struct adapter *sc, struct sge_fl *fl)
 	fl->sdesc = NULL;
 }
 
-int
-t4_alloc_tx_maps(struct tx_maps *txmaps, bus_dma_tag_t tx_tag, int count,
-    int flags)
+static inline void
+get_pkt_gl(struct mbuf *m, struct sglist *gl)
 {
-	struct tx_map *txm;
-	int i, rc;
-
-	txmaps->map_total = txmaps->map_avail = count;
-	txmaps->map_cidx = txmaps->map_pidx = 0;
-
-	txmaps->maps = malloc(count * sizeof(struct tx_map), M_CXGBE,
-	    M_ZERO | flags);
+	int rc;
 
-	txm = txmaps->maps;
-	for (i = 0; i < count; i++, txm++) {
-		rc = bus_dmamap_create(tx_tag, 0, &txm->map);
-		if (rc != 0)
-			goto failed;
-	}
+	M_ASSERTPKTHDR(m);
 
-	return (0);
-failed:
-	while (--i >= 0) {
-		txm--;
-		bus_dmamap_destroy(tx_tag, txm->map);
+	sglist_reset(gl);
+	rc = sglist_append_mbuf(gl, m);
+	if (__predict_false(rc != 0)) {
+		panic("%s: mbuf %p (%d segs) was vetted earlier but now fails "
+		    "with %d.", __func__, m, mbuf_nsegs(m), rc);
 	}
-	KASSERT(txm == txmaps->maps, ("%s: EDOOFUS", __func__));
 
-	free(txmaps->maps, M_CXGBE);
-	txmaps->maps = NULL;
-
-	return (rc);
+	KASSERT(gl->sg_nseg == mbuf_nsegs(m),
+	    ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m,
+	    mbuf_nsegs(m), gl->sg_nseg));
+	KASSERT(gl->sg_nseg > 0 &&
+	    gl->sg_nseg <= (needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS),
+	    ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__,
+		gl->sg_nseg, needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS));
 }
 
-void
-t4_free_tx_maps(struct tx_maps *txmaps, bus_dma_tag_t tx_tag)
+/*
+ * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
+ */
+static inline u_int
+txpkt_len16(u_int nsegs, u_int tso)
 {
-	struct tx_map *txm;
-	int i;
+	u_int n;
 
-	txm = txmaps->maps;
-	for (i = 0; i < txmaps->map_total; i++, txm++) {
-
-		if (txm->m) {
-			bus_dmamap_unload(tx_tag, txm->map);
-			m_freem(txm->m);
-			txm->m = NULL;
-		}
+	MPASS(nsegs > 0);
 
-		bus_dmamap_destroy(tx_tag, txm->map);
-	}
+	nsegs--; /* first segment is part of ulptx_sgl */
+	n = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) +
+	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
+	if (tso)
+		n += sizeof(struct cpl_tx_pkt_lso_core);
 
-	free(txmaps->maps, M_CXGBE);
-	txmaps->maps = NULL;
+	return (howmany(n, 16));
 }
 
 /*
- * We'll do immediate data tx for non-TSO, but only when not coalescing.  We're
- * willing to use upto 2 hardware descriptors which means a maximum of 96 bytes
- * of immediate data.
- */
-#define IMM_LEN ( \
-      2 * EQ_ESIZE \
-    - sizeof(struct fw_eth_tx_pkt_wr) \
-    - sizeof(struct cpl_tx_pkt_core))
-
-/*
- * Returns non-zero on failure, no need to cleanup anything in that case.
- *
- * Note 1: We always try to defrag the mbuf if required and return EFBIG only
- * if the resulting chain still won't fit in a tx descriptor.
- *
- * Note 2: We'll pullup the mbuf chain if TSO is requested and the first mbuf
- * does not have the TCP header in it.
+ * len16 for a txpkts type 0 WR with a GL.  Does not include the firmware work
+ * request header.
  */
-static int
-get_pkt_sgl(struct sge_txq *txq, struct mbuf **fp, struct sgl *sgl,
-    int sgl_only)
+static inline u_int
+txpkts0_len16(u_int nsegs)
 {
-	struct mbuf *m = *fp;
-	struct tx_maps *txmaps;
-	struct tx_map *txm;
-	int rc, defragged = 0, n;
-
-	TXQ_LOCK_ASSERT_OWNED(txq);
-
-	if (m->m_pkthdr.tso_segsz)
-		sgl_only = 1;	/* Do not allow immediate data with LSO */
+	u_int n;
 
-start:	sgl->nsegs = 0;
+	MPASS(nsegs > 0);
 
-	if (m->m_pkthdr.len <= IMM_LEN && !sgl_only)
-		return (0);	/* nsegs = 0 tells caller to use imm. tx */
-
-	txmaps = &txq->txmaps;
-	if (txmaps->map_avail == 0) {
-		txq->no_dmamap++;
-		return (ENOMEM);
-	}
-	txm = &txmaps->maps[txmaps->map_pidx];
+	nsegs--; /* first segment is part of ulptx_sgl */
+	n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) +
+	    sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) +
+	    8 * ((3 * nsegs) / 2 + (nsegs & 1));
 
-	if (m->m_pkthdr.tso_segsz && m->m_len < 50) {
-		*fp = m_pullup(m, 50);
-		m = *fp;
-		if (m == NULL)
-			return (ENOBUFS);
-	}
-
-	rc = bus_dmamap_load_mbuf_sg(txq->tx_tag, txm->map, m, sgl->seg,
-	    &sgl->nsegs, BUS_DMA_NOWAIT);
-	if (rc == EFBIG && defragged == 0) {
-		m = m_defrag(m, M_NOWAIT);
-		if (m == NULL)
-			return (EFBIG);
-
-		defragged = 1;
-		*fp = m;
-		goto start;
-	}
-	if (rc != 0)
-		return (rc);
-
-	txm->m = m;
-	txmaps->map_avail--;
-	if (++txmaps->map_pidx == txmaps->map_total)
-		txmaps->map_pidx = 0;
-
-	KASSERT(sgl->nsegs > 0 && sgl->nsegs <= TX_SGL_SEGS,
-	    ("%s: bad DMA mapping (%d segments)", __func__, sgl->nsegs));
-
-	/*
-	 * Store the # of flits required to hold this frame's SGL in nflits.  An
-	 * SGL has a (ULPTX header + len0, addr0) tuple optionally followed by
-	 * multiple (len0 + len1, addr0, addr1) tuples.  If addr1 is not used
-	 * then len1 must be set to 0.
-	 */
-	n = sgl->nsegs - 1;
-	sgl->nflits = (3 * n) / 2 + (n & 1) + 2;
-
-	return (0);
+	return (howmany(n, 16));
 }
 
-
 /*
- * Releases all the txq resources used up in the specified sgl.
+ * len16 for a txpkts type 1 WR with a GL.  Does not include the firmware work
+ * request header.
  */
-static int
-free_pkt_sgl(struct sge_txq *txq, struct sgl *sgl)
+static inline u_int
+txpkts1_len16(void)
 {
-	struct tx_maps *txmaps;
-	struct tx_map *txm;
+	u_int n;
 
-	TXQ_LOCK_ASSERT_OWNED(txq);
-
-	if (sgl->nsegs == 0)
-		return (0);	/* didn't use any map */
+	n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl);
 
-	txmaps = &txq->txmaps;
-
-	/* 1 pkt uses exactly 1 map, back it out */
+	return (howmany(n, 16));
+}
 
-	txmaps->map_avail++;
-	if (txmaps->map_pidx > 0)
-		txmaps->map_pidx--;
-	else
-		txmaps->map_pidx = txmaps->map_total - 1;
+static inline u_int
+imm_payload(u_int ndesc)
+{
+	u_int n;
 
-	txm = &txmaps->maps[txmaps->map_pidx];
-	bus_dmamap_unload(txq->tx_tag, txm->map);
-	txm->m = NULL;
+	n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) -
+	    sizeof(struct cpl_tx_pkt_core);
 
-	return (0);
+	return (n);
 }
 
-static int
-write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, struct mbuf *m,
-    struct sgl *sgl)
+/*
+ * Write a txpkt WR for this packet to the hardware descriptors, update the
+ * software descriptor, and advance the pidx.  It is guaranteed that enough
+ * descriptors are available.
+ *
+ * The return value is the # of hardware descriptors used.
+ */
+static u_int
+write_txpkt_wr(struct sge_txq *txq, struct fw_eth_tx_pkt_wr *wr,
+    struct mbuf *m0, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
-	struct fw_eth_tx_pkt_wr *wr;
+	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;	/* used in many unrelated places */
 	uint64_t ctrl1;
-	int nflits, ndesc, pktlen;
-	struct tx_sdesc *txsd;
+	int len16, ndesc, pktlen, nsegs;
 	caddr_t dst;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
+	M_ASSERTPKTHDR(m0);
+	MPASS(available > 0 && available < eq->sidx);
 
-	pktlen = m->m_pkthdr.len;
-
-	/*
-	 * Do we have enough flits to send this frame out?
-	 */
+	len16 = mbuf_len16(m0);
+	nsegs = mbuf_nsegs(m0);
+	pktlen = m0->m_pkthdr.len;
 	ctrl = sizeof(struct cpl_tx_pkt_core);
-	if (m->m_pkthdr.tso_segsz) {
-		nflits = TXPKT_LSO_WR_HDR;
+	if (needs_tso(m0))
 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
-	} else
-		nflits = TXPKT_WR_HDR;
-	if (sgl->nsegs > 0)
-		nflits += sgl->nflits;
-	else {
-		nflits += howmany(pktlen, 8);
+	else if (pktlen <= imm_payload(2) && available >= 2) {
+		/* Immediate data.  Recalculate len16 and set nsegs to 0. */
 		ctrl += pktlen;
+		len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) +
+		    sizeof(struct cpl_tx_pkt_core) + pktlen, 16);
+		nsegs = 0;
 	}
-	ndesc = howmany(nflits, 8);
-	if (ndesc > eq->avail)
-		return (ENOMEM);
+	ndesc = howmany(len16, EQ_ESIZE / 16);
+	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
-	wr = (void *)&eq->desc[eq->pidx];
+	MPASS(wr == (void *)&eq->desc[eq->pidx]);
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
-	ctrl = V_FW_WR_LEN16(howmany(nflits, 2));
-	if (eq->avail == ndesc) {
-		if (!(eq->flags & EQ_CRFLUSHED)) {
-			ctrl |= F_FW_WR_EQUEQ | F_FW_WR_EQUIQ;
-			eq->flags |= EQ_CRFLUSHED;
-		}
-		eq->flags |= EQ_STALLED;
-	}
 
+	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3 = 0;
 
-	if (m->m_pkthdr.tso_segsz) {
+	if (needs_tso(m0)) {
 		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
-		struct ether_header *eh;
-		void *l3hdr;
-#if defined(INET) || defined(INET6)
-		struct tcphdr *tcp;
-#endif
-		uint16_t eh_type;
-
-		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
-		    F_LSO_LAST_SLICE;
 
-		eh = mtod(m, struct ether_header *);
-		eh_type = ntohs(eh->ether_type);
-		if (eh_type == ETHERTYPE_VLAN) {
-			struct ether_vlan_header *evh = (void *)eh;
+		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
+		    m0->m_pkthdr.l4hlen > 0,
+		    ("%s: mbuf %p needs TSO but missing header lengths",
+			__func__, m0));
 
+		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
+		    F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2)
+		    | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
+		if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header))
 			ctrl |= V_LSO_ETHHDR_LEN(1);
-			l3hdr = evh + 1;
-			eh_type = ntohs(evh->evl_proto);
-		} else
-			l3hdr = eh + 1;
-
-		switch (eh_type) {
-#ifdef INET6
-		case ETHERTYPE_IPV6:
-		{
-			struct ip6_hdr *ip6 = l3hdr;
-
-			/*
-			 * XXX-BZ For now we do not pretend to support
-			 * IPv6 extension headers.
-			 */
-			KASSERT(ip6->ip6_nxt == IPPROTO_TCP, ("%s: CSUM_TSO "
-			    "with ip6_nxt != TCP: %u", __func__, ip6->ip6_nxt));
-			tcp = (struct tcphdr *)(ip6 + 1);
+		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 			ctrl |= F_LSO_IPV6;
-			ctrl |= V_LSO_IPHDR_LEN(sizeof(*ip6) >> 2) |
-			    V_LSO_TCPHDR_LEN(tcp->th_off);
-			break;
-		}
-#endif
-#ifdef INET
-		case ETHERTYPE_IP:
-		{
-			struct ip *ip = l3hdr;
-
-			tcp = (void *)((uintptr_t)ip + ip->ip_hl * 4);
-			ctrl |= V_LSO_IPHDR_LEN(ip->ip_hl) |
-			    V_LSO_TCPHDR_LEN(tcp->th_off);
-			break;
-		}
-#endif
-		default:
-			panic("%s: CSUM_TSO but no supported IP version "
-			    "(0x%04x)", __func__, eh_type);
-		}
 
 		lso->lso_ctrl = htobe32(ctrl);
 		lso->ipid_ofst = htobe16(0);
-		lso->mss = htobe16(m->m_pkthdr.tso_segsz);
+		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
 		lso->seqno_offset = htobe32(0);
 		lso->len = htobe32(pktlen);
 
@@ -3764,48 +3985,36 @@ write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, struct mbuf *m,
 
 	/* Checksum offload */
 	ctrl1 = 0;
-	if (!(m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)))
+	if (needs_l3_csum(m0) == 0)
 		ctrl1 |= F_TXPKT_IPCSUM_DIS;
-	if (!(m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
-	    CSUM_TCP_IPV6 | CSUM_TSO)))
+	if (needs_l4_csum(m0) == 0)
 		ctrl1 |= F_TXPKT_L4CSUM_DIS;
-	if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
+	if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
 	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
 		txq->txcsum++;	/* some hardware assistance provided */
 
 	/* VLAN tag insertion */
-	if (m->m_flags & M_VLANTAG) {
-		ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
+	if (needs_vlan_insertion(m0)) {
+		ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
 		txq->vlan_insertion++;
 	}
 
 	/* CPL header */
-	cpl->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
-	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(pi->adapter->pf));
+	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 	cpl->ctrl1 = htobe64(ctrl1);
 
-	/* Software descriptor */
-	txsd = &txq->sdesc[eq->pidx];
-	txsd->desc_used = ndesc;
-
-	eq->pending += ndesc;
-	eq->avail -= ndesc;
-	eq->pidx += ndesc;
-	if (eq->pidx >= eq->cap)
-		eq->pidx -= eq->cap;
-
 	/* SGL */
 	dst = (void *)(cpl + 1);
-	if (sgl->nsegs > 0) {
-		txsd->credits = 1;
+	if (nsegs > 0) {
+
+		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
 		txq->sgl_wrs++;
-		write_sgl_to_txd(eq, sgl, &dst);
 	} else {
-		txsd->credits = 0;
-		txq->imm_wrs++;
-		for (; m; m = m->m_next) {
+		struct mbuf *m;
+
+		for (m = m0; m != NULL; m = m->m_next) {
 			copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
 #ifdef INVARIANTS
 			pktlen -= m->m_len;
@@ -3814,245 +4023,225 @@ write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, struct mbuf *m,
 #ifdef INVARIANTS
 		KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen));
 #endif
-
+		txq->imm_wrs++;
 	}
 
 	txq->txpkt_wrs++;
-	return (0);
+
+	txsd = &txq->sdesc[eq->pidx];
+	txsd->m = m0;
+	txsd->desc_used = ndesc;
+
+	return (ndesc);
 }
 
-/*
- * Returns 0 to indicate that m has been accepted into a coalesced tx work
- * request.  It has either been folded into txpkts or txpkts was flushed and m
- * has started a new coalesced work request (as the first frame in a fresh
- * txpkts).
- *
- * Returns non-zero to indicate a failure - caller is responsible for
- * transmitting m, if there was anything in txpkts it has been flushed.
- */
 static int
-add_to_txpkts(struct port_info *pi, struct sge_txq *txq, struct txpkts *txpkts,
-    struct mbuf *m, struct sgl *sgl)
+try_txpkts(struct mbuf *m, struct mbuf *n, struct txpkts *txp, u_int available)
 {
-	struct sge_eq *eq = &txq->eq;
-	int can_coalesce;
-	struct tx_sdesc *txsd;
-	int flits;
-
-	TXQ_LOCK_ASSERT_OWNED(txq);
+	u_int needed, nsegs1, nsegs2, l1, l2;
 
-	KASSERT(sgl->nsegs, ("%s: can't coalesce imm data", __func__));
+	if (cannot_use_txpkts(m) || cannot_use_txpkts(n))
+		return (1);
 
-	if (txpkts->npkt > 0) {
-		flits = TXPKTS_PKT_HDR + sgl->nflits;
-		can_coalesce = m->m_pkthdr.tso_segsz == 0 &&
-		    txpkts->nflits + flits <= TX_WR_FLITS &&
-		    txpkts->nflits + flits <= eq->avail * 8 &&
-		    txpkts->plen + m->m_pkthdr.len < 65536;
+	nsegs1 = mbuf_nsegs(m);
+	nsegs2 = mbuf_nsegs(n);
+	if (nsegs1 + nsegs2 == 2) {
+		txp->wr_type = 1;
+		l1 = l2 = txpkts1_len16();
+	} else {
+		txp->wr_type = 0;
+		l1 = txpkts0_len16(nsegs1);
+		l2 = txpkts0_len16(nsegs2);
+	}
+	txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + l1 + l2;
+	needed = howmany(txp->len16, EQ_ESIZE / 16);
+	if (needed > SGE_MAX_WR_NDESC || needed > available)
+		return (1);
 
-		if (can_coalesce) {
-			txpkts->npkt++;
-			txpkts->nflits += flits;
-			txpkts->plen += m->m_pkthdr.len;
+	txp->plen = m->m_pkthdr.len + n->m_pkthdr.len;
+	if (txp->plen > 65535)
+		return (1);
 
-			txsd = &txq->sdesc[eq->pidx];
-			txsd->credits++;
+	txp->npkt = 2;
+	set_mbuf_len16(m, l1);
+	set_mbuf_len16(n, l2);
 
-			return (0);
-		}
-
-		/*
-		 * Couldn't coalesce m into txpkts.  The first order of business
-		 * is to send txpkts on its way.  Then we'll revisit m.
-		 */
-		write_txpkts_wr(txq, txpkts);
-	}
+	return (0);
+}
 
-	/*
-	 * Check if we can start a new coalesced tx work request with m as
-	 * the first packet in it.
-	 */
+static int
+add_to_txpkts(struct mbuf *m, struct txpkts *txp, u_int available)
+{
+	u_int plen, len16, needed, nsegs;
 
-	KASSERT(txpkts->npkt == 0, ("%s: txpkts not empty", __func__));
+	MPASS(txp->wr_type == 0 || txp->wr_type == 1);
 
-	flits = TXPKTS_WR_HDR + sgl->nflits;
-	can_coalesce = m->m_pkthdr.tso_segsz == 0 &&
-	    flits <= eq->avail * 8 && flits <= TX_WR_FLITS;
+	nsegs = mbuf_nsegs(m);
+	if (needs_tso(m) || (txp->wr_type == 1 && nsegs != 1))
+		return (1);
 
-	if (can_coalesce == 0)
-		return (EINVAL);
+	plen = txp->plen + m->m_pkthdr.len;
+	if (plen > 65535)
+		return (1);
 
-	/*
-	 * Start a fresh coalesced tx WR with m as the first frame in it.
-	 */
-	txpkts->npkt = 1;
-	txpkts->nflits = flits;
-	txpkts->flitp = &eq->desc[eq->pidx].flit[2];
-	txpkts->plen = m->m_pkthdr.len;
+	if (txp->wr_type == 0)
+		len16 = txpkts0_len16(nsegs);
+	else
+		len16 = txpkts1_len16();
+	needed = howmany(txp->len16 + len16, EQ_ESIZE / 16);
+	if (needed > SGE_MAX_WR_NDESC || needed > available)
+		return (1);
 
-	txsd = &txq->sdesc[eq->pidx];
-	txsd->credits = 1;
+	txp->npkt++;
+	txp->plen = plen;
+	txp->len16 += len16;
+	set_mbuf_len16(m, len16);
 
 	return (0);
 }
 
 /*
- * Note that write_txpkts_wr can never run out of hardware descriptors (but
- * write_txpkt_wr can).  add_to_txpkts ensures that a frame is accepted for
- * coalescing only if sufficient hardware descriptors are available.
+ * Write a txpkts WR for the packets in txp to the hardware descriptors, update
+ * the software descriptor, and advance the pidx.  It is guaranteed that enough
+ * descriptors are available.
+ *
+ * The return value is the # of hardware descriptors used.
  */
-static void
-write_txpkts_wr(struct sge_txq *txq, struct txpkts *txpkts)
+static u_int
+write_txpkts_wr(struct sge_txq *txq, struct fw_eth_tx_pkts_wr *wr,
+    struct mbuf *m0, const struct txpkts *txp, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
-	struct fw_eth_tx_pkts_wr *wr;
 	struct tx_sdesc *txsd;
+	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;
-	int ndesc;
+	uint64_t ctrl1;
+	int ndesc, checkwrap;
+	struct mbuf *m;
+	void *flitp;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
+	MPASS(txp->npkt > 0);
+	MPASS(txp->plen < 65536);
+	MPASS(m0 != NULL);
+	MPASS(m0->m_nextpkt != NULL);
+	MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
+	MPASS(available > 0 && available < eq->sidx);
 
-	ndesc = howmany(txpkts->nflits, 8);
+	ndesc = howmany(txp->len16, EQ_ESIZE / 16);
+	MPASS(ndesc <= available);
 
-	wr = (void *)&eq->desc[eq->pidx];
+	MPASS(wr == (void *)&eq->desc[eq->pidx]);
 	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
-	ctrl = V_FW_WR_LEN16(howmany(txpkts->nflits, 2));
-	if (eq->avail == ndesc) {
-		if (!(eq->flags & EQ_CRFLUSHED)) {
-			ctrl |= F_FW_WR_EQUEQ | F_FW_WR_EQUIQ;
-			eq->flags |= EQ_CRFLUSHED;
-		}
-		eq->flags |= EQ_STALLED;
-	}
+	ctrl = V_FW_WR_LEN16(txp->len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
-	wr->plen = htobe16(txpkts->plen);
-	wr->npkt = txpkts->npkt;
-	wr->r3 = wr->type = 0;
-
-	/* Everything else already written */
-
-	txsd = &txq->sdesc[eq->pidx];
-	txsd->desc_used = ndesc;
-
-	KASSERT(eq->avail >= ndesc, ("%s: out of descriptors", __func__));
-
-	eq->pending += ndesc;
-	eq->avail -= ndesc;
-	eq->pidx += ndesc;
-	if (eq->pidx >= eq->cap)
-		eq->pidx -= eq->cap;
+	wr->plen = htobe16(txp->plen);
+	wr->npkt = txp->npkt;
+	wr->r3 = 0;
+	wr->type = txp->wr_type;
+	flitp = wr + 1;
 
-	txq->txpkts_pkts += txpkts->npkt;
-	txq->txpkts_wrs++;
-	txpkts->npkt = 0;	/* emptied */
-}
+	/*
+	 * At this point we are 16B into a hardware descriptor.  If checkwrap is
+	 * set then we know the WR is going to wrap around somewhere.  We'll
+	 * check for that at appropriate points.
+	 */
+	checkwrap = eq->sidx - ndesc < eq->pidx;
+	for (m = m0; m != NULL; m = m->m_nextpkt) {
+		if (txp->wr_type == 0) {
+			struct ulp_txpkt *ulpmc;
+			struct ulptx_idata *ulpsc;
+
+			/* ULP master command */
+			ulpmc = flitp;
+			ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
+			    V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid));
+			ulpmc->len = htobe32(mbuf_len16(m));
+
+			/* ULP subcommand */
+			ulpsc = (void *)(ulpmc + 1);
+			ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
+			    F_ULP_TX_SC_MORE);
+			ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core));
+
+			cpl = (void *)(ulpsc + 1);
+			if (checkwrap &&
+			    (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx])
+				cpl = (void *)&eq->desc[0];
+			txq->txpkts0_pkts += txp->npkt;
+			txq->txpkts0_wrs++;
+		} else {
+			cpl = flitp;
+			txq->txpkts1_pkts += txp->npkt;
+			txq->txpkts1_wrs++;
+		}
 
-static inline void
-write_ulp_cpl_sgl(struct port_info *pi, struct sge_txq *txq,
-    struct txpkts *txpkts, struct mbuf *m, struct sgl *sgl)
-{
-	struct ulp_txpkt *ulpmc;
-	struct ulptx_idata *ulpsc;
-	struct cpl_tx_pkt_core *cpl;
-	struct sge_eq *eq = &txq->eq;
-	uintptr_t flitp, start, end;
-	uint64_t ctrl;
-	caddr_t dst;
+		/* Checksum offload */
+		ctrl1 = 0;
+		if (needs_l3_csum(m) == 0)
+			ctrl1 |= F_TXPKT_IPCSUM_DIS;
+		if (needs_l4_csum(m) == 0)
+			ctrl1 |= F_TXPKT_L4CSUM_DIS;
+		if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
+		    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
+			txq->txcsum++;	/* some hardware assistance provided */
+
+		/* VLAN tag insertion */
+		if (needs_vlan_insertion(m)) {
+			ctrl1 |= F_TXPKT_VLAN_VLD |
+			    V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
+			txq->vlan_insertion++;
+		}
 
-	KASSERT(txpkts->npkt > 0, ("%s: txpkts is empty", __func__));
+		/* CPL header */
+		cpl->ctrl0 = txq->cpl_ctrl0;
+		cpl->pack = 0;
+		cpl->len = htobe16(m->m_pkthdr.len);
+		cpl->ctrl1 = htobe64(ctrl1);
 
-	start = (uintptr_t)eq->desc;
-	end = (uintptr_t)eq->spg;
+		flitp = cpl + 1;
+		if (checkwrap &&
+		    (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx])
+			flitp = (void *)&eq->desc[0];
 
-	/* Checksum offload */
-	ctrl = 0;
-	if (!(m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)))
-		ctrl |= F_TXPKT_IPCSUM_DIS;
-	if (!(m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
-	    CSUM_TCP_IPV6 | CSUM_TSO)))
-		ctrl |= F_TXPKT_L4CSUM_DIS;
-	if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
-	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
-		txq->txcsum++;	/* some hardware assistance provided */
+		write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap);
 
-	/* VLAN tag insertion */
-	if (m->m_flags & M_VLANTAG) {
-		ctrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
-		txq->vlan_insertion++;
 	}
 
-	/*
-	 * The previous packet's SGL must have ended at a 16 byte boundary (this
-	 * is required by the firmware/hardware).  It follows that flitp cannot
-	 * wrap around between the ULPTX master command and ULPTX subcommand (8
-	 * bytes each), and that it can not wrap around in the middle of the
-	 * cpl_tx_pkt_core either.
-	 */
-	flitp = (uintptr_t)txpkts->flitp;
-	KASSERT((flitp & 0xf) == 0,
-	    ("%s: last SGL did not end at 16 byte boundary: %p",
-	    __func__, txpkts->flitp));
-
-	/* ULP master command */
-	ulpmc = (void *)flitp;
-	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0) |
-	    V_ULP_TXPKT_FID(eq->iqid));
-	ulpmc->len = htonl(howmany(sizeof(*ulpmc) + sizeof(*ulpsc) +
-	    sizeof(*cpl) + 8 * sgl->nflits, 16));
-
-	/* ULP subcommand */
-	ulpsc = (void *)(ulpmc + 1);
-	ulpsc->cmd_more = htobe32(V_ULPTX_CMD((u32)ULP_TX_SC_IMM) |
-	    F_ULP_TX_SC_MORE);
-	ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core));
-
-	flitp += sizeof(*ulpmc) + sizeof(*ulpsc);
-	if (flitp == end)
-		flitp = start;
-
-	/* CPL_TX_PKT */
-	cpl = (void *)flitp;
-	cpl->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
-	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(pi->adapter->pf));
-	cpl->pack = 0;
-	cpl->len = htobe16(m->m_pkthdr.len);
-	cpl->ctrl1 = htobe64(ctrl);
-
-	flitp += sizeof(*cpl);
-	if (flitp == end)
-		flitp = start;
-
-	/* SGL for this frame */
-	dst = (caddr_t)flitp;
-	txpkts->nflits += write_sgl_to_txd(eq, sgl, &dst);
-	txpkts->flitp = (void *)dst;
+	txsd = &txq->sdesc[eq->pidx];
+	txsd->m = m0;
+	txsd->desc_used = ndesc;
 
-	KASSERT(((uintptr_t)dst & 0xf) == 0,
-	    ("%s: SGL ends at %p (not a 16 byte boundary)", __func__, dst));
+	return (ndesc);
 }
 
 /*
  * If the SGL ends on an address that is not 16 byte aligned, this function will
- * add a 0 filled flit at the end.  It returns 1 in that case.
+ * add a 0 filled flit at the end.
  */
-static int
-write_sgl_to_txd(struct sge_eq *eq, struct sgl *sgl, caddr_t *to)
+static void
+write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap)
 {
-	__be64 *flitp, *end;
+	struct sge_eq *eq = &txq->eq;
+	struct sglist *gl = txq->gl;
+	struct sglist_seg *seg;
+	__be64 *flitp, *wrap;
 	struct ulptx_sgl *usgl;
-	bus_dma_segment_t *seg;
-	int i, padded;
-
-	KASSERT(sgl->nsegs > 0 && sgl->nflits > 0,
-	    ("%s: bad SGL - nsegs=%d, nflits=%d",
-	    __func__, sgl->nsegs, sgl->nflits));
+	int i, nflits, nsegs;
 
 	KASSERT(((uintptr_t)(*to) & 0xf) == 0,
 	    ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to));
+	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
+	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
 
+	get_pkt_gl(m, gl);
+	nsegs = gl->sg_nseg;
+	MPASS(nsegs > 0);
+
+	nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2;
 	flitp = (__be64 *)(*to);
-	end = flitp + sgl->nflits;
-	seg = &sgl->seg[0];
+	wrap = (__be64 *)(&eq->desc[eq->sidx]);
+	seg = &gl->sg_segs[0];
 	usgl = (void *)flitp;
 
 	/*
@@ -4062,58 +4251,60 @@ write_sgl_to_txd(struct sge_eq *eq, struct sgl *sgl, caddr_t *to)
 	 */
 
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
-	    V_ULPTX_NSGE(sgl->nsegs));
-	usgl->len0 = htobe32(seg->ds_len);
-	usgl->addr0 = htobe64(seg->ds_addr);
+	    V_ULPTX_NSGE(nsegs));
+	usgl->len0 = htobe32(seg->ss_len);
+	usgl->addr0 = htobe64(seg->ss_paddr);
 	seg++;
 
-	if ((uintptr_t)end <= (uintptr_t)eq->spg) {
+	if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) {
 
 		/* Won't wrap around at all */
 
-		for (i = 0; i < sgl->nsegs - 1; i++, seg++) {
-			usgl->sge[i / 2].len[i & 1] = htobe32(seg->ds_len);
-			usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ds_addr);
+		for (i = 0; i < nsegs - 1; i++, seg++) {
+			usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len);
+			usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr);
 		}
 		if (i & 1)
 			usgl->sge[i / 2].len[1] = htobe32(0);
+		flitp += nflits;
 	} else {
 
 		/* Will wrap somewhere in the rest of the SGL */
 
 		/* 2 flits already written, write the rest flit by flit */
 		flitp = (void *)(usgl + 1);
-		for (i = 0; i < sgl->nflits - 2; i++) {
-			if ((uintptr_t)flitp == (uintptr_t)eq->spg)
+		for (i = 0; i < nflits - 2; i++) {
+			if (flitp == wrap)
 				flitp = (void *)eq->desc;
-			*flitp++ = get_flit(seg, sgl->nsegs - 1, i);
+			*flitp++ = get_flit(seg, nsegs - 1, i);
 		}
-		end = flitp;
 	}
 
-	if ((uintptr_t)end & 0xf) {
-		*(uint64_t *)end = 0;
-		end++;
-		padded = 1;
-	} else
-		padded = 0;
+	if (nflits & 1) {
+		MPASS(((uintptr_t)flitp) & 0xf);
+		*flitp++ = 0;
+	}
 
-	if ((uintptr_t)end == (uintptr_t)eq->spg)
+	MPASS((((uintptr_t)flitp) & 0xf) == 0);
+	if (__predict_false(flitp == wrap))
 		*to = (void *)eq->desc;
 	else
-		*to = (void *)end;
-
-	return (padded);
+		*to = (void *)flitp;
 }
 
 static inline void
 copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
 {
-	if (__predict_true((uintptr_t)(*to) + len <= (uintptr_t)eq->spg)) {
+
+	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
+	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
+
+	if (__predict_true((uintptr_t)(*to) + len <=
+	    (uintptr_t)&eq->desc[eq->sidx])) {
 		bcopy(from, *to, len);
 		(*to) += len;
 	} else {
-		int portion = (uintptr_t)eq->spg - (uintptr_t)(*to);
+		int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to);
 
 		bcopy(from, *to, portion);
 		from += portion;
@@ -4124,21 +4315,21 @@ copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
 }
 
 static inline void
-ring_eq_db(struct adapter *sc, struct sge_eq *eq)
+ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n)
 {
-	u_int db, pending;
+	u_int db;
+
+	MPASS(n > 0);
 
 	db = eq->doorbells;
-	pending = eq->pending;
-	if (pending > 1)
+	if (n > 1)
 		clrbit(&db, DOORBELL_WCWR);
-	eq->pending = 0;
 	wmb();
 
 	switch (ffs(db) - 1) {
 	case DOORBELL_UDB:
-		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(pending));
-		return;
+		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
+		break;
 
 	case DOORBELL_WCWR: {
 		volatile uint64_t *dst, *src;
@@ -4149,69 +4340,84 @@ ring_eq_db(struct adapter *sc, struct sge_eq *eq)
 		 * use relative qid (udb_qid is always 0).  Only queues with
 		 * doorbell segments can do WCWR.
 		 */
-		KASSERT(eq->udb_qid == 0 && pending == 1,
+		KASSERT(eq->udb_qid == 0 && n == 1,
 		    ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p",
-		    __func__, eq->doorbells, pending, eq->pidx, eq));
+		    __func__, eq->doorbells, n, eq->dbidx, eq));
 
 		dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET -
 		    UDBS_DB_OFFSET);
-		i = eq->pidx ? eq->pidx - 1 : eq->cap - 1;
+		i = eq->dbidx;
 		src = (void *)&eq->desc[i];
 		while (src != (void *)&eq->desc[i + 1])
 			*dst++ = *src++;
 		wmb();
-		return;
+		break;
 	}
 
 	case DOORBELL_UDBWC:
-		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(pending));
+		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
 		wmb();
-		return;
+		break;
 
 	case DOORBELL_KDB:
 		t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL),
-		    V_QID(eq->cntxt_id) | V_PIDX(pending));
-		return;
+		    V_QID(eq->cntxt_id) | V_PIDX(n));
+		break;
 	}
+
+	IDXINCR(eq->dbidx, n, eq->sidx);
 }
 
-static inline int
-reclaimable(struct sge_eq *eq)
+static inline u_int
+reclaimable_tx_desc(struct sge_eq *eq)
 {
-	unsigned int cidx;
+	uint16_t hw_cidx;
 
-	cidx = eq->spg->cidx;	/* stable snapshot */
-	cidx = be16toh(cidx);
+	hw_cidx = read_hw_cidx(eq);
+	return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx));
+}
+
+static inline u_int
+total_available_tx_desc(struct sge_eq *eq)
+{
+	uint16_t hw_cidx, pidx;
+
+	hw_cidx = read_hw_cidx(eq);
+	pidx = eq->pidx;
 
-	if (cidx >= eq->cidx)
-		return (cidx - eq->cidx);
+	if (pidx == hw_cidx)
+		return (eq->sidx - 1);
 	else
-		return (cidx + eq->cap - eq->cidx);
+		return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1);
+}
+
+static inline uint16_t
+read_hw_cidx(struct sge_eq *eq)
+{
+	struct sge_qstat *spg = (void *)&eq->desc[eq->sidx];
+	uint16_t cidx = spg->cidx;	/* stable snapshot */
+
+	return (be16toh(cidx));
 }
 
 /*
- * There are "can_reclaim" tx descriptors ready to be reclaimed.  Reclaim as
- * many as possible but stop when there are around "n" mbufs to free.
- *
- * The actual number reclaimed is provided as the return value.
+ * Reclaim 'n' descriptors approximately.
  */
-static int
-reclaim_tx_descs(struct sge_txq *txq, int can_reclaim, int n)
+static u_int
+reclaim_tx_descs(struct sge_txq *txq, u_int n)
 {
 	struct tx_sdesc *txsd;
-	struct tx_maps *txmaps;
-	struct tx_map *txm;
-	unsigned int reclaimed, maps;
 	struct sge_eq *eq = &txq->eq;
+	u_int can_reclaim, reclaimed;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
+	MPASS(n > 0);
 
-	if (can_reclaim == 0)
-		can_reclaim = reclaimable(eq);
-
-	maps = reclaimed = 0;
-	while (can_reclaim && maps < n) {
+	reclaimed = 0;
+	can_reclaim = reclaimable_tx_desc(eq);
+	while (can_reclaim && reclaimed < n) {
 		int ndesc;
+		struct mbuf *m, *nextpkt;
 
 		txsd = &txq->sdesc[eq->cidx];
 		ndesc = txsd->desc_used;
@@ -4221,73 +4427,37 @@ reclaim_tx_descs(struct sge_txq *txq, int can_reclaim, int n)
 		    ("%s: unexpected number of credits: %d, %d",
 		    __func__, can_reclaim, ndesc));
 
-		maps += txsd->credits;
-
+		for (m = txsd->m; m != NULL; m = nextpkt) {
+			nextpkt = m->m_nextpkt;
+			m->m_nextpkt = NULL;
+			m_freem(m);
+		}
 		reclaimed += ndesc;
 		can_reclaim -= ndesc;
-
-		eq->cidx += ndesc;
-		if (__predict_false(eq->cidx >= eq->cap))
-			eq->cidx -= eq->cap;
-	}
-
-	txmaps = &txq->txmaps;
-	txm = &txmaps->maps[txmaps->map_cidx];
-	if (maps)
-		prefetch(txm->m);
-
-	eq->avail += reclaimed;
-	KASSERT(eq->avail < eq->cap,	/* avail tops out at (cap - 1) */
-	    ("%s: too many descriptors available", __func__));
-
-	txmaps->map_avail += maps;
-	KASSERT(txmaps->map_avail <= txmaps->map_total,
-	    ("%s: too many maps available", __func__));
-
-	while (maps--) {
-		struct tx_map *next;
-
-		next = txm + 1;
-		if (__predict_false(txmaps->map_cidx + 1 == txmaps->map_total))
-			next = txmaps->maps;
-		prefetch(next->m);
-
-		bus_dmamap_unload(txq->tx_tag, txm->map);
-		m_freem(txm->m);
-		txm->m = NULL;
-
-		txm = next;
-		if (__predict_false(++txmaps->map_cidx == txmaps->map_total))
-			txmaps->map_cidx = 0;
+		IDXINCR(eq->cidx, ndesc, eq->sidx);
 	}
 
 	return (reclaimed);
 }
 
 static void
-write_eqflush_wr(struct sge_eq *eq)
+tx_reclaim(void *arg, int n)
 {
-	struct fw_eq_flush_wr *wr;
+	struct sge_txq *txq = arg;
+	struct sge_eq *eq = &txq->eq;
 
-	EQ_LOCK_ASSERT_OWNED(eq);
-	KASSERT(eq->avail > 0, ("%s: no descriptors left.", __func__));
-	KASSERT(!(eq->flags & EQ_CRFLUSHED), ("%s: flushed already", __func__));
-
-	wr = (void *)&eq->desc[eq->pidx];
-	bzero(wr, sizeof(*wr));
-	wr->opcode = FW_EQ_FLUSH_WR;
-	wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(sizeof(*wr) / 16) |
-	    F_FW_WR_EQUEQ | F_FW_WR_EQUIQ);
-
-	eq->flags |= (EQ_CRFLUSHED | EQ_STALLED);
-	eq->pending++;
-	eq->avail--;
-	if (++eq->pidx == eq->cap)
-		eq->pidx = 0;
+	do {
+		if (TXQ_TRYLOCK(txq) == 0)
+			break;
+		n = reclaim_tx_descs(txq, 32);
+		if (eq->cidx == eq->pidx)
+			eq->equeqidx = eq->pidx;
+		TXQ_UNLOCK(txq);
+	} while (n > 0);
 }
 
 static __be64
-get_flit(bus_dma_segment_t *sgl, int nsegs, int idx)
+get_flit(struct sglist_seg *segs, int nsegs, int idx)
 {
 	int i = (idx / 3) * 2;
 
@@ -4295,16 +4465,16 @@ get_flit(bus_dma_segment_t *sgl, int nsegs, int idx)
 	case 0: {
 		__be64 rc;
 
-		rc = htobe32(sgl[i].ds_len);
+		rc = htobe32(segs[i].ss_len);
 		if (i + 1 < nsegs)
-			rc |= (uint64_t)htobe32(sgl[i + 1].ds_len) << 32;
+			rc |= (uint64_t)htobe32(segs[i + 1].ss_len) << 32;
 
 		return (rc);
 	}
 	case 1:
-		return htobe64(sgl[i].ds_addr);
+		return (htobe64(segs[i].ss_paddr));
 	case 2:
-		return htobe64(sgl[i + 1].ds_addr);
+		return (htobe64(segs[i + 1].ss_paddr));
 	}
 
 	return (0);
@@ -4499,6 +4669,27 @@ add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl)
 	mtx_unlock(&sc->sfl_lock);
 }
 
+static void
+handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq)
+{
+	struct sge_wrq *wrq = (void *)eq;
+
+	atomic_readandclear_int(&eq->equiq);
+	taskqueue_enqueue(sc->tq[eq->tx_chan], &wrq->wrq_tx_task);
+}
+
+static void
+handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq)
+{
+	struct sge_txq *txq = (void *)eq;
+
+	MPASS((eq->flags & EQ_TYPEMASK) == EQ_ETH);
+
+	atomic_readandclear_int(&eq->equiq);
+	mp_ring_check_drainage(txq->r, 0);
+	taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task);
+}
+
 static int
 handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
@@ -4508,22 +4699,15 @@ handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss,
 	struct adapter *sc = iq->adapter;
 	struct sge *s = &sc->sge;
 	struct sge_eq *eq;
+	static void (*h[])(struct adapter *, struct sge_eq *) = {NULL,
+		&handle_wrq_egr_update, &handle_eth_egr_update,
+		&handle_wrq_egr_update};
 
 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
 	    rss->opcode));
 
 	eq = s->eqmap[qid - s->eq_start];
-	EQ_LOCK(eq);
-	KASSERT(eq->flags & EQ_CRFLUSHED,
-	    ("%s: unsolicited egress update", __func__));
-	eq->flags &= ~EQ_CRFLUSHED;
-	eq->egr_update++;
-
-	if (__predict_false(eq->flags & EQ_DOOMED))
-		wakeup_one(eq);
-	else if (eq->flags & EQ_STALLED && can_resume_tx(eq))
-		taskqueue_enqueue(sc->tq[eq->tx_chan], &eq->tx_task);
-	EQ_UNLOCK(eq);
+	(*h[eq->flags & EQ_TYPEMASK])(sc, eq);
 
 	return (0);
 }
diff --git a/sys/modules/cxgbe/if_cxgbe/Makefile b/sys/modules/cxgbe/if_cxgbe/Makefile
index e4828f7..a66e45a 100644
--- a/sys/modules/cxgbe/if_cxgbe/Makefile
+++ b/sys/modules/cxgbe/if_cxgbe/Makefile
@@ -15,6 +15,7 @@ SRCS+=	pci_if.h
 SRCS+=	t4_hw.c
 SRCS+=	t4_l2t.c
 SRCS+=	t4_main.c
+SRCS+=	t4_mp_ring.c
 SRCS+=	t4_netmap.c
 SRCS+=	t4_sge.c
 SRCS+=	t4_tracer.c