From b2f095aaa6409529c4b5377e0d521e55e60a1d81 Mon Sep 17 00:00:00 2001
From: np <np@FreeBSD.org>
Date: Wed, 31 Dec 2014 23:19:16 +0000
Subject: cxgbe(4): major tx rework.

a) Front load as much work as possible in if_transmit, before any driver
lock or software queue has to get involved.

b) Replace buf_ring with a brand new mp_ring (multiproducer ring).  This
is specifically for the tx multiqueue model where one of the if_transmit
producer threads becomes the consumer and other producers carry on as
usual.  mp_ring is implemented as standalone code and it should be
possible to use it in any driver with tx multiqueue.  It also has:
- the ability to enqueue/dequeue multiple items.  This might become
  significant if packet batching is ever implemented.
- an abdication mechanism to allow a thread to give up writing tx
  descriptors and have another if_transmit thread take over.  A thread
  that's writing tx descriptors can end up doing so for an unbounded
  time period if a) there are other if_transmit threads continuously
  feeding the sofware queue, and b) the chip keeps up with whatever the
  thread is throwing at it.
- accurate statistics about interesting events even when the stats come
  at the expense of additional branches/conditional code.

The NIC txq lock is uncontested on the fast path at this point.  I've
left it there for synchronization with the control events (interface
up/down, modload/unload).

c) Add support for "type 1" coalescing work request in the normal NIC tx
path.  This work request is optimized for frames with a single item in
the DMA gather list.  These are very common when forwarding packets.
Note that netmap tx in cxgbe already uses these "type 1" work requests.

d) Do not request automatic cidx updates every 32 descriptors.  Instead,
request updates via bits in individual work requests (still every 32
descriptors approximately).  Also, request an automatic final update
when the queue idles after activity.  This means NIC tx reclaim is still
performed lazily but it will catch up quickly as soon as the queue
idles.  This seems to be the best middle ground and I'll probably do
something similar for netmap tx as well.

e) Implement a faster tx path for WRQs (used by TOE tx and control
queues, _not_ by the normal NIC tx).  Allow work requests to be written
directly to the hardware descriptor ring if room is available.  I will
convert t4_tom and iw_cxgbe modules to this faster style gradually.

MFC after:	2 months
---
 sys/conf/files                      |    2 +
 sys/dev/cxgbe/adapter.h             |  116 +-
 sys/dev/cxgbe/t4_l2t.c              |    9 +-
 sys/dev/cxgbe/t4_main.c             |  264 ++---
 sys/dev/cxgbe/t4_mp_ring.c          |  364 +++++++
 sys/dev/cxgbe/t4_mp_ring.h          |   68 ++
 sys/dev/cxgbe/t4_sge.c              | 1996 +++++++++++++++++++----------------
 sys/modules/cxgbe/if_cxgbe/Makefile |    1 +
 8 files changed, 1687 insertions(+), 1133 deletions(-)
 create mode 100644 sys/dev/cxgbe/t4_mp_ring.c
 create mode 100644 sys/dev/cxgbe/t4_mp_ring.h

(limited to 'sys')

diff --git a/sys/conf/files b/sys/conf/files
index 3884c11..9e55f42 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -1142,6 +1142,8 @@ dev/cxgb/sys/uipc_mvec.c	optional cxgb pci \
 	compile-with "${NORMAL_C} -I$S/dev/cxgb"
 dev/cxgb/cxgb_t3fw.c		optional cxgb cxgb_t3fw \
 	compile-with "${NORMAL_C} -I$S/dev/cxgb"
+dev/cxgbe/t4_mp_ring.c		optional cxgbe pci \
+	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
 dev/cxgbe/t4_main.c		optional cxgbe pci \
 	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
 dev/cxgbe/t4_netmap.c		optional cxgbe pci \
diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h
index ec84bb4..62ff9af 100644
--- a/sys/dev/cxgbe/adapter.h
+++ b/sys/dev/cxgbe/adapter.h
@@ -152,7 +152,8 @@ enum {
 	CL_METADATA_SIZE = CACHE_LINE_SIZE,
 
 	SGE_MAX_WR_NDESC = SGE_MAX_WR_LEN / EQ_ESIZE, /* max WR size in desc */
-	TX_SGL_SEGS = 36,
+	TX_SGL_SEGS = 39,
+	TX_SGL_SEGS_TSO = 38,
 	TX_WR_FLITS = SGE_MAX_WR_LEN / 8
 };
 
@@ -273,6 +274,7 @@ struct port_info {
 	struct timeval last_refreshed;
  	struct port_stats stats;
 	u_int tnl_cong_drops;
+	u_int tx_parse_error;
 
 	eventhandler_tag vlan_c;
 
@@ -308,23 +310,9 @@ struct tx_desc {
 	__be64 flit[8];
 };
 
-struct tx_map {
-	struct mbuf *m;
-	bus_dmamap_t map;
-};
-
-/* DMA maps used for tx */
-struct tx_maps {
-	struct tx_map *maps;
-	uint32_t map_total;	/* # of DMA maps */
-	uint32_t map_pidx;	/* next map to be used */
-	uint32_t map_cidx;	/* reclaimed up to this index */
-	uint32_t map_avail;	/* # of available maps */
-};
-
 struct tx_sdesc {
+	struct mbuf *m;		/* m_nextpkt linked chain of frames */
 	uint8_t desc_used;	/* # of hardware descriptors used by the WR */
-	uint8_t credits;	/* NIC txq: # of frames sent out in the WR */
 };
 
 
@@ -378,16 +366,12 @@ struct sge_iq {
 enum {
 	EQ_CTRL		= 1,
 	EQ_ETH		= 2,
-#ifdef TCP_OFFLOAD
 	EQ_OFLD		= 3,
-#endif
 
 	/* eq flags */
-	EQ_TYPEMASK	= 7,		/* 3 lsbits hold the type */
-	EQ_ALLOCATED	= (1 << 3),	/* firmware resources allocated */
-	EQ_DOOMED	= (1 << 4),	/* about to be destroyed */
-	EQ_CRFLUSHED	= (1 << 5),	/* expecting an update from SGE */
-	EQ_STALLED	= (1 << 6),	/* out of hw descriptors or dmamaps */
+	EQ_TYPEMASK	= 0x3,		/* 2 lsbits hold the type (see above) */
+	EQ_ALLOCATED	= (1 << 2),	/* firmware resources allocated */
+	EQ_ENABLED	= (1 << 3),	/* open for business */
 };
 
 /* Listed in order of preference.  Update t4_sysctls too if you change these */
@@ -402,32 +386,25 @@ enum {DOORBELL_UDB, DOORBELL_WCWR, DOORBELL_UDBWC, DOORBELL_KDB};
 struct sge_eq {
 	unsigned int flags;	/* MUST be first */
 	unsigned int cntxt_id;	/* SGE context id for the eq */
-	bus_dma_tag_t desc_tag;
-	bus_dmamap_t desc_map;
-	char lockname[16];
 	struct mtx eq_lock;
 
 	struct tx_desc *desc;	/* KVA of descriptor ring */
-	bus_addr_t ba;		/* bus address of descriptor ring */
-	struct sge_qstat *spg;	/* status page, for convenience */
 	uint16_t doorbells;
 	volatile uint32_t *udb;	/* KVA of doorbell (lies within BAR2) */
 	u_int udb_qid;		/* relative qid within the doorbell page */
-	uint16_t cap;		/* max # of desc, for convenience */
-	uint16_t avail;		/* available descriptors, for convenience */
-	uint16_t qsize;		/* size (# of entries) of the queue */
+	uint16_t sidx;		/* index of the entry with the status page */
 	uint16_t cidx;		/* consumer idx (desc idx) */
 	uint16_t pidx;		/* producer idx (desc idx) */
-	uint16_t pending;	/* # of descriptors used since last doorbell */
+	uint16_t equeqidx;	/* EQUEQ last requested at this pidx */
+	uint16_t dbidx;		/* pidx of the most recent doorbell */
 	uint16_t iqid;		/* iq that gets egr_update for the eq */
 	uint8_t tx_chan;	/* tx channel used by the eq */
-	struct task tx_task;
-	struct callout tx_callout;
-
-	/* stats */
+	volatile u_int equiq;	/* EQUIQ outstanding */
 
-	uint32_t egr_update;	/* # of SGE_EGR_UPDATE notifications for eq */
-	uint32_t unstalled;	/* recovered from stall */
+	bus_dma_tag_t desc_tag;
+	bus_dmamap_t desc_map;
+	bus_addr_t ba;		/* bus address of descriptor ring */
+	char lockname[16];
 };
 
 struct sw_zone_info {
@@ -499,18 +476,19 @@ struct sge_fl {
 	struct cluster_layout cll_alt;	/* alternate refill zone, layout */
 };
 
+struct mp_ring;
+
 /* txq: SGE egress queue + what's needed for Ethernet NIC */
 struct sge_txq {
 	struct sge_eq eq;	/* MUST be first */
 
 	struct ifnet *ifp;	/* the interface this txq belongs to */
-	bus_dma_tag_t tx_tag;	/* tag for transmit buffers */
-	struct buf_ring *br;	/* tx buffer ring */
+	struct mp_ring *r;	/* tx software ring */
 	struct tx_sdesc *sdesc;	/* KVA of software descriptor ring */
-	struct mbuf *m;		/* held up due to temporary resource shortage */
-
-	struct tx_maps txmaps;
+	struct sglist *gl;
+	__be32 cpl_ctrl0;	/* for convenience */
 
+	struct task tx_reclaim_task;
 	/* stats for common events first */
 
 	uint64_t txcsum;	/* # of times hardware assisted with checksum */
@@ -519,13 +497,12 @@ struct sge_txq {
 	uint64_t imm_wrs;	/* # of work requests with immediate data */
 	uint64_t sgl_wrs;	/* # of work requests with direct SGL */
 	uint64_t txpkt_wrs;	/* # of txpkt work requests (not coalesced) */
-	uint64_t txpkts_wrs;	/* # of coalesced tx work requests */
-	uint64_t txpkts_pkts;	/* # of frames in coalesced tx work requests */
+	uint64_t txpkts0_wrs;	/* # of type0 coalesced tx work requests */
+	uint64_t txpkts1_wrs;	/* # of type1 coalesced tx work requests */
+	uint64_t txpkts0_pkts;	/* # of frames in type0 coalesced tx WRs */
+	uint64_t txpkts1_pkts;	/* # of frames in type1 coalesced tx WRs */
 
 	/* stats for not-that-common events */
-
-	uint32_t no_dmamap;	/* no DMA map to load the mbuf */
-	uint32_t no_desc;	/* out of hardware descriptors */
 } __aligned(CACHE_LINE_SIZE);
 
 /* rxq: SGE ingress queue + SGE free list + miscellaneous items */
@@ -574,7 +551,13 @@ struct wrqe {
 	STAILQ_ENTRY(wrqe) link;
 	struct sge_wrq *wrq;
 	int wr_len;
-	uint64_t wr[] __aligned(16);
+	char wr[] __aligned(16);
+};
+
+struct wrq_cookie {
+	TAILQ_ENTRY(wrq_cookie) link;
+	int ndesc;
+	int pidx;
 };
 
 /*
@@ -585,17 +568,32 @@ struct sge_wrq {
 	struct sge_eq eq;	/* MUST be first */
 
 	struct adapter *adapter;
+	struct task wrq_tx_task;
+
+	/* Tx desc reserved but WR not "committed" yet. */
+	TAILQ_HEAD(wrq_incomplete_wrs , wrq_cookie) incomplete_wrs;
 
-	/* List of WRs held up due to lack of tx descriptors */
+	/* List of WRs ready to go out as soon as descriptors are available. */
 	STAILQ_HEAD(, wrqe) wr_list;
+	u_int nwr_pending;
+	u_int ndesc_needed;
 
 	/* stats for common events first */
 
-	uint64_t tx_wrs;	/* # of tx work requests */
+	uint64_t tx_wrs_direct;	/* # of WRs written directly to desc ring. */
+	uint64_t tx_wrs_ss;	/* # of WRs copied from scratch space. */
+	uint64_t tx_wrs_copied;	/* # of WRs queued and copied to desc ring. */
 
 	/* stats for not-that-common events */
 
-	uint32_t no_desc;	/* out of hardware descriptors */
+	/*
+	 * Scratch space for work requests that wrap around after reaching the
+	 * status page, and some infomation about the last WR that used it.
+	 */
+	uint16_t ss_pidx;
+	uint16_t ss_len;
+	uint8_t ss[SGE_MAX_WR_LEN];
+
 } __aligned(CACHE_LINE_SIZE);
 
 
@@ -744,7 +742,7 @@ struct adapter {
 	struct sge sge;
 	int lro_timeout;
 
-	struct taskqueue *tq[NCHAN];	/* taskqueues that flush data out */
+	struct taskqueue *tq[NCHAN];	/* General purpose taskqueues */
 	struct port_info *port[MAX_NPORTS];
 	uint8_t chan_map[NCHAN];
 
@@ -978,12 +976,11 @@ static inline int
 tx_resume_threshold(struct sge_eq *eq)
 {
 
-	return (eq->qsize / 4);
+	/* not quite the same as qsize / 4, but this will do. */
+	return (eq->sidx / 4);
 }
 
 /* t4_main.c */
-void t4_tx_task(void *, int);
-void t4_tx_callout(void *);
 int t4_os_find_pci_capability(struct adapter *, int);
 int t4_os_pci_save_state(struct adapter *);
 int t4_os_pci_restore_state(struct adapter *);
@@ -1024,16 +1021,15 @@ int t4_setup_adapter_queues(struct adapter *);
 int t4_teardown_adapter_queues(struct adapter *);
 int t4_setup_port_queues(struct port_info *);
 int t4_teardown_port_queues(struct port_info *);
-int t4_alloc_tx_maps(struct tx_maps *, bus_dma_tag_t, int, int);
-void t4_free_tx_maps(struct tx_maps *, bus_dma_tag_t);
 void t4_intr_all(void *);
 void t4_intr(void *);
 void t4_intr_err(void *);
 void t4_intr_evt(void *);
 void t4_wrq_tx_locked(struct adapter *, struct sge_wrq *, struct wrqe *);
-int t4_eth_tx(struct ifnet *, struct sge_txq *, struct mbuf *);
 void t4_update_fl_bufsize(struct ifnet *);
-int can_resume_tx(struct sge_eq *);
+int parse_pkt(struct mbuf **);
+void *start_wrq_wr(struct sge_wrq *, int, struct wrq_cookie *);
+void commit_wrq_wr(struct sge_wrq *, void *, struct wrq_cookie *);
 
 /* t4_tracer.c */
 struct t4_tracer;
diff --git a/sys/dev/cxgbe/t4_l2t.c b/sys/dev/cxgbe/t4_l2t.c
index 6f7378a..cca1bf3 100644
--- a/sys/dev/cxgbe/t4_l2t.c
+++ b/sys/dev/cxgbe/t4_l2t.c
@@ -113,16 +113,15 @@ found:
 int
 t4_write_l2e(struct adapter *sc, struct l2t_entry *e, int sync)
 {
-	struct wrqe *wr;
+	struct wrq_cookie cookie;
 	struct cpl_l2t_write_req *req;
 	int idx = e->idx + sc->vres.l2t.start;
 
 	mtx_assert(&e->lock, MA_OWNED);
 
-	wr = alloc_wrqe(sizeof(*req), &sc->sge.mgmtq);
-	if (wr == NULL)
+	req = start_wrq_wr(&sc->sge.mgmtq, howmany(sizeof(*req), 16), &cookie);
+	if (req == NULL)
 		return (ENOMEM);
-	req = wrtod(wr);
 
 	INIT_TP_WR(req, 0);
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, idx |
@@ -132,7 +131,7 @@ t4_write_l2e(struct adapter *sc, struct l2t_entry *e, int sync)
 	req->vlan = htons(e->vlan);
 	memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac));
 
-	t4_wrq_tx(sc, wr);
+	commit_wrq_wr(&sc->sge.mgmtq, req, &cookie);
 
 	if (sync && e->state != L2T_STATE_SWITCHING)
 		e->state = L2T_STATE_SYNC_WRITE;
diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c
index 2c384fd..39dc816 100644
--- a/sys/dev/cxgbe/t4_main.c
+++ b/sys/dev/cxgbe/t4_main.c
@@ -66,6 +66,7 @@ __FBSDID("$FreeBSD$");
 #include "common/t4_regs_values.h"
 #include "t4_ioctl.h"
 #include "t4_l2t.h"
+#include "t4_mp_ring.h"
 
 /* T4 bus driver interface */
 static int t4_probe(device_t);
@@ -378,7 +379,8 @@ static void build_medialist(struct port_info *, struct ifmedia *);
 static int cxgbe_init_synchronized(struct port_info *);
 static int cxgbe_uninit_synchronized(struct port_info *);
 static int setup_intr_handlers(struct adapter *);
-static void quiesce_eq(struct adapter *, struct sge_eq *);
+static void quiesce_txq(struct adapter *, struct sge_txq *);
+static void quiesce_wrq(struct adapter *, struct sge_wrq *);
 static void quiesce_iq(struct adapter *, struct sge_iq *);
 static void quiesce_fl(struct adapter *, struct sge_fl *);
 static int t4_alloc_irq(struct adapter *, struct irq *, int rid,
@@ -434,7 +436,6 @@ static int sysctl_tx_rate(SYSCTL_HANDLER_ARGS);
 static int sysctl_ulprx_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS);
 #endif
-static inline void txq_start(struct ifnet *, struct sge_txq *);
 static uint32_t fconf_to_mode(uint32_t);
 static uint32_t mode_to_fconf(uint32_t);
 static uint32_t fspec_to_fconf(struct t4_filter_specification *);
@@ -1429,67 +1430,36 @@ cxgbe_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct port_info *pi = ifp->if_softc;
 	struct adapter *sc = pi->adapter;
-	struct sge_txq *txq = &sc->sge.txq[pi->first_txq];
-	struct buf_ring *br;
+	struct sge_txq *txq;
+	void *items[1];
 	int rc;
 
 	M_ASSERTPKTHDR(m);
+	MPASS(m->m_nextpkt == NULL);	/* not quite ready for this yet */
 
 	if (__predict_false(pi->link_cfg.link_ok == 0)) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
-	/* check if flowid is set */
-	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
-		txq += ((m->m_pkthdr.flowid % (pi->ntxq - pi->rsrv_noflowq))
-		    + pi->rsrv_noflowq);
-	br = txq->br;
-
-	if (TXQ_TRYLOCK(txq) == 0) {
-		struct sge_eq *eq = &txq->eq;
-
-		/*
-		 * It is possible that t4_eth_tx finishes up and releases the
-		 * lock between the TRYLOCK above and the drbr_enqueue here.  We
-		 * need to make sure that this mbuf doesn't just sit there in
-		 * the drbr.
-		 */
-
-		rc = drbr_enqueue(ifp, br, m);
-		if (rc == 0 && callout_pending(&eq->tx_callout) == 0 &&
-		    !(eq->flags & EQ_DOOMED))
-			callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq);
+	rc = parse_pkt(&m);
+	if (__predict_false(rc != 0)) {
+		MPASS(m == NULL);			/* was freed already */
+		atomic_add_int(&pi->tx_parse_error, 1);	/* rare, atomic is ok */
 		return (rc);
 	}
 
-	/*
-	 * txq->m is the mbuf that is held up due to a temporary shortage of
-	 * resources and it should be put on the wire first.  Then what's in
-	 * drbr and finally the mbuf that was just passed in to us.
-	 *
-	 * Return code should indicate the fate of the mbuf that was passed in
-	 * this time.
-	 */
-
-	TXQ_LOCK_ASSERT_OWNED(txq);
-	if (drbr_needs_enqueue(ifp, br) || txq->m) {
-
-		/* Queued for transmission. */
-
-		rc = drbr_enqueue(ifp, br, m);
-		m = txq->m ? txq->m : drbr_dequeue(ifp, br);
-		(void) t4_eth_tx(ifp, txq, m);
-		TXQ_UNLOCK(txq);
-		return (rc);
-	}
+	/* Select a txq. */
+	txq = &sc->sge.txq[pi->first_txq];
+	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
+		txq += ((m->m_pkthdr.flowid % (pi->ntxq - pi->rsrv_noflowq)) +
+		    pi->rsrv_noflowq);
 
-	/* Direct transmission. */
-	rc = t4_eth_tx(ifp, txq, m);
-	if (rc != 0 && txq->m)
-		rc = 0;	/* held, will be transmitted soon (hopefully) */
+	items[0] = m;
+	rc = mp_ring_enqueue(txq->r, items, 1, 4096);
+	if (__predict_false(rc != 0))
+		m_freem(m);
 
-	TXQ_UNLOCK(txq);
 	return (rc);
 }
 
@@ -1499,17 +1469,17 @@ cxgbe_qflush(struct ifnet *ifp)
 	struct port_info *pi = ifp->if_softc;
 	struct sge_txq *txq;
 	int i;
-	struct mbuf *m;
 
 	/* queues do not exist if !PORT_INIT_DONE. */
 	if (pi->flags & PORT_INIT_DONE) {
 		for_each_txq(pi, i, txq) {
 			TXQ_LOCK(txq);
-			m_freem(txq->m);
-			txq->m = NULL;
-			while ((m = buf_ring_dequeue_sc(txq->br)) != NULL)
-				m_freem(m);
+			txq->eq.flags &= ~EQ_ENABLED;
 			TXQ_UNLOCK(txq);
+			while (!mp_ring_is_idle(txq->r)) {
+				mp_ring_check_drainage(txq->r, 0);
+				pause("qflush", 1);
+			}
 		}
 	}
 	if_qflush(ifp);
@@ -1564,7 +1534,7 @@ cxgbe_get_counter(struct ifnet *ifp, ift_counter c)
 			struct sge_txq *txq;
 
 			for_each_txq(pi, i, txq)
-				drops += txq->br->br_drops;
+				drops += counter_u64_fetch(txq->r->drops);
 		}
 
 		return (drops);
@@ -3236,7 +3206,8 @@ cxgbe_init_synchronized(struct port_info *pi)
 {
 	struct adapter *sc = pi->adapter;
 	struct ifnet *ifp = pi->ifp;
-	int rc = 0;
+	int rc = 0, i;
+	struct sge_txq *txq;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
@@ -3265,6 +3236,17 @@ cxgbe_init_synchronized(struct port_info *pi)
 	}
 
 	/*
+	 * Can't fail from this point onwards.  Review cxgbe_uninit_synchronized
+	 * if this changes.
+	 */
+
+	for_each_txq(pi, i, txq) {
+		TXQ_LOCK(txq);
+		txq->eq.flags |= EQ_ENABLED;
+		TXQ_UNLOCK(txq);
+	}
+
+	/*
 	 * The first iq of the first port to come up is used for tracing.
 	 */
 	if (sc->traceq < 0) {
@@ -3297,7 +3279,8 @@ cxgbe_uninit_synchronized(struct port_info *pi)
 {
 	struct adapter *sc = pi->adapter;
 	struct ifnet *ifp = pi->ifp;
-	int rc;
+	int rc, i;
+	struct sge_txq *txq;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
@@ -3314,6 +3297,12 @@ cxgbe_uninit_synchronized(struct port_info *pi)
 		return (rc);
 	}
 
+	for_each_txq(pi, i, txq) {
+		TXQ_LOCK(txq);
+		txq->eq.flags &= ~EQ_ENABLED;
+		TXQ_UNLOCK(txq);
+	}
+
 	clrbit(&sc->open_device_map, pi->port_id);
 	PORT_LOCK(pi);
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
@@ -3543,15 +3532,17 @@ port_full_uninit(struct port_info *pi)
 
 	if (pi->flags & PORT_INIT_DONE) {
 
-		/* Need to quiesce queues.  XXX: ctrl queues? */
+		/* Need to quiesce queues.  */
+
+		quiesce_wrq(sc, &sc->sge.ctrlq[pi->port_id]);
 
 		for_each_txq(pi, i, txq) {
-			quiesce_eq(sc, &txq->eq);
+			quiesce_txq(sc, txq);
 		}
 
 #ifdef TCP_OFFLOAD
 		for_each_ofld_txq(pi, i, ofld_txq) {
-			quiesce_eq(sc, &ofld_txq->eq);
+			quiesce_wrq(sc, ofld_txq);
 		}
 #endif
 
@@ -3576,23 +3567,39 @@ port_full_uninit(struct port_info *pi)
 }
 
 static void
-quiesce_eq(struct adapter *sc, struct sge_eq *eq)
+quiesce_txq(struct adapter *sc, struct sge_txq *txq)
 {
-	EQ_LOCK(eq);
-	eq->flags |= EQ_DOOMED;
+	struct sge_eq *eq = &txq->eq;
+	struct sge_qstat *spg = (void *)&eq->desc[eq->sidx];
 
-	/*
-	 * Wait for the response to a credit flush if one's
-	 * pending.
-	 */
-	while (eq->flags & EQ_CRFLUSHED)
-		mtx_sleep(eq, &eq->eq_lock, 0, "crflush", 0);
-	EQ_UNLOCK(eq);
+	(void) sc;	/* unused */
+
+#ifdef INVARIANTS
+	TXQ_LOCK(txq);
+	MPASS((eq->flags & EQ_ENABLED) == 0);
+	TXQ_UNLOCK(txq);
+#endif
+
+	/* Wait for the mp_ring to empty. */
+	while (!mp_ring_is_idle(txq->r)) {
+		mp_ring_check_drainage(txq->r, 0);
+		pause("rquiesce", 1);
+	}
 
-	callout_drain(&eq->tx_callout);	/* XXX: iffy */
-	pause("callout", 10);		/* Still iffy */
+	/* Then wait for the hardware to finish. */
+	while (spg->cidx != htobe16(eq->pidx))
+		pause("equiesce", 1);
 
-	taskqueue_drain(sc->tq[eq->tx_chan], &eq->tx_task);
+	/* Finally, wait for the driver to reclaim all descriptors. */
+	while (eq->cidx != eq->pidx)
+		pause("dquiesce", 1);
+}
+
+static void
+quiesce_wrq(struct adapter *sc, struct sge_wrq *wrq)
+{
+
+	/* XXXTX */
 }
 
 static void
@@ -4892,6 +4899,9 @@ cxgbe_sysctls(struct port_info *pi)
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "stats", CTLFLAG_RD,
 	    NULL, "port statistics");
 	children = SYSCTL_CHILDREN(oid);
+	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "tx_parse_error", CTLFLAG_RD,
+	    &pi->tx_parse_error, 0,
+	    "# of tx packets with invalid length or # of segments");
 
 #define SYSCTL_ADD_T4_REG64(pi, name, desc, reg) \
 	SYSCTL_ADD_OID(ctx, children, OID_AUTO, name, \
@@ -6947,74 +6957,6 @@ sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS)
 }
 #endif
 
-static inline void
-txq_start(struct ifnet *ifp, struct sge_txq *txq)
-{
-	struct buf_ring *br;
-	struct mbuf *m;
-
-	TXQ_LOCK_ASSERT_OWNED(txq);
-
-	br = txq->br;
-	m = txq->m ? txq->m : drbr_dequeue(ifp, br);
-	if (m)
-		t4_eth_tx(ifp, txq, m);
-}
-
-void
-t4_tx_callout(void *arg)
-{
-	struct sge_eq *eq = arg;
-	struct adapter *sc;
-
-	if (EQ_TRYLOCK(eq) == 0)
-		goto reschedule;
-
-	if (eq->flags & EQ_STALLED && !can_resume_tx(eq)) {
-		EQ_UNLOCK(eq);
-reschedule:
-		if (__predict_true(!(eq->flags && EQ_DOOMED)))
-			callout_schedule(&eq->tx_callout, 1);
-		return;
-	}
-
-	EQ_LOCK_ASSERT_OWNED(eq);
-
-	if (__predict_true((eq->flags & EQ_DOOMED) == 0)) {
-
-		if ((eq->flags & EQ_TYPEMASK) == EQ_ETH) {
-			struct sge_txq *txq = arg;
-			struct port_info *pi = txq->ifp->if_softc;
-
-			sc = pi->adapter;
-		} else {
-			struct sge_wrq *wrq = arg;
-
-			sc = wrq->adapter;
-		}
-
-		taskqueue_enqueue(sc->tq[eq->tx_chan], &eq->tx_task);
-	}
-
-	EQ_UNLOCK(eq);
-}
-
-void
-t4_tx_task(void *arg, int count)
-{
-	struct sge_eq *eq = arg;
-
-	EQ_LOCK(eq);
-	if ((eq->flags & EQ_TYPEMASK) == EQ_ETH) {
-		struct sge_txq *txq = arg;
-		txq_start(txq->ifp, txq);
-	} else {
-		struct sge_wrq *wrq = arg;
-		t4_wrq_tx_locked(wrq->adapter, wrq, NULL);
-	}
-	EQ_UNLOCK(eq);
-}
-
 static uint32_t
 fconf_to_mode(uint32_t fconf)
 {
@@ -7452,9 +7394,9 @@ static int
 set_filter_wr(struct adapter *sc, int fidx)
 {
 	struct filter_entry *f = &sc->tids.ftid_tab[fidx];
-	struct wrqe *wr;
 	struct fw_filter_wr *fwr;
 	unsigned int ftid;
+	struct wrq_cookie cookie;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
@@ -7473,12 +7415,10 @@ set_filter_wr(struct adapter *sc, int fidx)
 
 	ftid = sc->tids.ftid_base + fidx;
 
-	wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq);
-	if (wr == NULL)
+	fwr = start_wrq_wr(&sc->sge.mgmtq, howmany(sizeof(*fwr), 16), &cookie);
+	if (fwr == NULL)
 		return (ENOMEM);
-
-	fwr = wrtod(wr);
-	bzero(fwr, sizeof (*fwr));
+	bzero(fwr, sizeof(*fwr));
 
 	fwr->op_pkd = htobe32(V_FW_WR_OP(FW_FILTER_WR));
 	fwr->len16_pkd = htobe32(FW_LEN16(*fwr));
@@ -7547,7 +7487,7 @@ set_filter_wr(struct adapter *sc, int fidx)
 	f->pending = 1;
 	sc->tids.ftids_in_use++;
 
-	t4_wrq_tx(sc, wr);
+	commit_wrq_wr(&sc->sge.mgmtq, fwr, &cookie);
 	return (0);
 }
 
@@ -7555,22 +7495,21 @@ static int
 del_filter_wr(struct adapter *sc, int fidx)
 {
 	struct filter_entry *f = &sc->tids.ftid_tab[fidx];
-	struct wrqe *wr;
 	struct fw_filter_wr *fwr;
 	unsigned int ftid;
+	struct wrq_cookie cookie;
 
 	ftid = sc->tids.ftid_base + fidx;
 
-	wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq);
-	if (wr == NULL)
+	fwr = start_wrq_wr(&sc->sge.mgmtq, howmany(sizeof(*fwr), 16), &cookie);
+	if (fwr == NULL)
 		return (ENOMEM);
-	fwr = wrtod(wr);
 	bzero(fwr, sizeof (*fwr));
 
 	t4_mk_filtdelwr(ftid, fwr, sc->sge.fwq.abs_id);
 
 	f->pending = 1;
-	t4_wrq_tx(sc, wr);
+	commit_wrq_wr(&sc->sge.mgmtq, fwr, &cookie);
 	return (0);
 }
 
@@ -8170,6 +8109,7 @@ t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag,
 
 		/* MAC stats */
 		t4_clr_port_stats(sc, pi->tx_chan);
+		pi->tx_parse_error = 0;
 
 		if (pi->flags & PORT_INIT_DONE) {
 			struct sge_rxq *rxq;
@@ -8192,24 +8132,24 @@ t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag,
 				txq->imm_wrs = 0;
 				txq->sgl_wrs = 0;
 				txq->txpkt_wrs = 0;
-				txq->txpkts_wrs = 0;
-				txq->txpkts_pkts = 0;
-				txq->br->br_drops = 0;
-				txq->no_dmamap = 0;
-				txq->no_desc = 0;
+				txq->txpkts0_wrs = 0;
+				txq->txpkts1_wrs = 0;
+				txq->txpkts0_pkts = 0;
+				txq->txpkts1_pkts = 0;
+				mp_ring_reset_stats(txq->r);
 			}
 
 #ifdef TCP_OFFLOAD
 			/* nothing to clear for each ofld_rxq */
 
 			for_each_ofld_txq(pi, i, wrq) {
-				wrq->tx_wrs = 0;
-				wrq->no_desc = 0;
+				wrq->tx_wrs_direct = 0;
+				wrq->tx_wrs_copied = 0;
 			}
 #endif
 			wrq = &sc->sge.ctrlq[pi->port_id];
-			wrq->tx_wrs = 0;
-			wrq->no_desc = 0;
+			wrq->tx_wrs_direct = 0;
+			wrq->tx_wrs_copied = 0;
 		}
 		break;
 	}
diff --git a/sys/dev/cxgbe/t4_mp_ring.c b/sys/dev/cxgbe/t4_mp_ring.c
new file mode 100644
index 0000000..ef09f01
--- /dev/null
+++ b/sys/dev/cxgbe/t4_mp_ring.c
@@ -0,0 +1,364 @@
+/*-
+ * Copyright (c) 2014 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/counter.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <machine/cpu.h>
+
+#include "t4_mp_ring.h"
+
+union ring_state {
+	struct {
+		uint16_t pidx_head;
+		uint16_t pidx_tail;
+		uint16_t cidx;
+		uint16_t flags;
+	};
+	uint64_t state;
+};
+
+enum {
+	IDLE = 0,	/* consumer ran to completion, nothing more to do. */
+	BUSY,		/* consumer is running already, or will be shortly. */
+	STALLED,	/* consumer stopped due to lack of resources. */
+	ABDICATED,	/* consumer stopped even though there was work to be
+			   done because it wants another thread to take over. */
+};
+
+static inline uint16_t
+space_available(struct mp_ring *r, union ring_state s)
+{
+	uint16_t x = r->size - 1;
+
+	if (s.cidx == s.pidx_head)
+		return (x);
+	else if (s.cidx > s.pidx_head)
+		return (s.cidx - s.pidx_head - 1);
+	else
+		return (x - s.pidx_head + s.cidx);
+}
+
+static inline uint16_t
+increment_idx(struct mp_ring *r, uint16_t idx, uint16_t n)
+{
+	int x = r->size - idx;
+
+	MPASS(x > 0);
+	return (x > n ? idx + n : n - x);
+}
+
+/* Consumer is about to update the ring's state to s */
+static inline uint16_t
+state_to_flags(union ring_state s, int abdicate)
+{
+
+	if (s.cidx == s.pidx_tail)
+		return (IDLE);
+	else if (abdicate && s.pidx_tail != s.pidx_head)
+		return (ABDICATED);
+
+	return (BUSY);
+}
+
+/*
+ * Caller passes in a state, with a guarantee that there is work to do and that
+ * all items up to the pidx_tail in the state are visible.
+ */
+static void
+drain_ring(struct mp_ring *r, union ring_state os, uint16_t prev, int budget)
+{
+	union ring_state ns;
+	int n, pending, total;
+	uint16_t cidx = os.cidx;
+	uint16_t pidx = os.pidx_tail;
+
+	MPASS(os.flags == BUSY);
+	MPASS(cidx != pidx);
+
+	if (prev == IDLE)
+		counter_u64_add(r->starts, 1);
+	pending = 0;
+	total = 0;
+
+	while (cidx != pidx) {
+
+		/* Items from cidx to pidx are available for consumption. */
+		n = r->drain(r, cidx, pidx);
+		if (n == 0) {
+			critical_enter();
+			do {
+				os.state = ns.state = r->state;
+				ns.cidx = cidx;
+				ns.flags = STALLED;
+			} while (atomic_cmpset_64(&r->state, os.state,
+			    ns.state) == 0);
+			critical_exit();
+			if (prev != STALLED)
+				counter_u64_add(r->stalls, 1);
+			else if (total > 0) {
+				counter_u64_add(r->restarts, 1);
+				counter_u64_add(r->stalls, 1);
+			}
+			break;
+		}
+		cidx = increment_idx(r, cidx, n);
+		pending += n;
+		total += n;
+
+		/*
+		 * We update the cidx only if we've caught up with the pidx, the
+		 * real cidx is getting too far ahead of the one visible to
+		 * everyone else, or we have exceeded our budget.
+		 */
+		if (cidx != pidx && pending < 64 && total < budget)
+			continue;
+		critical_enter();
+		do {
+			os.state = ns.state = r->state;
+			ns.cidx = cidx;
+			ns.flags = state_to_flags(ns, total >= budget);
+		} while (atomic_cmpset_acq_64(&r->state, os.state, ns.state) == 0);
+		critical_exit();
+
+		if (ns.flags == ABDICATED)
+			counter_u64_add(r->abdications, 1);
+		if (ns.flags != BUSY) {
+			/* Wrong loop exit if we're going to stall. */
+			MPASS(ns.flags != STALLED);
+			if (prev == STALLED) {
+				MPASS(total > 0);
+				counter_u64_add(r->restarts, 1);
+			}
+			break;
+		}
+
+		/*
+		 * The acquire style atomic above guarantees visibility of items
+		 * associated with any pidx change that we notice here.
+		 */
+		pidx = ns.pidx_tail;
+		pending = 0;
+	}
+}
+
+int
+mp_ring_alloc(struct mp_ring **pr, int size, void *cookie, ring_drain_t drain,
+    ring_can_drain_t can_drain, struct malloc_type *mt, int flags)
+{
+	struct mp_ring *r;
+
+	/* All idx are 16b so size can be 65536 at most */
+	if (pr == NULL || size < 2 || size > 65536 || drain == NULL ||
+	    can_drain == NULL)
+		return (EINVAL);
+	*pr = NULL;
+	flags &= M_NOWAIT | M_WAITOK;
+	MPASS(flags != 0);
+
+	r = malloc(__offsetof(struct mp_ring, items[size]), mt, flags | M_ZERO);
+	if (r == NULL)
+		return (ENOMEM);
+	r->size = size;
+	r->cookie = cookie;
+	r->mt = mt;
+	r->drain = drain;
+	r->can_drain = can_drain;
+	r->enqueues = counter_u64_alloc(flags);
+	r->drops = counter_u64_alloc(flags);
+	r->starts = counter_u64_alloc(flags);
+	r->stalls = counter_u64_alloc(flags);
+	r->restarts = counter_u64_alloc(flags);
+	r->abdications = counter_u64_alloc(flags);
+	if (r->enqueues == NULL || r->drops == NULL || r->starts == NULL ||
+	    r->stalls == NULL || r->restarts == NULL ||
+	    r->abdications == NULL) {
+		mp_ring_free(r);
+		return (ENOMEM);
+	}
+
+	*pr = r;
+	return (0);
+}
+
+void
+
+mp_ring_free(struct mp_ring *r)
+{
+
+	if (r == NULL)
+		return;
+
+	if (r->enqueues != NULL)
+		counter_u64_free(r->enqueues);
+	if (r->drops != NULL)
+		counter_u64_free(r->drops);
+	if (r->starts != NULL)
+		counter_u64_free(r->starts);
+	if (r->stalls != NULL)
+		counter_u64_free(r->stalls);
+	if (r->restarts != NULL)
+		counter_u64_free(r->restarts);
+	if (r->abdications != NULL)
+		counter_u64_free(r->abdications);
+
+	free(r, r->mt);
+}
+
+/*
+ * Enqueue n items and maybe drain the ring for some time.
+ *
+ * Returns an errno.
+ */
+int
+mp_ring_enqueue(struct mp_ring *r, void **items, int n, int budget)
+{
+	union ring_state os, ns;
+	uint16_t pidx_start, pidx_stop;
+	int i;
+
+	MPASS(items != NULL);
+	MPASS(n > 0);
+
+	/*
+	 * Reserve room for the new items.  Our reservation, if successful, is
+	 * from 'pidx_start' to 'pidx_stop'.
+	 */
+	for (;;) {
+		os.state = r->state;
+		if (n >= space_available(r, os)) {
+			counter_u64_add(r->drops, n);
+			MPASS(os.flags != IDLE);
+			if (os.flags == STALLED)
+				mp_ring_check_drainage(r, 0);
+			return (ENOBUFS);
+		}
+		ns.state = os.state;
+		ns.pidx_head = increment_idx(r, os.pidx_head, n);
+		critical_enter();
+		if (atomic_cmpset_64(&r->state, os.state, ns.state))
+			break;
+		critical_exit();
+		cpu_spinwait();
+	}
+	pidx_start = os.pidx_head;
+	pidx_stop = ns.pidx_head;
+
+	/*
+	 * Wait for other producers who got in ahead of us to enqueue their
+	 * items, one producer at a time.  It is our turn when the ring's
+	 * pidx_tail reaches the begining of our reservation (pidx_start).
+	 */
+	while (ns.pidx_tail != pidx_start) {
+		cpu_spinwait();
+		ns.state = r->state;
+	}
+
+	/* Now it is our turn to fill up the area we reserved earlier. */
+	i = pidx_start;
+	do {
+		r->items[i] = *items++;
+		if (__predict_false(++i == r->size))
+			i = 0;
+	} while (i != pidx_stop);
+
+	/*
+	 * Update the ring's pidx_tail.  The release style atomic guarantees
+	 * that the items are visible to any thread that sees the updated pidx.
+	 */
+	do {
+		os.state = ns.state = r->state;
+		ns.pidx_tail = pidx_stop;
+		ns.flags = BUSY;
+	} while (atomic_cmpset_rel_64(&r->state, os.state, ns.state) == 0);
+	critical_exit();
+	counter_u64_add(r->enqueues, n);
+
+	/*
+	 * Turn into a consumer if some other thread isn't active as a consumer
+	 * already.
+	 */
+	if (os.flags != BUSY)
+		drain_ring(r, ns, os.flags, budget);
+
+	return (0);
+}
+
+void
+mp_ring_check_drainage(struct mp_ring *r, int budget)
+{
+	union ring_state os, ns;
+
+	os.state = r->state;
+	if (os.flags != STALLED || os.pidx_head != os.pidx_tail ||
+	    r->can_drain(r) == 0)
+		return;
+
+	MPASS(os.cidx != os.pidx_tail);	/* implied by STALLED */
+	ns.state = os.state;
+	ns.flags = BUSY;
+
+	/*
+	 * The acquire style atomic guarantees visibility of items associated
+	 * with the pidx that we read here.
+	 */
+	if (!atomic_cmpset_acq_64(&r->state, os.state, ns.state))
+		return;
+
+	drain_ring(r, ns, os.flags, budget);
+}
+
+void
+mp_ring_reset_stats(struct mp_ring *r)
+{
+
+	counter_u64_zero(r->enqueues);
+	counter_u64_zero(r->drops);
+	counter_u64_zero(r->starts);
+	counter_u64_zero(r->stalls);
+	counter_u64_zero(r->restarts);
+	counter_u64_zero(r->abdications);
+}
+
+int
+mp_ring_is_idle(struct mp_ring *r)
+{
+	union ring_state s;
+
+	s.state = r->state;
+	if (s.pidx_head == s.pidx_tail && s.pidx_tail == s.cidx &&
+	    s.flags == IDLE)
+		return (1);
+
+	return (0);
+}
diff --git a/sys/dev/cxgbe/t4_mp_ring.h b/sys/dev/cxgbe/t4_mp_ring.h
new file mode 100644
index 0000000..c9ee346
--- /dev/null
+++ b/sys/dev/cxgbe/t4_mp_ring.h
@@ -0,0 +1,68 @@
+/*-
+ * Copyright (c) 2014 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+#ifndef __CXGBE_MP_RING_H
+#define __CXGBE_MP_RING_H
+
+#ifndef _KERNEL
+#error "no user-serviceable parts inside"
+#endif
+
+struct mp_ring;
+typedef u_int (*ring_drain_t)(struct mp_ring *, u_int, u_int);
+typedef u_int (*ring_can_drain_t)(struct mp_ring *);
+
+struct mp_ring {
+	volatile uint64_t	state __aligned(CACHE_LINE_SIZE);
+
+	int			size __aligned(CACHE_LINE_SIZE);
+	void *			cookie;
+	struct malloc_type *	mt;
+	ring_drain_t		drain;
+	ring_can_drain_t	can_drain;	/* cheap, may be unreliable */
+	counter_u64_t		enqueues;
+	counter_u64_t		drops;
+	counter_u64_t		starts;
+	counter_u64_t		stalls;
+	counter_u64_t		restarts;	/* recovered after stalling */
+	counter_u64_t		abdications;
+
+	void * volatile		items[] __aligned(CACHE_LINE_SIZE);
+};
+
+int mp_ring_alloc(struct mp_ring **, int, void *, ring_drain_t,
+    ring_can_drain_t, struct malloc_type *, int);
+void mp_ring_free(struct mp_ring *);
+int mp_ring_enqueue(struct mp_ring *, void **, int, int);
+void mp_ring_check_drainage(struct mp_ring *, int);
+void mp_ring_reset_stats(struct mp_ring *);
+int mp_ring_is_idle(struct mp_ring *);
+
+#endif
diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c
index 96e22cb..026b4ce 100644
--- a/sys/dev/cxgbe/t4_sge.c
+++ b/sys/dev/cxgbe/t4_sge.c
@@ -36,12 +36,12 @@ __FBSDID("$FreeBSD$");
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/kernel.h>
-#include <sys/kdb.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/taskqueue.h>
 #include <sys/time.h>
+#include <sys/sglist.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 #include <sys/counter.h>
@@ -68,6 +68,7 @@ __FBSDID("$FreeBSD$");
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "common/t4_msg.h"
+#include "t4_mp_ring.h"
 
 #ifdef T4_PKT_TIMESTAMP
 #define RX_COPY_THRESHOLD (MINCLSIZE - 8)
@@ -147,19 +148,17 @@ TUNABLE_INT("hw.cxgbe.largest_rx_cluster", &largest_rx_cluster);
 static int safest_rx_cluster = PAGE_SIZE;
 TUNABLE_INT("hw.cxgbe.safest_rx_cluster", &safest_rx_cluster);
 
-/* Used to track coalesced tx work request */
 struct txpkts {
-	uint64_t *flitp;	/* ptr to flit where next pkt should start */
-	uint8_t npkt;		/* # of packets in this work request */
-	uint8_t nflits;		/* # of flits used by this work request */
-	uint16_t plen;		/* total payload (sum of all packets) */
+	u_int wr_type;		/* type 0 or type 1 */
+	u_int npkt;		/* # of packets in this work request */
+	u_int plen;		/* total payload (sum of all packets) */
+	u_int len16;		/* # of 16B pieces used by this work request */
 };
 
 /* A packet's SGL.  This + m_pkthdr has all info needed for tx */
 struct sgl {
-	int nsegs;		/* # of segments in the SGL, 0 means imm. tx */
-	int nflits;		/* # of flits needed for the SGL */
-	bus_dma_segment_t seg[TX_SGL_SEGS];
+	struct sglist sg;
+	struct sglist_seg seg[TX_SGL_SEGS];
 };
 
 static int service_iq(struct sge_iq *, int);
@@ -221,26 +220,31 @@ static void find_best_refill_source(struct adapter *, struct sge_fl *, int);
 static void find_safe_refill_source(struct adapter *, struct sge_fl *);
 static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
 
-static int get_pkt_sgl(struct sge_txq *, struct mbuf **, struct sgl *, int);
-static int free_pkt_sgl(struct sge_txq *, struct sgl *);
-static int write_txpkt_wr(struct port_info *, struct sge_txq *, struct mbuf *,
-    struct sgl *);
-static int add_to_txpkts(struct port_info *, struct sge_txq *, struct txpkts *,
-    struct mbuf *, struct sgl *);
-static void write_txpkts_wr(struct sge_txq *, struct txpkts *);
-static inline void write_ulp_cpl_sgl(struct port_info *, struct sge_txq *,
-    struct txpkts *, struct mbuf *, struct sgl *);
-static int write_sgl_to_txd(struct sge_eq *, struct sgl *, caddr_t *);
+static inline void get_pkt_gl(struct mbuf *, struct sglist *);
+static inline u_int txpkt_len16(u_int, u_int);
+static inline u_int txpkts0_len16(u_int);
+static inline u_int txpkts1_len16(void);
+static u_int write_txpkt_wr(struct sge_txq *, struct fw_eth_tx_pkt_wr *,
+    struct mbuf *, u_int);
+static int try_txpkts(struct mbuf *, struct mbuf *, struct txpkts *, u_int);
+static int add_to_txpkts(struct mbuf *, struct txpkts *, u_int);
+static u_int write_txpkts_wr(struct sge_txq *, struct fw_eth_tx_pkts_wr *,
+    struct mbuf *, const struct txpkts *, u_int);
+static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int);
 static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int);
-static inline void ring_eq_db(struct adapter *, struct sge_eq *);
-static inline int reclaimable(struct sge_eq *);
-static int reclaim_tx_descs(struct sge_txq *, int, int);
-static void write_eqflush_wr(struct sge_eq *);
-static __be64 get_flit(bus_dma_segment_t *, int, int);
+static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int);
+static inline uint16_t read_hw_cidx(struct sge_eq *);
+static inline u_int reclaimable_tx_desc(struct sge_eq *);
+static inline u_int total_available_tx_desc(struct sge_eq *);
+static u_int reclaim_tx_descs(struct sge_txq *, u_int);
+static void tx_reclaim(void *, int);
+static __be64 get_flit(struct sglist_seg *, int, int);
 static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 static int handle_fw_msg(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
+static void wrq_tx_drain(void *, int);
+static void drain_wrq_wr_list(struct adapter *, struct sge_wrq *);
 
 static int sysctl_uint16(SYSCTL_HANDLER_ARGS);
 static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS);
@@ -1785,327 +1789,679 @@ t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
 }
 
 /*
+ * Must drain the wrq or make sure that someone else will.
+ */
+static void
+wrq_tx_drain(void *arg, int n)
+{
+	struct sge_wrq *wrq = arg;
+	struct sge_eq *eq = &wrq->eq;
+
+	EQ_LOCK(eq);
+	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
+		drain_wrq_wr_list(wrq->adapter, wrq);
+	EQ_UNLOCK(eq);
+}
+
+static void
+drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq)
+{
+	struct sge_eq *eq = &wrq->eq;
+	u_int available, dbdiff;	/* # of hardware descriptors */
+	u_int n;
+	struct wrqe *wr;
+	struct fw_eth_tx_pkt_wr *dst;	/* any fw WR struct will do */
+
+	EQ_LOCK_ASSERT_OWNED(eq);
+	MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs));
+	wr = STAILQ_FIRST(&wrq->wr_list);
+	MPASS(wr != NULL);	/* Must be called with something useful to do */
+	dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx);
+
+	do {
+		eq->cidx = read_hw_cidx(eq);
+		if (eq->pidx == eq->cidx)
+			available = eq->sidx - 1;
+		else
+			available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
+
+		MPASS(wr->wrq == wrq);
+		n = howmany(wr->wr_len, EQ_ESIZE);
+		if (available < n)
+			return;
+
+		dst = (void *)&eq->desc[eq->pidx];
+		if (__predict_true(eq->sidx - eq->pidx > n)) {
+			/* Won't wrap, won't end exactly at the status page. */
+			bcopy(&wr->wr[0], dst, wr->wr_len);
+			eq->pidx += n;
+		} else {
+			int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE;
+
+			bcopy(&wr->wr[0], dst, first_portion);
+			if (wr->wr_len > first_portion) {
+				bcopy(&wr->wr[first_portion], &eq->desc[0],
+				    wr->wr_len - first_portion);
+			}
+			eq->pidx = n - (eq->sidx - eq->pidx);
+		}
+
+		if (available < eq->sidx / 4 &&
+		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
+			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
+			    F_FW_WR_EQUEQ);
+			eq->equeqidx = eq->pidx;
+		} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
+			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
+			eq->equeqidx = eq->pidx;
+		}
+
+		dbdiff += n;
+		if (dbdiff >= 16) {
+			ring_eq_db(sc, eq, dbdiff);
+			dbdiff = 0;
+		}
+
+		STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
+		free_wrqe(wr);
+		MPASS(wrq->nwr_pending > 0);
+		wrq->nwr_pending--;
+		MPASS(wrq->ndesc_needed >= n);
+		wrq->ndesc_needed -= n;
+	} while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL);
+
+	if (dbdiff)
+		ring_eq_db(sc, eq, dbdiff);
+}
+
+/*
  * Doesn't fail.  Holds on to work requests it can't send right away.
  */
 void
 t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr)
 {
+#ifdef INVARIANTS
 	struct sge_eq *eq = &wrq->eq;
-	int can_reclaim;
-	caddr_t dst;
+#endif
+
+	EQ_LOCK_ASSERT_OWNED(eq);
+	MPASS(wr != NULL);
+	MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN);
+	MPASS((wr->wr_len & 0x7) == 0);
+
+	STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link);
+	wrq->nwr_pending++;
+	wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE);
+
+	if (!TAILQ_EMPTY(&wrq->incomplete_wrs))
+		return;	/* commit_wrq_wr will drain wr_list as well. */
+
+	drain_wrq_wr_list(sc, wrq);
+
+	/* Doorbell must have caught up to the pidx. */
+	MPASS(eq->pidx == eq->dbidx);
+}
 
-	TXQ_LOCK_ASSERT_OWNED(wrq);
+void
+t4_update_fl_bufsize(struct ifnet *ifp)
+{
+	struct port_info *pi = ifp->if_softc;
+	struct adapter *sc = pi->adapter;
+	struct sge_rxq *rxq;
 #ifdef TCP_OFFLOAD
-	KASSERT((eq->flags & EQ_TYPEMASK) == EQ_OFLD ||
-	    (eq->flags & EQ_TYPEMASK) == EQ_CTRL,
-	    ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK));
-#else
-	KASSERT((eq->flags & EQ_TYPEMASK) == EQ_CTRL,
-	    ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK));
+	struct sge_ofld_rxq *ofld_rxq;
 #endif
+	struct sge_fl *fl;
+	int i, maxp, mtu = ifp->if_mtu;
 
-	if (__predict_true(wr != NULL))
-		STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link);
+	maxp = mtu_to_max_payload(sc, mtu, 0);
+	for_each_rxq(pi, i, rxq) {
+		fl = &rxq->fl;
 
-	can_reclaim = reclaimable(eq);
-	if (__predict_false(eq->flags & EQ_STALLED)) {
-		if (eq->avail + can_reclaim < tx_resume_threshold(eq))
-			return;
-		eq->flags &= ~EQ_STALLED;
-		eq->unstalled++;
+		FL_LOCK(fl);
+		find_best_refill_source(sc, fl, maxp);
+		FL_UNLOCK(fl);
 	}
-	eq->cidx += can_reclaim;
-	eq->avail += can_reclaim;
-	if (__predict_false(eq->cidx >= eq->cap))
-		eq->cidx -= eq->cap;
+#ifdef TCP_OFFLOAD
+	maxp = mtu_to_max_payload(sc, mtu, 1);
+	for_each_ofld_rxq(pi, i, ofld_rxq) {
+		fl = &ofld_rxq->fl;
 
-	while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL) {
-		int ndesc;
+		FL_LOCK(fl);
+		find_best_refill_source(sc, fl, maxp);
+		FL_UNLOCK(fl);
+	}
+#endif
+}
 
-		if (__predict_false(wr->wr_len < 0 ||
-		    wr->wr_len > SGE_MAX_WR_LEN || (wr->wr_len & 0x7))) {
+static inline int
+mbuf_nsegs(struct mbuf *m)
+{
 
-#ifdef INVARIANTS
-			panic("%s: work request with length %d", __func__,
-			    wr->wr_len);
-#endif
-#ifdef KDB
-			kdb_backtrace();
-#endif
-			log(LOG_ERR, "%s: %s work request with length %d",
-			    device_get_nameunit(sc->dev), __func__, wr->wr_len);
-			STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
-			free_wrqe(wr);
-			continue;
-		}
+	M_ASSERTPKTHDR(m);
+	KASSERT(m->m_pkthdr.l5hlen > 0,
+	    ("%s: mbuf %p missing information on # of segments.", __func__, m));
 
-		ndesc = howmany(wr->wr_len, EQ_ESIZE);
-		if (eq->avail < ndesc) {
-			wrq->no_desc++;
-			break;
-		}
+	return (m->m_pkthdr.l5hlen);
+}
 
-		dst = (void *)&eq->desc[eq->pidx];
-		copy_to_txd(eq, wrtod(wr), &dst, wr->wr_len);
+static inline void
+set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs)
+{
+
+	M_ASSERTPKTHDR(m);
+	m->m_pkthdr.l5hlen = nsegs;
+}
 
-		eq->pidx += ndesc;
-		eq->avail -= ndesc;
-		if (__predict_false(eq->pidx >= eq->cap))
-			eq->pidx -= eq->cap;
+static inline int
+mbuf_len16(struct mbuf *m)
+{
+	int n;
 
-		eq->pending += ndesc;
-		if (eq->pending >= 8)
-			ring_eq_db(sc, eq);
+	M_ASSERTPKTHDR(m);
+	n = m->m_pkthdr.PH_loc.eight[0];
+	MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
 
-		wrq->tx_wrs++;
-		STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
-		free_wrqe(wr);
+	return (n);
+}
 
-		if (eq->avail < 8) {
-			can_reclaim = reclaimable(eq);
-			eq->cidx += can_reclaim;
-			eq->avail += can_reclaim;
-			if (__predict_false(eq->cidx >= eq->cap))
-				eq->cidx -= eq->cap;
-		}
-	}
+static inline void
+set_mbuf_len16(struct mbuf *m, uint8_t len16)
+{
+
+	M_ASSERTPKTHDR(m);
+	m->m_pkthdr.PH_loc.eight[0] = len16;
+}
+
+static inline int
+needs_tso(struct mbuf *m)
+{
 
-	if (eq->pending)
-		ring_eq_db(sc, eq);
+	M_ASSERTPKTHDR(m);
 
-	if (wr != NULL) {
-		eq->flags |= EQ_STALLED;
-		if (callout_pending(&eq->tx_callout) == 0)
-			callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq);
+	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
+		KASSERT(m->m_pkthdr.tso_segsz > 0,
+		    ("%s: TSO requested in mbuf %p but MSS not provided",
+		    __func__, m));
+		return (1);
 	}
+
+	return (0);
 }
 
-/* Per-packet header in a coalesced tx WR, before the SGL starts (in flits) */
-#define TXPKTS_PKT_HDR ((\
-    sizeof(struct ulp_txpkt) + \
-    sizeof(struct ulptx_idata) + \
-    sizeof(struct cpl_tx_pkt_core) \
-    ) / 8)
-
-/* Header of a coalesced tx WR, before SGL of first packet (in flits) */
-#define TXPKTS_WR_HDR (\
-    sizeof(struct fw_eth_tx_pkts_wr) / 8 + \
-    TXPKTS_PKT_HDR)
-
-/* Header of a tx WR, before SGL of first packet (in flits) */
-#define TXPKT_WR_HDR ((\
-    sizeof(struct fw_eth_tx_pkt_wr) + \
-    sizeof(struct cpl_tx_pkt_core) \
-    ) / 8 )
-
-/* Header of a tx LSO WR, before SGL of first packet (in flits) */
-#define TXPKT_LSO_WR_HDR ((\
-    sizeof(struct fw_eth_tx_pkt_wr) + \
-    sizeof(struct cpl_tx_pkt_lso_core) + \
-    sizeof(struct cpl_tx_pkt_core) \
-    ) / 8 )
+static inline int
+needs_l3_csum(struct mbuf *m)
+{
 
-int
-t4_eth_tx(struct ifnet *ifp, struct sge_txq *txq, struct mbuf *m)
+	M_ASSERTPKTHDR(m);
+
+	if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO))
+		return (1);
+	return (0);
+}
+
+static inline int
+needs_l4_csum(struct mbuf *m)
 {
-	struct port_info *pi = (void *)ifp->if_softc;
-	struct adapter *sc = pi->adapter;
-	struct sge_eq *eq = &txq->eq;
-	struct buf_ring *br = txq->br;
-	struct mbuf *next;
-	int rc, coalescing, can_reclaim;
-	struct txpkts txpkts;
-	struct sgl sgl;
 
-	TXQ_LOCK_ASSERT_OWNED(txq);
-	KASSERT(m, ("%s: called with nothing to do.", __func__));
-	KASSERT((eq->flags & EQ_TYPEMASK) == EQ_ETH,
-	    ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK));
+	M_ASSERTPKTHDR(m);
 
-	prefetch(&eq->desc[eq->pidx]);
-	prefetch(&txq->sdesc[eq->pidx]);
+	if (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
+	    CSUM_TCP_IPV6 | CSUM_TSO))
+		return (1);
+	return (0);
+}
 
-	txpkts.npkt = 0;/* indicates there's nothing in txpkts */
-	coalescing = 0;
+static inline int
+needs_vlan_insertion(struct mbuf *m)
+{
 
-	can_reclaim = reclaimable(eq);
-	if (__predict_false(eq->flags & EQ_STALLED)) {
-		if (eq->avail + can_reclaim < tx_resume_threshold(eq)) {
-			txq->m = m;
-			return (0);
-		}
-		eq->flags &= ~EQ_STALLED;
-		eq->unstalled++;
-	}
+	M_ASSERTPKTHDR(m);
 
-	if (__predict_false(eq->flags & EQ_DOOMED)) {
-		m_freem(m);
-		while ((m = buf_ring_dequeue_sc(txq->br)) != NULL)
-			m_freem(m);
-		return (ENETDOWN);
+	if (m->m_flags & M_VLANTAG) {
+		KASSERT(m->m_pkthdr.ether_vtag != 0,
+		    ("%s: HWVLAN requested in mbuf %p but tag not provided",
+		    __func__, m));
+		return (1);
 	}
+	return (0);
+}
 
-	if (eq->avail < 8 && can_reclaim)
-		reclaim_tx_descs(txq, can_reclaim, 32);
+static void *
+m_advance(struct mbuf **pm, int *poffset, int len)
+{
+	struct mbuf *m = *pm;
+	int offset = *poffset;
+	uintptr_t p = 0;
 
-	for (; m; m = next ? next : drbr_dequeue(ifp, br)) {
+	MPASS(len > 0);
 
-		if (eq->avail < 8)
+	while (len) {
+		if (offset + len < m->m_len) {
+			offset += len;
+			p = mtod(m, uintptr_t) + offset;
 			break;
+		}
+		len -= m->m_len - offset;
+		m = m->m_next;
+		offset = 0;
+		MPASS(m != NULL);
+	}
+	*poffset = offset;
+	*pm = m;
+	return ((void *)p);
+}
 
-		next = m->m_nextpkt;
-		m->m_nextpkt = NULL;
+static inline int
+same_paddr(char *a, char *b)
+{
 
-		if (next || buf_ring_peek(br))
-			coalescing = 1;
+	if (a == b)
+		return (1);
+	else if (a != NULL && b != NULL) {
+		vm_offset_t x = (vm_offset_t)a;
+		vm_offset_t y = (vm_offset_t)b;
 
-		rc = get_pkt_sgl(txq, &m, &sgl, coalescing);
-		if (rc != 0) {
-			if (rc == ENOMEM) {
+		if ((x & PAGE_MASK) == (y & PAGE_MASK) &&
+		    pmap_kextract(x) == pmap_kextract(y))
+			return (1);
+	}
 
-				/* Short of resources, suspend tx */
+	return (0);
+}
 
-				m->m_nextpkt = next;
-				break;
-			}
+/*
+ * Can deal with empty mbufs in the chain that have m_len = 0, but the chain
+ * must have at least one mbuf that's not empty.
+ */
+static inline int
+count_mbuf_nsegs(struct mbuf *m)
+{
+	char *prev_end, *start;
+	int len, nsegs;
 
-			/*
-			 * Unrecoverable error for this packet, throw it away
-			 * and move on to the next.  get_pkt_sgl may already
-			 * have freed m (it will be NULL in that case and the
-			 * m_freem here is still safe).
-			 */
+	MPASS(m != NULL);
 
-			m_freem(m);
+	nsegs = 0;
+	prev_end = NULL;
+	for (; m; m = m->m_next) {
+
+		len = m->m_len;
+		if (__predict_false(len == 0))
 			continue;
-		}
+		start = mtod(m, char *);
+
+		nsegs += sglist_count(start, len);
+		if (same_paddr(prev_end, start))
+			nsegs--;
+		prev_end = start + len;
+	}
 
-		if (coalescing &&
-		    add_to_txpkts(pi, txq, &txpkts, m, &sgl) == 0) {
+	MPASS(nsegs > 0);
+	return (nsegs);
+}
 
-			/* Successfully absorbed into txpkts */
+/*
+ * Analyze the mbuf to determine its tx needs.  The mbuf passed in may change:
+ * a) caller can assume it's been freed if this function returns with an error.
+ * b) it may get defragged up if the gather list is too long for the hardware.
+ */
+int
+parse_pkt(struct mbuf **mp)
+{
+	struct mbuf *m0 = *mp, *m;
+	int rc, nsegs, defragged = 0, offset;
+	struct ether_header *eh;
+	void *l3hdr;
+#if defined(INET) || defined(INET6)
+	struct tcphdr *tcp;
+#endif
+	uint16_t eh_type;
 
-			write_ulp_cpl_sgl(pi, txq, &txpkts, m, &sgl);
-			goto doorbell;
+	M_ASSERTPKTHDR(m0);
+	if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) {
+		rc = EINVAL;
+fail:
+		m_freem(m0);
+		*mp = NULL;
+		return (rc);
+	}
+restart:
+	/*
+	 * First count the number of gather list segments in the payload.
+	 * Defrag the mbuf if nsegs exceeds the hardware limit.
+	 */
+	M_ASSERTPKTHDR(m0);
+	MPASS(m0->m_pkthdr.len > 0);
+	nsegs = count_mbuf_nsegs(m0);
+	if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) {
+		if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) {
+			rc = EFBIG;
+			goto fail;
 		}
+		*mp = m0 = m;	/* update caller's copy after defrag */
+		goto restart;
+	}
 
-		/*
-		 * We weren't coalescing to begin with, or current frame could
-		 * not be coalesced (add_to_txpkts flushes txpkts if a frame
-		 * given to it can't be coalesced).  Either way there should be
-		 * nothing in txpkts.
-		 */
-		KASSERT(txpkts.npkt == 0,
-		    ("%s: txpkts not empty: %d", __func__, txpkts.npkt));
+	if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN)) {
+		m0 = m_pullup(m0, m0->m_pkthdr.len);
+		if (m0 == NULL) {
+			/* Should have left well enough alone. */
+			rc = EFBIG;
+			goto fail;
+		}
+		*mp = m0;	/* update caller's copy after pullup */
+		goto restart;
+	}
+	set_mbuf_nsegs(m0, nsegs);
+	set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0)));
 
-		/* We're sending out individual packets now */
-		coalescing = 0;
+	if (!needs_tso(m0))
+		return (0);
 
-		if (eq->avail < 8)
-			reclaim_tx_descs(txq, 0, 8);
-		rc = write_txpkt_wr(pi, txq, m, &sgl);
-		if (rc != 0) {
+	m = m0;
+	eh = mtod(m, struct ether_header *);
+	eh_type = ntohs(eh->ether_type);
+	if (eh_type == ETHERTYPE_VLAN) {
+		struct ether_vlan_header *evh = (void *)eh;
 
-			/* Short of hardware descriptors, suspend tx */
+		eh_type = ntohs(evh->evl_proto);
+		m0->m_pkthdr.l2hlen = sizeof(*evh);
+	} else
+		m0->m_pkthdr.l2hlen = sizeof(*eh);
 
-			/*
-			 * This is an unlikely but expensive failure.  We've
-			 * done all the hard work (DMA mappings etc.) and now we
-			 * can't send out the packet.  What's worse, we have to
-			 * spend even more time freeing up everything in sgl.
-			 */
-			txq->no_desc++;
-			free_pkt_sgl(txq, &sgl);
+	offset = 0;
+	l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen);
 
-			m->m_nextpkt = next;
-			break;
-		}
+	switch (eh_type) {
+#ifdef INET6
+	case ETHERTYPE_IPV6:
+	{
+		struct ip6_hdr *ip6 = l3hdr;
 
-		ETHER_BPF_MTAP(ifp, m);
-		if (sgl.nsegs == 0)
-			m_freem(m);
-doorbell:
-		if (eq->pending >= 8)
-			ring_eq_db(sc, eq);
+		MPASS(ip6->ip6_nxt == IPPROTO_TCP);
 
-		can_reclaim = reclaimable(eq);
-		if (can_reclaim >= 32)
-			reclaim_tx_descs(txq, can_reclaim, 64);
+		m0->m_pkthdr.l3hlen = sizeof(*ip6);
+		break;
 	}
+#endif
+#ifdef INET
+	case ETHERTYPE_IP:
+	{
+		struct ip *ip = l3hdr;
 
-	if (txpkts.npkt > 0)
-		write_txpkts_wr(txq, &txpkts);
+		m0->m_pkthdr.l3hlen = ip->ip_hl * 4;
+		break;
+	}
+#endif
+	default:
+		panic("%s: ethertype 0x%04x unknown.  if_cxgbe must be compiled"
+		    " with the same INET/INET6 options as the kernel.",
+		    __func__, eh_type);
+	}
 
-	/*
-	 * m not NULL means there was an error but we haven't thrown it away.
-	 * This can happen when we're short of tx descriptors (no_desc) or maybe
-	 * even DMA maps (no_dmamap).  Either way, a credit flush and reclaim
-	 * will get things going again.
-	 */
-	if (m && !(eq->flags & EQ_CRFLUSHED)) {
-		struct tx_sdesc *txsd = &txq->sdesc[eq->pidx];
+#if defined(INET) || defined(INET6)
+	tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen);
+	m0->m_pkthdr.l4hlen = tcp->th_off * 4;
+#endif
+	MPASS(m0 == *mp);
+	return (0);
+}
 
-		/*
-		 * If EQ_CRFLUSHED is not set then we know we have at least one
-		 * available descriptor because any WR that reduces eq->avail to
-		 * 0 also sets EQ_CRFLUSHED.
-		 */
-		KASSERT(eq->avail > 0, ("%s: no space for eqflush.", __func__));
+void *
+start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie)
+{
+	struct sge_eq *eq = &wrq->eq;
+	struct adapter *sc = wrq->adapter;
+	int ndesc, available;
+	struct wrqe *wr;
+	void *w;
 
-		txsd->desc_used = 1;
-		txsd->credits = 0;
-		write_eqflush_wr(eq);
-	}
-	txq->m = m;
+	MPASS(len16 > 0);
+	ndesc = howmany(len16, EQ_ESIZE / 16);
+	MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC);
 
-	if (eq->pending)
-		ring_eq_db(sc, eq);
+	EQ_LOCK(eq);
 
-	reclaim_tx_descs(txq, 0, 128);
+	if (!STAILQ_EMPTY(&wrq->wr_list))
+		drain_wrq_wr_list(sc, wrq);
 
-	if (eq->flags & EQ_STALLED && callout_pending(&eq->tx_callout) == 0)
-		callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq);
+	if (!STAILQ_EMPTY(&wrq->wr_list)) {
+slowpath:
+		EQ_UNLOCK(eq);
+		wr = alloc_wrqe(len16 * 16, wrq);
+		if (__predict_false(wr == NULL))
+			return (NULL);
+		cookie->pidx = -1;
+		cookie->ndesc = ndesc;
+		return (&wr->wr);
+	}
 
-	return (0);
+	eq->cidx = read_hw_cidx(eq);
+	if (eq->pidx == eq->cidx)
+		available = eq->sidx - 1;
+	else
+		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
+	if (available < ndesc)
+		goto slowpath;
+
+	cookie->pidx = eq->pidx;
+	cookie->ndesc = ndesc;
+	TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link);
+
+	w = &eq->desc[eq->pidx];
+	IDXINCR(eq->pidx, ndesc, eq->sidx);
+	if (__predict_false(eq->pidx < ndesc - 1)) {
+		w = &wrq->ss[0];
+		wrq->ss_pidx = cookie->pidx;
+		wrq->ss_len = len16 * 16;
+	}
+
+	EQ_UNLOCK(eq);
+
+	return (w);
 }
 
 void
-t4_update_fl_bufsize(struct ifnet *ifp)
+commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie)
 {
-	struct port_info *pi = ifp->if_softc;
-	struct adapter *sc = pi->adapter;
-	struct sge_rxq *rxq;
-#ifdef TCP_OFFLOAD
-	struct sge_ofld_rxq *ofld_rxq;
-#endif
-	struct sge_fl *fl;
-	int i, maxp, mtu = ifp->if_mtu;
+	struct sge_eq *eq = &wrq->eq;
+	struct adapter *sc = wrq->adapter;
+	int ndesc, pidx;
+	struct wrq_cookie *prev, *next;
 
-	maxp = mtu_to_max_payload(sc, mtu, 0);
-	for_each_rxq(pi, i, rxq) {
-		fl = &rxq->fl;
+	if (cookie->pidx == -1) {
+		struct wrqe *wr = __containerof(w, struct wrqe, wr);
 
-		FL_LOCK(fl);
-		find_best_refill_source(sc, fl, maxp);
-		FL_UNLOCK(fl);
+		t4_wrq_tx(sc, wr);
+		return;
 	}
-#ifdef TCP_OFFLOAD
-	maxp = mtu_to_max_payload(sc, mtu, 1);
-	for_each_ofld_rxq(pi, i, ofld_rxq) {
-		fl = &ofld_rxq->fl;
 
-		FL_LOCK(fl);
-		find_best_refill_source(sc, fl, maxp);
-		FL_UNLOCK(fl);
+	ndesc = cookie->ndesc;	/* Can be more than SGE_MAX_WR_NDESC here. */
+	pidx = cookie->pidx;
+	MPASS(pidx >= 0 && pidx < eq->sidx);
+	if (__predict_false(w == &wrq->ss[0])) {
+		int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE;
+
+		MPASS(wrq->ss_len > n);	/* WR had better wrap around. */
+		bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n);
+		bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n);
+		wrq->tx_wrs_ss++;
+	} else
+		wrq->tx_wrs_direct++;
+
+	EQ_LOCK(eq);
+	prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link);
+	next = TAILQ_NEXT(cookie, link);
+	if (prev == NULL) {
+		MPASS(pidx == eq->dbidx);
+		if (next == NULL || ndesc >= 16)
+			ring_eq_db(wrq->adapter, eq, ndesc);
+		else {
+			MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc);
+			next->pidx = pidx;
+			next->ndesc += ndesc;
+		}
+	} else {
+		MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc);
+		prev->ndesc += ndesc;
+	}
+	TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link);
+
+	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
+		drain_wrq_wr_list(sc, wrq);
+
+#ifdef INVARIANTS
+	if (TAILQ_EMPTY(&wrq->incomplete_wrs)) {
+		/* Doorbell must have caught up to the pidx. */
+		MPASS(wrq->eq.pidx == wrq->eq.dbidx);
 	}
 #endif
+	EQ_UNLOCK(eq);
 }
 
-int
-can_resume_tx(struct sge_eq *eq)
-{
+static u_int
+can_resume_eth_tx(struct mp_ring *r)
+{
+	struct sge_eq *eq = r->cookie;
+
+	return (total_available_tx_desc(eq) > eq->sidx / 8);
+}
+
+static inline int
+cannot_use_txpkts(struct mbuf *m)
+{
+	/* maybe put a GL limit too, to avoid silliness? */
+
+	return (needs_tso(m));
+}
+
+/*
+ * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to
+ * be consumed.  Return the actual number consumed.  0 indicates a stall.
+ */
+static u_int
+eth_tx(struct mp_ring *r, u_int cidx, u_int pidx)
+{
+	struct sge_txq *txq = r->cookie;
+	struct sge_eq *eq = &txq->eq;
+	struct ifnet *ifp = txq->ifp;
+	struct port_info *pi = (void *)ifp->if_softc;
+	struct adapter *sc = pi->adapter;
+	u_int total, remaining;		/* # of packets */
+	u_int available, dbdiff;	/* # of hardware descriptors */
+	u_int n, next_cidx;
+	struct mbuf *m0, *tail;
+	struct txpkts txp;
+	struct fw_eth_tx_pkts_wr *wr;	/* any fw WR struct will do */
+
+	remaining = IDXDIFF(pidx, cidx, r->size);
+	MPASS(remaining > 0);	/* Must not be called without work to do. */
+	total = 0;
+
+	TXQ_LOCK(txq);
+	if (__predict_false((eq->flags & EQ_ENABLED) == 0)) {
+		while (cidx != pidx) {
+			m0 = r->items[cidx];
+			m_freem(m0);
+			if (++cidx == r->size)
+				cidx = 0;
+		}
+		reclaim_tx_descs(txq, 2048);
+		total = remaining;
+		goto done;
+	}
+
+	/* How many hardware descriptors do we have readily available. */
+	if (eq->pidx == eq->cidx)
+		available = eq->sidx - 1;
+	else
+		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
+	dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx);
+
+	while (remaining > 0) {
+
+		m0 = r->items[cidx];
+		M_ASSERTPKTHDR(m0);
+		MPASS(m0->m_nextpkt == NULL);
+
+		if (available < SGE_MAX_WR_NDESC) {
+			available += reclaim_tx_descs(txq, 64);
+			if (available < howmany(mbuf_len16(m0), EQ_ESIZE / 16))
+				break;	/* out of descriptors */
+		}
+
+		next_cidx = cidx + 1;
+		if (__predict_false(next_cidx == r->size))
+			next_cidx = 0;
+
+		wr = (void *)&eq->desc[eq->pidx];
+		if (remaining > 1 &&
+		    try_txpkts(m0, r->items[next_cidx], &txp, available) == 0) {
+
+			/* pkts at cidx, next_cidx should both be in txp. */
+			MPASS(txp.npkt == 2);
+			tail = r->items[next_cidx];
+			MPASS(tail->m_nextpkt == NULL);
+			ETHER_BPF_MTAP(ifp, m0);
+			ETHER_BPF_MTAP(ifp, tail);
+			m0->m_nextpkt = tail;
+
+			if (__predict_false(++next_cidx == r->size))
+				next_cidx = 0;
+
+			while (next_cidx != pidx) {
+				if (add_to_txpkts(r->items[next_cidx], &txp,
+				    available) != 0)
+					break;
+				tail->m_nextpkt = r->items[next_cidx];
+				tail = tail->m_nextpkt;
+				ETHER_BPF_MTAP(ifp, tail);
+				if (__predict_false(++next_cidx == r->size))
+					next_cidx = 0;
+			}
+
+			n = write_txpkts_wr(txq, wr, m0, &txp, available);
+			total += txp.npkt;
+			remaining -= txp.npkt;
+		} else {
+			total++;
+			remaining--;
+			n = write_txpkt_wr(txq, (void *)wr, m0, available);
+			ETHER_BPF_MTAP(ifp, m0);
+		}
+		MPASS(n >= 1 && n <= available && n <= SGE_MAX_WR_NDESC);
+
+		available -= n;
+		dbdiff += n;
+		IDXINCR(eq->pidx, n, eq->sidx);
+
+		if (total_available_tx_desc(eq) < eq->sidx / 4 &&
+		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
+			wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
+			    F_FW_WR_EQUEQ);
+			eq->equeqidx = eq->pidx;
+		} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
+			wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
+			eq->equeqidx = eq->pidx;
+		}
+
+		if (dbdiff >= 16 && remaining >= 4) {
+			ring_eq_db(sc, eq, dbdiff);
+			available += reclaim_tx_descs(txq, 4 * dbdiff);
+			dbdiff = 0;
+		}
+
+		cidx = next_cidx;
+	}
+	if (dbdiff != 0) {
+		ring_eq_db(sc, eq, dbdiff);
+		reclaim_tx_descs(txq, 32);
+	}
+done:
+	TXQ_UNLOCK(txq);
 
-	return (eq->avail + reclaimable(eq) >= tx_resume_threshold(eq));
+	return (total);
 }
 
 static inline void
@@ -2155,11 +2511,8 @@ init_eq(struct sge_eq *eq, int eqtype, int qsize, uint8_t tx_chan,
 	eq->flags = eqtype & EQ_TYPEMASK;
 	eq->tx_chan = tx_chan;
 	eq->iqid = iqid;
-	eq->qsize = qsize;
+	eq->sidx = qsize - spg_len / EQ_ESIZE;
 	strlcpy(eq->lockname, name, sizeof(eq->lockname));
-
-	TASK_INIT(&eq->tx_task, 0, t4_tx_task, eq);
-	callout_init(&eq->tx_callout, CALLOUT_MPSAFE);
 }
 
 static int
@@ -2848,6 +3201,7 @@ ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_ctrl_cmd c;
+	int qsize = eq->sidx + spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
@@ -2856,17 +3210,16 @@ ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq)
 	    V_FW_EQ_CTRL_CMD_VFN(0));
 	c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC |
 	    F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c));
-	c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid)); /* XXX */
+	c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid));
 	c.physeqid_pkd = htobe32(0);
 	c.fetchszm_to_iqid =
-	    htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
+	    htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
 		V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) |
 		F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize =
 	    htobe32(V_FW_EQ_CTRL_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
 		V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
-		V_FW_EQ_CTRL_CMD_CIDXFTHRESH(X_CIDXFLUSHTHRESH_32) |
-		V_FW_EQ_CTRL_CMD_EQSIZE(eq->qsize));
+		V_FW_EQ_CTRL_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
@@ -2892,6 +3245,7 @@ eth_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_eth_cmd c;
+	int qsize = eq->sidx + spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
@@ -2900,15 +3254,15 @@ eth_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
 	    V_FW_EQ_ETH_CMD_VFN(0));
 	c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC |
 	    F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c));
-	c.autoequiqe_to_viid = htobe32(V_FW_EQ_ETH_CMD_VIID(pi->viid));
+	c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE |
+	    F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(pi->viid));
 	c.fetchszm_to_iqid =
-	    htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
+	    htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
 		V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO |
 		V_FW_EQ_ETH_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
-		      V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
-		      V_FW_EQ_ETH_CMD_CIDXFTHRESH(X_CIDXFLUSHTHRESH_32) |
-		      V_FW_EQ_ETH_CMD_EQSIZE(eq->qsize));
+	    V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
+	    V_FW_EQ_ETH_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
@@ -2935,6 +3289,7 @@ ofld_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_ofld_cmd c;
+	int qsize = eq->sidx + spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
@@ -2944,14 +3299,13 @@ ofld_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
 	c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC |
 	    F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c));
 	c.fetchszm_to_iqid =
-		htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) |
+		htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
 		    V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) |
 		    F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize =
 	    htobe32(V_FW_EQ_OFLD_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
 		V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
-		V_FW_EQ_OFLD_CMD_CIDXFTHRESH(X_CIDXFLUSHTHRESH_32) |
-		V_FW_EQ_OFLD_CMD_EQSIZE(eq->qsize));
+		V_FW_EQ_OFLD_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
@@ -2976,21 +3330,20 @@ ofld_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
 static int
 alloc_eq(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
 {
-	int rc;
+	int rc, qsize;
 	size_t len;
 
 	mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF);
 
-	len = eq->qsize * EQ_ESIZE;
+	qsize = eq->sidx + spg_len / EQ_ESIZE;
+	len = qsize * EQ_ESIZE;
 	rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map,
 	    &eq->ba, (void **)&eq->desc);
 	if (rc)
 		return (rc);
 
-	eq->cap = eq->qsize - spg_len / EQ_ESIZE;
-	eq->spg = (void *)&eq->desc[eq->cap];
-	eq->avail = eq->cap - 1;	/* one less to avoid cidx = pidx */
 	eq->pidx = eq->cidx = 0;
+	eq->equeqidx = eq->dbidx = 0;
 	eq->doorbells = sc->doorbells;
 
 	switch (eq->flags & EQ_TYPEMASK) {
@@ -3018,8 +3371,6 @@ alloc_eq(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
 		    eq->flags & EQ_TYPEMASK, rc);
 	}
 
-	eq->tx_callout.c_cpu = eq->cntxt_id % mp_ncpus;
-
 	if (isset(&eq->doorbells, DOORBELL_UDB) ||
 	    isset(&eq->doorbells, DOORBELL_UDBWC) ||
 	    isset(&eq->doorbells, DOORBELL_WCWR)) {
@@ -3101,7 +3452,11 @@ alloc_wrq(struct adapter *sc, struct port_info *pi, struct sge_wrq *wrq,
 		return (rc);
 
 	wrq->adapter = sc;
+	TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq);
+	TAILQ_INIT(&wrq->incomplete_wrs);
 	STAILQ_INIT(&wrq->wr_list);
+	wrq->nwr_pending = 0;
+	wrq->ndesc_needed = 0;
 
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &wrq->eq.cntxt_id, 0, "SGE context id of the queue");
@@ -3111,13 +3466,10 @@ alloc_wrq(struct adapter *sc, struct port_info *pi, struct sge_wrq *wrq,
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.pidx, 0, sysctl_uint16, "I",
 	    "producer index");
-	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs", CTLFLAG_RD,
-	    &wrq->tx_wrs, "# of work requests");
-	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "no_desc", CTLFLAG_RD,
-	    &wrq->no_desc, 0,
-	    "# of times queue ran out of hardware descriptors");
-	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "unstalled", CTLFLAG_RD,
-	    &wrq->eq.unstalled, 0, "# of times queue recovered after stall");
+	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD,
+	    &wrq->tx_wrs_direct, "# of work requests (direct)");
+	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD,
+	    &wrq->tx_wrs_copied, "# of work requests (copied)");
 
 	return (rc);
 }
@@ -3145,37 +3497,30 @@ alloc_txq(struct port_info *pi, struct sge_txq *txq, int idx,
 	char name[16];
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
-	rc = alloc_eq(sc, pi, eq);
-	if (rc)
-		return (rc);
-
-	txq->ifp = pi->ifp;
-
-	txq->sdesc = malloc(eq->cap * sizeof(struct tx_sdesc), M_CXGBE,
-	    M_ZERO | M_WAITOK);
-	txq->br = buf_ring_alloc(eq->qsize, M_CXGBE, M_WAITOK, &eq->eq_lock);
-
-	rc = bus_dma_tag_create(sc->dmat, 1, 0, BUS_SPACE_MAXADDR,
-	    BUS_SPACE_MAXADDR, NULL, NULL, 64 * 1024, TX_SGL_SEGS,
-	    BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL, NULL, &txq->tx_tag);
+	rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx,
+	    M_CXGBE, M_WAITOK);
 	if (rc != 0) {
-		device_printf(sc->dev,
-		    "failed to create tx DMA tag: %d\n", rc);
+		device_printf(sc->dev, "failed to allocate mp_ring: %d\n", rc);
 		return (rc);
 	}
 
-	/*
-	 * We can stuff ~10 frames in an 8-descriptor txpkts WR (8 is the SGE
-	 * limit for any WR).  txq->no_dmamap events shouldn't occur if maps is
-	 * sized for the worst case.
-	 */
-	rc = t4_alloc_tx_maps(&txq->txmaps, txq->tx_tag, eq->qsize * 10 / 8,
-	    M_WAITOK);
+	rc = alloc_eq(sc, pi, eq);
 	if (rc != 0) {
-		device_printf(sc->dev, "failed to setup tx DMA maps: %d\n", rc);
+		mp_ring_free(txq->r);
+		txq->r = NULL;
 		return (rc);
 	}
 
+	/* Can't fail after this point. */
+
+	TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq);
+	txq->ifp = pi->ifp;
+	txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK);
+	txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
+	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf));
+	txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE,
+	    M_ZERO | M_WAITOK);
+
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
 	    NULL, "tx queue");
@@ -3203,23 +3548,39 @@ alloc_txq(struct port_info *pi, struct sge_txq *txq, int idx,
 	    &txq->sgl_wrs, "# of work requests with direct SGL");
 	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD,
 	    &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)");
-	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts_wrs", CTLFLAG_RD,
-	    &txq->txpkts_wrs, "# of txpkts work requests (multiple pkts/WR)");
-	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts_pkts", CTLFLAG_RD,
-	    &txq->txpkts_pkts, "# of frames tx'd using txpkts work requests");
-
-	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "br_drops", CTLFLAG_RD,
-	    &txq->br->br_drops, "# of drops in the buf_ring for this queue");
-	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "no_dmamap", CTLFLAG_RD,
-	    &txq->no_dmamap, 0, "# of times txq ran out of DMA maps");
-	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "no_desc", CTLFLAG_RD,
-	    &txq->no_desc, 0, "# of times txq ran out of hardware descriptors");
-	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "egr_update", CTLFLAG_RD,
-	    &eq->egr_update, 0, "egress update notifications from the SGE");
-	SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "unstalled", CTLFLAG_RD,
-	    &eq->unstalled, 0, "# of times txq recovered after stall");
+	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts0_wrs",
+	    CTLFLAG_RD, &txq->txpkts0_wrs,
+	    "# of txpkts (type 0) work requests");
+	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts1_wrs",
+	    CTLFLAG_RD, &txq->txpkts1_wrs,
+	    "# of txpkts (type 1) work requests");
+	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts0_pkts",
+	    CTLFLAG_RD, &txq->txpkts0_pkts,
+	    "# of frames tx'd using type0 txpkts work requests");
+	SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts1_pkts",
+	    CTLFLAG_RD, &txq->txpkts1_pkts,
+	    "# of frames tx'd using type1 txpkts work requests");
+
+	SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_enqueues",
+	    CTLFLAG_RD, &txq->r->enqueues,
+	    "# of enqueues to the mp_ring for this queue");
+	SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_drops",
+	    CTLFLAG_RD, &txq->r->drops,
+	    "# of drops in the mp_ring for this queue");
+	SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_starts",
+	    CTLFLAG_RD, &txq->r->starts,
+	    "# of normal consumer starts in the mp_ring for this queue");
+	SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_stalls",
+	    CTLFLAG_RD, &txq->r->stalls,
+	    "# of consumer stalls in the mp_ring for this queue");
+	SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_restarts",
+	    CTLFLAG_RD, &txq->r->restarts,
+	    "# of consumer restarts in the mp_ring for this queue");
+	SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_abdications",
+	    CTLFLAG_RD, &txq->r->abdications,
+	    "# of consumer abdications in the mp_ring for this queue");
 
-	return (rc);
+	return (0);
 }
 
 static int
@@ -3233,15 +3594,9 @@ free_txq(struct port_info *pi, struct sge_txq *txq)
 	if (rc)
 		return (rc);
 
+	sglist_free(txq->gl);
 	free(txq->sdesc, M_CXGBE);
-
-	if (txq->txmaps.maps)
-		t4_free_tx_maps(&txq->txmaps, txq->tx_tag);
-
-	buf_ring_free(txq->br, M_CXGBE);
-
-	if (txq->tx_tag)
-		bus_dma_tag_destroy(txq->tx_tag);
+	mp_ring_free(txq->r);
 
 	bzero(txq, sizeof(*txq));
 	return (0);
@@ -3466,293 +3821,159 @@ free_fl_sdesc(struct adapter *sc, struct sge_fl *fl)
 	fl->sdesc = NULL;
 }
 
-int
-t4_alloc_tx_maps(struct tx_maps *txmaps, bus_dma_tag_t tx_tag, int count,
-    int flags)
+static inline void
+get_pkt_gl(struct mbuf *m, struct sglist *gl)
 {
-	struct tx_map *txm;
-	int i, rc;
-
-	txmaps->map_total = txmaps->map_avail = count;
-	txmaps->map_cidx = txmaps->map_pidx = 0;
-
-	txmaps->maps = malloc(count * sizeof(struct tx_map), M_CXGBE,
-	    M_ZERO | flags);
+	int rc;
 
-	txm = txmaps->maps;
-	for (i = 0; i < count; i++, txm++) {
-		rc = bus_dmamap_create(tx_tag, 0, &txm->map);
-		if (rc != 0)
-			goto failed;
-	}
+	M_ASSERTPKTHDR(m);
 
-	return (0);
-failed:
-	while (--i >= 0) {
-		txm--;
-		bus_dmamap_destroy(tx_tag, txm->map);
+	sglist_reset(gl);
+	rc = sglist_append_mbuf(gl, m);
+	if (__predict_false(rc != 0)) {
+		panic("%s: mbuf %p (%d segs) was vetted earlier but now fails "
+		    "with %d.", __func__, m, mbuf_nsegs(m), rc);
 	}
-	KASSERT(txm == txmaps->maps, ("%s: EDOOFUS", __func__));
 
-	free(txmaps->maps, M_CXGBE);
-	txmaps->maps = NULL;
-
-	return (rc);
+	KASSERT(gl->sg_nseg == mbuf_nsegs(m),
+	    ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m,
+	    mbuf_nsegs(m), gl->sg_nseg));
+	KASSERT(gl->sg_nseg > 0 &&
+	    gl->sg_nseg <= (needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS),
+	    ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__,
+		gl->sg_nseg, needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS));
 }
 
-void
-t4_free_tx_maps(struct tx_maps *txmaps, bus_dma_tag_t tx_tag)
+/*
+ * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
+ */
+static inline u_int
+txpkt_len16(u_int nsegs, u_int tso)
 {
-	struct tx_map *txm;
-	int i;
+	u_int n;
 
-	txm = txmaps->maps;
-	for (i = 0; i < txmaps->map_total; i++, txm++) {
-
-		if (txm->m) {
-			bus_dmamap_unload(tx_tag, txm->map);
-			m_freem(txm->m);
-			txm->m = NULL;
-		}
+	MPASS(nsegs > 0);
 
-		bus_dmamap_destroy(tx_tag, txm->map);
-	}
+	nsegs--; /* first segment is part of ulptx_sgl */
+	n = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) +
+	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
+	if (tso)
+		n += sizeof(struct cpl_tx_pkt_lso_core);
 
-	free(txmaps->maps, M_CXGBE);
-	txmaps->maps = NULL;
+	return (howmany(n, 16));
 }
 
 /*
- * We'll do immediate data tx for non-TSO, but only when not coalescing.  We're
- * willing to use upto 2 hardware descriptors which means a maximum of 96 bytes
- * of immediate data.
- */
-#define IMM_LEN ( \
-      2 * EQ_ESIZE \
-    - sizeof(struct fw_eth_tx_pkt_wr) \
-    - sizeof(struct cpl_tx_pkt_core))
-
-/*
- * Returns non-zero on failure, no need to cleanup anything in that case.
- *
- * Note 1: We always try to defrag the mbuf if required and return EFBIG only
- * if the resulting chain still won't fit in a tx descriptor.
- *
- * Note 2: We'll pullup the mbuf chain if TSO is requested and the first mbuf
- * does not have the TCP header in it.
+ * len16 for a txpkts type 0 WR with a GL.  Does not include the firmware work
+ * request header.
  */
-static int
-get_pkt_sgl(struct sge_txq *txq, struct mbuf **fp, struct sgl *sgl,
-    int sgl_only)
+static inline u_int
+txpkts0_len16(u_int nsegs)
 {
-	struct mbuf *m = *fp;
-	struct tx_maps *txmaps;
-	struct tx_map *txm;
-	int rc, defragged = 0, n;
-
-	TXQ_LOCK_ASSERT_OWNED(txq);
-
-	if (m->m_pkthdr.tso_segsz)
-		sgl_only = 1;	/* Do not allow immediate data with LSO */
-
-start:	sgl->nsegs = 0;
-
-	if (m->m_pkthdr.len <= IMM_LEN && !sgl_only)
-		return (0);	/* nsegs = 0 tells caller to use imm. tx */
-
-	txmaps = &txq->txmaps;
-	if (txmaps->map_avail == 0) {
-		txq->no_dmamap++;
-		return (ENOMEM);
-	}
-	txm = &txmaps->maps[txmaps->map_pidx];
-
-	if (m->m_pkthdr.tso_segsz && m->m_len < 50) {
-		*fp = m_pullup(m, 50);
-		m = *fp;
-		if (m == NULL)
-			return (ENOBUFS);
-	}
-
-	rc = bus_dmamap_load_mbuf_sg(txq->tx_tag, txm->map, m, sgl->seg,
-	    &sgl->nsegs, BUS_DMA_NOWAIT);
-	if (rc == EFBIG && defragged == 0) {
-		m = m_defrag(m, M_NOWAIT);
-		if (m == NULL)
-			return (EFBIG);
-
-		defragged = 1;
-		*fp = m;
-		goto start;
-	}
-	if (rc != 0)
-		return (rc);
-
-	txm->m = m;
-	txmaps->map_avail--;
-	if (++txmaps->map_pidx == txmaps->map_total)
-		txmaps->map_pidx = 0;
+	u_int n;
 
-	KASSERT(sgl->nsegs > 0 && sgl->nsegs <= TX_SGL_SEGS,
-	    ("%s: bad DMA mapping (%d segments)", __func__, sgl->nsegs));
+	MPASS(nsegs > 0);
 
-	/*
-	 * Store the # of flits required to hold this frame's SGL in nflits.  An
-	 * SGL has a (ULPTX header + len0, addr0) tuple optionally followed by
-	 * multiple (len0 + len1, addr0, addr1) tuples.  If addr1 is not used
-	 * then len1 must be set to 0.
-	 */
-	n = sgl->nsegs - 1;
-	sgl->nflits = (3 * n) / 2 + (n & 1) + 2;
+	nsegs--; /* first segment is part of ulptx_sgl */
+	n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) +
+	    sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) +
+	    8 * ((3 * nsegs) / 2 + (nsegs & 1));
 
-	return (0);
+	return (howmany(n, 16));
 }
 
-
 /*
- * Releases all the txq resources used up in the specified sgl.
+ * len16 for a txpkts type 1 WR with a GL.  Does not include the firmware work
+ * request header.
  */
-static int
-free_pkt_sgl(struct sge_txq *txq, struct sgl *sgl)
+static inline u_int
+txpkts1_len16(void)
 {
-	struct tx_maps *txmaps;
-	struct tx_map *txm;
-
-	TXQ_LOCK_ASSERT_OWNED(txq);
-
-	if (sgl->nsegs == 0)
-		return (0);	/* didn't use any map */
+	u_int n;
 
-	txmaps = &txq->txmaps;
+	n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl);
 
-	/* 1 pkt uses exactly 1 map, back it out */
+	return (howmany(n, 16));
+}
 
-	txmaps->map_avail++;
-	if (txmaps->map_pidx > 0)
-		txmaps->map_pidx--;
-	else
-		txmaps->map_pidx = txmaps->map_total - 1;
+static inline u_int
+imm_payload(u_int ndesc)
+{
+	u_int n;
 
-	txm = &txmaps->maps[txmaps->map_pidx];
-	bus_dmamap_unload(txq->tx_tag, txm->map);
-	txm->m = NULL;
+	n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) -
+	    sizeof(struct cpl_tx_pkt_core);
 
-	return (0);
+	return (n);
 }
 
-static int
-write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, struct mbuf *m,
-    struct sgl *sgl)
+/*
+ * Write a txpkt WR for this packet to the hardware descriptors, update the
+ * software descriptor, and advance the pidx.  It is guaranteed that enough
+ * descriptors are available.
+ *
+ * The return value is the # of hardware descriptors used.
+ */
+static u_int
+write_txpkt_wr(struct sge_txq *txq, struct fw_eth_tx_pkt_wr *wr,
+    struct mbuf *m0, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
-	struct fw_eth_tx_pkt_wr *wr;
+	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;	/* used in many unrelated places */
 	uint64_t ctrl1;
-	int nflits, ndesc, pktlen;
-	struct tx_sdesc *txsd;
+	int len16, ndesc, pktlen, nsegs;
 	caddr_t dst;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
+	M_ASSERTPKTHDR(m0);
+	MPASS(available > 0 && available < eq->sidx);
 
-	pktlen = m->m_pkthdr.len;
-
-	/*
-	 * Do we have enough flits to send this frame out?
-	 */
+	len16 = mbuf_len16(m0);
+	nsegs = mbuf_nsegs(m0);
+	pktlen = m0->m_pkthdr.len;
 	ctrl = sizeof(struct cpl_tx_pkt_core);
-	if (m->m_pkthdr.tso_segsz) {
-		nflits = TXPKT_LSO_WR_HDR;
+	if (needs_tso(m0))
 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
-	} else
-		nflits = TXPKT_WR_HDR;
-	if (sgl->nsegs > 0)
-		nflits += sgl->nflits;
-	else {
-		nflits += howmany(pktlen, 8);
+	else if (pktlen <= imm_payload(2) && available >= 2) {
+		/* Immediate data.  Recalculate len16 and set nsegs to 0. */
 		ctrl += pktlen;
+		len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) +
+		    sizeof(struct cpl_tx_pkt_core) + pktlen, 16);
+		nsegs = 0;
 	}
-	ndesc = howmany(nflits, 8);
-	if (ndesc > eq->avail)
-		return (ENOMEM);
+	ndesc = howmany(len16, EQ_ESIZE / 16);
+	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
-	wr = (void *)&eq->desc[eq->pidx];
+	MPASS(wr == (void *)&eq->desc[eq->pidx]);
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
-	ctrl = V_FW_WR_LEN16(howmany(nflits, 2));
-	if (eq->avail == ndesc) {
-		if (!(eq->flags & EQ_CRFLUSHED)) {
-			ctrl |= F_FW_WR_EQUEQ | F_FW_WR_EQUIQ;
-			eq->flags |= EQ_CRFLUSHED;
-		}
-		eq->flags |= EQ_STALLED;
-	}
 
+	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3 = 0;
 
-	if (m->m_pkthdr.tso_segsz) {
+	if (needs_tso(m0)) {
 		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
-		struct ether_header *eh;
-		void *l3hdr;
-#if defined(INET) || defined(INET6)
-		struct tcphdr *tcp;
-#endif
-		uint16_t eh_type;
-
-		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
-		    F_LSO_LAST_SLICE;
 
-		eh = mtod(m, struct ether_header *);
-		eh_type = ntohs(eh->ether_type);
-		if (eh_type == ETHERTYPE_VLAN) {
-			struct ether_vlan_header *evh = (void *)eh;
+		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
+		    m0->m_pkthdr.l4hlen > 0,
+		    ("%s: mbuf %p needs TSO but missing header lengths",
+			__func__, m0));
 
+		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
+		    F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2)
+		    | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
+		if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header))
 			ctrl |= V_LSO_ETHHDR_LEN(1);
-			l3hdr = evh + 1;
-			eh_type = ntohs(evh->evl_proto);
-		} else
-			l3hdr = eh + 1;
-
-		switch (eh_type) {
-#ifdef INET6
-		case ETHERTYPE_IPV6:
-		{
-			struct ip6_hdr *ip6 = l3hdr;
-
-			/*
-			 * XXX-BZ For now we do not pretend to support
-			 * IPv6 extension headers.
-			 */
-			KASSERT(ip6->ip6_nxt == IPPROTO_TCP, ("%s: CSUM_TSO "
-			    "with ip6_nxt != TCP: %u", __func__, ip6->ip6_nxt));
-			tcp = (struct tcphdr *)(ip6 + 1);
+		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 			ctrl |= F_LSO_IPV6;
-			ctrl |= V_LSO_IPHDR_LEN(sizeof(*ip6) >> 2) |
-			    V_LSO_TCPHDR_LEN(tcp->th_off);
-			break;
-		}
-#endif
-#ifdef INET
-		case ETHERTYPE_IP:
-		{
-			struct ip *ip = l3hdr;
-
-			tcp = (void *)((uintptr_t)ip + ip->ip_hl * 4);
-			ctrl |= V_LSO_IPHDR_LEN(ip->ip_hl) |
-			    V_LSO_TCPHDR_LEN(tcp->th_off);
-			break;
-		}
-#endif
-		default:
-			panic("%s: CSUM_TSO but no supported IP version "
-			    "(0x%04x)", __func__, eh_type);
-		}
 
 		lso->lso_ctrl = htobe32(ctrl);
 		lso->ipid_ofst = htobe16(0);
-		lso->mss = htobe16(m->m_pkthdr.tso_segsz);
+		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
 		lso->seqno_offset = htobe32(0);
 		lso->len = htobe32(pktlen);
 
@@ -3764,48 +3985,36 @@ write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, struct mbuf *m,
 
 	/* Checksum offload */
 	ctrl1 = 0;
-	if (!(m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)))
+	if (needs_l3_csum(m0) == 0)
 		ctrl1 |= F_TXPKT_IPCSUM_DIS;
-	if (!(m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
-	    CSUM_TCP_IPV6 | CSUM_TSO)))
+	if (needs_l4_csum(m0) == 0)
 		ctrl1 |= F_TXPKT_L4CSUM_DIS;
-	if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
+	if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
 	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
 		txq->txcsum++;	/* some hardware assistance provided */
 
 	/* VLAN tag insertion */
-	if (m->m_flags & M_VLANTAG) {
-		ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
+	if (needs_vlan_insertion(m0)) {
+		ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
 		txq->vlan_insertion++;
 	}
 
 	/* CPL header */
-	cpl->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
-	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(pi->adapter->pf));
+	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 	cpl->ctrl1 = htobe64(ctrl1);
 
-	/* Software descriptor */
-	txsd = &txq->sdesc[eq->pidx];
-	txsd->desc_used = ndesc;
-
-	eq->pending += ndesc;
-	eq->avail -= ndesc;
-	eq->pidx += ndesc;
-	if (eq->pidx >= eq->cap)
-		eq->pidx -= eq->cap;
-
 	/* SGL */
 	dst = (void *)(cpl + 1);
-	if (sgl->nsegs > 0) {
-		txsd->credits = 1;
+	if (nsegs > 0) {
+
+		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
 		txq->sgl_wrs++;
-		write_sgl_to_txd(eq, sgl, &dst);
 	} else {
-		txsd->credits = 0;
-		txq->imm_wrs++;
-		for (; m; m = m->m_next) {
+		struct mbuf *m;
+
+		for (m = m0; m != NULL; m = m->m_next) {
 			copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
 #ifdef INVARIANTS
 			pktlen -= m->m_len;
@@ -3814,245 +4023,225 @@ write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, struct mbuf *m,
 #ifdef INVARIANTS
 		KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen));
 #endif
-
+		txq->imm_wrs++;
 	}
 
 	txq->txpkt_wrs++;
-	return (0);
+
+	txsd = &txq->sdesc[eq->pidx];
+	txsd->m = m0;
+	txsd->desc_used = ndesc;
+
+	return (ndesc);
 }
 
-/*
- * Returns 0 to indicate that m has been accepted into a coalesced tx work
- * request.  It has either been folded into txpkts or txpkts was flushed and m
- * has started a new coalesced work request (as the first frame in a fresh
- * txpkts).
- *
- * Returns non-zero to indicate a failure - caller is responsible for
- * transmitting m, if there was anything in txpkts it has been flushed.
- */
 static int
-add_to_txpkts(struct port_info *pi, struct sge_txq *txq, struct txpkts *txpkts,
-    struct mbuf *m, struct sgl *sgl)
+try_txpkts(struct mbuf *m, struct mbuf *n, struct txpkts *txp, u_int available)
 {
-	struct sge_eq *eq = &txq->eq;
-	int can_coalesce;
-	struct tx_sdesc *txsd;
-	int flits;
-
-	TXQ_LOCK_ASSERT_OWNED(txq);
-
-	KASSERT(sgl->nsegs, ("%s: can't coalesce imm data", __func__));
+	u_int needed, nsegs1, nsegs2, l1, l2;
 
-	if (txpkts->npkt > 0) {
-		flits = TXPKTS_PKT_HDR + sgl->nflits;
-		can_coalesce = m->m_pkthdr.tso_segsz == 0 &&
-		    txpkts->nflits + flits <= TX_WR_FLITS &&
-		    txpkts->nflits + flits <= eq->avail * 8 &&
-		    txpkts->plen + m->m_pkthdr.len < 65536;
+	if (cannot_use_txpkts(m) || cannot_use_txpkts(n))
+		return (1);
 
-		if (can_coalesce) {
-			txpkts->npkt++;
-			txpkts->nflits += flits;
-			txpkts->plen += m->m_pkthdr.len;
+	nsegs1 = mbuf_nsegs(m);
+	nsegs2 = mbuf_nsegs(n);
+	if (nsegs1 + nsegs2 == 2) {
+		txp->wr_type = 1;
+		l1 = l2 = txpkts1_len16();
+	} else {
+		txp->wr_type = 0;
+		l1 = txpkts0_len16(nsegs1);
+		l2 = txpkts0_len16(nsegs2);
+	}
+	txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + l1 + l2;
+	needed = howmany(txp->len16, EQ_ESIZE / 16);
+	if (needed > SGE_MAX_WR_NDESC || needed > available)
+		return (1);
 
-			txsd = &txq->sdesc[eq->pidx];
-			txsd->credits++;
+	txp->plen = m->m_pkthdr.len + n->m_pkthdr.len;
+	if (txp->plen > 65535)
+		return (1);
 
-			return (0);
-		}
+	txp->npkt = 2;
+	set_mbuf_len16(m, l1);
+	set_mbuf_len16(n, l2);
 
-		/*
-		 * Couldn't coalesce m into txpkts.  The first order of business
-		 * is to send txpkts on its way.  Then we'll revisit m.
-		 */
-		write_txpkts_wr(txq, txpkts);
-	}
+	return (0);
+}
 
-	/*
-	 * Check if we can start a new coalesced tx work request with m as
-	 * the first packet in it.
-	 */
+static int
+add_to_txpkts(struct mbuf *m, struct txpkts *txp, u_int available)
+{
+	u_int plen, len16, needed, nsegs;
 
-	KASSERT(txpkts->npkt == 0, ("%s: txpkts not empty", __func__));
+	MPASS(txp->wr_type == 0 || txp->wr_type == 1);
 
-	flits = TXPKTS_WR_HDR + sgl->nflits;
-	can_coalesce = m->m_pkthdr.tso_segsz == 0 &&
-	    flits <= eq->avail * 8 && flits <= TX_WR_FLITS;
+	nsegs = mbuf_nsegs(m);
+	if (needs_tso(m) || (txp->wr_type == 1 && nsegs != 1))
+		return (1);
 
-	if (can_coalesce == 0)
-		return (EINVAL);
+	plen = txp->plen + m->m_pkthdr.len;
+	if (plen > 65535)
+		return (1);
 
-	/*
-	 * Start a fresh coalesced tx WR with m as the first frame in it.
-	 */
-	txpkts->npkt = 1;
-	txpkts->nflits = flits;
-	txpkts->flitp = &eq->desc[eq->pidx].flit[2];
-	txpkts->plen = m->m_pkthdr.len;
+	if (txp->wr_type == 0)
+		len16 = txpkts0_len16(nsegs);
+	else
+		len16 = txpkts1_len16();
+	needed = howmany(txp->len16 + len16, EQ_ESIZE / 16);
+	if (needed > SGE_MAX_WR_NDESC || needed > available)
+		return (1);
 
-	txsd = &txq->sdesc[eq->pidx];
-	txsd->credits = 1;
+	txp->npkt++;
+	txp->plen = plen;
+	txp->len16 += len16;
+	set_mbuf_len16(m, len16);
 
 	return (0);
 }
 
 /*
- * Note that write_txpkts_wr can never run out of hardware descriptors (but
- * write_txpkt_wr can).  add_to_txpkts ensures that a frame is accepted for
- * coalescing only if sufficient hardware descriptors are available.
+ * Write a txpkts WR for the packets in txp to the hardware descriptors, update
+ * the software descriptor, and advance the pidx.  It is guaranteed that enough
+ * descriptors are available.
+ *
+ * The return value is the # of hardware descriptors used.
  */
-static void
-write_txpkts_wr(struct sge_txq *txq, struct txpkts *txpkts)
+static u_int
+write_txpkts_wr(struct sge_txq *txq, struct fw_eth_tx_pkts_wr *wr,
+    struct mbuf *m0, const struct txpkts *txp, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
-	struct fw_eth_tx_pkts_wr *wr;
 	struct tx_sdesc *txsd;
+	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;
-	int ndesc;
+	uint64_t ctrl1;
+	int ndesc, checkwrap;
+	struct mbuf *m;
+	void *flitp;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
+	MPASS(txp->npkt > 0);
+	MPASS(txp->plen < 65536);
+	MPASS(m0 != NULL);
+	MPASS(m0->m_nextpkt != NULL);
+	MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
+	MPASS(available > 0 && available < eq->sidx);
 
-	ndesc = howmany(txpkts->nflits, 8);
+	ndesc = howmany(txp->len16, EQ_ESIZE / 16);
+	MPASS(ndesc <= available);
 
-	wr = (void *)&eq->desc[eq->pidx];
+	MPASS(wr == (void *)&eq->desc[eq->pidx]);
 	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
-	ctrl = V_FW_WR_LEN16(howmany(txpkts->nflits, 2));
-	if (eq->avail == ndesc) {
-		if (!(eq->flags & EQ_CRFLUSHED)) {
-			ctrl |= F_FW_WR_EQUEQ | F_FW_WR_EQUIQ;
-			eq->flags |= EQ_CRFLUSHED;
-		}
-		eq->flags |= EQ_STALLED;
-	}
+	ctrl = V_FW_WR_LEN16(txp->len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
-	wr->plen = htobe16(txpkts->plen);
-	wr->npkt = txpkts->npkt;
-	wr->r3 = wr->type = 0;
-
-	/* Everything else already written */
-
-	txsd = &txq->sdesc[eq->pidx];
-	txsd->desc_used = ndesc;
-
-	KASSERT(eq->avail >= ndesc, ("%s: out of descriptors", __func__));
-
-	eq->pending += ndesc;
-	eq->avail -= ndesc;
-	eq->pidx += ndesc;
-	if (eq->pidx >= eq->cap)
-		eq->pidx -= eq->cap;
+	wr->plen = htobe16(txp->plen);
+	wr->npkt = txp->npkt;
+	wr->r3 = 0;
+	wr->type = txp->wr_type;
+	flitp = wr + 1;
 
-	txq->txpkts_pkts += txpkts->npkt;
-	txq->txpkts_wrs++;
-	txpkts->npkt = 0;	/* emptied */
-}
+	/*
+	 * At this point we are 16B into a hardware descriptor.  If checkwrap is
+	 * set then we know the WR is going to wrap around somewhere.  We'll
+	 * check for that at appropriate points.
+	 */
+	checkwrap = eq->sidx - ndesc < eq->pidx;
+	for (m = m0; m != NULL; m = m->m_nextpkt) {
+		if (txp->wr_type == 0) {
+			struct ulp_txpkt *ulpmc;
+			struct ulptx_idata *ulpsc;
+
+			/* ULP master command */
+			ulpmc = flitp;
+			ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
+			    V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid));
+			ulpmc->len = htobe32(mbuf_len16(m));
+
+			/* ULP subcommand */
+			ulpsc = (void *)(ulpmc + 1);
+			ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
+			    F_ULP_TX_SC_MORE);
+			ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core));
+
+			cpl = (void *)(ulpsc + 1);
+			if (checkwrap &&
+			    (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx])
+				cpl = (void *)&eq->desc[0];
+			txq->txpkts0_pkts += txp->npkt;
+			txq->txpkts0_wrs++;
+		} else {
+			cpl = flitp;
+			txq->txpkts1_pkts += txp->npkt;
+			txq->txpkts1_wrs++;
+		}
 
-static inline void
-write_ulp_cpl_sgl(struct port_info *pi, struct sge_txq *txq,
-    struct txpkts *txpkts, struct mbuf *m, struct sgl *sgl)
-{
-	struct ulp_txpkt *ulpmc;
-	struct ulptx_idata *ulpsc;
-	struct cpl_tx_pkt_core *cpl;
-	struct sge_eq *eq = &txq->eq;
-	uintptr_t flitp, start, end;
-	uint64_t ctrl;
-	caddr_t dst;
+		/* Checksum offload */
+		ctrl1 = 0;
+		if (needs_l3_csum(m) == 0)
+			ctrl1 |= F_TXPKT_IPCSUM_DIS;
+		if (needs_l4_csum(m) == 0)
+			ctrl1 |= F_TXPKT_L4CSUM_DIS;
+		if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
+		    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
+			txq->txcsum++;	/* some hardware assistance provided */
+
+		/* VLAN tag insertion */
+		if (needs_vlan_insertion(m)) {
+			ctrl1 |= F_TXPKT_VLAN_VLD |
+			    V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
+			txq->vlan_insertion++;
+		}
 
-	KASSERT(txpkts->npkt > 0, ("%s: txpkts is empty", __func__));
+		/* CPL header */
+		cpl->ctrl0 = txq->cpl_ctrl0;
+		cpl->pack = 0;
+		cpl->len = htobe16(m->m_pkthdr.len);
+		cpl->ctrl1 = htobe64(ctrl1);
 
-	start = (uintptr_t)eq->desc;
-	end = (uintptr_t)eq->spg;
+		flitp = cpl + 1;
+		if (checkwrap &&
+		    (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx])
+			flitp = (void *)&eq->desc[0];
 
-	/* Checksum offload */
-	ctrl = 0;
-	if (!(m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)))
-		ctrl |= F_TXPKT_IPCSUM_DIS;
-	if (!(m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
-	    CSUM_TCP_IPV6 | CSUM_TSO)))
-		ctrl |= F_TXPKT_L4CSUM_DIS;
-	if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
-	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
-		txq->txcsum++;	/* some hardware assistance provided */
+		write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap);
 
-	/* VLAN tag insertion */
-	if (m->m_flags & M_VLANTAG) {
-		ctrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
-		txq->vlan_insertion++;
 	}
 
-	/*
-	 * The previous packet's SGL must have ended at a 16 byte boundary (this
-	 * is required by the firmware/hardware).  It follows that flitp cannot
-	 * wrap around between the ULPTX master command and ULPTX subcommand (8
-	 * bytes each), and that it can not wrap around in the middle of the
-	 * cpl_tx_pkt_core either.
-	 */
-	flitp = (uintptr_t)txpkts->flitp;
-	KASSERT((flitp & 0xf) == 0,
-	    ("%s: last SGL did not end at 16 byte boundary: %p",
-	    __func__, txpkts->flitp));
-
-	/* ULP master command */
-	ulpmc = (void *)flitp;
-	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0) |
-	    V_ULP_TXPKT_FID(eq->iqid));
-	ulpmc->len = htonl(howmany(sizeof(*ulpmc) + sizeof(*ulpsc) +
-	    sizeof(*cpl) + 8 * sgl->nflits, 16));
-
-	/* ULP subcommand */
-	ulpsc = (void *)(ulpmc + 1);
-	ulpsc->cmd_more = htobe32(V_ULPTX_CMD((u32)ULP_TX_SC_IMM) |
-	    F_ULP_TX_SC_MORE);
-	ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core));
-
-	flitp += sizeof(*ulpmc) + sizeof(*ulpsc);
-	if (flitp == end)
-		flitp = start;
-
-	/* CPL_TX_PKT */
-	cpl = (void *)flitp;
-	cpl->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
-	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(pi->adapter->pf));
-	cpl->pack = 0;
-	cpl->len = htobe16(m->m_pkthdr.len);
-	cpl->ctrl1 = htobe64(ctrl);
-
-	flitp += sizeof(*cpl);
-	if (flitp == end)
-		flitp = start;
-
-	/* SGL for this frame */
-	dst = (caddr_t)flitp;
-	txpkts->nflits += write_sgl_to_txd(eq, sgl, &dst);
-	txpkts->flitp = (void *)dst;
+	txsd = &txq->sdesc[eq->pidx];
+	txsd->m = m0;
+	txsd->desc_used = ndesc;
 
-	KASSERT(((uintptr_t)dst & 0xf) == 0,
-	    ("%s: SGL ends at %p (not a 16 byte boundary)", __func__, dst));
+	return (ndesc);
 }
 
 /*
  * If the SGL ends on an address that is not 16 byte aligned, this function will
- * add a 0 filled flit at the end.  It returns 1 in that case.
+ * add a 0 filled flit at the end.
  */
-static int
-write_sgl_to_txd(struct sge_eq *eq, struct sgl *sgl, caddr_t *to)
+static void
+write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap)
 {
-	__be64 *flitp, *end;
+	struct sge_eq *eq = &txq->eq;
+	struct sglist *gl = txq->gl;
+	struct sglist_seg *seg;
+	__be64 *flitp, *wrap;
 	struct ulptx_sgl *usgl;
-	bus_dma_segment_t *seg;
-	int i, padded;
-
-	KASSERT(sgl->nsegs > 0 && sgl->nflits > 0,
-	    ("%s: bad SGL - nsegs=%d, nflits=%d",
-	    __func__, sgl->nsegs, sgl->nflits));
+	int i, nflits, nsegs;
 
 	KASSERT(((uintptr_t)(*to) & 0xf) == 0,
 	    ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to));
+	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
+	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
 
+	get_pkt_gl(m, gl);
+	nsegs = gl->sg_nseg;
+	MPASS(nsegs > 0);
+
+	nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2;
 	flitp = (__be64 *)(*to);
-	end = flitp + sgl->nflits;
-	seg = &sgl->seg[0];
+	wrap = (__be64 *)(&eq->desc[eq->sidx]);
+	seg = &gl->sg_segs[0];
 	usgl = (void *)flitp;
 
 	/*
@@ -4062,58 +4251,60 @@ write_sgl_to_txd(struct sge_eq *eq, struct sgl *sgl, caddr_t *to)
 	 */
 
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
-	    V_ULPTX_NSGE(sgl->nsegs));
-	usgl->len0 = htobe32(seg->ds_len);
-	usgl->addr0 = htobe64(seg->ds_addr);
+	    V_ULPTX_NSGE(nsegs));
+	usgl->len0 = htobe32(seg->ss_len);
+	usgl->addr0 = htobe64(seg->ss_paddr);
 	seg++;
 
-	if ((uintptr_t)end <= (uintptr_t)eq->spg) {
+	if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) {
 
 		/* Won't wrap around at all */
 
-		for (i = 0; i < sgl->nsegs - 1; i++, seg++) {
-			usgl->sge[i / 2].len[i & 1] = htobe32(seg->ds_len);
-			usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ds_addr);
+		for (i = 0; i < nsegs - 1; i++, seg++) {
+			usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len);
+			usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr);
 		}
 		if (i & 1)
 			usgl->sge[i / 2].len[1] = htobe32(0);
+		flitp += nflits;
 	} else {
 
 		/* Will wrap somewhere in the rest of the SGL */
 
 		/* 2 flits already written, write the rest flit by flit */
 		flitp = (void *)(usgl + 1);
-		for (i = 0; i < sgl->nflits - 2; i++) {
-			if ((uintptr_t)flitp == (uintptr_t)eq->spg)
+		for (i = 0; i < nflits - 2; i++) {
+			if (flitp == wrap)
 				flitp = (void *)eq->desc;
-			*flitp++ = get_flit(seg, sgl->nsegs - 1, i);
+			*flitp++ = get_flit(seg, nsegs - 1, i);
 		}
-		end = flitp;
 	}
 
-	if ((uintptr_t)end & 0xf) {
-		*(uint64_t *)end = 0;
-		end++;
-		padded = 1;
-	} else
-		padded = 0;
+	if (nflits & 1) {
+		MPASS(((uintptr_t)flitp) & 0xf);
+		*flitp++ = 0;
+	}
 
-	if ((uintptr_t)end == (uintptr_t)eq->spg)
+	MPASS((((uintptr_t)flitp) & 0xf) == 0);
+	if (__predict_false(flitp == wrap))
 		*to = (void *)eq->desc;
 	else
-		*to = (void *)end;
-
-	return (padded);
+		*to = (void *)flitp;
 }
 
 static inline void
 copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
 {
-	if (__predict_true((uintptr_t)(*to) + len <= (uintptr_t)eq->spg)) {
+
+	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
+	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
+
+	if (__predict_true((uintptr_t)(*to) + len <=
+	    (uintptr_t)&eq->desc[eq->sidx])) {
 		bcopy(from, *to, len);
 		(*to) += len;
 	} else {
-		int portion = (uintptr_t)eq->spg - (uintptr_t)(*to);
+		int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to);
 
 		bcopy(from, *to, portion);
 		from += portion;
@@ -4124,21 +4315,21 @@ copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
 }
 
 static inline void
-ring_eq_db(struct adapter *sc, struct sge_eq *eq)
+ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n)
 {
-	u_int db, pending;
+	u_int db;
+
+	MPASS(n > 0);
 
 	db = eq->doorbells;
-	pending = eq->pending;
-	if (pending > 1)
+	if (n > 1)
 		clrbit(&db, DOORBELL_WCWR);
-	eq->pending = 0;
 	wmb();
 
 	switch (ffs(db) - 1) {
 	case DOORBELL_UDB:
-		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(pending));
-		return;
+		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
+		break;
 
 	case DOORBELL_WCWR: {
 		volatile uint64_t *dst, *src;
@@ -4149,69 +4340,84 @@ ring_eq_db(struct adapter *sc, struct sge_eq *eq)
 		 * use relative qid (udb_qid is always 0).  Only queues with
 		 * doorbell segments can do WCWR.
 		 */
-		KASSERT(eq->udb_qid == 0 && pending == 1,
+		KASSERT(eq->udb_qid == 0 && n == 1,
 		    ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p",
-		    __func__, eq->doorbells, pending, eq->pidx, eq));
+		    __func__, eq->doorbells, n, eq->dbidx, eq));
 
 		dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET -
 		    UDBS_DB_OFFSET);
-		i = eq->pidx ? eq->pidx - 1 : eq->cap - 1;
+		i = eq->dbidx;
 		src = (void *)&eq->desc[i];
 		while (src != (void *)&eq->desc[i + 1])
 			*dst++ = *src++;
 		wmb();
-		return;
+		break;
 	}
 
 	case DOORBELL_UDBWC:
-		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(pending));
+		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
 		wmb();
-		return;
+		break;
 
 	case DOORBELL_KDB:
 		t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL),
-		    V_QID(eq->cntxt_id) | V_PIDX(pending));
-		return;
+		    V_QID(eq->cntxt_id) | V_PIDX(n));
+		break;
 	}
+
+	IDXINCR(eq->dbidx, n, eq->sidx);
 }
 
-static inline int
-reclaimable(struct sge_eq *eq)
+static inline u_int
+reclaimable_tx_desc(struct sge_eq *eq)
 {
-	unsigned int cidx;
+	uint16_t hw_cidx;
 
-	cidx = eq->spg->cidx;	/* stable snapshot */
-	cidx = be16toh(cidx);
+	hw_cidx = read_hw_cidx(eq);
+	return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx));
+}
+
+static inline u_int
+total_available_tx_desc(struct sge_eq *eq)
+{
+	uint16_t hw_cidx, pidx;
+
+	hw_cidx = read_hw_cidx(eq);
+	pidx = eq->pidx;
 
-	if (cidx >= eq->cidx)
-		return (cidx - eq->cidx);
+	if (pidx == hw_cidx)
+		return (eq->sidx - 1);
 	else
-		return (cidx + eq->cap - eq->cidx);
+		return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1);
+}
+
+static inline uint16_t
+read_hw_cidx(struct sge_eq *eq)
+{
+	struct sge_qstat *spg = (void *)&eq->desc[eq->sidx];
+	uint16_t cidx = spg->cidx;	/* stable snapshot */
+
+	return (be16toh(cidx));
 }
 
 /*
- * There are "can_reclaim" tx descriptors ready to be reclaimed.  Reclaim as
- * many as possible but stop when there are around "n" mbufs to free.
- *
- * The actual number reclaimed is provided as the return value.
+ * Reclaim 'n' descriptors approximately.
  */
-static int
-reclaim_tx_descs(struct sge_txq *txq, int can_reclaim, int n)
+static u_int
+reclaim_tx_descs(struct sge_txq *txq, u_int n)
 {
 	struct tx_sdesc *txsd;
-	struct tx_maps *txmaps;
-	struct tx_map *txm;
-	unsigned int reclaimed, maps;
 	struct sge_eq *eq = &txq->eq;
+	u_int can_reclaim, reclaimed;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
+	MPASS(n > 0);
 
-	if (can_reclaim == 0)
-		can_reclaim = reclaimable(eq);
-
-	maps = reclaimed = 0;
-	while (can_reclaim && maps < n) {
+	reclaimed = 0;
+	can_reclaim = reclaimable_tx_desc(eq);
+	while (can_reclaim && reclaimed < n) {
 		int ndesc;
+		struct mbuf *m, *nextpkt;
 
 		txsd = &txq->sdesc[eq->cidx];
 		ndesc = txsd->desc_used;
@@ -4221,73 +4427,37 @@ reclaim_tx_descs(struct sge_txq *txq, int can_reclaim, int n)
 		    ("%s: unexpected number of credits: %d, %d",
 		    __func__, can_reclaim, ndesc));
 
-		maps += txsd->credits;
-
+		for (m = txsd->m; m != NULL; m = nextpkt) {
+			nextpkt = m->m_nextpkt;
+			m->m_nextpkt = NULL;
+			m_freem(m);
+		}
 		reclaimed += ndesc;
 		can_reclaim -= ndesc;
-
-		eq->cidx += ndesc;
-		if (__predict_false(eq->cidx >= eq->cap))
-			eq->cidx -= eq->cap;
-	}
-
-	txmaps = &txq->txmaps;
-	txm = &txmaps->maps[txmaps->map_cidx];
-	if (maps)
-		prefetch(txm->m);
-
-	eq->avail += reclaimed;
-	KASSERT(eq->avail < eq->cap,	/* avail tops out at (cap - 1) */
-	    ("%s: too many descriptors available", __func__));
-
-	txmaps->map_avail += maps;
-	KASSERT(txmaps->map_avail <= txmaps->map_total,
-	    ("%s: too many maps available", __func__));
-
-	while (maps--) {
-		struct tx_map *next;
-
-		next = txm + 1;
-		if (__predict_false(txmaps->map_cidx + 1 == txmaps->map_total))
-			next = txmaps->maps;
-		prefetch(next->m);
-
-		bus_dmamap_unload(txq->tx_tag, txm->map);
-		m_freem(txm->m);
-		txm->m = NULL;
-
-		txm = next;
-		if (__predict_false(++txmaps->map_cidx == txmaps->map_total))
-			txmaps->map_cidx = 0;
+		IDXINCR(eq->cidx, ndesc, eq->sidx);
 	}
 
 	return (reclaimed);
 }
 
 static void
-write_eqflush_wr(struct sge_eq *eq)
+tx_reclaim(void *arg, int n)
 {
-	struct fw_eq_flush_wr *wr;
+	struct sge_txq *txq = arg;
+	struct sge_eq *eq = &txq->eq;
 
-	EQ_LOCK_ASSERT_OWNED(eq);
-	KASSERT(eq->avail > 0, ("%s: no descriptors left.", __func__));
-	KASSERT(!(eq->flags & EQ_CRFLUSHED), ("%s: flushed already", __func__));
-
-	wr = (void *)&eq->desc[eq->pidx];
-	bzero(wr, sizeof(*wr));
-	wr->opcode = FW_EQ_FLUSH_WR;
-	wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(sizeof(*wr) / 16) |
-	    F_FW_WR_EQUEQ | F_FW_WR_EQUIQ);
-
-	eq->flags |= (EQ_CRFLUSHED | EQ_STALLED);
-	eq->pending++;
-	eq->avail--;
-	if (++eq->pidx == eq->cap)
-		eq->pidx = 0;
+	do {
+		if (TXQ_TRYLOCK(txq) == 0)
+			break;
+		n = reclaim_tx_descs(txq, 32);
+		if (eq->cidx == eq->pidx)
+			eq->equeqidx = eq->pidx;
+		TXQ_UNLOCK(txq);
+	} while (n > 0);
 }
 
 static __be64
-get_flit(bus_dma_segment_t *sgl, int nsegs, int idx)
+get_flit(struct sglist_seg *segs, int nsegs, int idx)
 {
 	int i = (idx / 3) * 2;
 
@@ -4295,16 +4465,16 @@ get_flit(bus_dma_segment_t *sgl, int nsegs, int idx)
 	case 0: {
 		__be64 rc;
 
-		rc = htobe32(sgl[i].ds_len);
+		rc = htobe32(segs[i].ss_len);
 		if (i + 1 < nsegs)
-			rc |= (uint64_t)htobe32(sgl[i + 1].ds_len) << 32;
+			rc |= (uint64_t)htobe32(segs[i + 1].ss_len) << 32;
 
 		return (rc);
 	}
 	case 1:
-		return htobe64(sgl[i].ds_addr);
+		return (htobe64(segs[i].ss_paddr));
 	case 2:
-		return htobe64(sgl[i + 1].ds_addr);
+		return (htobe64(segs[i + 1].ss_paddr));
 	}
 
 	return (0);
@@ -4499,6 +4669,27 @@ add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl)
 	mtx_unlock(&sc->sfl_lock);
 }
 
+static void
+handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq)
+{
+	struct sge_wrq *wrq = (void *)eq;
+
+	atomic_readandclear_int(&eq->equiq);
+	taskqueue_enqueue(sc->tq[eq->tx_chan], &wrq->wrq_tx_task);
+}
+
+static void
+handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq)
+{
+	struct sge_txq *txq = (void *)eq;
+
+	MPASS((eq->flags & EQ_TYPEMASK) == EQ_ETH);
+
+	atomic_readandclear_int(&eq->equiq);
+	mp_ring_check_drainage(txq->r, 0);
+	taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task);
+}
+
 static int
 handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
@@ -4508,22 +4699,15 @@ handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss,
 	struct adapter *sc = iq->adapter;
 	struct sge *s = &sc->sge;
 	struct sge_eq *eq;
+	static void (*h[])(struct adapter *, struct sge_eq *) = {NULL,
+		&handle_wrq_egr_update, &handle_eth_egr_update,
+		&handle_wrq_egr_update};
 
 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
 	    rss->opcode));
 
 	eq = s->eqmap[qid - s->eq_start];
-	EQ_LOCK(eq);
-	KASSERT(eq->flags & EQ_CRFLUSHED,
-	    ("%s: unsolicited egress update", __func__));
-	eq->flags &= ~EQ_CRFLUSHED;
-	eq->egr_update++;
-
-	if (__predict_false(eq->flags & EQ_DOOMED))
-		wakeup_one(eq);
-	else if (eq->flags & EQ_STALLED && can_resume_tx(eq))
-		taskqueue_enqueue(sc->tq[eq->tx_chan], &eq->tx_task);
-	EQ_UNLOCK(eq);
+	(*h[eq->flags & EQ_TYPEMASK])(sc, eq);
 
 	return (0);
 }
diff --git a/sys/modules/cxgbe/if_cxgbe/Makefile b/sys/modules/cxgbe/if_cxgbe/Makefile
index e4828f7..a66e45a 100644
--- a/sys/modules/cxgbe/if_cxgbe/Makefile
+++ b/sys/modules/cxgbe/if_cxgbe/Makefile
@@ -15,6 +15,7 @@ SRCS+=	pci_if.h
 SRCS+=	t4_hw.c
 SRCS+=	t4_l2t.c
 SRCS+=	t4_main.c
+SRCS+=	t4_mp_ring.c
 SRCS+=	t4_netmap.c
 SRCS+=	t4_sge.c
 SRCS+=	t4_tracer.c
-- 
cgit v1.1