diff options
-rw-r--r-- | sys/conf/files | 2 | ||||
-rw-r--r-- | sys/dev/cxgbe/adapter.h | 116 | ||||
-rw-r--r-- | sys/dev/cxgbe/t4_l2t.c | 9 | ||||
-rw-r--r-- | sys/dev/cxgbe/t4_main.c | 264 | ||||
-rw-r--r-- | sys/dev/cxgbe/t4_mp_ring.c | 364 | ||||
-rw-r--r-- | sys/dev/cxgbe/t4_mp_ring.h | 68 | ||||
-rw-r--r-- | sys/dev/cxgbe/t4_sge.c | 1994 | ||||
-rw-r--r-- | sys/modules/cxgbe/if_cxgbe/Makefile | 1 |
8 files changed, 1686 insertions, 1132 deletions
diff --git a/sys/conf/files b/sys/conf/files index 3884c11..9e55f42 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1142,6 +1142,8 @@ dev/cxgb/sys/uipc_mvec.c optional cxgb pci \ compile-with "${NORMAL_C} -I$S/dev/cxgb" dev/cxgb/cxgb_t3fw.c optional cxgb cxgb_t3fw \ compile-with "${NORMAL_C} -I$S/dev/cxgb" +dev/cxgbe/t4_mp_ring.c optional cxgbe pci \ + compile-with "${NORMAL_C} -I$S/dev/cxgbe" dev/cxgbe/t4_main.c optional cxgbe pci \ compile-with "${NORMAL_C} -I$S/dev/cxgbe" dev/cxgbe/t4_netmap.c optional cxgbe pci \ diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h index ec84bb4..62ff9af 100644 --- a/sys/dev/cxgbe/adapter.h +++ b/sys/dev/cxgbe/adapter.h @@ -152,7 +152,8 @@ enum { CL_METADATA_SIZE = CACHE_LINE_SIZE, SGE_MAX_WR_NDESC = SGE_MAX_WR_LEN / EQ_ESIZE, /* max WR size in desc */ - TX_SGL_SEGS = 36, + TX_SGL_SEGS = 39, + TX_SGL_SEGS_TSO = 38, TX_WR_FLITS = SGE_MAX_WR_LEN / 8 }; @@ -273,6 +274,7 @@ struct port_info { struct timeval last_refreshed; struct port_stats stats; u_int tnl_cong_drops; + u_int tx_parse_error; eventhandler_tag vlan_c; @@ -308,23 +310,9 @@ struct tx_desc { __be64 flit[8]; }; -struct tx_map { - struct mbuf *m; - bus_dmamap_t map; -}; - -/* DMA maps used for tx */ -struct tx_maps { - struct tx_map *maps; - uint32_t map_total; /* # of DMA maps */ - uint32_t map_pidx; /* next map to be used */ - uint32_t map_cidx; /* reclaimed up to this index */ - uint32_t map_avail; /* # of available maps */ -}; - struct tx_sdesc { + struct mbuf *m; /* m_nextpkt linked chain of frames */ uint8_t desc_used; /* # of hardware descriptors used by the WR */ - uint8_t credits; /* NIC txq: # of frames sent out in the WR */ }; @@ -378,16 +366,12 @@ struct sge_iq { enum { EQ_CTRL = 1, EQ_ETH = 2, -#ifdef TCP_OFFLOAD EQ_OFLD = 3, -#endif /* eq flags */ - EQ_TYPEMASK = 7, /* 3 lsbits hold the type */ - EQ_ALLOCATED = (1 << 3), /* firmware resources allocated */ - EQ_DOOMED = (1 << 4), /* about to be destroyed */ - EQ_CRFLUSHED = (1 << 5), /* expecting an update from SGE */ - EQ_STALLED = (1 << 6), /* out of hw descriptors or dmamaps */ + EQ_TYPEMASK = 0x3, /* 2 lsbits hold the type (see above) */ + EQ_ALLOCATED = (1 << 2), /* firmware resources allocated */ + EQ_ENABLED = (1 << 3), /* open for business */ }; /* Listed in order of preference. Update t4_sysctls too if you change these */ @@ -402,32 +386,25 @@ enum {DOORBELL_UDB, DOORBELL_WCWR, DOORBELL_UDBWC, DOORBELL_KDB}; struct sge_eq { unsigned int flags; /* MUST be first */ unsigned int cntxt_id; /* SGE context id for the eq */ - bus_dma_tag_t desc_tag; - bus_dmamap_t desc_map; - char lockname[16]; struct mtx eq_lock; struct tx_desc *desc; /* KVA of descriptor ring */ - bus_addr_t ba; /* bus address of descriptor ring */ - struct sge_qstat *spg; /* status page, for convenience */ uint16_t doorbells; volatile uint32_t *udb; /* KVA of doorbell (lies within BAR2) */ u_int udb_qid; /* relative qid within the doorbell page */ - uint16_t cap; /* max # of desc, for convenience */ - uint16_t avail; /* available descriptors, for convenience */ - uint16_t qsize; /* size (# of entries) of the queue */ + uint16_t sidx; /* index of the entry with the status page */ uint16_t cidx; /* consumer idx (desc idx) */ uint16_t pidx; /* producer idx (desc idx) */ - uint16_t pending; /* # of descriptors used since last doorbell */ + uint16_t equeqidx; /* EQUEQ last requested at this pidx */ + uint16_t dbidx; /* pidx of the most recent doorbell */ uint16_t iqid; /* iq that gets egr_update for the eq */ uint8_t tx_chan; /* tx channel used by the eq */ - struct task tx_task; - struct callout tx_callout; - - /* stats */ + volatile u_int equiq; /* EQUIQ outstanding */ - uint32_t egr_update; /* # of SGE_EGR_UPDATE notifications for eq */ - uint32_t unstalled; /* recovered from stall */ + bus_dma_tag_t desc_tag; + bus_dmamap_t desc_map; + bus_addr_t ba; /* bus address of descriptor ring */ + char lockname[16]; }; struct sw_zone_info { @@ -499,18 +476,19 @@ struct sge_fl { struct cluster_layout cll_alt; /* alternate refill zone, layout */ }; +struct mp_ring; + /* txq: SGE egress queue + what's needed for Ethernet NIC */ struct sge_txq { struct sge_eq eq; /* MUST be first */ struct ifnet *ifp; /* the interface this txq belongs to */ - bus_dma_tag_t tx_tag; /* tag for transmit buffers */ - struct buf_ring *br; /* tx buffer ring */ + struct mp_ring *r; /* tx software ring */ struct tx_sdesc *sdesc; /* KVA of software descriptor ring */ - struct mbuf *m; /* held up due to temporary resource shortage */ - - struct tx_maps txmaps; + struct sglist *gl; + __be32 cpl_ctrl0; /* for convenience */ + struct task tx_reclaim_task; /* stats for common events first */ uint64_t txcsum; /* # of times hardware assisted with checksum */ @@ -519,13 +497,12 @@ struct sge_txq { uint64_t imm_wrs; /* # of work requests with immediate data */ uint64_t sgl_wrs; /* # of work requests with direct SGL */ uint64_t txpkt_wrs; /* # of txpkt work requests (not coalesced) */ - uint64_t txpkts_wrs; /* # of coalesced tx work requests */ - uint64_t txpkts_pkts; /* # of frames in coalesced tx work requests */ + uint64_t txpkts0_wrs; /* # of type0 coalesced tx work requests */ + uint64_t txpkts1_wrs; /* # of type1 coalesced tx work requests */ + uint64_t txpkts0_pkts; /* # of frames in type0 coalesced tx WRs */ + uint64_t txpkts1_pkts; /* # of frames in type1 coalesced tx WRs */ /* stats for not-that-common events */ - - uint32_t no_dmamap; /* no DMA map to load the mbuf */ - uint32_t no_desc; /* out of hardware descriptors */ } __aligned(CACHE_LINE_SIZE); /* rxq: SGE ingress queue + SGE free list + miscellaneous items */ @@ -574,7 +551,13 @@ struct wrqe { STAILQ_ENTRY(wrqe) link; struct sge_wrq *wrq; int wr_len; - uint64_t wr[] __aligned(16); + char wr[] __aligned(16); +}; + +struct wrq_cookie { + TAILQ_ENTRY(wrq_cookie) link; + int ndesc; + int pidx; }; /* @@ -585,17 +568,32 @@ struct sge_wrq { struct sge_eq eq; /* MUST be first */ struct adapter *adapter; + struct task wrq_tx_task; + + /* Tx desc reserved but WR not "committed" yet. */ + TAILQ_HEAD(wrq_incomplete_wrs , wrq_cookie) incomplete_wrs; - /* List of WRs held up due to lack of tx descriptors */ + /* List of WRs ready to go out as soon as descriptors are available. */ STAILQ_HEAD(, wrqe) wr_list; + u_int nwr_pending; + u_int ndesc_needed; /* stats for common events first */ - uint64_t tx_wrs; /* # of tx work requests */ + uint64_t tx_wrs_direct; /* # of WRs written directly to desc ring. */ + uint64_t tx_wrs_ss; /* # of WRs copied from scratch space. */ + uint64_t tx_wrs_copied; /* # of WRs queued and copied to desc ring. */ /* stats for not-that-common events */ - uint32_t no_desc; /* out of hardware descriptors */ + /* + * Scratch space for work requests that wrap around after reaching the + * status page, and some infomation about the last WR that used it. + */ + uint16_t ss_pidx; + uint16_t ss_len; + uint8_t ss[SGE_MAX_WR_LEN]; + } __aligned(CACHE_LINE_SIZE); @@ -744,7 +742,7 @@ struct adapter { struct sge sge; int lro_timeout; - struct taskqueue *tq[NCHAN]; /* taskqueues that flush data out */ + struct taskqueue *tq[NCHAN]; /* General purpose taskqueues */ struct port_info *port[MAX_NPORTS]; uint8_t chan_map[NCHAN]; @@ -978,12 +976,11 @@ static inline int tx_resume_threshold(struct sge_eq *eq) { - return (eq->qsize / 4); + /* not quite the same as qsize / 4, but this will do. */ + return (eq->sidx / 4); } /* t4_main.c */ -void t4_tx_task(void *, int); -void t4_tx_callout(void *); int t4_os_find_pci_capability(struct adapter *, int); int t4_os_pci_save_state(struct adapter *); int t4_os_pci_restore_state(struct adapter *); @@ -1024,16 +1021,15 @@ int t4_setup_adapter_queues(struct adapter *); int t4_teardown_adapter_queues(struct adapter *); int t4_setup_port_queues(struct port_info *); int t4_teardown_port_queues(struct port_info *); -int t4_alloc_tx_maps(struct tx_maps *, bus_dma_tag_t, int, int); -void t4_free_tx_maps(struct tx_maps *, bus_dma_tag_t); void t4_intr_all(void *); void t4_intr(void *); void t4_intr_err(void *); void t4_intr_evt(void *); void t4_wrq_tx_locked(struct adapter *, struct sge_wrq *, struct wrqe *); -int t4_eth_tx(struct ifnet *, struct sge_txq *, struct mbuf *); void t4_update_fl_bufsize(struct ifnet *); -int can_resume_tx(struct sge_eq *); +int parse_pkt(struct mbuf **); +void *start_wrq_wr(struct sge_wrq *, int, struct wrq_cookie *); +void commit_wrq_wr(struct sge_wrq *, void *, struct wrq_cookie *); /* t4_tracer.c */ struct t4_tracer; diff --git a/sys/dev/cxgbe/t4_l2t.c b/sys/dev/cxgbe/t4_l2t.c index 6f7378a..cca1bf3 100644 --- a/sys/dev/cxgbe/t4_l2t.c +++ b/sys/dev/cxgbe/t4_l2t.c @@ -113,16 +113,15 @@ found: int t4_write_l2e(struct adapter *sc, struct l2t_entry *e, int sync) { - struct wrqe *wr; + struct wrq_cookie cookie; struct cpl_l2t_write_req *req; int idx = e->idx + sc->vres.l2t.start; mtx_assert(&e->lock, MA_OWNED); - wr = alloc_wrqe(sizeof(*req), &sc->sge.mgmtq); - if (wr == NULL) + req = start_wrq_wr(&sc->sge.mgmtq, howmany(sizeof(*req), 16), &cookie); + if (req == NULL) return (ENOMEM); - req = wrtod(wr); INIT_TP_WR(req, 0); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, idx | @@ -132,7 +131,7 @@ t4_write_l2e(struct adapter *sc, struct l2t_entry *e, int sync) req->vlan = htons(e->vlan); memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac)); - t4_wrq_tx(sc, wr); + commit_wrq_wr(&sc->sge.mgmtq, req, &cookie); if (sync && e->state != L2T_STATE_SWITCHING) e->state = L2T_STATE_SYNC_WRITE; diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c index 2c384fd..39dc816 100644 --- a/sys/dev/cxgbe/t4_main.c +++ b/sys/dev/cxgbe/t4_main.c @@ -66,6 +66,7 @@ __FBSDID("$FreeBSD$"); #include "common/t4_regs_values.h" #include "t4_ioctl.h" #include "t4_l2t.h" +#include "t4_mp_ring.h" /* T4 bus driver interface */ static int t4_probe(device_t); @@ -378,7 +379,8 @@ static void build_medialist(struct port_info *, struct ifmedia *); static int cxgbe_init_synchronized(struct port_info *); static int cxgbe_uninit_synchronized(struct port_info *); static int setup_intr_handlers(struct adapter *); -static void quiesce_eq(struct adapter *, struct sge_eq *); +static void quiesce_txq(struct adapter *, struct sge_txq *); +static void quiesce_wrq(struct adapter *, struct sge_wrq *); static void quiesce_iq(struct adapter *, struct sge_iq *); static void quiesce_fl(struct adapter *, struct sge_fl *); static int t4_alloc_irq(struct adapter *, struct irq *, int rid, @@ -434,7 +436,6 @@ static int sysctl_tx_rate(SYSCTL_HANDLER_ARGS); static int sysctl_ulprx_la(SYSCTL_HANDLER_ARGS); static int sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS); #endif -static inline void txq_start(struct ifnet *, struct sge_txq *); static uint32_t fconf_to_mode(uint32_t); static uint32_t mode_to_fconf(uint32_t); static uint32_t fspec_to_fconf(struct t4_filter_specification *); @@ -1429,67 +1430,36 @@ cxgbe_transmit(struct ifnet *ifp, struct mbuf *m) { struct port_info *pi = ifp->if_softc; struct adapter *sc = pi->adapter; - struct sge_txq *txq = &sc->sge.txq[pi->first_txq]; - struct buf_ring *br; + struct sge_txq *txq; + void *items[1]; int rc; M_ASSERTPKTHDR(m); + MPASS(m->m_nextpkt == NULL); /* not quite ready for this yet */ if (__predict_false(pi->link_cfg.link_ok == 0)) { m_freem(m); return (ENETDOWN); } - /* check if flowid is set */ - if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) - txq += ((m->m_pkthdr.flowid % (pi->ntxq - pi->rsrv_noflowq)) - + pi->rsrv_noflowq); - br = txq->br; - - if (TXQ_TRYLOCK(txq) == 0) { - struct sge_eq *eq = &txq->eq; - - /* - * It is possible that t4_eth_tx finishes up and releases the - * lock between the TRYLOCK above and the drbr_enqueue here. We - * need to make sure that this mbuf doesn't just sit there in - * the drbr. - */ - - rc = drbr_enqueue(ifp, br, m); - if (rc == 0 && callout_pending(&eq->tx_callout) == 0 && - !(eq->flags & EQ_DOOMED)) - callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq); + rc = parse_pkt(&m); + if (__predict_false(rc != 0)) { + MPASS(m == NULL); /* was freed already */ + atomic_add_int(&pi->tx_parse_error, 1); /* rare, atomic is ok */ return (rc); } - /* - * txq->m is the mbuf that is held up due to a temporary shortage of - * resources and it should be put on the wire first. Then what's in - * drbr and finally the mbuf that was just passed in to us. - * - * Return code should indicate the fate of the mbuf that was passed in - * this time. - */ - - TXQ_LOCK_ASSERT_OWNED(txq); - if (drbr_needs_enqueue(ifp, br) || txq->m) { - - /* Queued for transmission. */ - - rc = drbr_enqueue(ifp, br, m); - m = txq->m ? txq->m : drbr_dequeue(ifp, br); - (void) t4_eth_tx(ifp, txq, m); - TXQ_UNLOCK(txq); - return (rc); - } + /* Select a txq. */ + txq = &sc->sge.txq[pi->first_txq]; + if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) + txq += ((m->m_pkthdr.flowid % (pi->ntxq - pi->rsrv_noflowq)) + + pi->rsrv_noflowq); - /* Direct transmission. */ - rc = t4_eth_tx(ifp, txq, m); - if (rc != 0 && txq->m) - rc = 0; /* held, will be transmitted soon (hopefully) */ + items[0] = m; + rc = mp_ring_enqueue(txq->r, items, 1, 4096); + if (__predict_false(rc != 0)) + m_freem(m); - TXQ_UNLOCK(txq); return (rc); } @@ -1499,17 +1469,17 @@ cxgbe_qflush(struct ifnet *ifp) struct port_info *pi = ifp->if_softc; struct sge_txq *txq; int i; - struct mbuf *m; /* queues do not exist if !PORT_INIT_DONE. */ if (pi->flags & PORT_INIT_DONE) { for_each_txq(pi, i, txq) { TXQ_LOCK(txq); - m_freem(txq->m); - txq->m = NULL; - while ((m = buf_ring_dequeue_sc(txq->br)) != NULL) - m_freem(m); + txq->eq.flags &= ~EQ_ENABLED; TXQ_UNLOCK(txq); + while (!mp_ring_is_idle(txq->r)) { + mp_ring_check_drainage(txq->r, 0); + pause("qflush", 1); + } } } if_qflush(ifp); @@ -1564,7 +1534,7 @@ cxgbe_get_counter(struct ifnet *ifp, ift_counter c) struct sge_txq *txq; for_each_txq(pi, i, txq) - drops += txq->br->br_drops; + drops += counter_u64_fetch(txq->r->drops); } return (drops); @@ -3236,7 +3206,8 @@ cxgbe_init_synchronized(struct port_info *pi) { struct adapter *sc = pi->adapter; struct ifnet *ifp = pi->ifp; - int rc = 0; + int rc = 0, i; + struct sge_txq *txq; ASSERT_SYNCHRONIZED_OP(sc); @@ -3265,6 +3236,17 @@ cxgbe_init_synchronized(struct port_info *pi) } /* + * Can't fail from this point onwards. Review cxgbe_uninit_synchronized + * if this changes. + */ + + for_each_txq(pi, i, txq) { + TXQ_LOCK(txq); + txq->eq.flags |= EQ_ENABLED; + TXQ_UNLOCK(txq); + } + + /* * The first iq of the first port to come up is used for tracing. */ if (sc->traceq < 0) { @@ -3297,7 +3279,8 @@ cxgbe_uninit_synchronized(struct port_info *pi) { struct adapter *sc = pi->adapter; struct ifnet *ifp = pi->ifp; - int rc; + int rc, i; + struct sge_txq *txq; ASSERT_SYNCHRONIZED_OP(sc); @@ -3314,6 +3297,12 @@ cxgbe_uninit_synchronized(struct port_info *pi) return (rc); } + for_each_txq(pi, i, txq) { + TXQ_LOCK(txq); + txq->eq.flags &= ~EQ_ENABLED; + TXQ_UNLOCK(txq); + } + clrbit(&sc->open_device_map, pi->port_id); PORT_LOCK(pi); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; @@ -3543,15 +3532,17 @@ port_full_uninit(struct port_info *pi) if (pi->flags & PORT_INIT_DONE) { - /* Need to quiesce queues. XXX: ctrl queues? */ + /* Need to quiesce queues. */ + + quiesce_wrq(sc, &sc->sge.ctrlq[pi->port_id]); for_each_txq(pi, i, txq) { - quiesce_eq(sc, &txq->eq); + quiesce_txq(sc, txq); } #ifdef TCP_OFFLOAD for_each_ofld_txq(pi, i, ofld_txq) { - quiesce_eq(sc, &ofld_txq->eq); + quiesce_wrq(sc, ofld_txq); } #endif @@ -3576,23 +3567,39 @@ port_full_uninit(struct port_info *pi) } static void -quiesce_eq(struct adapter *sc, struct sge_eq *eq) +quiesce_txq(struct adapter *sc, struct sge_txq *txq) { - EQ_LOCK(eq); - eq->flags |= EQ_DOOMED; + struct sge_eq *eq = &txq->eq; + struct sge_qstat *spg = (void *)&eq->desc[eq->sidx]; - /* - * Wait for the response to a credit flush if one's - * pending. - */ - while (eq->flags & EQ_CRFLUSHED) - mtx_sleep(eq, &eq->eq_lock, 0, "crflush", 0); - EQ_UNLOCK(eq); + (void) sc; /* unused */ + +#ifdef INVARIANTS + TXQ_LOCK(txq); + MPASS((eq->flags & EQ_ENABLED) == 0); + TXQ_UNLOCK(txq); +#endif + + /* Wait for the mp_ring to empty. */ + while (!mp_ring_is_idle(txq->r)) { + mp_ring_check_drainage(txq->r, 0); + pause("rquiesce", 1); + } - callout_drain(&eq->tx_callout); /* XXX: iffy */ - pause("callout", 10); /* Still iffy */ + /* Then wait for the hardware to finish. */ + while (spg->cidx != htobe16(eq->pidx)) + pause("equiesce", 1); - taskqueue_drain(sc->tq[eq->tx_chan], &eq->tx_task); + /* Finally, wait for the driver to reclaim all descriptors. */ + while (eq->cidx != eq->pidx) + pause("dquiesce", 1); +} + +static void +quiesce_wrq(struct adapter *sc, struct sge_wrq *wrq) +{ + + /* XXXTX */ } static void @@ -4892,6 +4899,9 @@ cxgbe_sysctls(struct port_info *pi) oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "stats", CTLFLAG_RD, NULL, "port statistics"); children = SYSCTL_CHILDREN(oid); + SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "tx_parse_error", CTLFLAG_RD, + &pi->tx_parse_error, 0, + "# of tx packets with invalid length or # of segments"); #define SYSCTL_ADD_T4_REG64(pi, name, desc, reg) \ SYSCTL_ADD_OID(ctx, children, OID_AUTO, name, \ @@ -6947,74 +6957,6 @@ sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS) } #endif -static inline void -txq_start(struct ifnet *ifp, struct sge_txq *txq) -{ - struct buf_ring *br; - struct mbuf *m; - - TXQ_LOCK_ASSERT_OWNED(txq); - - br = txq->br; - m = txq->m ? txq->m : drbr_dequeue(ifp, br); - if (m) - t4_eth_tx(ifp, txq, m); -} - -void -t4_tx_callout(void *arg) -{ - struct sge_eq *eq = arg; - struct adapter *sc; - - if (EQ_TRYLOCK(eq) == 0) - goto reschedule; - - if (eq->flags & EQ_STALLED && !can_resume_tx(eq)) { - EQ_UNLOCK(eq); -reschedule: - if (__predict_true(!(eq->flags && EQ_DOOMED))) - callout_schedule(&eq->tx_callout, 1); - return; - } - - EQ_LOCK_ASSERT_OWNED(eq); - - if (__predict_true((eq->flags & EQ_DOOMED) == 0)) { - - if ((eq->flags & EQ_TYPEMASK) == EQ_ETH) { - struct sge_txq *txq = arg; - struct port_info *pi = txq->ifp->if_softc; - - sc = pi->adapter; - } else { - struct sge_wrq *wrq = arg; - - sc = wrq->adapter; - } - - taskqueue_enqueue(sc->tq[eq->tx_chan], &eq->tx_task); - } - - EQ_UNLOCK(eq); -} - -void -t4_tx_task(void *arg, int count) -{ - struct sge_eq *eq = arg; - - EQ_LOCK(eq); - if ((eq->flags & EQ_TYPEMASK) == EQ_ETH) { - struct sge_txq *txq = arg; - txq_start(txq->ifp, txq); - } else { - struct sge_wrq *wrq = arg; - t4_wrq_tx_locked(wrq->adapter, wrq, NULL); - } - EQ_UNLOCK(eq); -} - static uint32_t fconf_to_mode(uint32_t fconf) { @@ -7452,9 +7394,9 @@ static int set_filter_wr(struct adapter *sc, int fidx) { struct filter_entry *f = &sc->tids.ftid_tab[fidx]; - struct wrqe *wr; struct fw_filter_wr *fwr; unsigned int ftid; + struct wrq_cookie cookie; ASSERT_SYNCHRONIZED_OP(sc); @@ -7473,12 +7415,10 @@ set_filter_wr(struct adapter *sc, int fidx) ftid = sc->tids.ftid_base + fidx; - wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq); - if (wr == NULL) + fwr = start_wrq_wr(&sc->sge.mgmtq, howmany(sizeof(*fwr), 16), &cookie); + if (fwr == NULL) return (ENOMEM); - - fwr = wrtod(wr); - bzero(fwr, sizeof (*fwr)); + bzero(fwr, sizeof(*fwr)); fwr->op_pkd = htobe32(V_FW_WR_OP(FW_FILTER_WR)); fwr->len16_pkd = htobe32(FW_LEN16(*fwr)); @@ -7547,7 +7487,7 @@ set_filter_wr(struct adapter *sc, int fidx) f->pending = 1; sc->tids.ftids_in_use++; - t4_wrq_tx(sc, wr); + commit_wrq_wr(&sc->sge.mgmtq, fwr, &cookie); return (0); } @@ -7555,22 +7495,21 @@ static int del_filter_wr(struct adapter *sc, int fidx) { struct filter_entry *f = &sc->tids.ftid_tab[fidx]; - struct wrqe *wr; struct fw_filter_wr *fwr; unsigned int ftid; + struct wrq_cookie cookie; ftid = sc->tids.ftid_base + fidx; - wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq); - if (wr == NULL) + fwr = start_wrq_wr(&sc->sge.mgmtq, howmany(sizeof(*fwr), 16), &cookie); + if (fwr == NULL) return (ENOMEM); - fwr = wrtod(wr); bzero(fwr, sizeof (*fwr)); t4_mk_filtdelwr(ftid, fwr, sc->sge.fwq.abs_id); f->pending = 1; - t4_wrq_tx(sc, wr); + commit_wrq_wr(&sc->sge.mgmtq, fwr, &cookie); return (0); } @@ -8170,6 +8109,7 @@ t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag, /* MAC stats */ t4_clr_port_stats(sc, pi->tx_chan); + pi->tx_parse_error = 0; if (pi->flags & PORT_INIT_DONE) { struct sge_rxq *rxq; @@ -8192,24 +8132,24 @@ t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag, txq->imm_wrs = 0; txq->sgl_wrs = 0; txq->txpkt_wrs = 0; - txq->txpkts_wrs = 0; - txq->txpkts_pkts = 0; - txq->br->br_drops = 0; - txq->no_dmamap = 0; - txq->no_desc = 0; + txq->txpkts0_wrs = 0; + txq->txpkts1_wrs = 0; + txq->txpkts0_pkts = 0; + txq->txpkts1_pkts = 0; + mp_ring_reset_stats(txq->r); } #ifdef TCP_OFFLOAD /* nothing to clear for each ofld_rxq */ for_each_ofld_txq(pi, i, wrq) { - wrq->tx_wrs = 0; - wrq->no_desc = 0; + wrq->tx_wrs_direct = 0; + wrq->tx_wrs_copied = 0; } #endif wrq = &sc->sge.ctrlq[pi->port_id]; - wrq->tx_wrs = 0; - wrq->no_desc = 0; + wrq->tx_wrs_direct = 0; + wrq->tx_wrs_copied = 0; } break; } diff --git a/sys/dev/cxgbe/t4_mp_ring.c b/sys/dev/cxgbe/t4_mp_ring.c new file mode 100644 index 0000000..ef09f01 --- /dev/null +++ b/sys/dev/cxgbe/t4_mp_ring.c @@ -0,0 +1,364 @@ +/*- + * Copyright (c) 2014 Chelsio Communications, Inc. + * All rights reserved. + * Written by: Navdeep Parhar <np@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/counter.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <machine/cpu.h> + +#include "t4_mp_ring.h" + +union ring_state { + struct { + uint16_t pidx_head; + uint16_t pidx_tail; + uint16_t cidx; + uint16_t flags; + }; + uint64_t state; +}; + +enum { + IDLE = 0, /* consumer ran to completion, nothing more to do. */ + BUSY, /* consumer is running already, or will be shortly. */ + STALLED, /* consumer stopped due to lack of resources. */ + ABDICATED, /* consumer stopped even though there was work to be + done because it wants another thread to take over. */ +}; + +static inline uint16_t +space_available(struct mp_ring *r, union ring_state s) +{ + uint16_t x = r->size - 1; + + if (s.cidx == s.pidx_head) + return (x); + else if (s.cidx > s.pidx_head) + return (s.cidx - s.pidx_head - 1); + else + return (x - s.pidx_head + s.cidx); +} + +static inline uint16_t +increment_idx(struct mp_ring *r, uint16_t idx, uint16_t n) +{ + int x = r->size - idx; + + MPASS(x > 0); + return (x > n ? idx + n : n - x); +} + +/* Consumer is about to update the ring's state to s */ +static inline uint16_t +state_to_flags(union ring_state s, int abdicate) +{ + + if (s.cidx == s.pidx_tail) + return (IDLE); + else if (abdicate && s.pidx_tail != s.pidx_head) + return (ABDICATED); + + return (BUSY); +} + +/* + * Caller passes in a state, with a guarantee that there is work to do and that + * all items up to the pidx_tail in the state are visible. + */ +static void +drain_ring(struct mp_ring *r, union ring_state os, uint16_t prev, int budget) +{ + union ring_state ns; + int n, pending, total; + uint16_t cidx = os.cidx; + uint16_t pidx = os.pidx_tail; + + MPASS(os.flags == BUSY); + MPASS(cidx != pidx); + + if (prev == IDLE) + counter_u64_add(r->starts, 1); + pending = 0; + total = 0; + + while (cidx != pidx) { + + /* Items from cidx to pidx are available for consumption. */ + n = r->drain(r, cidx, pidx); + if (n == 0) { + critical_enter(); + do { + os.state = ns.state = r->state; + ns.cidx = cidx; + ns.flags = STALLED; + } while (atomic_cmpset_64(&r->state, os.state, + ns.state) == 0); + critical_exit(); + if (prev != STALLED) + counter_u64_add(r->stalls, 1); + else if (total > 0) { + counter_u64_add(r->restarts, 1); + counter_u64_add(r->stalls, 1); + } + break; + } + cidx = increment_idx(r, cidx, n); + pending += n; + total += n; + + /* + * We update the cidx only if we've caught up with the pidx, the + * real cidx is getting too far ahead of the one visible to + * everyone else, or we have exceeded our budget. + */ + if (cidx != pidx && pending < 64 && total < budget) + continue; + critical_enter(); + do { + os.state = ns.state = r->state; + ns.cidx = cidx; + ns.flags = state_to_flags(ns, total >= budget); + } while (atomic_cmpset_acq_64(&r->state, os.state, ns.state) == 0); + critical_exit(); + + if (ns.flags == ABDICATED) + counter_u64_add(r->abdications, 1); + if (ns.flags != BUSY) { + /* Wrong loop exit if we're going to stall. */ + MPASS(ns.flags != STALLED); + if (prev == STALLED) { + MPASS(total > 0); + counter_u64_add(r->restarts, 1); + } + break; + } + + /* + * The acquire style atomic above guarantees visibility of items + * associated with any pidx change that we notice here. + */ + pidx = ns.pidx_tail; + pending = 0; + } +} + +int +mp_ring_alloc(struct mp_ring **pr, int size, void *cookie, ring_drain_t drain, + ring_can_drain_t can_drain, struct malloc_type *mt, int flags) +{ + struct mp_ring *r; + + /* All idx are 16b so size can be 65536 at most */ + if (pr == NULL || size < 2 || size > 65536 || drain == NULL || + can_drain == NULL) + return (EINVAL); + *pr = NULL; + flags &= M_NOWAIT | M_WAITOK; + MPASS(flags != 0); + + r = malloc(__offsetof(struct mp_ring, items[size]), mt, flags | M_ZERO); + if (r == NULL) + return (ENOMEM); + r->size = size; + r->cookie = cookie; + r->mt = mt; + r->drain = drain; + r->can_drain = can_drain; + r->enqueues = counter_u64_alloc(flags); + r->drops = counter_u64_alloc(flags); + r->starts = counter_u64_alloc(flags); + r->stalls = counter_u64_alloc(flags); + r->restarts = counter_u64_alloc(flags); + r->abdications = counter_u64_alloc(flags); + if (r->enqueues == NULL || r->drops == NULL || r->starts == NULL || + r->stalls == NULL || r->restarts == NULL || + r->abdications == NULL) { + mp_ring_free(r); + return (ENOMEM); + } + + *pr = r; + return (0); +} + +void + +mp_ring_free(struct mp_ring *r) +{ + + if (r == NULL) + return; + + if (r->enqueues != NULL) + counter_u64_free(r->enqueues); + if (r->drops != NULL) + counter_u64_free(r->drops); + if (r->starts != NULL) + counter_u64_free(r->starts); + if (r->stalls != NULL) + counter_u64_free(r->stalls); + if (r->restarts != NULL) + counter_u64_free(r->restarts); + if (r->abdications != NULL) + counter_u64_free(r->abdications); + + free(r, r->mt); +} + +/* + * Enqueue n items and maybe drain the ring for some time. + * + * Returns an errno. + */ +int +mp_ring_enqueue(struct mp_ring *r, void **items, int n, int budget) +{ + union ring_state os, ns; + uint16_t pidx_start, pidx_stop; + int i; + + MPASS(items != NULL); + MPASS(n > 0); + + /* + * Reserve room for the new items. Our reservation, if successful, is + * from 'pidx_start' to 'pidx_stop'. + */ + for (;;) { + os.state = r->state; + if (n >= space_available(r, os)) { + counter_u64_add(r->drops, n); + MPASS(os.flags != IDLE); + if (os.flags == STALLED) + mp_ring_check_drainage(r, 0); + return (ENOBUFS); + } + ns.state = os.state; + ns.pidx_head = increment_idx(r, os.pidx_head, n); + critical_enter(); + if (atomic_cmpset_64(&r->state, os.state, ns.state)) + break; + critical_exit(); + cpu_spinwait(); + } + pidx_start = os.pidx_head; + pidx_stop = ns.pidx_head; + + /* + * Wait for other producers who got in ahead of us to enqueue their + * items, one producer at a time. It is our turn when the ring's + * pidx_tail reaches the begining of our reservation (pidx_start). + */ + while (ns.pidx_tail != pidx_start) { + cpu_spinwait(); + ns.state = r->state; + } + + /* Now it is our turn to fill up the area we reserved earlier. */ + i = pidx_start; + do { + r->items[i] = *items++; + if (__predict_false(++i == r->size)) + i = 0; + } while (i != pidx_stop); + + /* + * Update the ring's pidx_tail. The release style atomic guarantees + * that the items are visible to any thread that sees the updated pidx. + */ + do { + os.state = ns.state = r->state; + ns.pidx_tail = pidx_stop; + ns.flags = BUSY; + } while (atomic_cmpset_rel_64(&r->state, os.state, ns.state) == 0); + critical_exit(); + counter_u64_add(r->enqueues, n); + + /* + * Turn into a consumer if some other thread isn't active as a consumer + * already. + */ + if (os.flags != BUSY) + drain_ring(r, ns, os.flags, budget); + + return (0); +} + +void +mp_ring_check_drainage(struct mp_ring *r, int budget) +{ + union ring_state os, ns; + + os.state = r->state; + if (os.flags != STALLED || os.pidx_head != os.pidx_tail || + r->can_drain(r) == 0) + return; + + MPASS(os.cidx != os.pidx_tail); /* implied by STALLED */ + ns.state = os.state; + ns.flags = BUSY; + + /* + * The acquire style atomic guarantees visibility of items associated + * with the pidx that we read here. + */ + if (!atomic_cmpset_acq_64(&r->state, os.state, ns.state)) + return; + + drain_ring(r, ns, os.flags, budget); +} + +void +mp_ring_reset_stats(struct mp_ring *r) +{ + + counter_u64_zero(r->enqueues); + counter_u64_zero(r->drops); + counter_u64_zero(r->starts); + counter_u64_zero(r->stalls); + counter_u64_zero(r->restarts); + counter_u64_zero(r->abdications); +} + +int +mp_ring_is_idle(struct mp_ring *r) +{ + union ring_state s; + + s.state = r->state; + if (s.pidx_head == s.pidx_tail && s.pidx_tail == s.cidx && + s.flags == IDLE) + return (1); + + return (0); +} diff --git a/sys/dev/cxgbe/t4_mp_ring.h b/sys/dev/cxgbe/t4_mp_ring.h new file mode 100644 index 0000000..c9ee346 --- /dev/null +++ b/sys/dev/cxgbe/t4_mp_ring.h @@ -0,0 +1,68 @@ +/*- + * Copyright (c) 2014 Chelsio Communications, Inc. + * All rights reserved. + * Written by: Navdeep Parhar <np@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#ifndef __CXGBE_MP_RING_H +#define __CXGBE_MP_RING_H + +#ifndef _KERNEL +#error "no user-serviceable parts inside" +#endif + +struct mp_ring; +typedef u_int (*ring_drain_t)(struct mp_ring *, u_int, u_int); +typedef u_int (*ring_can_drain_t)(struct mp_ring *); + +struct mp_ring { + volatile uint64_t state __aligned(CACHE_LINE_SIZE); + + int size __aligned(CACHE_LINE_SIZE); + void * cookie; + struct malloc_type * mt; + ring_drain_t drain; + ring_can_drain_t can_drain; /* cheap, may be unreliable */ + counter_u64_t enqueues; + counter_u64_t drops; + counter_u64_t starts; + counter_u64_t stalls; + counter_u64_t restarts; /* recovered after stalling */ + counter_u64_t abdications; + + void * volatile items[] __aligned(CACHE_LINE_SIZE); +}; + +int mp_ring_alloc(struct mp_ring **, int, void *, ring_drain_t, + ring_can_drain_t, struct malloc_type *, int); +void mp_ring_free(struct mp_ring *); +int mp_ring_enqueue(struct mp_ring *, void **, int, int); +void mp_ring_check_drainage(struct mp_ring *, int); +void mp_ring_reset_stats(struct mp_ring *); +int mp_ring_is_idle(struct mp_ring *); + +#endif diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c index 96e22cb..026b4ce 100644 --- a/sys/dev/cxgbe/t4_sge.c +++ b/sys/dev/cxgbe/t4_sge.c @@ -36,12 +36,12 @@ __FBSDID("$FreeBSD$"); #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/kernel.h> -#include <sys/kdb.h> #include <sys/malloc.h> #include <sys/queue.h> #include <sys/sbuf.h> #include <sys/taskqueue.h> #include <sys/time.h> +#include <sys/sglist.h> #include <sys/sysctl.h> #include <sys/smp.h> #include <sys/counter.h> @@ -68,6 +68,7 @@ __FBSDID("$FreeBSD$"); #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_msg.h" +#include "t4_mp_ring.h" #ifdef T4_PKT_TIMESTAMP #define RX_COPY_THRESHOLD (MINCLSIZE - 8) @@ -147,19 +148,17 @@ TUNABLE_INT("hw.cxgbe.largest_rx_cluster", &largest_rx_cluster); static int safest_rx_cluster = PAGE_SIZE; TUNABLE_INT("hw.cxgbe.safest_rx_cluster", &safest_rx_cluster); -/* Used to track coalesced tx work request */ struct txpkts { - uint64_t *flitp; /* ptr to flit where next pkt should start */ - uint8_t npkt; /* # of packets in this work request */ - uint8_t nflits; /* # of flits used by this work request */ - uint16_t plen; /* total payload (sum of all packets) */ + u_int wr_type; /* type 0 or type 1 */ + u_int npkt; /* # of packets in this work request */ + u_int plen; /* total payload (sum of all packets) */ + u_int len16; /* # of 16B pieces used by this work request */ }; /* A packet's SGL. This + m_pkthdr has all info needed for tx */ struct sgl { - int nsegs; /* # of segments in the SGL, 0 means imm. tx */ - int nflits; /* # of flits needed for the SGL */ - bus_dma_segment_t seg[TX_SGL_SEGS]; + struct sglist sg; + struct sglist_seg seg[TX_SGL_SEGS]; }; static int service_iq(struct sge_iq *, int); @@ -221,26 +220,31 @@ static void find_best_refill_source(struct adapter *, struct sge_fl *, int); static void find_safe_refill_source(struct adapter *, struct sge_fl *); static void add_fl_to_sfl(struct adapter *, struct sge_fl *); -static int get_pkt_sgl(struct sge_txq *, struct mbuf **, struct sgl *, int); -static int free_pkt_sgl(struct sge_txq *, struct sgl *); -static int write_txpkt_wr(struct port_info *, struct sge_txq *, struct mbuf *, - struct sgl *); -static int add_to_txpkts(struct port_info *, struct sge_txq *, struct txpkts *, - struct mbuf *, struct sgl *); -static void write_txpkts_wr(struct sge_txq *, struct txpkts *); -static inline void write_ulp_cpl_sgl(struct port_info *, struct sge_txq *, - struct txpkts *, struct mbuf *, struct sgl *); -static int write_sgl_to_txd(struct sge_eq *, struct sgl *, caddr_t *); +static inline void get_pkt_gl(struct mbuf *, struct sglist *); +static inline u_int txpkt_len16(u_int, u_int); +static inline u_int txpkts0_len16(u_int); +static inline u_int txpkts1_len16(void); +static u_int write_txpkt_wr(struct sge_txq *, struct fw_eth_tx_pkt_wr *, + struct mbuf *, u_int); +static int try_txpkts(struct mbuf *, struct mbuf *, struct txpkts *, u_int); +static int add_to_txpkts(struct mbuf *, struct txpkts *, u_int); +static u_int write_txpkts_wr(struct sge_txq *, struct fw_eth_tx_pkts_wr *, + struct mbuf *, const struct txpkts *, u_int); +static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int); static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int); -static inline void ring_eq_db(struct adapter *, struct sge_eq *); -static inline int reclaimable(struct sge_eq *); -static int reclaim_tx_descs(struct sge_txq *, int, int); -static void write_eqflush_wr(struct sge_eq *); -static __be64 get_flit(bus_dma_segment_t *, int, int); +static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int); +static inline uint16_t read_hw_cidx(struct sge_eq *); +static inline u_int reclaimable_tx_desc(struct sge_eq *); +static inline u_int total_available_tx_desc(struct sge_eq *); +static u_int reclaim_tx_descs(struct sge_txq *, u_int); +static void tx_reclaim(void *, int); +static __be64 get_flit(struct sglist_seg *, int, int); static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *, struct mbuf *); static int handle_fw_msg(struct sge_iq *, const struct rss_header *, struct mbuf *); +static void wrq_tx_drain(void *, int); +static void drain_wrq_wr_list(struct adapter *, struct sge_wrq *); static int sysctl_uint16(SYSCTL_HANDLER_ARGS); static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS); @@ -1785,327 +1789,679 @@ t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0) } /* + * Must drain the wrq or make sure that someone else will. + */ +static void +wrq_tx_drain(void *arg, int n) +{ + struct sge_wrq *wrq = arg; + struct sge_eq *eq = &wrq->eq; + + EQ_LOCK(eq); + if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list)) + drain_wrq_wr_list(wrq->adapter, wrq); + EQ_UNLOCK(eq); +} + +static void +drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq) +{ + struct sge_eq *eq = &wrq->eq; + u_int available, dbdiff; /* # of hardware descriptors */ + u_int n; + struct wrqe *wr; + struct fw_eth_tx_pkt_wr *dst; /* any fw WR struct will do */ + + EQ_LOCK_ASSERT_OWNED(eq); + MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs)); + wr = STAILQ_FIRST(&wrq->wr_list); + MPASS(wr != NULL); /* Must be called with something useful to do */ + dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx); + + do { + eq->cidx = read_hw_cidx(eq); + if (eq->pidx == eq->cidx) + available = eq->sidx - 1; + else + available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; + + MPASS(wr->wrq == wrq); + n = howmany(wr->wr_len, EQ_ESIZE); + if (available < n) + return; + + dst = (void *)&eq->desc[eq->pidx]; + if (__predict_true(eq->sidx - eq->pidx > n)) { + /* Won't wrap, won't end exactly at the status page. */ + bcopy(&wr->wr[0], dst, wr->wr_len); + eq->pidx += n; + } else { + int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE; + + bcopy(&wr->wr[0], dst, first_portion); + if (wr->wr_len > first_portion) { + bcopy(&wr->wr[first_portion], &eq->desc[0], + wr->wr_len - first_portion); + } + eq->pidx = n - (eq->sidx - eq->pidx); + } + + if (available < eq->sidx / 4 && + atomic_cmpset_int(&eq->equiq, 0, 1)) { + dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ | + F_FW_WR_EQUEQ); + eq->equeqidx = eq->pidx; + } else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) { + dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ); + eq->equeqidx = eq->pidx; + } + + dbdiff += n; + if (dbdiff >= 16) { + ring_eq_db(sc, eq, dbdiff); + dbdiff = 0; + } + + STAILQ_REMOVE_HEAD(&wrq->wr_list, link); + free_wrqe(wr); + MPASS(wrq->nwr_pending > 0); + wrq->nwr_pending--; + MPASS(wrq->ndesc_needed >= n); + wrq->ndesc_needed -= n; + } while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL); + + if (dbdiff) + ring_eq_db(sc, eq, dbdiff); +} + +/* * Doesn't fail. Holds on to work requests it can't send right away. */ void t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr) { +#ifdef INVARIANTS struct sge_eq *eq = &wrq->eq; - int can_reclaim; - caddr_t dst; +#endif + + EQ_LOCK_ASSERT_OWNED(eq); + MPASS(wr != NULL); + MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN); + MPASS((wr->wr_len & 0x7) == 0); - TXQ_LOCK_ASSERT_OWNED(wrq); + STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link); + wrq->nwr_pending++; + wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE); + + if (!TAILQ_EMPTY(&wrq->incomplete_wrs)) + return; /* commit_wrq_wr will drain wr_list as well. */ + + drain_wrq_wr_list(sc, wrq); + + /* Doorbell must have caught up to the pidx. */ + MPASS(eq->pidx == eq->dbidx); +} + +void +t4_update_fl_bufsize(struct ifnet *ifp) +{ + struct port_info *pi = ifp->if_softc; + struct adapter *sc = pi->adapter; + struct sge_rxq *rxq; #ifdef TCP_OFFLOAD - KASSERT((eq->flags & EQ_TYPEMASK) == EQ_OFLD || - (eq->flags & EQ_TYPEMASK) == EQ_CTRL, - ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK)); -#else - KASSERT((eq->flags & EQ_TYPEMASK) == EQ_CTRL, - ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK)); + struct sge_ofld_rxq *ofld_rxq; #endif + struct sge_fl *fl; + int i, maxp, mtu = ifp->if_mtu; - if (__predict_true(wr != NULL)) - STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link); + maxp = mtu_to_max_payload(sc, mtu, 0); + for_each_rxq(pi, i, rxq) { + fl = &rxq->fl; - can_reclaim = reclaimable(eq); - if (__predict_false(eq->flags & EQ_STALLED)) { - if (eq->avail + can_reclaim < tx_resume_threshold(eq)) - return; - eq->flags &= ~EQ_STALLED; - eq->unstalled++; + FL_LOCK(fl); + find_best_refill_source(sc, fl, maxp); + FL_UNLOCK(fl); } - eq->cidx += can_reclaim; - eq->avail += can_reclaim; - if (__predict_false(eq->cidx >= eq->cap)) - eq->cidx -= eq->cap; +#ifdef TCP_OFFLOAD + maxp = mtu_to_max_payload(sc, mtu, 1); + for_each_ofld_rxq(pi, i, ofld_rxq) { + fl = &ofld_rxq->fl; - while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL) { - int ndesc; + FL_LOCK(fl); + find_best_refill_source(sc, fl, maxp); + FL_UNLOCK(fl); + } +#endif +} - if (__predict_false(wr->wr_len < 0 || - wr->wr_len > SGE_MAX_WR_LEN || (wr->wr_len & 0x7))) { +static inline int +mbuf_nsegs(struct mbuf *m) +{ -#ifdef INVARIANTS - panic("%s: work request with length %d", __func__, - wr->wr_len); -#endif -#ifdef KDB - kdb_backtrace(); -#endif - log(LOG_ERR, "%s: %s work request with length %d", - device_get_nameunit(sc->dev), __func__, wr->wr_len); - STAILQ_REMOVE_HEAD(&wrq->wr_list, link); - free_wrqe(wr); - continue; - } + M_ASSERTPKTHDR(m); + KASSERT(m->m_pkthdr.l5hlen > 0, + ("%s: mbuf %p missing information on # of segments.", __func__, m)); - ndesc = howmany(wr->wr_len, EQ_ESIZE); - if (eq->avail < ndesc) { - wrq->no_desc++; - break; - } + return (m->m_pkthdr.l5hlen); +} - dst = (void *)&eq->desc[eq->pidx]; - copy_to_txd(eq, wrtod(wr), &dst, wr->wr_len); +static inline void +set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs) +{ - eq->pidx += ndesc; - eq->avail -= ndesc; - if (__predict_false(eq->pidx >= eq->cap)) - eq->pidx -= eq->cap; + M_ASSERTPKTHDR(m); + m->m_pkthdr.l5hlen = nsegs; +} - eq->pending += ndesc; - if (eq->pending >= 8) - ring_eq_db(sc, eq); +static inline int +mbuf_len16(struct mbuf *m) +{ + int n; - wrq->tx_wrs++; - STAILQ_REMOVE_HEAD(&wrq->wr_list, link); - free_wrqe(wr); + M_ASSERTPKTHDR(m); + n = m->m_pkthdr.PH_loc.eight[0]; + MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16); - if (eq->avail < 8) { - can_reclaim = reclaimable(eq); - eq->cidx += can_reclaim; - eq->avail += can_reclaim; - if (__predict_false(eq->cidx >= eq->cap)) - eq->cidx -= eq->cap; - } - } + return (n); +} + +static inline void +set_mbuf_len16(struct mbuf *m, uint8_t len16) +{ - if (eq->pending) - ring_eq_db(sc, eq); + M_ASSERTPKTHDR(m); + m->m_pkthdr.PH_loc.eight[0] = len16; +} + +static inline int +needs_tso(struct mbuf *m) +{ - if (wr != NULL) { - eq->flags |= EQ_STALLED; - if (callout_pending(&eq->tx_callout) == 0) - callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq); + M_ASSERTPKTHDR(m); + + if (m->m_pkthdr.csum_flags & CSUM_TSO) { + KASSERT(m->m_pkthdr.tso_segsz > 0, + ("%s: TSO requested in mbuf %p but MSS not provided", + __func__, m)); + return (1); } + + return (0); } -/* Per-packet header in a coalesced tx WR, before the SGL starts (in flits) */ -#define TXPKTS_PKT_HDR ((\ - sizeof(struct ulp_txpkt) + \ - sizeof(struct ulptx_idata) + \ - sizeof(struct cpl_tx_pkt_core) \ - ) / 8) - -/* Header of a coalesced tx WR, before SGL of first packet (in flits) */ -#define TXPKTS_WR_HDR (\ - sizeof(struct fw_eth_tx_pkts_wr) / 8 + \ - TXPKTS_PKT_HDR) - -/* Header of a tx WR, before SGL of first packet (in flits) */ -#define TXPKT_WR_HDR ((\ - sizeof(struct fw_eth_tx_pkt_wr) + \ - sizeof(struct cpl_tx_pkt_core) \ - ) / 8 ) - -/* Header of a tx LSO WR, before SGL of first packet (in flits) */ -#define TXPKT_LSO_WR_HDR ((\ - sizeof(struct fw_eth_tx_pkt_wr) + \ - sizeof(struct cpl_tx_pkt_lso_core) + \ - sizeof(struct cpl_tx_pkt_core) \ - ) / 8 ) +static inline int +needs_l3_csum(struct mbuf *m) +{ + + M_ASSERTPKTHDR(m); -int -t4_eth_tx(struct ifnet *ifp, struct sge_txq *txq, struct mbuf *m) + if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)) + return (1); + return (0); +} + +static inline int +needs_l4_csum(struct mbuf *m) { - struct port_info *pi = (void *)ifp->if_softc; - struct adapter *sc = pi->adapter; - struct sge_eq *eq = &txq->eq; - struct buf_ring *br = txq->br; - struct mbuf *next; - int rc, coalescing, can_reclaim; - struct txpkts txpkts; - struct sgl sgl; - TXQ_LOCK_ASSERT_OWNED(txq); - KASSERT(m, ("%s: called with nothing to do.", __func__)); - KASSERT((eq->flags & EQ_TYPEMASK) == EQ_ETH, - ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK)); + M_ASSERTPKTHDR(m); - prefetch(&eq->desc[eq->pidx]); - prefetch(&txq->sdesc[eq->pidx]); + if (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | + CSUM_TCP_IPV6 | CSUM_TSO)) + return (1); + return (0); +} - txpkts.npkt = 0;/* indicates there's nothing in txpkts */ - coalescing = 0; +static inline int +needs_vlan_insertion(struct mbuf *m) +{ - can_reclaim = reclaimable(eq); - if (__predict_false(eq->flags & EQ_STALLED)) { - if (eq->avail + can_reclaim < tx_resume_threshold(eq)) { - txq->m = m; - return (0); - } - eq->flags &= ~EQ_STALLED; - eq->unstalled++; - } + M_ASSERTPKTHDR(m); - if (__predict_false(eq->flags & EQ_DOOMED)) { - m_freem(m); - while ((m = buf_ring_dequeue_sc(txq->br)) != NULL) - m_freem(m); - return (ENETDOWN); + if (m->m_flags & M_VLANTAG) { + KASSERT(m->m_pkthdr.ether_vtag != 0, + ("%s: HWVLAN requested in mbuf %p but tag not provided", + __func__, m)); + return (1); } + return (0); +} - if (eq->avail < 8 && can_reclaim) - reclaim_tx_descs(txq, can_reclaim, 32); +static void * +m_advance(struct mbuf **pm, int *poffset, int len) +{ + struct mbuf *m = *pm; + int offset = *poffset; + uintptr_t p = 0; - for (; m; m = next ? next : drbr_dequeue(ifp, br)) { + MPASS(len > 0); - if (eq->avail < 8) + while (len) { + if (offset + len < m->m_len) { + offset += len; + p = mtod(m, uintptr_t) + offset; break; + } + len -= m->m_len - offset; + m = m->m_next; + offset = 0; + MPASS(m != NULL); + } + *poffset = offset; + *pm = m; + return ((void *)p); +} - next = m->m_nextpkt; - m->m_nextpkt = NULL; +static inline int +same_paddr(char *a, char *b) +{ - if (next || buf_ring_peek(br)) - coalescing = 1; + if (a == b) + return (1); + else if (a != NULL && b != NULL) { + vm_offset_t x = (vm_offset_t)a; + vm_offset_t y = (vm_offset_t)b; - rc = get_pkt_sgl(txq, &m, &sgl, coalescing); - if (rc != 0) { - if (rc == ENOMEM) { + if ((x & PAGE_MASK) == (y & PAGE_MASK) && + pmap_kextract(x) == pmap_kextract(y)) + return (1); + } - /* Short of resources, suspend tx */ + return (0); +} - m->m_nextpkt = next; - break; - } +/* + * Can deal with empty mbufs in the chain that have m_len = 0, but the chain + * must have at least one mbuf that's not empty. + */ +static inline int +count_mbuf_nsegs(struct mbuf *m) +{ + char *prev_end, *start; + int len, nsegs; - /* - * Unrecoverable error for this packet, throw it away - * and move on to the next. get_pkt_sgl may already - * have freed m (it will be NULL in that case and the - * m_freem here is still safe). - */ + MPASS(m != NULL); - m_freem(m); + nsegs = 0; + prev_end = NULL; + for (; m; m = m->m_next) { + + len = m->m_len; + if (__predict_false(len == 0)) continue; - } + start = mtod(m, char *); - if (coalescing && - add_to_txpkts(pi, txq, &txpkts, m, &sgl) == 0) { + nsegs += sglist_count(start, len); + if (same_paddr(prev_end, start)) + nsegs--; + prev_end = start + len; + } - /* Successfully absorbed into txpkts */ + MPASS(nsegs > 0); + return (nsegs); +} - write_ulp_cpl_sgl(pi, txq, &txpkts, m, &sgl); - goto doorbell; +/* + * Analyze the mbuf to determine its tx needs. The mbuf passed in may change: + * a) caller can assume it's been freed if this function returns with an error. + * b) it may get defragged up if the gather list is too long for the hardware. + */ +int +parse_pkt(struct mbuf **mp) +{ + struct mbuf *m0 = *mp, *m; + int rc, nsegs, defragged = 0, offset; + struct ether_header *eh; + void *l3hdr; +#if defined(INET) || defined(INET6) + struct tcphdr *tcp; +#endif + uint16_t eh_type; + + M_ASSERTPKTHDR(m0); + if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) { + rc = EINVAL; +fail: + m_freem(m0); + *mp = NULL; + return (rc); + } +restart: + /* + * First count the number of gather list segments in the payload. + * Defrag the mbuf if nsegs exceeds the hardware limit. + */ + M_ASSERTPKTHDR(m0); + MPASS(m0->m_pkthdr.len > 0); + nsegs = count_mbuf_nsegs(m0); + if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) { + if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) { + rc = EFBIG; + goto fail; } + *mp = m0 = m; /* update caller's copy after defrag */ + goto restart; + } - /* - * We weren't coalescing to begin with, or current frame could - * not be coalesced (add_to_txpkts flushes txpkts if a frame - * given to it can't be coalesced). Either way there should be - * nothing in txpkts. - */ - KASSERT(txpkts.npkt == 0, - ("%s: txpkts not empty: %d", __func__, txpkts.npkt)); + if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN)) { + m0 = m_pullup(m0, m0->m_pkthdr.len); + if (m0 == NULL) { + /* Should have left well enough alone. */ + rc = EFBIG; + goto fail; + } + *mp = m0; /* update caller's copy after pullup */ + goto restart; + } + set_mbuf_nsegs(m0, nsegs); + set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0))); - /* We're sending out individual packets now */ - coalescing = 0; + if (!needs_tso(m0)) + return (0); - if (eq->avail < 8) - reclaim_tx_descs(txq, 0, 8); - rc = write_txpkt_wr(pi, txq, m, &sgl); - if (rc != 0) { + m = m0; + eh = mtod(m, struct ether_header *); + eh_type = ntohs(eh->ether_type); + if (eh_type == ETHERTYPE_VLAN) { + struct ether_vlan_header *evh = (void *)eh; - /* Short of hardware descriptors, suspend tx */ + eh_type = ntohs(evh->evl_proto); + m0->m_pkthdr.l2hlen = sizeof(*evh); + } else + m0->m_pkthdr.l2hlen = sizeof(*eh); - /* - * This is an unlikely but expensive failure. We've - * done all the hard work (DMA mappings etc.) and now we - * can't send out the packet. What's worse, we have to - * spend even more time freeing up everything in sgl. - */ - txq->no_desc++; - free_pkt_sgl(txq, &sgl); + offset = 0; + l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen); - m->m_nextpkt = next; - break; - } + switch (eh_type) { +#ifdef INET6 + case ETHERTYPE_IPV6: + { + struct ip6_hdr *ip6 = l3hdr; - ETHER_BPF_MTAP(ifp, m); - if (sgl.nsegs == 0) - m_freem(m); -doorbell: - if (eq->pending >= 8) - ring_eq_db(sc, eq); + MPASS(ip6->ip6_nxt == IPPROTO_TCP); - can_reclaim = reclaimable(eq); - if (can_reclaim >= 32) - reclaim_tx_descs(txq, can_reclaim, 64); + m0->m_pkthdr.l3hlen = sizeof(*ip6); + break; } +#endif +#ifdef INET + case ETHERTYPE_IP: + { + struct ip *ip = l3hdr; - if (txpkts.npkt > 0) - write_txpkts_wr(txq, &txpkts); + m0->m_pkthdr.l3hlen = ip->ip_hl * 4; + break; + } +#endif + default: + panic("%s: ethertype 0x%04x unknown. if_cxgbe must be compiled" + " with the same INET/INET6 options as the kernel.", + __func__, eh_type); + } - /* - * m not NULL means there was an error but we haven't thrown it away. - * This can happen when we're short of tx descriptors (no_desc) or maybe - * even DMA maps (no_dmamap). Either way, a credit flush and reclaim - * will get things going again. - */ - if (m && !(eq->flags & EQ_CRFLUSHED)) { - struct tx_sdesc *txsd = &txq->sdesc[eq->pidx]; +#if defined(INET) || defined(INET6) + tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen); + m0->m_pkthdr.l4hlen = tcp->th_off * 4; +#endif + MPASS(m0 == *mp); + return (0); +} - /* - * If EQ_CRFLUSHED is not set then we know we have at least one - * available descriptor because any WR that reduces eq->avail to - * 0 also sets EQ_CRFLUSHED. - */ - KASSERT(eq->avail > 0, ("%s: no space for eqflush.", __func__)); +void * +start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie) +{ + struct sge_eq *eq = &wrq->eq; + struct adapter *sc = wrq->adapter; + int ndesc, available; + struct wrqe *wr; + void *w; - txsd->desc_used = 1; - txsd->credits = 0; - write_eqflush_wr(eq); - } - txq->m = m; + MPASS(len16 > 0); + ndesc = howmany(len16, EQ_ESIZE / 16); + MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC); - if (eq->pending) - ring_eq_db(sc, eq); + EQ_LOCK(eq); - reclaim_tx_descs(txq, 0, 128); + if (!STAILQ_EMPTY(&wrq->wr_list)) + drain_wrq_wr_list(sc, wrq); - if (eq->flags & EQ_STALLED && callout_pending(&eq->tx_callout) == 0) - callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq); + if (!STAILQ_EMPTY(&wrq->wr_list)) { +slowpath: + EQ_UNLOCK(eq); + wr = alloc_wrqe(len16 * 16, wrq); + if (__predict_false(wr == NULL)) + return (NULL); + cookie->pidx = -1; + cookie->ndesc = ndesc; + return (&wr->wr); + } - return (0); + eq->cidx = read_hw_cidx(eq); + if (eq->pidx == eq->cidx) + available = eq->sidx - 1; + else + available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; + if (available < ndesc) + goto slowpath; + + cookie->pidx = eq->pidx; + cookie->ndesc = ndesc; + TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link); + + w = &eq->desc[eq->pidx]; + IDXINCR(eq->pidx, ndesc, eq->sidx); + if (__predict_false(eq->pidx < ndesc - 1)) { + w = &wrq->ss[0]; + wrq->ss_pidx = cookie->pidx; + wrq->ss_len = len16 * 16; + } + + EQ_UNLOCK(eq); + + return (w); } void -t4_update_fl_bufsize(struct ifnet *ifp) +commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie) { - struct port_info *pi = ifp->if_softc; - struct adapter *sc = pi->adapter; - struct sge_rxq *rxq; -#ifdef TCP_OFFLOAD - struct sge_ofld_rxq *ofld_rxq; -#endif - struct sge_fl *fl; - int i, maxp, mtu = ifp->if_mtu; + struct sge_eq *eq = &wrq->eq; + struct adapter *sc = wrq->adapter; + int ndesc, pidx; + struct wrq_cookie *prev, *next; - maxp = mtu_to_max_payload(sc, mtu, 0); - for_each_rxq(pi, i, rxq) { - fl = &rxq->fl; + if (cookie->pidx == -1) { + struct wrqe *wr = __containerof(w, struct wrqe, wr); - FL_LOCK(fl); - find_best_refill_source(sc, fl, maxp); - FL_UNLOCK(fl); + t4_wrq_tx(sc, wr); + return; } -#ifdef TCP_OFFLOAD - maxp = mtu_to_max_payload(sc, mtu, 1); - for_each_ofld_rxq(pi, i, ofld_rxq) { - fl = &ofld_rxq->fl; - FL_LOCK(fl); - find_best_refill_source(sc, fl, maxp); - FL_UNLOCK(fl); + ndesc = cookie->ndesc; /* Can be more than SGE_MAX_WR_NDESC here. */ + pidx = cookie->pidx; + MPASS(pidx >= 0 && pidx < eq->sidx); + if (__predict_false(w == &wrq->ss[0])) { + int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE; + + MPASS(wrq->ss_len > n); /* WR had better wrap around. */ + bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n); + bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n); + wrq->tx_wrs_ss++; + } else + wrq->tx_wrs_direct++; + + EQ_LOCK(eq); + prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link); + next = TAILQ_NEXT(cookie, link); + if (prev == NULL) { + MPASS(pidx == eq->dbidx); + if (next == NULL || ndesc >= 16) + ring_eq_db(wrq->adapter, eq, ndesc); + else { + MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc); + next->pidx = pidx; + next->ndesc += ndesc; + } + } else { + MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc); + prev->ndesc += ndesc; + } + TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link); + + if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list)) + drain_wrq_wr_list(sc, wrq); + +#ifdef INVARIANTS + if (TAILQ_EMPTY(&wrq->incomplete_wrs)) { + /* Doorbell must have caught up to the pidx. */ + MPASS(wrq->eq.pidx == wrq->eq.dbidx); } #endif + EQ_UNLOCK(eq); } -int -can_resume_tx(struct sge_eq *eq) +static u_int +can_resume_eth_tx(struct mp_ring *r) +{ + struct sge_eq *eq = r->cookie; + + return (total_available_tx_desc(eq) > eq->sidx / 8); +} + +static inline int +cannot_use_txpkts(struct mbuf *m) +{ + /* maybe put a GL limit too, to avoid silliness? */ + + return (needs_tso(m)); +} + +/* + * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to + * be consumed. Return the actual number consumed. 0 indicates a stall. + */ +static u_int +eth_tx(struct mp_ring *r, u_int cidx, u_int pidx) { + struct sge_txq *txq = r->cookie; + struct sge_eq *eq = &txq->eq; + struct ifnet *ifp = txq->ifp; + struct port_info *pi = (void *)ifp->if_softc; + struct adapter *sc = pi->adapter; + u_int total, remaining; /* # of packets */ + u_int available, dbdiff; /* # of hardware descriptors */ + u_int n, next_cidx; + struct mbuf *m0, *tail; + struct txpkts txp; + struct fw_eth_tx_pkts_wr *wr; /* any fw WR struct will do */ + + remaining = IDXDIFF(pidx, cidx, r->size); + MPASS(remaining > 0); /* Must not be called without work to do. */ + total = 0; + + TXQ_LOCK(txq); + if (__predict_false((eq->flags & EQ_ENABLED) == 0)) { + while (cidx != pidx) { + m0 = r->items[cidx]; + m_freem(m0); + if (++cidx == r->size) + cidx = 0; + } + reclaim_tx_descs(txq, 2048); + total = remaining; + goto done; + } + + /* How many hardware descriptors do we have readily available. */ + if (eq->pidx == eq->cidx) + available = eq->sidx - 1; + else + available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; + dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx); + + while (remaining > 0) { + + m0 = r->items[cidx]; + M_ASSERTPKTHDR(m0); + MPASS(m0->m_nextpkt == NULL); + + if (available < SGE_MAX_WR_NDESC) { + available += reclaim_tx_descs(txq, 64); + if (available < howmany(mbuf_len16(m0), EQ_ESIZE / 16)) + break; /* out of descriptors */ + } - return (eq->avail + reclaimable(eq) >= tx_resume_threshold(eq)); + next_cidx = cidx + 1; + if (__predict_false(next_cidx == r->size)) + next_cidx = 0; + + wr = (void *)&eq->desc[eq->pidx]; + if (remaining > 1 && + try_txpkts(m0, r->items[next_cidx], &txp, available) == 0) { + + /* pkts at cidx, next_cidx should both be in txp. */ + MPASS(txp.npkt == 2); + tail = r->items[next_cidx]; + MPASS(tail->m_nextpkt == NULL); + ETHER_BPF_MTAP(ifp, m0); + ETHER_BPF_MTAP(ifp, tail); + m0->m_nextpkt = tail; + + if (__predict_false(++next_cidx == r->size)) + next_cidx = 0; + + while (next_cidx != pidx) { + if (add_to_txpkts(r->items[next_cidx], &txp, + available) != 0) + break; + tail->m_nextpkt = r->items[next_cidx]; + tail = tail->m_nextpkt; + ETHER_BPF_MTAP(ifp, tail); + if (__predict_false(++next_cidx == r->size)) + next_cidx = 0; + } + + n = write_txpkts_wr(txq, wr, m0, &txp, available); + total += txp.npkt; + remaining -= txp.npkt; + } else { + total++; + remaining--; + n = write_txpkt_wr(txq, (void *)wr, m0, available); + ETHER_BPF_MTAP(ifp, m0); + } + MPASS(n >= 1 && n <= available && n <= SGE_MAX_WR_NDESC); + + available -= n; + dbdiff += n; + IDXINCR(eq->pidx, n, eq->sidx); + + if (total_available_tx_desc(eq) < eq->sidx / 4 && + atomic_cmpset_int(&eq->equiq, 0, 1)) { + wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ | + F_FW_WR_EQUEQ); + eq->equeqidx = eq->pidx; + } else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) { + wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ); + eq->equeqidx = eq->pidx; + } + + if (dbdiff >= 16 && remaining >= 4) { + ring_eq_db(sc, eq, dbdiff); + available += reclaim_tx_descs(txq, 4 * dbdiff); + dbdiff = 0; + } + + cidx = next_cidx; + } + if (dbdiff != 0) { + ring_eq_db(sc, eq, dbdiff); + reclaim_tx_descs(txq, 32); + } +done: + TXQ_UNLOCK(txq); + + return (total); } static inline void @@ -2155,11 +2511,8 @@ init_eq(struct sge_eq *eq, int eqtype, int qsize, uint8_t tx_chan, eq->flags = eqtype & EQ_TYPEMASK; eq->tx_chan = tx_chan; eq->iqid = iqid; - eq->qsize = qsize; + eq->sidx = qsize - spg_len / EQ_ESIZE; strlcpy(eq->lockname, name, sizeof(eq->lockname)); - - TASK_INIT(&eq->tx_task, 0, t4_tx_task, eq); - callout_init(&eq->tx_callout, CALLOUT_MPSAFE); } static int @@ -2848,6 +3201,7 @@ ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_ctrl_cmd c; + int qsize = eq->sidx + spg_len / EQ_ESIZE; bzero(&c, sizeof(c)); @@ -2856,17 +3210,16 @@ ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq) V_FW_EQ_CTRL_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC | F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c)); - c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid)); /* XXX */ + c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid)); c.physeqid_pkd = htobe32(0); c.fetchszm_to_iqid = - htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) | + htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) | V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_CTRL_CMD_FBMIN(X_FETCHBURSTMIN_64B) | V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) | - V_FW_EQ_CTRL_CMD_CIDXFTHRESH(X_CIDXFLUSHTHRESH_32) | - V_FW_EQ_CTRL_CMD_EQSIZE(eq->qsize)); + V_FW_EQ_CTRL_CMD_EQSIZE(qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); @@ -2892,6 +3245,7 @@ eth_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_eth_cmd c; + int qsize = eq->sidx + spg_len / EQ_ESIZE; bzero(&c, sizeof(c)); @@ -2900,15 +3254,15 @@ eth_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq) V_FW_EQ_ETH_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC | F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c)); - c.autoequiqe_to_viid = htobe32(V_FW_EQ_ETH_CMD_VIID(pi->viid)); + c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE | + F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(pi->viid)); c.fetchszm_to_iqid = - htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) | + htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) | V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO | V_FW_EQ_ETH_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) | - V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) | - V_FW_EQ_ETH_CMD_CIDXFTHRESH(X_CIDXFLUSHTHRESH_32) | - V_FW_EQ_ETH_CMD_EQSIZE(eq->qsize)); + V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) | + V_FW_EQ_ETH_CMD_EQSIZE(qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); @@ -2935,6 +3289,7 @@ ofld_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_ofld_cmd c; + int qsize = eq->sidx + spg_len / EQ_ESIZE; bzero(&c, sizeof(c)); @@ -2944,14 +3299,13 @@ ofld_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq) c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC | F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c)); c.fetchszm_to_iqid = - htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) | + htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) | V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_OFLD_CMD_FBMIN(X_FETCHBURSTMIN_64B) | V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) | - V_FW_EQ_OFLD_CMD_CIDXFTHRESH(X_CIDXFLUSHTHRESH_32) | - V_FW_EQ_OFLD_CMD_EQSIZE(eq->qsize)); + V_FW_EQ_OFLD_CMD_EQSIZE(qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); @@ -2976,21 +3330,20 @@ ofld_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq) static int alloc_eq(struct adapter *sc, struct port_info *pi, struct sge_eq *eq) { - int rc; + int rc, qsize; size_t len; mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF); - len = eq->qsize * EQ_ESIZE; + qsize = eq->sidx + spg_len / EQ_ESIZE; + len = qsize * EQ_ESIZE; rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map, &eq->ba, (void **)&eq->desc); if (rc) return (rc); - eq->cap = eq->qsize - spg_len / EQ_ESIZE; - eq->spg = (void *)&eq->desc[eq->cap]; - eq->avail = eq->cap - 1; /* one less to avoid cidx = pidx */ eq->pidx = eq->cidx = 0; + eq->equeqidx = eq->dbidx = 0; eq->doorbells = sc->doorbells; switch (eq->flags & EQ_TYPEMASK) { @@ -3018,8 +3371,6 @@ alloc_eq(struct adapter *sc, struct port_info *pi, struct sge_eq *eq) eq->flags & EQ_TYPEMASK, rc); } - eq->tx_callout.c_cpu = eq->cntxt_id % mp_ncpus; - if (isset(&eq->doorbells, DOORBELL_UDB) || isset(&eq->doorbells, DOORBELL_UDBWC) || isset(&eq->doorbells, DOORBELL_WCWR)) { @@ -3101,7 +3452,11 @@ alloc_wrq(struct adapter *sc, struct port_info *pi, struct sge_wrq *wrq, return (rc); wrq->adapter = sc; + TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq); + TAILQ_INIT(&wrq->incomplete_wrs); STAILQ_INIT(&wrq->wr_list); + wrq->nwr_pending = 0; + wrq->ndesc_needed = 0; SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &wrq->eq.cntxt_id, 0, "SGE context id of the queue"); @@ -3111,13 +3466,10 @@ alloc_wrq(struct adapter *sc, struct port_info *pi, struct sge_wrq *wrq, SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pidx", CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.pidx, 0, sysctl_uint16, "I", "producer index"); - SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs", CTLFLAG_RD, - &wrq->tx_wrs, "# of work requests"); - SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "no_desc", CTLFLAG_RD, - &wrq->no_desc, 0, - "# of times queue ran out of hardware descriptors"); - SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "unstalled", CTLFLAG_RD, - &wrq->eq.unstalled, 0, "# of times queue recovered after stall"); + SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD, + &wrq->tx_wrs_direct, "# of work requests (direct)"); + SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD, + &wrq->tx_wrs_copied, "# of work requests (copied)"); return (rc); } @@ -3145,37 +3497,30 @@ alloc_txq(struct port_info *pi, struct sge_txq *txq, int idx, char name[16]; struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); - rc = alloc_eq(sc, pi, eq); - if (rc) - return (rc); - - txq->ifp = pi->ifp; - - txq->sdesc = malloc(eq->cap * sizeof(struct tx_sdesc), M_CXGBE, - M_ZERO | M_WAITOK); - txq->br = buf_ring_alloc(eq->qsize, M_CXGBE, M_WAITOK, &eq->eq_lock); - - rc = bus_dma_tag_create(sc->dmat, 1, 0, BUS_SPACE_MAXADDR, - BUS_SPACE_MAXADDR, NULL, NULL, 64 * 1024, TX_SGL_SEGS, - BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL, NULL, &txq->tx_tag); + rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx, + M_CXGBE, M_WAITOK); if (rc != 0) { - device_printf(sc->dev, - "failed to create tx DMA tag: %d\n", rc); + device_printf(sc->dev, "failed to allocate mp_ring: %d\n", rc); return (rc); } - /* - * We can stuff ~10 frames in an 8-descriptor txpkts WR (8 is the SGE - * limit for any WR). txq->no_dmamap events shouldn't occur if maps is - * sized for the worst case. - */ - rc = t4_alloc_tx_maps(&txq->txmaps, txq->tx_tag, eq->qsize * 10 / 8, - M_WAITOK); + rc = alloc_eq(sc, pi, eq); if (rc != 0) { - device_printf(sc->dev, "failed to setup tx DMA maps: %d\n", rc); + mp_ring_free(txq->r); + txq->r = NULL; return (rc); } + /* Can't fail after this point. */ + + TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq); + txq->ifp = pi->ifp; + txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK); + txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) | + V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf)); + txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE, + M_ZERO | M_WAITOK); + snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "tx queue"); @@ -3203,23 +3548,39 @@ alloc_txq(struct port_info *pi, struct sge_txq *txq, int idx, &txq->sgl_wrs, "# of work requests with direct SGL"); SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD, &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)"); - SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts_wrs", CTLFLAG_RD, - &txq->txpkts_wrs, "# of txpkts work requests (multiple pkts/WR)"); - SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts_pkts", CTLFLAG_RD, - &txq->txpkts_pkts, "# of frames tx'd using txpkts work requests"); - - SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "br_drops", CTLFLAG_RD, - &txq->br->br_drops, "# of drops in the buf_ring for this queue"); - SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "no_dmamap", CTLFLAG_RD, - &txq->no_dmamap, 0, "# of times txq ran out of DMA maps"); - SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "no_desc", CTLFLAG_RD, - &txq->no_desc, 0, "# of times txq ran out of hardware descriptors"); - SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "egr_update", CTLFLAG_RD, - &eq->egr_update, 0, "egress update notifications from the SGE"); - SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "unstalled", CTLFLAG_RD, - &eq->unstalled, 0, "# of times txq recovered after stall"); + SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts0_wrs", + CTLFLAG_RD, &txq->txpkts0_wrs, + "# of txpkts (type 0) work requests"); + SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts1_wrs", + CTLFLAG_RD, &txq->txpkts1_wrs, + "# of txpkts (type 1) work requests"); + SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts0_pkts", + CTLFLAG_RD, &txq->txpkts0_pkts, + "# of frames tx'd using type0 txpkts work requests"); + SYSCTL_ADD_UQUAD(&pi->ctx, children, OID_AUTO, "txpkts1_pkts", + CTLFLAG_RD, &txq->txpkts1_pkts, + "# of frames tx'd using type1 txpkts work requests"); + + SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_enqueues", + CTLFLAG_RD, &txq->r->enqueues, + "# of enqueues to the mp_ring for this queue"); + SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_drops", + CTLFLAG_RD, &txq->r->drops, + "# of drops in the mp_ring for this queue"); + SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_starts", + CTLFLAG_RD, &txq->r->starts, + "# of normal consumer starts in the mp_ring for this queue"); + SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_stalls", + CTLFLAG_RD, &txq->r->stalls, + "# of consumer stalls in the mp_ring for this queue"); + SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_restarts", + CTLFLAG_RD, &txq->r->restarts, + "# of consumer restarts in the mp_ring for this queue"); + SYSCTL_ADD_COUNTER_U64(&pi->ctx, children, OID_AUTO, "r_abdications", + CTLFLAG_RD, &txq->r->abdications, + "# of consumer abdications in the mp_ring for this queue"); - return (rc); + return (0); } static int @@ -3233,15 +3594,9 @@ free_txq(struct port_info *pi, struct sge_txq *txq) if (rc) return (rc); + sglist_free(txq->gl); free(txq->sdesc, M_CXGBE); - - if (txq->txmaps.maps) - t4_free_tx_maps(&txq->txmaps, txq->tx_tag); - - buf_ring_free(txq->br, M_CXGBE); - - if (txq->tx_tag) - bus_dma_tag_destroy(txq->tx_tag); + mp_ring_free(txq->r); bzero(txq, sizeof(*txq)); return (0); @@ -3466,293 +3821,159 @@ free_fl_sdesc(struct adapter *sc, struct sge_fl *fl) fl->sdesc = NULL; } -int -t4_alloc_tx_maps(struct tx_maps *txmaps, bus_dma_tag_t tx_tag, int count, - int flags) +static inline void +get_pkt_gl(struct mbuf *m, struct sglist *gl) { - struct tx_map *txm; - int i, rc; - - txmaps->map_total = txmaps->map_avail = count; - txmaps->map_cidx = txmaps->map_pidx = 0; - - txmaps->maps = malloc(count * sizeof(struct tx_map), M_CXGBE, - M_ZERO | flags); + int rc; - txm = txmaps->maps; - for (i = 0; i < count; i++, txm++) { - rc = bus_dmamap_create(tx_tag, 0, &txm->map); - if (rc != 0) - goto failed; - } + M_ASSERTPKTHDR(m); - return (0); -failed: - while (--i >= 0) { - txm--; - bus_dmamap_destroy(tx_tag, txm->map); + sglist_reset(gl); + rc = sglist_append_mbuf(gl, m); + if (__predict_false(rc != 0)) { + panic("%s: mbuf %p (%d segs) was vetted earlier but now fails " + "with %d.", __func__, m, mbuf_nsegs(m), rc); } - KASSERT(txm == txmaps->maps, ("%s: EDOOFUS", __func__)); - free(txmaps->maps, M_CXGBE); - txmaps->maps = NULL; - - return (rc); + KASSERT(gl->sg_nseg == mbuf_nsegs(m), + ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m, + mbuf_nsegs(m), gl->sg_nseg)); + KASSERT(gl->sg_nseg > 0 && + gl->sg_nseg <= (needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS), + ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__, + gl->sg_nseg, needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)); } -void -t4_free_tx_maps(struct tx_maps *txmaps, bus_dma_tag_t tx_tag) +/* + * len16 for a txpkt WR with a GL. Includes the firmware work request header. + */ +static inline u_int +txpkt_len16(u_int nsegs, u_int tso) { - struct tx_map *txm; - int i; + u_int n; - txm = txmaps->maps; - for (i = 0; i < txmaps->map_total; i++, txm++) { - - if (txm->m) { - bus_dmamap_unload(tx_tag, txm->map); - m_freem(txm->m); - txm->m = NULL; - } + MPASS(nsegs > 0); - bus_dmamap_destroy(tx_tag, txm->map); - } + nsegs--; /* first segment is part of ulptx_sgl */ + n = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) + + sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); + if (tso) + n += sizeof(struct cpl_tx_pkt_lso_core); - free(txmaps->maps, M_CXGBE); - txmaps->maps = NULL; + return (howmany(n, 16)); } /* - * We'll do immediate data tx for non-TSO, but only when not coalescing. We're - * willing to use upto 2 hardware descriptors which means a maximum of 96 bytes - * of immediate data. - */ -#define IMM_LEN ( \ - 2 * EQ_ESIZE \ - - sizeof(struct fw_eth_tx_pkt_wr) \ - - sizeof(struct cpl_tx_pkt_core)) - -/* - * Returns non-zero on failure, no need to cleanup anything in that case. - * - * Note 1: We always try to defrag the mbuf if required and return EFBIG only - * if the resulting chain still won't fit in a tx descriptor. - * - * Note 2: We'll pullup the mbuf chain if TSO is requested and the first mbuf - * does not have the TCP header in it. + * len16 for a txpkts type 0 WR with a GL. Does not include the firmware work + * request header. */ -static int -get_pkt_sgl(struct sge_txq *txq, struct mbuf **fp, struct sgl *sgl, - int sgl_only) +static inline u_int +txpkts0_len16(u_int nsegs) { - struct mbuf *m = *fp; - struct tx_maps *txmaps; - struct tx_map *txm; - int rc, defragged = 0, n; - - TXQ_LOCK_ASSERT_OWNED(txq); - - if (m->m_pkthdr.tso_segsz) - sgl_only = 1; /* Do not allow immediate data with LSO */ + u_int n; -start: sgl->nsegs = 0; + MPASS(nsegs > 0); - if (m->m_pkthdr.len <= IMM_LEN && !sgl_only) - return (0); /* nsegs = 0 tells caller to use imm. tx */ - - txmaps = &txq->txmaps; - if (txmaps->map_avail == 0) { - txq->no_dmamap++; - return (ENOMEM); - } - txm = &txmaps->maps[txmaps->map_pidx]; + nsegs--; /* first segment is part of ulptx_sgl */ + n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) + + sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) + + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); - if (m->m_pkthdr.tso_segsz && m->m_len < 50) { - *fp = m_pullup(m, 50); - m = *fp; - if (m == NULL) - return (ENOBUFS); - } - - rc = bus_dmamap_load_mbuf_sg(txq->tx_tag, txm->map, m, sgl->seg, - &sgl->nsegs, BUS_DMA_NOWAIT); - if (rc == EFBIG && defragged == 0) { - m = m_defrag(m, M_NOWAIT); - if (m == NULL) - return (EFBIG); - - defragged = 1; - *fp = m; - goto start; - } - if (rc != 0) - return (rc); - - txm->m = m; - txmaps->map_avail--; - if (++txmaps->map_pidx == txmaps->map_total) - txmaps->map_pidx = 0; - - KASSERT(sgl->nsegs > 0 && sgl->nsegs <= TX_SGL_SEGS, - ("%s: bad DMA mapping (%d segments)", __func__, sgl->nsegs)); - - /* - * Store the # of flits required to hold this frame's SGL in nflits. An - * SGL has a (ULPTX header + len0, addr0) tuple optionally followed by - * multiple (len0 + len1, addr0, addr1) tuples. If addr1 is not used - * then len1 must be set to 0. - */ - n = sgl->nsegs - 1; - sgl->nflits = (3 * n) / 2 + (n & 1) + 2; - - return (0); + return (howmany(n, 16)); } - /* - * Releases all the txq resources used up in the specified sgl. + * len16 for a txpkts type 1 WR with a GL. Does not include the firmware work + * request header. */ -static int -free_pkt_sgl(struct sge_txq *txq, struct sgl *sgl) +static inline u_int +txpkts1_len16(void) { - struct tx_maps *txmaps; - struct tx_map *txm; + u_int n; - TXQ_LOCK_ASSERT_OWNED(txq); - - if (sgl->nsegs == 0) - return (0); /* didn't use any map */ + n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl); - txmaps = &txq->txmaps; - - /* 1 pkt uses exactly 1 map, back it out */ + return (howmany(n, 16)); +} - txmaps->map_avail++; - if (txmaps->map_pidx > 0) - txmaps->map_pidx--; - else - txmaps->map_pidx = txmaps->map_total - 1; +static inline u_int +imm_payload(u_int ndesc) +{ + u_int n; - txm = &txmaps->maps[txmaps->map_pidx]; - bus_dmamap_unload(txq->tx_tag, txm->map); - txm->m = NULL; + n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) - + sizeof(struct cpl_tx_pkt_core); - return (0); + return (n); } -static int -write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, struct mbuf *m, - struct sgl *sgl) +/* + * Write a txpkt WR for this packet to the hardware descriptors, update the + * software descriptor, and advance the pidx. It is guaranteed that enough + * descriptors are available. + * + * The return value is the # of hardware descriptors used. + */ +static u_int +write_txpkt_wr(struct sge_txq *txq, struct fw_eth_tx_pkt_wr *wr, + struct mbuf *m0, u_int available) { struct sge_eq *eq = &txq->eq; - struct fw_eth_tx_pkt_wr *wr; + struct tx_sdesc *txsd; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; /* used in many unrelated places */ uint64_t ctrl1; - int nflits, ndesc, pktlen; - struct tx_sdesc *txsd; + int len16, ndesc, pktlen, nsegs; caddr_t dst; TXQ_LOCK_ASSERT_OWNED(txq); + M_ASSERTPKTHDR(m0); + MPASS(available > 0 && available < eq->sidx); - pktlen = m->m_pkthdr.len; - - /* - * Do we have enough flits to send this frame out? - */ + len16 = mbuf_len16(m0); + nsegs = mbuf_nsegs(m0); + pktlen = m0->m_pkthdr.len; ctrl = sizeof(struct cpl_tx_pkt_core); - if (m->m_pkthdr.tso_segsz) { - nflits = TXPKT_LSO_WR_HDR; + if (needs_tso(m0)) ctrl += sizeof(struct cpl_tx_pkt_lso_core); - } else - nflits = TXPKT_WR_HDR; - if (sgl->nsegs > 0) - nflits += sgl->nflits; - else { - nflits += howmany(pktlen, 8); + else if (pktlen <= imm_payload(2) && available >= 2) { + /* Immediate data. Recalculate len16 and set nsegs to 0. */ ctrl += pktlen; + len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + + sizeof(struct cpl_tx_pkt_core) + pktlen, 16); + nsegs = 0; } - ndesc = howmany(nflits, 8); - if (ndesc > eq->avail) - return (ENOMEM); + ndesc = howmany(len16, EQ_ESIZE / 16); + MPASS(ndesc <= available); /* Firmware work request header */ - wr = (void *)&eq->desc[eq->pidx]; + MPASS(wr == (void *)&eq->desc[eq->pidx]); wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) | V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl)); - ctrl = V_FW_WR_LEN16(howmany(nflits, 2)); - if (eq->avail == ndesc) { - if (!(eq->flags & EQ_CRFLUSHED)) { - ctrl |= F_FW_WR_EQUEQ | F_FW_WR_EQUIQ; - eq->flags |= EQ_CRFLUSHED; - } - eq->flags |= EQ_STALLED; - } + ctrl = V_FW_WR_LEN16(len16); wr->equiq_to_len16 = htobe32(ctrl); wr->r3 = 0; - if (m->m_pkthdr.tso_segsz) { + if (needs_tso(m0)) { struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1); - struct ether_header *eh; - void *l3hdr; -#if defined(INET) || defined(INET6) - struct tcphdr *tcp; -#endif - uint16_t eh_type; - - ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE | - F_LSO_LAST_SLICE; - eh = mtod(m, struct ether_header *); - eh_type = ntohs(eh->ether_type); - if (eh_type == ETHERTYPE_VLAN) { - struct ether_vlan_header *evh = (void *)eh; + KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 && + m0->m_pkthdr.l4hlen > 0, + ("%s: mbuf %p needs TSO but missing header lengths", + __func__, m0)); + ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE | + F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) + | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2); + if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header)) ctrl |= V_LSO_ETHHDR_LEN(1); - l3hdr = evh + 1; - eh_type = ntohs(evh->evl_proto); - } else - l3hdr = eh + 1; - - switch (eh_type) { -#ifdef INET6 - case ETHERTYPE_IPV6: - { - struct ip6_hdr *ip6 = l3hdr; - - /* - * XXX-BZ For now we do not pretend to support - * IPv6 extension headers. - */ - KASSERT(ip6->ip6_nxt == IPPROTO_TCP, ("%s: CSUM_TSO " - "with ip6_nxt != TCP: %u", __func__, ip6->ip6_nxt)); - tcp = (struct tcphdr *)(ip6 + 1); + if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr)) ctrl |= F_LSO_IPV6; - ctrl |= V_LSO_IPHDR_LEN(sizeof(*ip6) >> 2) | - V_LSO_TCPHDR_LEN(tcp->th_off); - break; - } -#endif -#ifdef INET - case ETHERTYPE_IP: - { - struct ip *ip = l3hdr; - - tcp = (void *)((uintptr_t)ip + ip->ip_hl * 4); - ctrl |= V_LSO_IPHDR_LEN(ip->ip_hl) | - V_LSO_TCPHDR_LEN(tcp->th_off); - break; - } -#endif - default: - panic("%s: CSUM_TSO but no supported IP version " - "(0x%04x)", __func__, eh_type); - } lso->lso_ctrl = htobe32(ctrl); lso->ipid_ofst = htobe16(0); - lso->mss = htobe16(m->m_pkthdr.tso_segsz); + lso->mss = htobe16(m0->m_pkthdr.tso_segsz); lso->seqno_offset = htobe32(0); lso->len = htobe32(pktlen); @@ -3764,48 +3985,36 @@ write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, struct mbuf *m, /* Checksum offload */ ctrl1 = 0; - if (!(m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO))) + if (needs_l3_csum(m0) == 0) ctrl1 |= F_TXPKT_IPCSUM_DIS; - if (!(m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | - CSUM_TCP_IPV6 | CSUM_TSO))) + if (needs_l4_csum(m0) == 0) ctrl1 |= F_TXPKT_L4CSUM_DIS; - if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | + if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) txq->txcsum++; /* some hardware assistance provided */ /* VLAN tag insertion */ - if (m->m_flags & M_VLANTAG) { - ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m->m_pkthdr.ether_vtag); + if (needs_vlan_insertion(m0)) { + ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag); txq->vlan_insertion++; } /* CPL header */ - cpl->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) | - V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(pi->adapter->pf)); + cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(pktlen); cpl->ctrl1 = htobe64(ctrl1); - /* Software descriptor */ - txsd = &txq->sdesc[eq->pidx]; - txsd->desc_used = ndesc; - - eq->pending += ndesc; - eq->avail -= ndesc; - eq->pidx += ndesc; - if (eq->pidx >= eq->cap) - eq->pidx -= eq->cap; - /* SGL */ dst = (void *)(cpl + 1); - if (sgl->nsegs > 0) { - txsd->credits = 1; + if (nsegs > 0) { + + write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx); txq->sgl_wrs++; - write_sgl_to_txd(eq, sgl, &dst); } else { - txsd->credits = 0; - txq->imm_wrs++; - for (; m; m = m->m_next) { + struct mbuf *m; + + for (m = m0; m != NULL; m = m->m_next) { copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len); #ifdef INVARIANTS pktlen -= m->m_len; @@ -3814,245 +4023,225 @@ write_txpkt_wr(struct port_info *pi, struct sge_txq *txq, struct mbuf *m, #ifdef INVARIANTS KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen)); #endif - + txq->imm_wrs++; } txq->txpkt_wrs++; - return (0); + + txsd = &txq->sdesc[eq->pidx]; + txsd->m = m0; + txsd->desc_used = ndesc; + + return (ndesc); } -/* - * Returns 0 to indicate that m has been accepted into a coalesced tx work - * request. It has either been folded into txpkts or txpkts was flushed and m - * has started a new coalesced work request (as the first frame in a fresh - * txpkts). - * - * Returns non-zero to indicate a failure - caller is responsible for - * transmitting m, if there was anything in txpkts it has been flushed. - */ static int -add_to_txpkts(struct port_info *pi, struct sge_txq *txq, struct txpkts *txpkts, - struct mbuf *m, struct sgl *sgl) +try_txpkts(struct mbuf *m, struct mbuf *n, struct txpkts *txp, u_int available) { - struct sge_eq *eq = &txq->eq; - int can_coalesce; - struct tx_sdesc *txsd; - int flits; - - TXQ_LOCK_ASSERT_OWNED(txq); + u_int needed, nsegs1, nsegs2, l1, l2; - KASSERT(sgl->nsegs, ("%s: can't coalesce imm data", __func__)); + if (cannot_use_txpkts(m) || cannot_use_txpkts(n)) + return (1); - if (txpkts->npkt > 0) { - flits = TXPKTS_PKT_HDR + sgl->nflits; - can_coalesce = m->m_pkthdr.tso_segsz == 0 && - txpkts->nflits + flits <= TX_WR_FLITS && - txpkts->nflits + flits <= eq->avail * 8 && - txpkts->plen + m->m_pkthdr.len < 65536; + nsegs1 = mbuf_nsegs(m); + nsegs2 = mbuf_nsegs(n); + if (nsegs1 + nsegs2 == 2) { + txp->wr_type = 1; + l1 = l2 = txpkts1_len16(); + } else { + txp->wr_type = 0; + l1 = txpkts0_len16(nsegs1); + l2 = txpkts0_len16(nsegs2); + } + txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + l1 + l2; + needed = howmany(txp->len16, EQ_ESIZE / 16); + if (needed > SGE_MAX_WR_NDESC || needed > available) + return (1); - if (can_coalesce) { - txpkts->npkt++; - txpkts->nflits += flits; - txpkts->plen += m->m_pkthdr.len; + txp->plen = m->m_pkthdr.len + n->m_pkthdr.len; + if (txp->plen > 65535) + return (1); - txsd = &txq->sdesc[eq->pidx]; - txsd->credits++; + txp->npkt = 2; + set_mbuf_len16(m, l1); + set_mbuf_len16(n, l2); - return (0); - } - - /* - * Couldn't coalesce m into txpkts. The first order of business - * is to send txpkts on its way. Then we'll revisit m. - */ - write_txpkts_wr(txq, txpkts); - } + return (0); +} - /* - * Check if we can start a new coalesced tx work request with m as - * the first packet in it. - */ +static int +add_to_txpkts(struct mbuf *m, struct txpkts *txp, u_int available) +{ + u_int plen, len16, needed, nsegs; - KASSERT(txpkts->npkt == 0, ("%s: txpkts not empty", __func__)); + MPASS(txp->wr_type == 0 || txp->wr_type == 1); - flits = TXPKTS_WR_HDR + sgl->nflits; - can_coalesce = m->m_pkthdr.tso_segsz == 0 && - flits <= eq->avail * 8 && flits <= TX_WR_FLITS; + nsegs = mbuf_nsegs(m); + if (needs_tso(m) || (txp->wr_type == 1 && nsegs != 1)) + return (1); - if (can_coalesce == 0) - return (EINVAL); + plen = txp->plen + m->m_pkthdr.len; + if (plen > 65535) + return (1); - /* - * Start a fresh coalesced tx WR with m as the first frame in it. - */ - txpkts->npkt = 1; - txpkts->nflits = flits; - txpkts->flitp = &eq->desc[eq->pidx].flit[2]; - txpkts->plen = m->m_pkthdr.len; + if (txp->wr_type == 0) + len16 = txpkts0_len16(nsegs); + else + len16 = txpkts1_len16(); + needed = howmany(txp->len16 + len16, EQ_ESIZE / 16); + if (needed > SGE_MAX_WR_NDESC || needed > available) + return (1); - txsd = &txq->sdesc[eq->pidx]; - txsd->credits = 1; + txp->npkt++; + txp->plen = plen; + txp->len16 += len16; + set_mbuf_len16(m, len16); return (0); } /* - * Note that write_txpkts_wr can never run out of hardware descriptors (but - * write_txpkt_wr can). add_to_txpkts ensures that a frame is accepted for - * coalescing only if sufficient hardware descriptors are available. + * Write a txpkts WR for the packets in txp to the hardware descriptors, update + * the software descriptor, and advance the pidx. It is guaranteed that enough + * descriptors are available. + * + * The return value is the # of hardware descriptors used. */ -static void -write_txpkts_wr(struct sge_txq *txq, struct txpkts *txpkts) +static u_int +write_txpkts_wr(struct sge_txq *txq, struct fw_eth_tx_pkts_wr *wr, + struct mbuf *m0, const struct txpkts *txp, u_int available) { struct sge_eq *eq = &txq->eq; - struct fw_eth_tx_pkts_wr *wr; struct tx_sdesc *txsd; + struct cpl_tx_pkt_core *cpl; uint32_t ctrl; - int ndesc; + uint64_t ctrl1; + int ndesc, checkwrap; + struct mbuf *m; + void *flitp; TXQ_LOCK_ASSERT_OWNED(txq); + MPASS(txp->npkt > 0); + MPASS(txp->plen < 65536); + MPASS(m0 != NULL); + MPASS(m0->m_nextpkt != NULL); + MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16)); + MPASS(available > 0 && available < eq->sidx); - ndesc = howmany(txpkts->nflits, 8); + ndesc = howmany(txp->len16, EQ_ESIZE / 16); + MPASS(ndesc <= available); - wr = (void *)&eq->desc[eq->pidx]; + MPASS(wr == (void *)&eq->desc[eq->pidx]); wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR)); - ctrl = V_FW_WR_LEN16(howmany(txpkts->nflits, 2)); - if (eq->avail == ndesc) { - if (!(eq->flags & EQ_CRFLUSHED)) { - ctrl |= F_FW_WR_EQUEQ | F_FW_WR_EQUIQ; - eq->flags |= EQ_CRFLUSHED; - } - eq->flags |= EQ_STALLED; - } + ctrl = V_FW_WR_LEN16(txp->len16); wr->equiq_to_len16 = htobe32(ctrl); - wr->plen = htobe16(txpkts->plen); - wr->npkt = txpkts->npkt; - wr->r3 = wr->type = 0; - - /* Everything else already written */ - - txsd = &txq->sdesc[eq->pidx]; - txsd->desc_used = ndesc; - - KASSERT(eq->avail >= ndesc, ("%s: out of descriptors", __func__)); - - eq->pending += ndesc; - eq->avail -= ndesc; - eq->pidx += ndesc; - if (eq->pidx >= eq->cap) - eq->pidx -= eq->cap; + wr->plen = htobe16(txp->plen); + wr->npkt = txp->npkt; + wr->r3 = 0; + wr->type = txp->wr_type; + flitp = wr + 1; - txq->txpkts_pkts += txpkts->npkt; - txq->txpkts_wrs++; - txpkts->npkt = 0; /* emptied */ -} + /* + * At this point we are 16B into a hardware descriptor. If checkwrap is + * set then we know the WR is going to wrap around somewhere. We'll + * check for that at appropriate points. + */ + checkwrap = eq->sidx - ndesc < eq->pidx; + for (m = m0; m != NULL; m = m->m_nextpkt) { + if (txp->wr_type == 0) { + struct ulp_txpkt *ulpmc; + struct ulptx_idata *ulpsc; + + /* ULP master command */ + ulpmc = flitp; + ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) | + V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid)); + ulpmc->len = htobe32(mbuf_len16(m)); + + /* ULP subcommand */ + ulpsc = (void *)(ulpmc + 1); + ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) | + F_ULP_TX_SC_MORE); + ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core)); + + cpl = (void *)(ulpsc + 1); + if (checkwrap && + (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx]) + cpl = (void *)&eq->desc[0]; + txq->txpkts0_pkts += txp->npkt; + txq->txpkts0_wrs++; + } else { + cpl = flitp; + txq->txpkts1_pkts += txp->npkt; + txq->txpkts1_wrs++; + } -static inline void -write_ulp_cpl_sgl(struct port_info *pi, struct sge_txq *txq, - struct txpkts *txpkts, struct mbuf *m, struct sgl *sgl) -{ - struct ulp_txpkt *ulpmc; - struct ulptx_idata *ulpsc; - struct cpl_tx_pkt_core *cpl; - struct sge_eq *eq = &txq->eq; - uintptr_t flitp, start, end; - uint64_t ctrl; - caddr_t dst; + /* Checksum offload */ + ctrl1 = 0; + if (needs_l3_csum(m) == 0) + ctrl1 |= F_TXPKT_IPCSUM_DIS; + if (needs_l4_csum(m) == 0) + ctrl1 |= F_TXPKT_L4CSUM_DIS; + if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | + CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) + txq->txcsum++; /* some hardware assistance provided */ + + /* VLAN tag insertion */ + if (needs_vlan_insertion(m)) { + ctrl1 |= F_TXPKT_VLAN_VLD | + V_TXPKT_VLAN(m->m_pkthdr.ether_vtag); + txq->vlan_insertion++; + } - KASSERT(txpkts->npkt > 0, ("%s: txpkts is empty", __func__)); + /* CPL header */ + cpl->ctrl0 = txq->cpl_ctrl0; + cpl->pack = 0; + cpl->len = htobe16(m->m_pkthdr.len); + cpl->ctrl1 = htobe64(ctrl1); - start = (uintptr_t)eq->desc; - end = (uintptr_t)eq->spg; + flitp = cpl + 1; + if (checkwrap && + (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx]) + flitp = (void *)&eq->desc[0]; - /* Checksum offload */ - ctrl = 0; - if (!(m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO))) - ctrl |= F_TXPKT_IPCSUM_DIS; - if (!(m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | - CSUM_TCP_IPV6 | CSUM_TSO))) - ctrl |= F_TXPKT_L4CSUM_DIS; - if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | - CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) - txq->txcsum++; /* some hardware assistance provided */ + write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap); - /* VLAN tag insertion */ - if (m->m_flags & M_VLANTAG) { - ctrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m->m_pkthdr.ether_vtag); - txq->vlan_insertion++; } - /* - * The previous packet's SGL must have ended at a 16 byte boundary (this - * is required by the firmware/hardware). It follows that flitp cannot - * wrap around between the ULPTX master command and ULPTX subcommand (8 - * bytes each), and that it can not wrap around in the middle of the - * cpl_tx_pkt_core either. - */ - flitp = (uintptr_t)txpkts->flitp; - KASSERT((flitp & 0xf) == 0, - ("%s: last SGL did not end at 16 byte boundary: %p", - __func__, txpkts->flitp)); - - /* ULP master command */ - ulpmc = (void *)flitp; - ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0) | - V_ULP_TXPKT_FID(eq->iqid)); - ulpmc->len = htonl(howmany(sizeof(*ulpmc) + sizeof(*ulpsc) + - sizeof(*cpl) + 8 * sgl->nflits, 16)); - - /* ULP subcommand */ - ulpsc = (void *)(ulpmc + 1); - ulpsc->cmd_more = htobe32(V_ULPTX_CMD((u32)ULP_TX_SC_IMM) | - F_ULP_TX_SC_MORE); - ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core)); - - flitp += sizeof(*ulpmc) + sizeof(*ulpsc); - if (flitp == end) - flitp = start; - - /* CPL_TX_PKT */ - cpl = (void *)flitp; - cpl->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) | - V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(pi->adapter->pf)); - cpl->pack = 0; - cpl->len = htobe16(m->m_pkthdr.len); - cpl->ctrl1 = htobe64(ctrl); - - flitp += sizeof(*cpl); - if (flitp == end) - flitp = start; - - /* SGL for this frame */ - dst = (caddr_t)flitp; - txpkts->nflits += write_sgl_to_txd(eq, sgl, &dst); - txpkts->flitp = (void *)dst; + txsd = &txq->sdesc[eq->pidx]; + txsd->m = m0; + txsd->desc_used = ndesc; - KASSERT(((uintptr_t)dst & 0xf) == 0, - ("%s: SGL ends at %p (not a 16 byte boundary)", __func__, dst)); + return (ndesc); } /* * If the SGL ends on an address that is not 16 byte aligned, this function will - * add a 0 filled flit at the end. It returns 1 in that case. + * add a 0 filled flit at the end. */ -static int -write_sgl_to_txd(struct sge_eq *eq, struct sgl *sgl, caddr_t *to) +static void +write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap) { - __be64 *flitp, *end; + struct sge_eq *eq = &txq->eq; + struct sglist *gl = txq->gl; + struct sglist_seg *seg; + __be64 *flitp, *wrap; struct ulptx_sgl *usgl; - bus_dma_segment_t *seg; - int i, padded; - - KASSERT(sgl->nsegs > 0 && sgl->nflits > 0, - ("%s: bad SGL - nsegs=%d, nflits=%d", - __func__, sgl->nsegs, sgl->nflits)); + int i, nflits, nsegs; KASSERT(((uintptr_t)(*to) & 0xf) == 0, ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to)); + MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]); + MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]); + get_pkt_gl(m, gl); + nsegs = gl->sg_nseg; + MPASS(nsegs > 0); + + nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2; flitp = (__be64 *)(*to); - end = flitp + sgl->nflits; - seg = &sgl->seg[0]; + wrap = (__be64 *)(&eq->desc[eq->sidx]); + seg = &gl->sg_segs[0]; usgl = (void *)flitp; /* @@ -4062,58 +4251,60 @@ write_sgl_to_txd(struct sge_eq *eq, struct sgl *sgl, caddr_t *to) */ usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | - V_ULPTX_NSGE(sgl->nsegs)); - usgl->len0 = htobe32(seg->ds_len); - usgl->addr0 = htobe64(seg->ds_addr); + V_ULPTX_NSGE(nsegs)); + usgl->len0 = htobe32(seg->ss_len); + usgl->addr0 = htobe64(seg->ss_paddr); seg++; - if ((uintptr_t)end <= (uintptr_t)eq->spg) { + if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) { /* Won't wrap around at all */ - for (i = 0; i < sgl->nsegs - 1; i++, seg++) { - usgl->sge[i / 2].len[i & 1] = htobe32(seg->ds_len); - usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ds_addr); + for (i = 0; i < nsegs - 1; i++, seg++) { + usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len); + usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr); } if (i & 1) usgl->sge[i / 2].len[1] = htobe32(0); + flitp += nflits; } else { /* Will wrap somewhere in the rest of the SGL */ /* 2 flits already written, write the rest flit by flit */ flitp = (void *)(usgl + 1); - for (i = 0; i < sgl->nflits - 2; i++) { - if ((uintptr_t)flitp == (uintptr_t)eq->spg) + for (i = 0; i < nflits - 2; i++) { + if (flitp == wrap) flitp = (void *)eq->desc; - *flitp++ = get_flit(seg, sgl->nsegs - 1, i); + *flitp++ = get_flit(seg, nsegs - 1, i); } - end = flitp; } - if ((uintptr_t)end & 0xf) { - *(uint64_t *)end = 0; - end++; - padded = 1; - } else - padded = 0; + if (nflits & 1) { + MPASS(((uintptr_t)flitp) & 0xf); + *flitp++ = 0; + } - if ((uintptr_t)end == (uintptr_t)eq->spg) + MPASS((((uintptr_t)flitp) & 0xf) == 0); + if (__predict_false(flitp == wrap)) *to = (void *)eq->desc; else - *to = (void *)end; - - return (padded); + *to = (void *)flitp; } static inline void copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len) { - if (__predict_true((uintptr_t)(*to) + len <= (uintptr_t)eq->spg)) { + + MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]); + MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]); + + if (__predict_true((uintptr_t)(*to) + len <= + (uintptr_t)&eq->desc[eq->sidx])) { bcopy(from, *to, len); (*to) += len; } else { - int portion = (uintptr_t)eq->spg - (uintptr_t)(*to); + int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to); bcopy(from, *to, portion); from += portion; @@ -4124,21 +4315,21 @@ copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len) } static inline void -ring_eq_db(struct adapter *sc, struct sge_eq *eq) +ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n) { - u_int db, pending; + u_int db; + + MPASS(n > 0); db = eq->doorbells; - pending = eq->pending; - if (pending > 1) + if (n > 1) clrbit(&db, DOORBELL_WCWR); - eq->pending = 0; wmb(); switch (ffs(db) - 1) { case DOORBELL_UDB: - *eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(pending)); - return; + *eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n)); + break; case DOORBELL_WCWR: { volatile uint64_t *dst, *src; @@ -4149,69 +4340,84 @@ ring_eq_db(struct adapter *sc, struct sge_eq *eq) * use relative qid (udb_qid is always 0). Only queues with * doorbell segments can do WCWR. */ - KASSERT(eq->udb_qid == 0 && pending == 1, + KASSERT(eq->udb_qid == 0 && n == 1, ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p", - __func__, eq->doorbells, pending, eq->pidx, eq)); + __func__, eq->doorbells, n, eq->dbidx, eq)); dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET - UDBS_DB_OFFSET); - i = eq->pidx ? eq->pidx - 1 : eq->cap - 1; + i = eq->dbidx; src = (void *)&eq->desc[i]; while (src != (void *)&eq->desc[i + 1]) *dst++ = *src++; wmb(); - return; + break; } case DOORBELL_UDBWC: - *eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(pending)); + *eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n)); wmb(); - return; + break; case DOORBELL_KDB: t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), - V_QID(eq->cntxt_id) | V_PIDX(pending)); - return; + V_QID(eq->cntxt_id) | V_PIDX(n)); + break; } + + IDXINCR(eq->dbidx, n, eq->sidx); } -static inline int -reclaimable(struct sge_eq *eq) +static inline u_int +reclaimable_tx_desc(struct sge_eq *eq) { - unsigned int cidx; + uint16_t hw_cidx; - cidx = eq->spg->cidx; /* stable snapshot */ - cidx = be16toh(cidx); + hw_cidx = read_hw_cidx(eq); + return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx)); +} + +static inline u_int +total_available_tx_desc(struct sge_eq *eq) +{ + uint16_t hw_cidx, pidx; + + hw_cidx = read_hw_cidx(eq); + pidx = eq->pidx; - if (cidx >= eq->cidx) - return (cidx - eq->cidx); + if (pidx == hw_cidx) + return (eq->sidx - 1); else - return (cidx + eq->cap - eq->cidx); + return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1); +} + +static inline uint16_t +read_hw_cidx(struct sge_eq *eq) +{ + struct sge_qstat *spg = (void *)&eq->desc[eq->sidx]; + uint16_t cidx = spg->cidx; /* stable snapshot */ + + return (be16toh(cidx)); } /* - * There are "can_reclaim" tx descriptors ready to be reclaimed. Reclaim as - * many as possible but stop when there are around "n" mbufs to free. - * - * The actual number reclaimed is provided as the return value. + * Reclaim 'n' descriptors approximately. */ -static int -reclaim_tx_descs(struct sge_txq *txq, int can_reclaim, int n) +static u_int +reclaim_tx_descs(struct sge_txq *txq, u_int n) { struct tx_sdesc *txsd; - struct tx_maps *txmaps; - struct tx_map *txm; - unsigned int reclaimed, maps; struct sge_eq *eq = &txq->eq; + u_int can_reclaim, reclaimed; TXQ_LOCK_ASSERT_OWNED(txq); + MPASS(n > 0); - if (can_reclaim == 0) - can_reclaim = reclaimable(eq); - - maps = reclaimed = 0; - while (can_reclaim && maps < n) { + reclaimed = 0; + can_reclaim = reclaimable_tx_desc(eq); + while (can_reclaim && reclaimed < n) { int ndesc; + struct mbuf *m, *nextpkt; txsd = &txq->sdesc[eq->cidx]; ndesc = txsd->desc_used; @@ -4221,73 +4427,37 @@ reclaim_tx_descs(struct sge_txq *txq, int can_reclaim, int n) ("%s: unexpected number of credits: %d, %d", __func__, can_reclaim, ndesc)); - maps += txsd->credits; - + for (m = txsd->m; m != NULL; m = nextpkt) { + nextpkt = m->m_nextpkt; + m->m_nextpkt = NULL; + m_freem(m); + } reclaimed += ndesc; can_reclaim -= ndesc; - - eq->cidx += ndesc; - if (__predict_false(eq->cidx >= eq->cap)) - eq->cidx -= eq->cap; - } - - txmaps = &txq->txmaps; - txm = &txmaps->maps[txmaps->map_cidx]; - if (maps) - prefetch(txm->m); - - eq->avail += reclaimed; - KASSERT(eq->avail < eq->cap, /* avail tops out at (cap - 1) */ - ("%s: too many descriptors available", __func__)); - - txmaps->map_avail += maps; - KASSERT(txmaps->map_avail <= txmaps->map_total, - ("%s: too many maps available", __func__)); - - while (maps--) { - struct tx_map *next; - - next = txm + 1; - if (__predict_false(txmaps->map_cidx + 1 == txmaps->map_total)) - next = txmaps->maps; - prefetch(next->m); - - bus_dmamap_unload(txq->tx_tag, txm->map); - m_freem(txm->m); - txm->m = NULL; - - txm = next; - if (__predict_false(++txmaps->map_cidx == txmaps->map_total)) - txmaps->map_cidx = 0; + IDXINCR(eq->cidx, ndesc, eq->sidx); } return (reclaimed); } static void -write_eqflush_wr(struct sge_eq *eq) +tx_reclaim(void *arg, int n) { - struct fw_eq_flush_wr *wr; + struct sge_txq *txq = arg; + struct sge_eq *eq = &txq->eq; - EQ_LOCK_ASSERT_OWNED(eq); - KASSERT(eq->avail > 0, ("%s: no descriptors left.", __func__)); - KASSERT(!(eq->flags & EQ_CRFLUSHED), ("%s: flushed already", __func__)); - - wr = (void *)&eq->desc[eq->pidx]; - bzero(wr, sizeof(*wr)); - wr->opcode = FW_EQ_FLUSH_WR; - wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(sizeof(*wr) / 16) | - F_FW_WR_EQUEQ | F_FW_WR_EQUIQ); - - eq->flags |= (EQ_CRFLUSHED | EQ_STALLED); - eq->pending++; - eq->avail--; - if (++eq->pidx == eq->cap) - eq->pidx = 0; + do { + if (TXQ_TRYLOCK(txq) == 0) + break; + n = reclaim_tx_descs(txq, 32); + if (eq->cidx == eq->pidx) + eq->equeqidx = eq->pidx; + TXQ_UNLOCK(txq); + } while (n > 0); } static __be64 -get_flit(bus_dma_segment_t *sgl, int nsegs, int idx) +get_flit(struct sglist_seg *segs, int nsegs, int idx) { int i = (idx / 3) * 2; @@ -4295,16 +4465,16 @@ get_flit(bus_dma_segment_t *sgl, int nsegs, int idx) case 0: { __be64 rc; - rc = htobe32(sgl[i].ds_len); + rc = htobe32(segs[i].ss_len); if (i + 1 < nsegs) - rc |= (uint64_t)htobe32(sgl[i + 1].ds_len) << 32; + rc |= (uint64_t)htobe32(segs[i + 1].ss_len) << 32; return (rc); } case 1: - return htobe64(sgl[i].ds_addr); + return (htobe64(segs[i].ss_paddr)); case 2: - return htobe64(sgl[i + 1].ds_addr); + return (htobe64(segs[i + 1].ss_paddr)); } return (0); @@ -4499,6 +4669,27 @@ add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl) mtx_unlock(&sc->sfl_lock); } +static void +handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq) +{ + struct sge_wrq *wrq = (void *)eq; + + atomic_readandclear_int(&eq->equiq); + taskqueue_enqueue(sc->tq[eq->tx_chan], &wrq->wrq_tx_task); +} + +static void +handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq) +{ + struct sge_txq *txq = (void *)eq; + + MPASS((eq->flags & EQ_TYPEMASK) == EQ_ETH); + + atomic_readandclear_int(&eq->equiq); + mp_ring_check_drainage(txq->r, 0); + taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task); +} + static int handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) @@ -4508,22 +4699,15 @@ handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss, struct adapter *sc = iq->adapter; struct sge *s = &sc->sge; struct sge_eq *eq; + static void (*h[])(struct adapter *, struct sge_eq *) = {NULL, + &handle_wrq_egr_update, &handle_eth_egr_update, + &handle_wrq_egr_update}; KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__, rss->opcode)); eq = s->eqmap[qid - s->eq_start]; - EQ_LOCK(eq); - KASSERT(eq->flags & EQ_CRFLUSHED, - ("%s: unsolicited egress update", __func__)); - eq->flags &= ~EQ_CRFLUSHED; - eq->egr_update++; - - if (__predict_false(eq->flags & EQ_DOOMED)) - wakeup_one(eq); - else if (eq->flags & EQ_STALLED && can_resume_tx(eq)) - taskqueue_enqueue(sc->tq[eq->tx_chan], &eq->tx_task); - EQ_UNLOCK(eq); + (*h[eq->flags & EQ_TYPEMASK])(sc, eq); return (0); } diff --git a/sys/modules/cxgbe/if_cxgbe/Makefile b/sys/modules/cxgbe/if_cxgbe/Makefile index e4828f7..a66e45a 100644 --- a/sys/modules/cxgbe/if_cxgbe/Makefile +++ b/sys/modules/cxgbe/if_cxgbe/Makefile @@ -15,6 +15,7 @@ SRCS+= pci_if.h SRCS+= t4_hw.c SRCS+= t4_l2t.c SRCS+= t4_main.c +SRCS+= t4_mp_ring.c SRCS+= t4_netmap.c SRCS+= t4_sge.c SRCS+= t4_tracer.c |