diff options
-rw-r--r-- | sys/dev/cxgbe/adapter.h | 74 | ||||
-rw-r--r-- | sys/dev/cxgbe/common/t4_hw.h | 1 | ||||
-rw-r--r-- | sys/dev/cxgbe/t4_main.c | 2 | ||||
-rw-r--r-- | sys/dev/cxgbe/t4_sge.c | 1130 |
4 files changed, 632 insertions, 575 deletions
diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h index 9a9b3db..64b71ef 100644 --- a/sys/dev/cxgbe/adapter.h +++ b/sys/dev/cxgbe/adapter.h @@ -134,10 +134,11 @@ enum { RX_FL_ESIZE = EQ_ESIZE, /* 8 64bit addresses */ #if MJUMPAGESIZE != MCLBYTES - FL_BUF_SIZES_MAX = 5, /* cluster, jumbop, jumbo9k, jumbo16k, extra */ + SW_ZONE_SIZES = 4, /* cluster, jumbop, jumbo9k, jumbo16k */ #else - FL_BUF_SIZES_MAX = 4, /* cluster, jumbo9k, jumbo16k, extra */ + SW_ZONE_SIZES = 3, /* cluster, jumbo9k, jumbo16k */ #endif + CL_METADATA_SIZE = CACHE_LINE_SIZE, CTRL_EQ_QSIZE = 128, @@ -241,15 +242,28 @@ struct port_info { uint8_t hw_addr[ETHER_ADDR_LEN]; /* factory MAC address, won't change */ }; -struct fl_sdesc { - bus_dmamap_t map; - caddr_t cl; - uint8_t tag_idx; /* the fl->tag entry this map comes from */ +/* Where the cluster came from, how it has been carved up. */ +struct cluster_layout { + int8_t zidx; + int8_t hwidx; + uint16_t region1; /* mbufs laid out within this region */ + /* region2 is the DMA region */ + uint16_t region3; /* cluster_metadata within this region */ +}; + +struct cluster_metadata { + u_int refcount; #ifdef INVARIANTS - __be64 ba_hwtag; + struct fl_sdesc *sd; /* For debug only. Could easily be stale */ #endif }; +struct fl_sdesc { + caddr_t cl; + uint8_t nmbuf; + struct cluster_layout cll; +}; + struct tx_desc { __be64 flit[8]; }; @@ -368,17 +382,19 @@ struct sge_eq { uint32_t unstalled; /* recovered from stall */ }; -struct fl_buf_info { - u_int size; - int type; - int hwtag:4; /* tag in low 4 bits of the pa. */ - uma_zone_t zone; +struct sw_zone_info { + uma_zone_t zone; /* zone that this cluster comes from */ + int size; /* size of cluster: 2K, 4K, 9K, 16K, etc. */ + int type; /* EXT_xxx type of the cluster */ + int8_t head_hwidx; + int8_t tail_hwidx; +}; + +struct hw_buf_info { + int8_t zidx; /* backpointer to zone; -ve means unused */ + int8_t next; /* next hwidx for this zone; -1 means no more */ + int size; }; -#define FL_BUF_SIZES(sc) (sc->sge.fl_buf_sizes) -#define FL_BUF_SIZE(sc, x) (sc->sge.fl_buf_info[x].size) -#define FL_BUF_TYPE(sc, x) (sc->sge.fl_buf_info[x].type) -#define FL_BUF_HWTAG(sc, x) (sc->sge.fl_buf_info[x].hwtag) -#define FL_BUF_ZONE(sc, x) (sc->sge.fl_buf_info[x].zone) enum { FL_STARVING = (1 << 0), /* on the adapter's list of starving fl's */ @@ -392,9 +408,8 @@ enum { struct sge_fl { bus_dma_tag_t desc_tag; bus_dmamap_t desc_map; - bus_dma_tag_t tag[FL_BUF_SIZES_MAX]; /* only first FL_BUF_SIZES(sc) are - valid */ - uint8_t tag_idx; + struct cluster_layout cll_def; /* default refill zone, layout */ + struct cluster_layout cll_alt; /* alternate refill zone, layout */ struct mtx fl_lock; char lockname[16]; int flags; @@ -411,9 +426,17 @@ struct sge_fl { uint32_t needed; /* # of buffers needed to fill up fl. */ uint32_t lowat; /* # of buffers <= this means fl needs help */ uint32_t pending; /* # of bufs allocated since last doorbell */ - u_int dmamap_failed; - struct mbuf *mstash[8]; TAILQ_ENTRY(sge_fl) link; /* All starving freelists */ + + struct mbuf *m0; + struct mbuf **pnext; + u_int remaining; + + uint64_t mbuf_allocated;/* # of mbuf allocated from zone_mbuf */ + uint64_t mbuf_inlined; /* # of mbuf created within clusters */ + uint64_t cl_allocated; /* # of clusters allocated */ + uint64_t cl_recycled; /* # of clusters recycled */ + uint64_t cl_fast_recycled; /* # of clusters recycled (fast) */ }; /* txq: SGE egress queue + what's needed for Ethernet NIC */ @@ -547,8 +570,11 @@ struct sge { struct sge_iq **iqmap; /* iq->cntxt_id to iq mapping */ struct sge_eq **eqmap; /* eq->cntxt_id to eq mapping */ - u_int fl_buf_sizes __aligned(CACHE_LINE_SIZE); - struct fl_buf_info fl_buf_info[FL_BUF_SIZES_MAX]; + int pack_boundary; + int8_t safe_hwidx1; /* may not have room for metadata */ + int8_t safe_hwidx2; /* with room for metadata and maybe more */ + struct sw_zone_info sw_zone_info[SW_ZONE_SIZES]; + struct hw_buf_info hw_buf_info[SGE_FLBUF_SIZES]; }; struct rss_header; diff --git a/sys/dev/cxgbe/common/t4_hw.h b/sys/dev/cxgbe/common/t4_hw.h index b0b82bd..34f462c 100644 --- a/sys/dev/cxgbe/common/t4_hw.h +++ b/sys/dev/cxgbe/common/t4_hw.h @@ -87,6 +87,7 @@ enum { SGE_NTIMERS = 6, /* # of interrupt holdoff timer values */ SGE_NCOUNTERS = 4, /* # of interrupt packet counter values */ SGE_MAX_IQ_SIZE = 65520, + SGE_FLBUF_SIZES = 16, }; struct sge_qstat { /* data written to SGE queue status entries */ diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c index 714a00c..d399742 100644 --- a/sys/dev/cxgbe/t4_main.c +++ b/sys/dev/cxgbe/t4_main.c @@ -494,6 +494,8 @@ CTASSERT(offsetof(struct sge_ofld_rxq, fl) == offsetof(struct sge_rxq, fl)); CTASSERT(nitems(((struct adapter *)0)->cpl_handler) == NUM_CPL_CMDS); CTASSERT(nitems(((struct adapter *)0)->fw_msg_handler) == NUM_FW6_TYPES); +CTASSERT(sizeof(struct cluster_metadata) <= CL_METADATA_SIZE); + static int t4_probe(device_t dev) { diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c index 5ec1b58..d461723 100644 --- a/sys/dev/cxgbe/t4_sge.c +++ b/sys/dev/cxgbe/t4_sge.c @@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$"); #include <sys/kdb.h> #include <sys/malloc.h> #include <sys/queue.h> +#include <sys/sbuf.h> #include <sys/taskqueue.h> #include <sys/time.h> #include <sys/sysctl.h> @@ -52,6 +53,8 @@ __FBSDID("$FreeBSD$"); #include <netinet/ip6.h> #include <netinet/tcp.h> #include <machine/md_var.h> +#include <vm/vm.h> +#include <vm/pmap.h> #include "common/common.h" #include "common/t4_regs.h" @@ -124,6 +127,27 @@ static int t4_fl_pack; static int t5_fl_pack; TUNABLE_INT("hw.cxgbe.fl_pack", &fl_pack); +/* + * Allow the driver to create mbuf(s) in a cluster allocated for rx. + * 0: never; always allocate mbufs from the zone_mbuf UMA zone. + * 1: ok to create mbuf(s) within a cluster if there is room. + */ +static int allow_mbufs_in_cluster = 1; +TUNABLE_INT("hw.cxgbe.allow_mbufs_in_cluster", &allow_mbufs_in_cluster); + +/* + * Largest rx cluster size that the driver is allowed to allocate. + */ +static int largest_rx_cluster = MJUM16BYTES; +TUNABLE_INT("hw.cxgbe.largest_rx_cluster", &largest_rx_cluster); + +/* + * Size of cluster allocation that's most likely to succeed. The driver will + * fall back to this size if it fails to allocate clusters larger than this. + */ +static int safest_rx_cluster = PAGE_SIZE; +TUNABLE_INT("hw.cxgbe.safest_rx_cluster", &safest_rx_cluster); + /* Used to track coalesced tx work request */ struct txpkts { uint64_t *flitp; /* ptr to flit where next pkt should start */ @@ -140,9 +164,7 @@ struct sgl { }; static int service_iq(struct sge_iq *, int); -static struct mbuf *get_fl_payload1(struct adapter *, struct sge_fl *, uint32_t, - int *); -static struct mbuf *get_fl_payload2(struct adapter *, struct sge_fl *, uint32_t, +static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t, int *); static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *); static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int, @@ -158,6 +180,8 @@ static int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t, static int alloc_iq_fl(struct port_info *, struct sge_iq *, struct sge_fl *, int, int); static int free_iq_fl(struct port_info *, struct sge_iq *, struct sge_fl *); +static void add_fl_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *, + struct sge_fl *); static int alloc_fwq(struct adapter *); static int free_fwq(struct adapter *); static int alloc_mgmtq(struct adapter *); @@ -191,7 +215,8 @@ static int refill_fl(struct adapter *, struct sge_fl *, int); static void refill_sfl(void *); static int alloc_fl_sdesc(struct sge_fl *); static void free_fl_sdesc(struct adapter *, struct sge_fl *); -static void set_fl_tag_idx(struct adapter *, struct sge_fl *, int); +static void find_best_refill_source(struct adapter *, struct sge_fl *, int); +static void find_safe_refill_source(struct adapter *, struct sge_fl *); static void add_fl_to_sfl(struct adapter *, struct sge_fl *); static int get_pkt_sgl(struct sge_txq *, struct mbuf **, struct sgl *, int); @@ -216,6 +241,7 @@ static int handle_fw_msg(struct sge_iq *, const struct rss_header *, struct mbuf *); static int sysctl_uint16(SYSCTL_HANDLER_ARGS); +static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS); /* * Called on MOD_LOAD. Validates and calculates the SGE tunables. @@ -264,7 +290,7 @@ t4_sge_modload(void) /* T5's pack boundary is independent of the pad boundary. */ if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 || !powerof2(fl_pack)) - t5_fl_pack = max(pad, 64); + t5_fl_pack = max(pad, CACHE_LINE_SIZE); else t5_fl_pack = fl_pack; @@ -313,14 +339,18 @@ t4_tweak_chip_settings(struct adapter *sc) int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk; int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */ uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE); - int sw_flbuf_sizes[] = { + static int sge_flbuf_sizes[] = { MCLBYTES, #if MJUMPAGESIZE != MCLBYTES MJUMPAGESIZE, + MJUMPAGESIZE - CL_METADATA_SIZE, + MJUMPAGESIZE - 2 * MSIZE - CL_METADATA_SIZE, #endif MJUM9BYTES, MJUM16BYTES, - MJUMPAGESIZE - MSIZE + MCLBYTES - MSIZE - CL_METADATA_SIZE, + MJUM9BYTES - CL_METADATA_SIZE, + MJUM16BYTES - CL_METADATA_SIZE, }; KASSERT(sc->flags & MASTER_PF, @@ -358,9 +388,11 @@ t4_tweak_chip_settings(struct adapter *sc) V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10); t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v); - for (i = 0; i < min(nitems(sw_flbuf_sizes), 16); i++) { + KASSERT(nitems(sge_flbuf_sizes) <= SGE_FLBUF_SIZES, + ("%s: hw buffer size table too big", __func__)); + for (i = 0; i < min(nitems(sge_flbuf_sizes), SGE_FLBUF_SIZES); i++) { t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i), - sw_flbuf_sizes[i]); + sge_flbuf_sizes[i]); } v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) | @@ -415,6 +447,18 @@ t4_tweak_chip_settings(struct adapter *sc) } /* + * SGE wants the buffer to be at least 64B and then a multiple of the pad + * boundary or 16, whichever is greater. + */ +static inline int +hwsz_ok(int hwsz) +{ + int mask = max(fl_pad, 16) - 1; + + return (hwsz >= 64 && (hwsz & mask) == 0); +} + +/* * XXX: driver really should be able to deal with unexpected settings. */ int @@ -424,7 +468,7 @@ t4_read_chip_settings(struct adapter *sc) int i, j, n, rc = 0; uint32_t m, v, r; uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE); - uint32_t sge_flbuf_sizes[16], sw_flbuf_sizes[] = { + static int sw_buf_sizes[] = { /* Sorted by size */ MCLBYTES, #if MJUMPAGESIZE != MCLBYTES MJUMPAGESIZE, @@ -432,6 +476,8 @@ t4_read_chip_settings(struct adapter *sc) MJUM9BYTES, MJUM16BYTES }; + struct sw_zone_info *swz, *safe_swz; + struct hw_buf_info *hwb; m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE; v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE | @@ -462,6 +508,7 @@ t4_read_chip_settings(struct adapter *sc) rc = EINVAL; } } + s->pack_boundary = is_t4(sc) ? t4_fl_pack : t5_fl_pack; v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) | @@ -477,45 +524,93 @@ t4_read_chip_settings(struct adapter *sc) rc = EINVAL; } - /* - * Make a list of SGE FL buffer sizes programmed in the chip and tally - * it with the FL buffer sizes that we'd like to use. - */ - n = 0; - for (i = 0; i < nitems(sge_flbuf_sizes); i++) { + /* Filter out unusable hw buffer sizes entirely (mark with -2). */ + hwb = &s->hw_buf_info[0]; + for (i = 0; i < nitems(s->hw_buf_info); i++, hwb++) { r = t4_read_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i)); - sge_flbuf_sizes[i] = r; - if (r == MJUMPAGESIZE - MSIZE && - (sc->flags & BUF_PACKING_OK) == 0) { - sc->flags |= BUF_PACKING_OK; - FL_BUF_HWTAG(sc, n) = i; - FL_BUF_SIZE(sc, n) = MJUMPAGESIZE - MSIZE; - FL_BUF_TYPE(sc, n) = m_gettype(MJUMPAGESIZE); - FL_BUF_ZONE(sc, n) = m_getzone(MJUMPAGESIZE); - n++; - } + hwb->size = r; + hwb->zidx = hwsz_ok(r) ? -1 : -2; + hwb->next = -1; } - for (i = 0; i < nitems(sw_flbuf_sizes); i++) { - for (j = 0; j < nitems(sge_flbuf_sizes); j++) { - if (sw_flbuf_sizes[i] != sge_flbuf_sizes[j]) + + /* + * Create a sorted list in decreasing order of hw buffer sizes (and so + * increasing order of spare area) for each software zone. + */ + n = 0; /* no usable buffer size to begin with */ + swz = &s->sw_zone_info[0]; + safe_swz = NULL; + for (i = 0; i < SW_ZONE_SIZES; i++, swz++) { + int8_t head = -1, tail = -1; + + swz->size = sw_buf_sizes[i]; + swz->zone = m_getzone(swz->size); + swz->type = m_gettype(swz->size); + + if (swz->size == safest_rx_cluster) + safe_swz = swz; + + hwb = &s->hw_buf_info[0]; + for (j = 0; j < SGE_FLBUF_SIZES; j++, hwb++) { + if (hwb->zidx != -1 || hwb->size > swz->size) continue; - FL_BUF_HWTAG(sc, n) = j; - FL_BUF_SIZE(sc, n) = sw_flbuf_sizes[i]; - FL_BUF_TYPE(sc, n) = m_gettype(sw_flbuf_sizes[i]); - FL_BUF_ZONE(sc, n) = m_getzone(sw_flbuf_sizes[i]); + hwb->zidx = i; + if (head == -1) + head = tail = j; + else if (hwb->size < s->hw_buf_info[tail].size) { + s->hw_buf_info[tail].next = j; + tail = j; + } else { + int8_t *cur; + struct hw_buf_info *t; + + for (cur = &head; *cur != -1; cur = &t->next) { + t = &s->hw_buf_info[*cur]; + if (hwb->size == t->size) { + hwb->zidx = -2; + break; + } + if (hwb->size > t->size) { + hwb->next = *cur; + *cur = j; + break; + } + } + } + } + swz->head_hwidx = head; + swz->tail_hwidx = tail; + + if (tail != -1) { n++; - break; + if (swz->size - s->hw_buf_info[tail].size >= + CL_METADATA_SIZE) + sc->flags |= BUF_PACKING_OK; } } if (n == 0) { device_printf(sc->dev, "no usable SGE FL buffer size.\n"); rc = EINVAL; - } else if (n == 1 && (sc->flags & BUF_PACKING_OK)) { - device_printf(sc->dev, - "no usable SGE FL buffer size when not packing buffers.\n"); - rc = EINVAL; } - FL_BUF_SIZES(sc) = n; + + s->safe_hwidx1 = -1; + s->safe_hwidx2 = -1; + if (safe_swz != NULL) { + s->safe_hwidx1 = safe_swz->head_hwidx; + for (i = safe_swz->head_hwidx; i != -1; i = hwb->next) { + int spare; + + hwb = &s->hw_buf_info[i]; + spare = safe_swz->size - hwb->size; + if (spare < CL_METADATA_SIZE) + continue; + if (s->safe_hwidx2 == -1 || + spare == CL_METADATA_SIZE + MSIZE) + s->safe_hwidx2 = i; + if (spare >= CL_METADATA_SIZE + MSIZE) + break; + } + } r = t4_read_reg(sc, A_SGE_INGRESS_RX_THRESHOLD); s->counter_val[0] = G_THRESHOLD_0(r); @@ -627,6 +722,10 @@ t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx, struct sysctl_oid_list *children) { + SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes", + CTLTYPE_STRING | CTLFLAG_RD, &sc->sge, 0, sysctl_bufsizes, "A", + "freelist buffer sizes"); + SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD, NULL, fl_pktshift, "payload DMA offset in rx buffer (bytes)"); @@ -644,8 +743,7 @@ t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx, "pack multiple frames in one fl buffer"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD, - NULL, is_t5(sc) ? t5_fl_pack : t4_fl_pack, - "payload pack boundary (bytes)"); + NULL, sc->sge.pack_boundary, "payload pack boundary (bytes)"); } int @@ -765,7 +863,7 @@ port_intr_iq(struct port_info *pi, int idx) #ifdef TCP_OFFLOAD if (sc->flags & INTR_DIRECT) { idx %= pi->nrxq + pi->nofldrxq; - + if (idx >= pi->nrxq) { idx -= pi->nrxq; iq = &s->ofld_rxq[pi->first_ofld_rxq + idx].iq; @@ -796,29 +894,28 @@ port_intr_iq(struct port_info *pi, int idx) return (iq); } +/* Maximum payload that can be delivered with a single iq descriptor */ static inline int -mtu_to_bufsize(int mtu) +mtu_to_max_payload(struct adapter *sc, int mtu, const int toe) { - int bufsize; - - /* large enough for a frame even when VLAN extraction is disabled */ - bufsize = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + mtu; - bufsize = roundup2(bufsize + fl_pktshift, fl_pad); - - return (bufsize); -} + int payload; #ifdef TCP_OFFLOAD -static inline int -mtu_to_bufsize_toe(struct adapter *sc, int mtu) -{ - - if (sc->tt.rx_coalesce) - return (G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2))); + if (toe) { + payload = sc->tt.rx_coalesce ? + G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)) : mtu; + } else { +#endif + /* large enough even when hw VLAN extraction is disabled */ + payload = fl_pktshift + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + + mtu; +#ifdef TCP_OFFLOAD + } +#endif + payload = roundup2(payload, fl_pad); - return (mtu); + return (payload); } -#endif int t4_setup_port_queues(struct port_info *pi) @@ -837,7 +934,7 @@ t4_setup_port_queues(struct port_info *pi) struct ifnet *ifp = pi->ifp; struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev); struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); - int bufsize, pack; + int maxp, pack, mtu = ifp->if_mtu; oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD, NULL, "rx queues"); @@ -858,7 +955,7 @@ t4_setup_port_queues(struct port_info *pi) * a) initialize iq and fl * b) allocate queue iff it will take direct interrupts. */ - bufsize = mtu_to_bufsize(ifp->if_mtu); + maxp = mtu_to_max_payload(sc, mtu, 0); pack = enable_buffer_packing(sc); for_each_rxq(pi, i, rxq) { @@ -867,7 +964,7 @@ t4_setup_port_queues(struct port_info *pi) snprintf(name, sizeof(name), "%s rxq%d-fl", device_get_nameunit(pi->dev), i); - init_fl(sc, &rxq->fl, pi->qsize_rxq / 8, bufsize, pack, name); + init_fl(sc, &rxq->fl, pi->qsize_rxq / 8, maxp, pack, name); if (sc->flags & INTR_DIRECT #ifdef TCP_OFFLOAD @@ -883,8 +980,7 @@ t4_setup_port_queues(struct port_info *pi) } #ifdef TCP_OFFLOAD - bufsize = mtu_to_bufsize_toe(sc, ifp->if_mtu); - pack = 0; /* XXX: think about this some more */ + maxp = mtu_to_max_payload(sc, mtu, 1); for_each_ofld_rxq(pi, i, ofld_rxq) { init_iq(&ofld_rxq->iq, sc, pi->tmr_idx, pi->pktc_idx, @@ -892,8 +988,7 @@ t4_setup_port_queues(struct port_info *pi) snprintf(name, sizeof(name), "%s ofld_rxq%d-fl", device_get_nameunit(pi->dev), i); - init_fl(sc, &ofld_rxq->fl, pi->qsize_rxq / 8, bufsize, pack, - name); + init_fl(sc, &ofld_rxq->fl, pi->qsize_rxq / 8, maxp, pack, name); if (sc->flags & INTR_DIRECT || (sc->intr_count > 1 && pi->nofldrxq > pi->nrxq)) { @@ -1170,10 +1265,7 @@ service_iq(struct sge_iq *iq, int budget) ("%s: data for an iq (%p) with no freelist", __func__, iq)); - m0 = fl->flags & FL_BUF_PACKING ? - get_fl_payload1(sc, fl, lq, &fl_bufs_used) : - get_fl_payload2(sc, fl, lq, &fl_bufs_used); - + m0 = get_fl_payload(sc, fl, lq, &fl_bufs_used); if (__predict_false(m0 == NULL)) goto process_iql; #ifdef T4_PKT_TIMESTAMP @@ -1246,6 +1338,14 @@ service_iq(struct sge_iq *iq, int budget) break; } + if (fl_bufs_used >= 16) { + FL_LOCK(fl); + fl->needed += fl_bufs_used; + refill_fl(sc, fl, 32); + FL_UNLOCK(fl); + fl_bufs_used = 0; + } + iq_next(iq); if (++ndescs == limit) { t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), @@ -1262,14 +1362,6 @@ service_iq(struct sge_iq *iq, int budget) } #endif - if (fl_bufs_used > 0) { - FL_LOCK(fl); - fl->needed += fl_bufs_used; - refill_fl(sc, fl, fl->cap / 8); - FL_UNLOCK(fl); - fl_bufs_used = 0; - } - if (budget) return (EINPROGRESS); } @@ -1312,7 +1404,7 @@ process_iql: FL_LOCK(fl); fl->needed += fl_bufs_used; - starved = refill_fl(sc, fl, fl->cap / 4); + starved = refill_fl(sc, fl, 64); FL_UNLOCK(fl); if (__predict_false(starved != 0)) add_fl_to_sfl(sc, fl); @@ -1321,74 +1413,28 @@ process_iql: return (0); } -static int -fill_mbuf_stash(struct sge_fl *fl) -{ - int i; - - for (i = 0; i < nitems(fl->mstash); i++) { - if (fl->mstash[i] == NULL) { - struct mbuf *m; - if ((m = m_get(M_NOWAIT, MT_NOINIT)) == NULL) - return (ENOBUFS); - fl->mstash[i] = m; - } - } - return (0); -} - -static struct mbuf * -get_mbuf_from_stash(struct sge_fl *fl) +static inline int +cl_has_metadata(struct sge_fl *fl, struct cluster_layout *cll) { - int i; - - for (i = 0; i < nitems(fl->mstash); i++) { - if (fl->mstash[i] != NULL) { - struct mbuf *m; + int rc = fl->flags & FL_BUF_PACKING || cll->region1 > 0; - m = fl->mstash[i]; - fl->mstash[i] = NULL; - return (m); - } else - fl->mstash[i] = m_get(M_NOWAIT, MT_NOINIT); - } + if (rc) + MPASS(cll->region3 >= CL_METADATA_SIZE); - return (m_get(M_NOWAIT, MT_NOINIT)); + return (rc); } -static void -return_mbuf_to_stash(struct sge_fl *fl, struct mbuf *m) +static inline struct cluster_metadata * +cl_metadata(struct adapter *sc, struct sge_fl *fl, struct cluster_layout *cll, + caddr_t cl) { - int i; - if (m == NULL) - return; + if (cl_has_metadata(fl, cll)) { + struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx]; - for (i = 0; i < nitems(fl->mstash); i++) { - if (fl->mstash[i] == NULL) { - fl->mstash[i] = m; - return; - } + return ((struct cluster_metadata *)(cl + swz->size) - 1); } - m_init(m, NULL, 0, M_NOWAIT, MT_DATA, 0); - m_free(m); -} - -/* buf can be any address within the buffer */ -static inline u_int * -find_buf_refcnt(caddr_t buf) -{ - uintptr_t ptr = (uintptr_t)buf; - - return ((u_int *)((ptr & ~(MJUMPAGESIZE - 1)) + MSIZE - sizeof(u_int))); -} - -static inline struct mbuf * -find_buf_mbuf(caddr_t buf) -{ - uintptr_t ptr = (uintptr_t)buf; - - return ((struct mbuf *)(ptr & ~(MJUMPAGESIZE - 1))); + return (NULL); } static int @@ -1396,179 +1442,117 @@ rxb_free(struct mbuf *m, void *arg1, void *arg2) { uma_zone_t zone = arg1; caddr_t cl = arg2; -#ifdef notyet - u_int refcount; - refcount = *find_buf_refcnt(cl); - KASSERT(refcount == 0, ("%s: cl %p refcount is %u", __func__, - cl - MSIZE, refcount)); -#endif - cl -= MSIZE; uma_zfree(zone, cl); return (EXT_FREE_OK); } +/* + * The mbuf returned by this function could be allocated from zone_mbuf or + * constructed in spare room in the cluster. + * + * The mbuf carries the payload in one of these ways + * a) frame inside the mbuf (mbuf from zone_mbuf) + * b) m_cljset (for clusters without metadata) zone_mbuf + * c) m_extaddref (cluster with metadata) inline mbuf + * d) m_extaddref (cluster with metadata) zone_mbuf + */ static struct mbuf * -get_fl_payload1(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf, - int *fl_bufs_used) +get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int total, int flags) { - struct mbuf *m0, *m; + struct mbuf *m; struct fl_sdesc *sd = &fl->sdesc[fl->cidx]; - unsigned int nbuf, len; - int pack_boundary = is_t4(sc) ? t4_fl_pack : t5_fl_pack; - - /* - * No assertion for the fl lock because we don't need it. This routine - * is called only from the rx interrupt handler and it only updates - * fl->cidx. (Contrast that with fl->pidx/fl->needed which could be - * updated in the rx interrupt handler or the starvation helper routine. - * That's why code that manipulates fl->pidx/fl->needed needs the fl - * lock but this routine does not). - */ + struct cluster_layout *cll = &sd->cll; + struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx]; + struct hw_buf_info *hwb = &sc->sge.hw_buf_info[cll->hwidx]; + struct cluster_metadata *clm = cl_metadata(sc, fl, cll, sd->cl); + int len, padded_len; + caddr_t payload; - KASSERT(fl->flags & FL_BUF_PACKING, - ("%s: buffer packing disabled for fl %p", __func__, fl)); + len = min(total, hwb->size - fl->rx_offset); + padded_len = roundup2(len, fl_pad); + payload = sd->cl + cll->region1 + fl->rx_offset; - len = G_RSPD_LEN(len_newbuf); - - if ((len_newbuf & F_RSPD_NEWBUF) == 0) { - KASSERT(fl->rx_offset > 0, - ("%s: packed frame but driver at offset=0", __func__)); + if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) { - /* A packed frame is guaranteed to fit entirely in this buf. */ - KASSERT(FL_BUF_SIZE(sc, sd->tag_idx) - fl->rx_offset >= len, - ("%s: packing error. bufsz=%u, offset=%u, len=%u", - __func__, FL_BUF_SIZE(sc, sd->tag_idx), fl->rx_offset, - len)); + /* + * Copy payload into a freshly allocated mbuf. + */ - m0 = get_mbuf_from_stash(fl); - if (m0 == NULL || - m_init(m0, NULL, 0, M_NOWAIT, MT_DATA, M_PKTHDR) != 0) { - return_mbuf_to_stash(fl, m0); + m = flags & M_PKTHDR ? + m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA); + if (m == NULL) return (NULL); - } - - bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map, - BUS_DMASYNC_POSTREAD); - if (sc->sc_do_rxcopy && (len < RX_COPY_THRESHOLD)) { + fl->mbuf_allocated++; #ifdef T4_PKT_TIMESTAMP - /* Leave room for a timestamp */ - m0->m_data += 8; + /* Leave room for a timestamp */ + m->m_data += 8; #endif - bcopy(sd->cl + fl->rx_offset, mtod(m0, caddr_t), len); - m0->m_pkthdr.len = len; - m0->m_len = len; - } else { - m0->m_pkthdr.len = len; - m0->m_len = len; - m_extaddref(m0, sd->cl + fl->rx_offset, - roundup2(m0->m_len, fl_pad), - find_buf_refcnt(sd->cl), rxb_free, - FL_BUF_ZONE(sc, sd->tag_idx), sd->cl); - } - fl->rx_offset += len; - fl->rx_offset = roundup2(fl->rx_offset, fl_pad); - fl->rx_offset = roundup2(fl->rx_offset, pack_boundary); - if (fl->rx_offset >= FL_BUF_SIZE(sc, sd->tag_idx)) { - fl->rx_offset = 0; - (*fl_bufs_used) += 1; - if (__predict_false(++fl->cidx == fl->cap)) - fl->cidx = 0; - } + /* copy data to mbuf */ + bcopy(payload, mtod(m, caddr_t), len); - return (m0); - } + } else if (sd->nmbuf * MSIZE < cll->region1) { - KASSERT(len_newbuf & F_RSPD_NEWBUF, - ("%s: only new buffer handled here", __func__)); + /* + * There's spare room in the cluster for an mbuf. Create one + * and associate it with the payload that's in the cluster too. + */ - nbuf = 0; + MPASS(clm != NULL); + m = (struct mbuf *)(sd->cl + sd->nmbuf * MSIZE); + /* No bzero required */ + if (m_init(m, NULL, 0, M_NOWAIT, MT_DATA, flags | M_NOFREE)) + return (NULL); + fl->mbuf_inlined++; + m_extaddref(m, payload, padded_len, &clm->refcount, rxb_free, + swz->zone, sd->cl); + sd->nmbuf++; - /* - * Move to the start of the next buffer if we are still in the middle of - * some buffer. This is the case where there was some room left in the - * previous buffer but not enough to fit this frame in its entirety. - */ - if (fl->rx_offset > 0) { - KASSERT(roundup2(len, fl_pad) > FL_BUF_SIZE(sc, sd->tag_idx) - - fl->rx_offset, ("%s: frame (%u bytes) should have fit at " - "cidx %u offset %u bufsize %u", __func__, len, fl->cidx, - fl->rx_offset, FL_BUF_SIZE(sc, sd->tag_idx))); - nbuf++; - fl->rx_offset = 0; - sd++; - if (__predict_false(++fl->cidx == fl->cap)) { - sd = fl->sdesc; - fl->cidx = 0; - } - } + } else { - m0 = find_buf_mbuf(sd->cl); - if (m_init(m0, NULL, 0, M_NOWAIT, MT_DATA, M_PKTHDR | M_NOFREE)) - goto done; - bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map, BUS_DMASYNC_POSTREAD); - m0->m_len = min(len, FL_BUF_SIZE(sc, sd->tag_idx)); - m_extaddref(m0, sd->cl, roundup2(m0->m_len, fl_pad), - find_buf_refcnt(sd->cl), rxb_free, FL_BUF_ZONE(sc, sd->tag_idx), - sd->cl); - m0->m_pkthdr.len = len; - - fl->rx_offset = roundup2(m0->m_len, fl_pad); - fl->rx_offset = roundup2(fl->rx_offset, pack_boundary); - if (fl->rx_offset >= FL_BUF_SIZE(sc, sd->tag_idx)) { - fl->rx_offset = 0; - nbuf++; - sd++; - if (__predict_false(++fl->cidx == fl->cap)) { - sd = fl->sdesc; - fl->cidx = 0; + /* + * Grab an mbuf from zone_mbuf and associate it with the + * payload in the cluster. + */ + + m = flags & M_PKTHDR ? + m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA); + if (m == NULL) + return (NULL); + fl->mbuf_allocated++; + if (clm != NULL) + m_extaddref(m, payload, padded_len, &clm->refcount, + rxb_free, swz->zone, sd->cl); + else { + m_cljset(m, sd->cl, swz->type); + sd->cl = NULL; /* consumed, not a recycle candidate */ } } + if (flags & M_PKTHDR) + m->m_pkthdr.len = total; + m->m_len = len; - m = m0; - len -= m->m_len; + if (fl->flags & FL_BUF_PACKING) { + fl->rx_offset += roundup2(padded_len, sc->sge.pack_boundary); + MPASS(fl->rx_offset <= hwb->size); + if (fl->rx_offset < hwb->size) + return (m); /* without advancing the cidx */ + } - while (len > 0) { - m->m_next = find_buf_mbuf(sd->cl); - m = m->m_next; - - bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map, - BUS_DMASYNC_POSTREAD); - - /* m_init for !M_PKTHDR can't fail so don't bother */ - m_init(m, NULL, 0, M_NOWAIT, MT_DATA, M_NOFREE); - m->m_len = min(len, FL_BUF_SIZE(sc, sd->tag_idx)); - m_extaddref(m, sd->cl, roundup2(m->m_len, fl_pad), - find_buf_refcnt(sd->cl), rxb_free, - FL_BUF_ZONE(sc, sd->tag_idx), sd->cl); - - fl->rx_offset = roundup2(m->m_len, fl_pad); - fl->rx_offset = roundup2(fl->rx_offset, pack_boundary); - if (fl->rx_offset >= FL_BUF_SIZE(sc, sd->tag_idx)) { - fl->rx_offset = 0; - nbuf++; - sd++; - if (__predict_false(++fl->cidx == fl->cap)) { - sd = fl->sdesc; - fl->cidx = 0; - } - } + if (__predict_false(++fl->cidx == fl->cap)) + fl->cidx = 0; + fl->rx_offset = 0; - len -= m->m_len; - } -done: - (*fl_bufs_used) += nbuf; - return (m0); + return (m); } static struct mbuf * -get_fl_payload2(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf, +get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf, int *fl_bufs_used) { - struct mbuf *m0, *m; - struct fl_sdesc *sd = &fl->sdesc[fl->cidx]; - unsigned int nbuf, len; + struct mbuf *m0, *m, **pnext; + u_int nbuf, len; /* * No assertion for the fl lock because we don't need it. This routine @@ -1579,87 +1563,54 @@ get_fl_payload2(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf, * lock but this routine does not). */ - KASSERT((fl->flags & FL_BUF_PACKING) == 0, - ("%s: buffer packing enabled for fl %p", __func__, fl)); - if (__predict_false((len_newbuf & F_RSPD_NEWBUF) == 0)) - panic("%s: cannot handle packed frames", __func__); + nbuf = 0; len = G_RSPD_LEN(len_newbuf); - - /* - * We never want to run out of mbufs in between a frame when a frame - * spans multiple fl buffers. If the fl's mbuf stash isn't full and - * can't be filled up to the brim then fail early. - */ - if (len > FL_BUF_SIZE(sc, sd->tag_idx) && fill_mbuf_stash(fl) != 0) - return (NULL); - - m0 = get_mbuf_from_stash(fl); - if (m0 == NULL || - m_init(m0, NULL, 0, M_NOWAIT, MT_DATA, M_PKTHDR) != 0) { - return_mbuf_to_stash(fl, m0); - return (NULL); + if (__predict_false(fl->m0 != NULL)) { + MPASS(len == fl->m0->m_pkthdr.len); + MPASS(fl->remaining < len); + + m0 = fl->m0; + pnext = fl->pnext; + len = fl->remaining; + fl->m0 = NULL; + goto get_segment; } - bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map, BUS_DMASYNC_POSTREAD); - - if (sc->sc_do_rxcopy && (len < RX_COPY_THRESHOLD)) { -#ifdef T4_PKT_TIMESTAMP - /* Leave room for a timestamp */ - m0->m_data += 8; -#endif - /* copy data to mbuf, buffer will be recycled */ - bcopy(sd->cl, mtod(m0, caddr_t), len); - m0->m_len = len; - } else { - bus_dmamap_unload(fl->tag[sd->tag_idx], sd->map); - m_cljset(m0, sd->cl, FL_BUF_TYPE(sc, sd->tag_idx)); - sd->cl = NULL; /* consumed */ - m0->m_len = min(len, FL_BUF_SIZE(sc, sd->tag_idx)); - } - m0->m_pkthdr.len = len; - - sd++; - if (__predict_false(++fl->cidx == fl->cap)) { - sd = fl->sdesc; - fl->cidx = 0; + if (fl->rx_offset > 0 && len_newbuf & F_RSPD_NEWBUF) { + nbuf++; + fl->rx_offset = 0; + if (__predict_false(++fl->cidx == fl->cap)) + fl->cidx = 0; } - m = m0; - len -= m->m_len; - nbuf = 1; /* # of fl buffers used */ + /* + * Payload starts at rx_offset in the current hw buffer. Its length is + * 'len' and it may span multiple hw buffers. + */ + m0 = get_scatter_segment(sc, fl, len, M_PKTHDR); + len -= m0->m_len; + pnext = &m0->m_next; while (len > 0) { - /* Can't fail, we checked earlier that the stash was full. */ - m->m_next = get_mbuf_from_stash(fl); - m = m->m_next; - - bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map, - BUS_DMASYNC_POSTREAD); - - /* m_init for !M_PKTHDR can't fail so don't bother */ - m_init(m, NULL, 0, M_NOWAIT, MT_DATA, 0); - if (len <= MLEN) { - bcopy(sd->cl, mtod(m, caddr_t), len); - m->m_len = len; - } else { - bus_dmamap_unload(fl->tag[sd->tag_idx], sd->map); - m_cljset(m, sd->cl, FL_BUF_TYPE(sc, sd->tag_idx)); - sd->cl = NULL; /* consumed */ - m->m_len = min(len, FL_BUF_SIZE(sc, sd->tag_idx)); - } - - sd++; - if (__predict_false(++fl->cidx == fl->cap)) { - sd = fl->sdesc; - fl->cidx = 0; + nbuf++; +get_segment: + MPASS(fl->rx_offset == 0); + m = get_scatter_segment(sc, fl, len, 0); + if (m == NULL) { + fl->m0 = m0; + fl->pnext = pnext; + fl->remaining = len; + return (NULL); } - + *pnext = m; + pnext = &m->m_next; len -= m->m_len; - nbuf++; } + *pnext = NULL; + if (fl->rx_offset == 0) + nbuf++; (*fl_bufs_used) += nbuf; - return (m0); } @@ -2016,23 +1967,23 @@ t4_update_fl_bufsize(struct ifnet *ifp) struct sge_ofld_rxq *ofld_rxq; #endif struct sge_fl *fl; - int i, bufsize; + int i, maxp, mtu = ifp->if_mtu; - bufsize = mtu_to_bufsize(ifp->if_mtu); + maxp = mtu_to_max_payload(sc, mtu, 0); for_each_rxq(pi, i, rxq) { fl = &rxq->fl; FL_LOCK(fl); - set_fl_tag_idx(sc, fl, bufsize); + find_best_refill_source(sc, fl, maxp); FL_UNLOCK(fl); } #ifdef TCP_OFFLOAD - bufsize = mtu_to_bufsize_toe(pi->adapter, ifp->if_mtu); + maxp = mtu_to_max_payload(sc, mtu, 1); for_each_ofld_rxq(pi, i, ofld_rxq) { fl = &ofld_rxq->fl; FL_LOCK(fl); - set_fl_tag_idx(sc, fl, bufsize); + find_best_refill_source(sc, fl, maxp); FL_UNLOCK(fl); } #endif @@ -2066,7 +2017,7 @@ init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx, } static inline void -init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int bufsize, int pack, +init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, int pack, char *name) { @@ -2074,7 +2025,8 @@ init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int bufsize, int pack, strlcpy(fl->lockname, name, sizeof(fl->lockname)); if (pack) fl->flags |= FL_BUF_PACKING; - set_fl_tag_idx(sc, fl, bufsize); + find_best_refill_source(sc, fl, maxp); + find_safe_refill_source(sc, fl); } static inline void @@ -2203,24 +2155,6 @@ alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl, if (fl) { mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF); - for (i = 0; i < FL_BUF_SIZES(sc); i++) { - - /* - * A freelist buffer must be 16 byte aligned as the SGE - * uses the low 4 bits of the bus addr to figure out the - * buffer size. - */ - rc = bus_dma_tag_create(sc->dmat, 16, 0, - BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, - FL_BUF_SIZE(sc, i), 1, FL_BUF_SIZE(sc, i), - BUS_DMA_ALLOCNOW, NULL, NULL, &fl->tag[i]); - if (rc != 0) { - device_printf(sc->dev, - "failed to create fl DMA tag[%d]: %d\n", - i, rc); - return (rc); - } - } len = fl->qsize * RX_FL_ESIZE; rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map, &fl->ba, (void **)&fl->desc); @@ -2337,7 +2271,7 @@ alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl, static int free_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl) { - int i, rc; + int rc; struct adapter *sc = iq->adapter; device_t dev; @@ -2369,29 +2303,48 @@ free_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl) if (fl->sdesc) free_fl_sdesc(sc, fl); - for (i = 0; i < nitems(fl->mstash); i++) { - struct mbuf *m = fl->mstash[i]; - - if (m != NULL) { - m_init(m, NULL, 0, M_NOWAIT, MT_DATA, 0); - m_free(m); - } - } - if (mtx_initialized(&fl->fl_lock)) mtx_destroy(&fl->fl_lock); - for (i = 0; i < FL_BUF_SIZES(sc); i++) { - if (fl->tag[i]) - bus_dma_tag_destroy(fl->tag[i]); - } - bzero(fl, sizeof(*fl)); } return (0); } +static void +add_fl_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, + struct sge_fl *fl) +{ + struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); + + oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL, + "freelist"); + children = SYSCTL_CHILDREN(oid); + + SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id", + CTLTYPE_INT | CTLFLAG_RD, &fl->cntxt_id, 0, sysctl_uint16, "I", + "SGE context id of the freelist"); + SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx, + 0, "consumer index"); + if (fl->flags & FL_BUF_PACKING) { + SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset", + CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset"); + } + SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx, + 0, "producer index"); + SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_allocated", + CTLFLAG_RD, &fl->mbuf_allocated, "# of mbuf allocated"); + SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_inlined", + CTLFLAG_RD, &fl->mbuf_inlined, "# of mbuf inlined in clusters"); + SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated", + CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated"); + SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled", + CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled"); + SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled", + CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)"); +} + static int alloc_fwq(struct adapter *sc) { @@ -2532,22 +2485,7 @@ alloc_rxq(struct port_info *pi, struct sge_rxq *rxq, int intr_idx, int idx, CTLFLAG_RD, &rxq->vlan_extraction, "# of times hardware extracted 802.1Q tag"); - children = SYSCTL_CHILDREN(oid); - oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "fl", CTLFLAG_RD, - NULL, "freelist"); - children = SYSCTL_CHILDREN(oid); - - SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cntxt_id", - CTLTYPE_INT | CTLFLAG_RD, &rxq->fl.cntxt_id, 0, sysctl_uint16, "I", - "SGE context id of the queue"); - SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, - &rxq->fl.cidx, 0, "consumer index"); - if (rxq->fl.flags & FL_BUF_PACKING) { - SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "rx_offset", - CTLFLAG_RD, &rxq->fl.rx_offset, 0, "packing rx offset"); - } - SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, - &rxq->fl.pidx, 0, "producer index"); + add_fl_sysctls(&pi->ctx, oid, &rxq->fl); return (rc); } @@ -2602,18 +2540,7 @@ alloc_ofld_rxq(struct port_info *pi, struct sge_ofld_rxq *ofld_rxq, CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cidx, 0, sysctl_uint16, "I", "consumer index"); - children = SYSCTL_CHILDREN(oid); - oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "fl", CTLFLAG_RD, - NULL, "freelist"); - children = SYSCTL_CHILDREN(oid); - - SYSCTL_ADD_PROC(&pi->ctx, children, OID_AUTO, "cntxt_id", - CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->fl.cntxt_id, 0, sysctl_uint16, - "I", "SGE context id of the queue"); - SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, - &ofld_rxq->fl.cidx, 0, "consumer index"); - SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, - &ofld_rxq->fl.pidx, 0, "producer index"); + add_fl_sysctls(&pi->ctx, oid, &ofld_rxq->fl); return (rc); } @@ -3100,103 +3027,83 @@ refill_fl(struct adapter *sc, struct sge_fl *fl, int nbufs) { __be64 *d = &fl->desc[fl->pidx]; struct fl_sdesc *sd = &fl->sdesc[fl->pidx]; - bus_dma_tag_t tag; - bus_addr_t pa; + uintptr_t pa; caddr_t cl; - int rc; + struct cluster_layout *cll = &fl->cll_def; /* default layout */ + struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx]; + struct cluster_metadata *clm; FL_LOCK_ASSERT_OWNED(fl); -#ifdef INVARIANTS - if (fl->flags & FL_BUF_PACKING) - KASSERT(sd->tag_idx == 0, - ("%s: expected tag 0 but found tag %d at pidx %u instead", - __func__, sd->tag_idx, fl->pidx)); -#endif if (nbufs > fl->needed) nbufs = fl->needed; + nbufs -= (fl->pidx + nbufs) % 8; while (nbufs--) { if (sd->cl != NULL) { - KASSERT(*d == sd->ba_hwtag, - ("%s: recyling problem at pidx %d", - __func__, fl->pidx)); - - if (fl->flags & FL_BUF_PACKING) { - u_int *refcount = find_buf_refcnt(sd->cl); - - if (atomic_fetchadd_int(refcount, -1) == 1) { - *refcount = 1; /* reinstate */ - d++; - goto recycled; - } - sd->cl = NULL; /* gave up my reference */ - } else { + if (sd->nmbuf == 0) { /* - * This happens when a frame small enough to fit - * entirely in an mbuf was received in cl last - * time. We'd held on to cl and can reuse it - * now. Note that we reuse a cluster of the old - * size if fl->tag_idx is no longer the same as - * sd->tag_idx. + * Fast recycle without involving any atomics on + * the cluster's metadata (if the cluster has + * metadata). This happens when all frames + * received in the cluster were small enough to + * fit within a single mbuf each. */ - d++; - goto recycled; + fl->cl_fast_recycled++; + goto recycled_fast; } - } - - if (__predict_false(fl->tag_idx != sd->tag_idx)) { - bus_dmamap_t map; - bus_dma_tag_t newtag = fl->tag[fl->tag_idx]; - bus_dma_tag_t oldtag = fl->tag[sd->tag_idx]; /* - * An MTU change can get us here. Discard the old map - * which was created with the old tag, but only if - * we're able to get a new one. + * Cluster is guaranteed to have metadata. Clusters + * without metadata always take the fast recycle path + * when they're recycled. */ - rc = bus_dmamap_create(newtag, 0, &map); - if (rc == 0) { - bus_dmamap_destroy(oldtag, sd->map); - sd->map = map; - sd->tag_idx = fl->tag_idx; - } - } - - tag = fl->tag[sd->tag_idx]; + clm = cl_metadata(sc, fl, &sd->cll, sd->cl); + MPASS(clm != NULL); - cl = uma_zalloc(FL_BUF_ZONE(sc, sd->tag_idx), M_NOWAIT); - if (cl == NULL) - break; - if (fl->flags & FL_BUF_PACKING) { - *find_buf_refcnt(cl) = 1; - cl += MSIZE; + if (atomic_fetchadd_int(&clm->refcount, -1) == 1) { + fl->cl_recycled++; + goto recycled; + } + sd->cl = NULL; /* gave up my reference */ } + MPASS(sd->cl == NULL); +alloc: + cl = uma_zalloc(swz->zone, M_NOWAIT); + if (__predict_false(cl == NULL)) { + if (cll == &fl->cll_alt || fl->cll_alt.zidx == -1 || + fl->cll_def.zidx == fl->cll_alt.zidx) + break; - rc = bus_dmamap_load(tag, sd->map, cl, - FL_BUF_SIZE(sc, sd->tag_idx), oneseg_dma_callback, &pa, 0); - if (rc != 0 || pa == 0) { - fl->dmamap_failed++; - if (fl->flags & FL_BUF_PACKING) - cl -= MSIZE; - uma_zfree(FL_BUF_ZONE(sc, sd->tag_idx), cl); - break; + /* fall back to the safe zone */ + cll = &fl->cll_alt; + swz = &sc->sge.sw_zone_info[cll->zidx]; + goto alloc; } + fl->cl_allocated++; + pa = pmap_kextract((vm_offset_t)cl); + pa += cll->region1; sd->cl = cl; - *d++ = htobe64(pa | FL_BUF_HWTAG(sc, sd->tag_idx)); - + sd->cll = *cll; + *d = htobe64(pa | cll->hwidx); + clm = cl_metadata(sc, fl, cll, cl); + if (clm != NULL) { +recycled: #ifdef INVARIANTS - sd->ba_hwtag = htobe64(pa | FL_BUF_HWTAG(sc, sd->tag_idx)); + clm->sd = sd; #endif - -recycled: + clm->refcount = 1; + } + sd->nmbuf = 0; +recycled_fast: fl->pending++; fl->needed--; + d++; sd++; - if (++fl->pidx == fl->cap) { + if (__predict_false(++fl->pidx == fl->cap)) { fl->pidx = 0; sd = fl->sdesc; d = fl->desc; @@ -3237,53 +3144,33 @@ refill_sfl(void *arg) static int alloc_fl_sdesc(struct sge_fl *fl) { - struct fl_sdesc *sd; - bus_dma_tag_t tag; - int i, rc; fl->sdesc = malloc(fl->cap * sizeof(struct fl_sdesc), M_CXGBE, M_ZERO | M_WAITOK); - tag = fl->tag[fl->tag_idx]; - sd = fl->sdesc; - for (i = 0; i < fl->cap; i++, sd++) { - - sd->tag_idx = fl->tag_idx; - rc = bus_dmamap_create(tag, 0, &sd->map); - if (rc != 0) - goto failed; - } - return (0); -failed: - while (--i >= 0) { - sd--; - bus_dmamap_destroy(tag, sd->map); - } - KASSERT(sd == fl->sdesc, ("%s: EDOOFUS", __func__)); - - free(fl->sdesc, M_CXGBE); - fl->sdesc = NULL; - - return (rc); } static void free_fl_sdesc(struct adapter *sc, struct sge_fl *fl) { struct fl_sdesc *sd; + struct cluster_metadata *clm; + struct cluster_layout *cll; int i; sd = fl->sdesc; for (i = 0; i < fl->cap; i++, sd++) { + if (sd->cl == NULL) + continue; - if (sd->cl) { - bus_dmamap_unload(fl->tag[sd->tag_idx], sd->map); - uma_zfree(FL_BUF_ZONE(sc, sd->tag_idx), sd->cl); - sd->cl = NULL; + cll = &sd->cll; + clm = cl_metadata(sc, fl, cll, sd->cl); + if (sd->nmbuf == 0 || + (clm && atomic_fetchadd_int(&clm->refcount, -1) == 1)) { + uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl); } - - bus_dmamap_destroy(fl->tag[sd->tag_idx], sd->map); + sd->cl = NULL; } free(fl->sdesc, M_CXGBE); @@ -4107,7 +3994,7 @@ write_eqflush_wr(struct sge_eq *eq) eq->pending++; eq->avail--; if (++eq->pidx == eq->cap) - eq->pidx = 0; + eq->pidx = 0; } static __be64 @@ -4134,52 +4021,167 @@ get_flit(bus_dma_segment_t *sgl, int nsegs, int idx) return (0); } -/* - * Find an SGE FL buffer size to use for the given bufsize. Look for the the - * smallest size that is large enough to hold bufsize or pick the largest size - * if all sizes are less than bufsize. - */ static void -set_fl_tag_idx(struct adapter *sc, struct sge_fl *fl, int bufsize) +find_best_refill_source(struct adapter *sc, struct sge_fl *fl, int maxp) { - int i, largest, best, delta, start; + int8_t zidx, hwidx, idx; + uint16_t region1, region3; + int spare, spare_needed, n; + struct sw_zone_info *swz; + struct hw_buf_info *hwb, *hwb_list = &sc->sge.hw_buf_info[0]; - if (fl->flags & FL_BUF_PACKING) { - fl->tag_idx = 0; /* first tag is the one for packing */ - return; - } + /* + * Buffer Packing: Look for PAGE_SIZE or larger zone which has a bufsize + * large enough for the max payload and cluster metadata. Otherwise + * settle for the largest bufsize that leaves enough room in the cluster + * for metadata. + * + * Without buffer packing: Look for the smallest zone which has a + * bufsize large enough for the max payload. Settle for the largest + * bufsize available if there's nothing big enough for max payload. + */ + spare_needed = fl->flags & FL_BUF_PACKING ? CL_METADATA_SIZE : 0; + swz = &sc->sge.sw_zone_info[0]; + hwidx = -1; + for (zidx = 0; zidx < SW_ZONE_SIZES; zidx++, swz++) { + if (swz->size > largest_rx_cluster) { + if (__predict_true(hwidx != -1)) + break; - start = sc->flags & BUF_PACKING_OK ? 1 : 0; - delta = FL_BUF_SIZE(sc, start) - bufsize; - if (delta == 0) { - fl->tag_idx = start; /* ideal fit, look no further */ - return; - } - best = start; - largest = start; + /* + * This is a misconfiguration. largest_rx_cluster is + * preventing us from finding a refill source. See + * dev.t5nex.<n>.buffer_sizes to figure out why. + */ + device_printf(sc->dev, "largest_rx_cluster=%u leaves no" + " refill source for fl %p (dma %u). Ignored.\n", + largest_rx_cluster, fl, maxp); + } + for (idx = swz->head_hwidx; idx != -1; idx = hwb->next) { + hwb = &hwb_list[idx]; + spare = swz->size - hwb->size; + if (spare < spare_needed) + continue; - for (i = start + 1; i < FL_BUF_SIZES(sc); i++) { - int d, fl_buf_size; + hwidx = idx; /* best option so far */ + if (hwb->size >= maxp) { - fl_buf_size = FL_BUF_SIZE(sc, i); - d = fl_buf_size - bufsize; + if ((fl->flags & FL_BUF_PACKING) == 0) + goto done; /* stop looking (not packing) */ - if (d == 0) { - fl->tag_idx = i; /* ideal fit, look no further */ - return; + if (swz->size >= safest_rx_cluster) + goto done; /* stop looking (packing) */ + } + break; /* keep looking, next zone */ } - if (fl_buf_size > FL_BUF_SIZE(sc, largest)) - largest = i; - if (d > 0 && (delta < 0 || delta > d)) { - delta = d; - best = i; + } +done: + /* A usable hwidx has been located. */ + MPASS(hwidx != -1); + hwb = &hwb_list[hwidx]; + zidx = hwb->zidx; + swz = &sc->sge.sw_zone_info[zidx]; + region1 = 0; + region3 = swz->size - hwb->size; + + /* + * Stay within this zone and see if there is a better match when mbuf + * inlining is allowed. Remember that the hwidx's are sorted in + * decreasing order of size (so in increasing order of spare area). + */ + for (idx = hwidx; idx != -1; idx = hwb->next) { + hwb = &hwb_list[idx]; + spare = swz->size - hwb->size; + + if (allow_mbufs_in_cluster == 0 || hwb->size < maxp) + break; + if (spare < CL_METADATA_SIZE + MSIZE) + continue; + n = (spare - CL_METADATA_SIZE) / MSIZE; + if (n > howmany(hwb->size, maxp)) + break; + + hwidx = idx; + if (fl->flags & FL_BUF_PACKING) { + region1 = n * MSIZE; + region3 = spare - region1; + } else { + region1 = MSIZE; + region3 = spare - region1; + break; } } - if (delta > 0) - fl->tag_idx = best; /* Found a buf bigger than bufsize */ + KASSERT(zidx >= 0 && zidx < SW_ZONE_SIZES, + ("%s: bad zone %d for fl %p, maxp %d", __func__, zidx, fl, maxp)); + KASSERT(hwidx >= 0 && hwidx <= SGE_FLBUF_SIZES, + ("%s: bad hwidx %d for fl %p, maxp %d", __func__, hwidx, fl, maxp)); + KASSERT(region1 + sc->sge.hw_buf_info[hwidx].size + region3 == + sc->sge.sw_zone_info[zidx].size, + ("%s: bad buffer layout for fl %p, maxp %d. " + "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp, + sc->sge.sw_zone_info[zidx].size, region1, + sc->sge.hw_buf_info[hwidx].size, region3)); + if (fl->flags & FL_BUF_PACKING || region1 > 0) { + KASSERT(region3 >= CL_METADATA_SIZE, + ("%s: no room for metadata. fl %p, maxp %d; " + "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp, + sc->sge.sw_zone_info[zidx].size, region1, + sc->sge.hw_buf_info[hwidx].size, region3)); + KASSERT(region1 % MSIZE == 0, + ("%s: bad mbuf region for fl %p, maxp %d. " + "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp, + sc->sge.sw_zone_info[zidx].size, region1, + sc->sge.hw_buf_info[hwidx].size, region3)); + } + + fl->cll_def.zidx = zidx; + fl->cll_def.hwidx = hwidx; + fl->cll_def.region1 = region1; + fl->cll_def.region3 = region3; +} + +static void +find_safe_refill_source(struct adapter *sc, struct sge_fl *fl) +{ + struct sge *s = &sc->sge; + struct hw_buf_info *hwb; + struct sw_zone_info *swz; + int spare; + int8_t hwidx; + + if (fl->flags & FL_BUF_PACKING) + hwidx = s->safe_hwidx2; /* with room for metadata */ + else if (allow_mbufs_in_cluster && s->safe_hwidx2 != -1) { + hwidx = s->safe_hwidx2; + hwb = &s->hw_buf_info[hwidx]; + swz = &s->sw_zone_info[hwb->zidx]; + spare = swz->size - hwb->size; + + /* no good if there isn't room for an mbuf as well */ + if (spare < CL_METADATA_SIZE + MSIZE) + hwidx = s->safe_hwidx1; + } else + hwidx = s->safe_hwidx1; + + if (hwidx == -1) { + /* No fallback source */ + fl->cll_alt.hwidx = -1; + fl->cll_alt.zidx = -1; + + return; + } + + hwb = &s->hw_buf_info[hwidx]; + swz = &s->sw_zone_info[hwb->zidx]; + spare = swz->size - hwb->size; + fl->cll_alt.hwidx = hwidx; + fl->cll_alt.zidx = hwb->zidx; + if (allow_mbufs_in_cluster) + fl->cll_alt.region1 = ((spare - CL_METADATA_SIZE) / MSIZE) * MSIZE; else - fl->tag_idx = largest; /* No buf large enough for bufsize */ + fl->cll_alt.region1 = 0; + fl->cll_alt.region3 = spare - fl->cll_alt.region1; } static void @@ -4256,3 +4258,29 @@ sysctl_uint16(SYSCTL_HANDLER_ARGS) return sysctl_handle_int(oidp, &i, 0, req); } + +static int +sysctl_bufsizes(SYSCTL_HANDLER_ARGS) +{ + struct sge *s = arg1; + struct hw_buf_info *hwb = &s->hw_buf_info[0]; + struct sw_zone_info *swz = &s->sw_zone_info[0]; + int i, rc; + struct sbuf sb; + char c; + + sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND); + for (i = 0; i < SGE_FLBUF_SIZES; i++, hwb++) { + if (hwb->zidx >= 0 && swz[hwb->zidx].size <= largest_rx_cluster) + c = '*'; + else + c = '\0'; + + sbuf_printf(&sb, "%u%c ", hwb->size, c); + } + sbuf_trim(&sb); + sbuf_finish(&sb); + rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); + sbuf_delete(&sb); + return (rc); +} |