summaryrefslogtreecommitdiffstats
path: root/sys/dev/mxge/if_mxge.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/dev/mxge/if_mxge.c')
-rw-r--r--sys/dev/mxge/if_mxge.c355
1 files changed, 219 insertions, 136 deletions
diff --git a/sys/dev/mxge/if_mxge.c b/sys/dev/mxge/if_mxge.c
index ee7aacf..f4fb50c 100644
--- a/sys/dev/mxge/if_mxge.c
+++ b/sys/dev/mxge/if_mxge.c
@@ -69,6 +69,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp.h>
#include <machine/bus.h>
+#include <machine/in_cksum.h>
#include <machine/resource.h>
#include <sys/bus.h>
#include <sys/rman.h>
@@ -1073,6 +1074,27 @@ mxge_set_multicast_list(mxge_softc_t *sc)
}
static int
+mxge_max_mtu(mxge_softc_t *sc)
+{
+ mxge_cmd_t cmd;
+ int status;
+
+ if (MJUMPAGESIZE - MXGEFW_PAD > MXGE_MAX_ETHER_MTU)
+ return MXGE_MAX_ETHER_MTU - MXGEFW_PAD;
+
+ /* try to set nbufs to see if it we can
+ use virtually contiguous jumbos */
+ cmd.data0 = 0;
+ status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
+ &cmd);
+ if (status == 0)
+ return MXGE_MAX_ETHER_MTU - MXGEFW_PAD;
+
+ /* otherwise, we're limited to MJUMPAGESIZE */
+ return MJUMPAGESIZE - MXGEFW_PAD;
+}
+
+static int
mxge_reset(mxge_softc_t *sc)
{
@@ -1139,6 +1161,9 @@ mxge_reset(mxge_softc_t *sc)
sc->rdma_tags_available = 15;
sc->fw_stats->valid = 0;
sc->fw_stats->send_done_count = 0;
+ sc->lro_bad_csum = 0;
+ sc->lro_queued = 0;
+ sc->lro_flushed = 0;
status = mxge_update_mac_address(sc);
mxge_change_promisc(sc, 0);
mxge_change_pause(sc, sc->pause);
@@ -1364,6 +1389,19 @@ mxge_add_sysctls(mxge_softc_t *sc)
CTLFLAG_RW, &mxge_verbose,
0, "verbose printing");
+ /* lro */
+ SYSCTL_ADD_INT(ctx, children, OID_AUTO,
+ "lro_cnt", CTLFLAG_RW, &sc->lro_cnt,
+ 0, "number of lro merge queues");
+
+ SYSCTL_ADD_INT(ctx, children, OID_AUTO,
+ "lro_flushed", CTLFLAG_RD, &sc->lro_flushed,
+ 0, "number of lro merge queues flushed");
+
+ SYSCTL_ADD_INT(ctx, children, OID_AUTO,
+ "lro_queued", CTLFLAG_RD, &sc->lro_queued,
+ 0, "number of frames appended to lro merge queues");
+
}
/* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
@@ -1883,169 +1921,135 @@ done:
static int
mxge_get_buf_big(mxge_softc_t *sc, bus_dmamap_t map, int idx)
{
- bus_dma_segment_t seg;
+ bus_dma_segment_t seg[3];
struct mbuf *m;
mxge_rx_buf_t *rx = &sc->rx_big;
- int cnt, err;
+ int cnt, err, i;
- m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, sc->big_bytes);
+ m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
if (m == NULL) {
rx->alloc_fail++;
err = ENOBUFS;
goto done;
}
- m->m_len = sc->big_bytes;
+ m->m_len = rx->cl_size;
err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
- &seg, &cnt, BUS_DMA_NOWAIT);
+ seg, &cnt, BUS_DMA_NOWAIT);
if (err != 0) {
m_free(m);
goto done;
}
rx->info[idx].m = m;
- rx->shadow[idx].addr_low =
- htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
- rx->shadow[idx].addr_high =
- htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
+
+ for (i = 0; i < cnt; i++) {
+ rx->shadow[idx + i].addr_low =
+ htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
+ rx->shadow[idx + i].addr_high =
+ htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
+ }
+
done:
- if ((idx & 7) == 7) {
- if (rx->wc_fifo == NULL)
- mxge_submit_8rx(&rx->lanai[idx - 7],
- &rx->shadow[idx - 7]);
- else {
- mb();
- mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
+ for (i = 0; i < rx->nbufs; i++) {
+ if ((idx & 7) == 7) {
+ if (rx->wc_fifo == NULL)
+ mxge_submit_8rx(&rx->lanai[idx - 7],
+ &rx->shadow[idx - 7]);
+ else {
+ mb();
+ mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
+ }
}
- }
+ idx++;
+ }
return err;
}
-static inline void
+/*
+ * Myri10GE hardware checksums are not valid if the sender
+ * padded the frame with non-zero padding. This is because
+ * the firmware just does a simple 16-bit 1s complement
+ * checksum across the entire frame, excluding the first 14
+ * bytes. It is best to simply to check the checksum and
+ * tell the stack about it only if the checksum is good
+ */
+
+static inline uint16_t
mxge_rx_csum(struct mbuf *m, int csum)
{
struct ether_header *eh;
struct ip *ip;
+ uint16_t c;
eh = mtod(m, struct ether_header *);
/* only deal with IPv4 TCP & UDP for now */
if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
- return;
+ return 1;
ip = (struct ip *)(eh + 1);
if (__predict_false(ip->ip_p != IPPROTO_TCP &&
ip->ip_p != IPPROTO_UDP))
- return;
+ return 1;
- /*
- * Myri10GE hardware checksums are not valid if the sender
- * padded the frame with non-zero padding. This is because
- * the firmware just does a simple 16-bit 1s complement
- * checksum across the entire frame, excluding the first 14
- * bytes. It is easiest to simply to assume the worst, and
- * only apply hardware checksums to non-padded frames. This
- * is what nearly every other OS does by default.
- */
-
- if (__predict_true(m->m_pkthdr.len ==
- (ntohs(ip->ip_len) + ETHER_HDR_LEN))) {
- m->m_pkthdr.csum_data = csum;
- m->m_pkthdr.csum_flags = CSUM_DATA_VALID;
- }
+ c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+ htonl(ntohs(csum) + ntohs(ip->ip_len) +
+ - (ip->ip_hl << 2) + ip->ip_p));
+ c ^= 0xffff;
+ return (c);
}
-static inline void
-mxge_rx_done_big(mxge_softc_t *sc, int len, int csum)
+
+static inline void
+mxge_rx_done_big(mxge_softc_t *sc, uint32_t len, uint32_t csum)
{
struct ifnet *ifp;
- struct mbuf *m = 0; /* -Wunitialized */
- struct mbuf *m_prev = 0; /* -Wunitialized */
- struct mbuf *m_head = 0;
- bus_dmamap_t old_map;
+ struct mbuf *m;
mxge_rx_buf_t *rx;
+ bus_dmamap_t old_map;
int idx;
+ uint16_t tcpudp_csum;
-
- rx = &sc->rx_big;
ifp = sc->ifp;
- while (len > 0) {
- idx = rx->cnt & rx->mask;
- rx->cnt++;
- /* save a pointer to the received mbuf */
- m = rx->info[idx].m;
- /* try to replace the received mbuf */
- if (mxge_get_buf_big(sc, rx->extra_map, idx)) {
- goto drop;
- }
- /* unmap the received buffer */
- old_map = rx->info[idx].map;
- bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
- bus_dmamap_unload(rx->dmat, old_map);
-
- /* swap the bus_dmamap_t's */
- rx->info[idx].map = rx->extra_map;
- rx->extra_map = old_map;
-
- /* chain multiple segments together */
- if (!m_head) {
- m_head = m;
- /* mcp implicitly skips 1st bytes so that
- * packet is properly aligned */
- m->m_data += MXGEFW_PAD;
- m->m_pkthdr.len = len;
- m->m_len = sc->big_bytes - MXGEFW_PAD;
- } else {
- m->m_len = sc->big_bytes;
- m->m_flags &= ~M_PKTHDR;
- m_prev->m_next = m;
- }
- len -= m->m_len;
- m_prev = m;
+ rx = &sc->rx_big;
+ idx = rx->cnt & rx->mask;
+ rx->cnt += rx->nbufs;
+ /* save a pointer to the received mbuf */
+ m = rx->info[idx].m;
+ /* try to replace the received mbuf */
+ if (mxge_get_buf_big(sc, rx->extra_map, idx)) {
+ /* drop the frame -- the old mbuf is re-cycled */
+ ifp->if_ierrors++;
+ return;
}
- /* trim trailing garbage from the last mbuf in the chain. If
- * there is any garbage, len will be negative */
- m->m_len += len;
+ /* unmap the received buffer */
+ old_map = rx->info[idx].map;
+ bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
+ bus_dmamap_unload(rx->dmat, old_map);
- m_head->m_pkthdr.rcvif = ifp;
+ /* swap the bus_dmamap_t's */
+ rx->info[idx].map = rx->extra_map;
+ rx->extra_map = old_map;
+
+ /* mcp implicitly skips 1st 2 bytes so that packet is properly
+ * aligned */
+ m->m_data += MXGEFW_PAD;
+
+ m->m_pkthdr.rcvif = ifp;
+ m->m_len = m->m_pkthdr.len = len;
ifp->if_ipackets++;
/* if the checksum is valid, mark it in the mbuf header */
- if (sc->csum_flag)
- mxge_rx_csum(m_head, csum);
-
+ if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
+ if (sc->lro_cnt && (0 == mxge_lro_rx(sc, m, csum)))
+ return;
+ /* otherwise, it was a UDP frame, or a TCP frame which
+ we could not do LRO on. Tell the stack that the
+ checksum is good */
+ m->m_pkthdr.csum_data = 0xffff;
+ m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
+ }
/* pass the frame up the stack */
- (*ifp->if_input)(ifp, m_head);
- return;
-
-drop:
- /* drop the frame -- the old mbuf(s) are re-cycled by running
- every slot through the allocator */
- if (m_head) {
- len -= sc->big_bytes;
- m_freem(m_head);
- } else {
- len -= (sc->big_bytes + MXGEFW_PAD);
- }
- while ((int)len > 0) {
- idx = rx->cnt & rx->mask;
- rx->cnt++;
- m = rx->info[idx].m;
- if (0 == (mxge_get_buf_big(sc, rx->extra_map, idx))) {
- m_freem(m);
- /* unmap the received buffer */
- old_map = rx->info[idx].map;
- bus_dmamap_sync(rx->dmat, old_map,
- BUS_DMASYNC_POSTREAD);
- bus_dmamap_unload(rx->dmat, old_map);
-
- /* swap the bus_dmamap_t's */
- rx->info[idx].map = rx->extra_map;
- rx->extra_map = old_map;
- }
- len -= sc->big_bytes;
- }
-
- ifp->if_ierrors++;
-
+ (*ifp->if_input)(ifp, m);
}
static inline void
@@ -2056,6 +2060,7 @@ mxge_rx_done_small(mxge_softc_t *sc, uint32_t len, uint32_t csum)
mxge_rx_buf_t *rx;
bus_dmamap_t old_map;
int idx;
+ uint16_t tcpudp_csum;
ifp = sc->ifp;
rx = &sc->rx_small;
@@ -2087,8 +2092,15 @@ mxge_rx_done_small(mxge_softc_t *sc, uint32_t len, uint32_t csum)
m->m_len = m->m_pkthdr.len = len;
ifp->if_ipackets++;
/* if the checksum is valid, mark it in the mbuf header */
- if (sc->csum_flag)
- mxge_rx_csum(m, csum);
+ if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
+ if (sc->lro_cnt && (0 == mxge_lro_rx(sc, m, csum)))
+ return;
+ /* otherwise, it was a UDP frame, or a TCP frame which
+ we could not do LRO on. Tell the stack that the
+ checksum is good */
+ m->m_pkthdr.csum_data = 0xffff;
+ m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
+ }
/* pass the frame up the stack */
(*ifp->if_input)(ifp, m);
@@ -2098,6 +2110,7 @@ static inline void
mxge_clean_rx_done(mxge_softc_t *sc)
{
mxge_rx_done_t *rx_done = &sc->rx_done;
+ struct lro_entry *lro;
int limit = 0;
uint16_t length;
uint16_t checksum;
@@ -2106,7 +2119,7 @@ mxge_clean_rx_done(mxge_softc_t *sc)
while (rx_done->entry[rx_done->idx].length != 0) {
length = ntohs(rx_done->entry[rx_done->idx].length);
rx_done->entry[rx_done->idx].length = 0;
- checksum = ntohs(rx_done->entry[rx_done->idx].checksum);
+ checksum = rx_done->entry[rx_done->idx].checksum;
if (length <= (MHLEN - MXGEFW_PAD))
mxge_rx_done_small(sc, length, checksum);
else
@@ -2117,7 +2130,11 @@ mxge_clean_rx_done(mxge_softc_t *sc)
/* limit potential for livelock */
if (__predict_false(++limit > 2 * mxge_max_intr_slots))
break;
-
+ }
+ while(!SLIST_EMPTY(&sc->lro_active)) {
+ lro = SLIST_FIRST(&sc->lro_active);
+ SLIST_REMOVE_HEAD(&sc->lro_active, next);
+ mxge_lro_flush(sc, lro);
}
}
@@ -2447,8 +2464,8 @@ mxge_alloc_rings(mxge_softc_t *sc)
BUS_SPACE_MAXADDR, /* low */
BUS_SPACE_MAXADDR, /* high */
NULL, NULL, /* filter */
- 4096, /* maxsize */
- 1, /* num segs */
+ 3*4096, /* maxsize */
+ 3, /* num segs */
4096, /* maxsegsize */
BUS_DMA_ALLOCNOW, /* flags */
NULL, NULL, /* lock */
@@ -2512,14 +2529,56 @@ abort_with_nothing:
return err;
}
+static void
+mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
+{
+ int bufsize = mtu + ETHER_HDR_LEN + 4 + MXGEFW_PAD;
+
+ if (bufsize < MCLBYTES) {
+ /* easy, everything fits in a single buffer */
+ *big_buf_size = MCLBYTES;
+ *cl_size = MCLBYTES;
+ *nbufs = 1;
+ return;
+ }
+
+ if (bufsize < MJUMPAGESIZE) {
+ /* still easy, everything still fits in a single buffer */
+ *big_buf_size = MJUMPAGESIZE;
+ *cl_size = MJUMPAGESIZE;
+ *nbufs = 1;
+ return;
+ }
+ /* now we need to use virtually contiguous buffers */
+ *cl_size = MJUM9BYTES;
+ *big_buf_size = 4096;
+ *nbufs = mtu / 4096 + 1;
+ /* needs to be a power of two, so round up */
+ if (*nbufs == 3)
+ *nbufs = 4;
+}
+
static int
mxge_open(mxge_softc_t *sc)
{
mxge_cmd_t cmd;
- int i, err;
+ int i, err, big_bytes;
bus_dmamap_t map;
bus_addr_t bus;
+ struct lro_entry *lro_entry;
+ SLIST_INIT(&sc->lro_free);
+ SLIST_INIT(&sc->lro_active);
+
+ for (i = 0; i < sc->lro_cnt; i++) {
+ lro_entry = (struct lro_entry *)
+ malloc(sizeof (*lro_entry), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (lro_entry == NULL) {
+ sc->lro_cnt = i;
+ break;
+ }
+ SLIST_INSERT_HEAD(&sc->lro_free, lro_entry, next);
+ }
/* Copy the MAC address in case it was overridden */
bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
@@ -2532,13 +2591,20 @@ mxge_open(mxge_softc_t *sc)
bzero(sc->rx_done.entry,
mxge_max_intr_slots * sizeof(*sc->rx_done.entry));
- if (MCLBYTES >=
- sc->ifp->if_mtu + ETHER_HDR_LEN + MXGEFW_PAD)
- sc->big_bytes = MCLBYTES;
- else
- sc->big_bytes = MJUMPAGESIZE;
-
+ mxge_choose_params(sc->ifp->if_mtu, &big_bytes,
+ &sc->rx_big.cl_size, &sc->rx_big.nbufs);
+ cmd.data0 = sc->rx_big.nbufs;
+ err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
+ &cmd);
+ /* error is only meaningful if we're trying to set
+ MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
+ if (err && sc->rx_big.nbufs > 1) {
+ device_printf(sc->dev,
+ "Failed to set alway-use-n to %d\n",
+ sc->rx_big.nbufs);
+ return EIO;
+ }
/* get the lanai pointers to the send and receive rings */
err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
@@ -2580,6 +2646,10 @@ mxge_open(mxge_softc_t *sc)
}
}
for (i = 0; i <= sc->rx_big.mask; i++) {
+ sc->rx_big.shadow[i].addr_low = 0xffffffff;
+ sc->rx_big.shadow[i].addr_high = 0xffffffff;
+ }
+ for (i = 0; i <= sc->rx_big.mask; i += sc->rx_big.nbufs) {
map = sc->rx_big.info[i].map;
err = mxge_get_buf_big(sc, map, i);
if (err) {
@@ -2592,12 +2662,12 @@ mxge_open(mxge_softc_t *sc)
/* Give the firmware the mtu and the big and small buffer
sizes. The firmware wants the big buf size to be a power
of two. Luckily, FreeBSD's clusters are powers of two */
- cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN;
+ cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + 4;
err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
cmd.data0 = MHLEN - MXGEFW_PAD;
err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
&cmd);
- cmd.data0 = sc->big_bytes;
+ cmd.data0 = big_bytes;
err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
if (err != 0) {
@@ -2651,6 +2721,7 @@ abort:
static int
mxge_close(mxge_softc_t *sc)
{
+ struct lro_entry *lro_entry;
mxge_cmd_t cmd;
int err, old_down_cnt;
@@ -2671,6 +2742,10 @@ mxge_close(mxge_softc_t *sc)
mxge_free_mbufs(sc);
+ while (!SLIST_EMPTY(&sc->lro_free)) {
+ lro_entry = SLIST_FIRST(&sc->lro_free);
+ SLIST_REMOVE_HEAD(&sc->lro_free, next);
+ }
return 0;
}
@@ -2833,8 +2908,7 @@ mxge_change_mtu(mxge_softc_t *sc, int mtu)
real_mtu = mtu + ETHER_HDR_LEN;
- if ((real_mtu > MXGE_MAX_ETHER_MTU) ||
- real_mtu < 60)
+ if ((real_mtu > sc->max_mtu) || real_mtu < 60)
return EINVAL;
mtx_lock(&sc->driver_mtx);
old_mtu = ifp->if_mtu;
@@ -2981,6 +3055,7 @@ mxge_fetch_tunables(mxge_softc_t *sc)
TUNABLE_INT_FETCH("hw.mxge.verbose",
&mxge_verbose);
TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
+ TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
if (bootverbose)
mxge_verbose = 1;
@@ -2989,6 +3064,7 @@ mxge_fetch_tunables(mxge_softc_t *sc)
if (mxge_ticks == 0)
mxge_ticks = hz;
sc->pause = mxge_flow_control;
+
}
static int
@@ -3145,8 +3221,14 @@ mxge_attach(device_t dev)
/* hook into the network stack */
if_initname(ifp, device_get_name(dev), device_get_unit(dev));
ifp->if_baudrate = 100000000;
- ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
- IFCAP_JUMBO_MTU;
+ ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4;
+ sc->max_mtu = mxge_max_mtu(sc);
+ if (sc->max_mtu >= 9000)
+ ifp->if_capabilities |= IFCAP_JUMBO_MTU;
+ else
+ device_printf(dev, "MTU limited to %d. Install "
+ "latest firmware for 9000 byte jumbo support",
+ sc->max_mtu - ETHER_HDR_LEN);
ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
ifp->if_capenable = ifp->if_capabilities;
sc->csum_flag = 1;
@@ -3157,7 +3239,8 @@ mxge_attach(device_t dev)
ifp->if_start = mxge_start;
ether_ifattach(ifp, sc->mac_addr);
/* ether_ifattach sets mtu to 1500 */
- ifp->if_mtu = MXGE_MAX_ETHER_MTU - ETHER_HDR_LEN;
+ if (ifp->if_capabilities & IFCAP_JUMBO_MTU)
+ ifp->if_mtu = MXGE_MAX_ETHER_MTU - ETHER_HDR_LEN;
/* Initialise the ifmedia structure */
ifmedia_init(&sc->media, 0, mxge_media_change,
OpenPOWER on IntegriCloud