From 3ec63ec821cd2663036e1b5fe134f37875691ae5 Mon Sep 17 00:00:00 2001
From: adrian <adrian@FreeBSD.org>
Date: Wed, 21 Oct 2015 01:41:18 +0000
Subject: arge: don't do the rx fixup copy and just offset the mbuf by 2 bytes

The existing code meets the "alignment" requirement for the l3 payload
by offsetting the mbuf by uint64_t and then calling an rx fixup routine
to copy the frame backwards by 2 bytes.  This DWORD aligns the
L3 payload so tcp, etc doesn't panic on unaligned access.

This is .. slow.

For arge MACs that support 1 byte TX/RX address alignment, we can do
the "other" hack: offset the RX address of the mbuf so the L3 payload
again is hopefully DWORD aligned.

This is much cheaper - since TX/RX is both 1 byte align ready (thanks
to the previous commit) there's no bounce buffering going on and there
is no rx fixup copying.

This gets bridging performance up from 180mbit/sec -> 410mbit/sec.
There's around 10% of CPU cycles spent in _bus_dmamap_sync(); I'll
investigate that later.

Tested:

* QCA955x SoC (AP135 reference board), bridging arge0/arge1
  by programming the switch to have two vlangroups in dot1q mode:

# ifconfig bridge0 inet 192.168.2.20/24
# etherswitchcfg config vlan_mode dot1q
# etherswitchcfg vlangroup0 members 0,1,2,3,4
# etherswitchcfg vlangroup1 vlan 2 members 5,6
# etherswitchcfg port5 pvid 2
# etherswitchcfg port6 pvid 2
# ifconfig arge1 up
# ifconfig bridge0 addm arge1
---
 sys/mips/atheros/if_arge.c | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

(limited to 'sys/mips')

diff --git a/sys/mips/atheros/if_arge.c b/sys/mips/atheros/if_arge.c
index 446cd64..7a3efff 100644
--- a/sys/mips/atheros/if_arge.c
+++ b/sys/mips/atheros/if_arge.c
@@ -2165,6 +2165,7 @@ arge_newbuf(struct arge_softc *sc, int idx)
 	bus_dmamap_t		map;
 	int			nsegs;
 
+	/* XXX TODO: should just allocate an explicit 2KiB buffer */
 	m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 	if (m == NULL)
 		return (ENOBUFS);
@@ -2174,7 +2175,15 @@ arge_newbuf(struct arge_softc *sc, int idx)
 	 * Add extra space to "adjust" (copy) the packet back to be aligned
 	 * for purposes of IPv4/IPv6 header contents.
 	 */
-	m_adj(m, sizeof(uint64_t));
+	if (sc->arge_hw_flags & ARGE_HW_FLG_RX_DESC_ALIGN_4BYTE)
+		m_adj(m, sizeof(uint64_t));
+	/*
+	 * If it's a 1-byte aligned buffer, then just offset it two bytes
+	 * and that will give us a hopefully correctly DWORD aligned
+	 * L3 payload - and we won't have to undo it afterwards.
+	 */
+	else if (sc->arge_hw_flags & ARGE_HW_FLG_RX_DESC_ALIGN_1BYTE)
+		m_adj(m, sizeof(uint16_t));
 
 	if (bus_dmamap_load_mbuf_sg(sc->arge_cdata.arge_rx_tag,
 	    sc->arge_cdata.arge_rx_sparemap, m, segs, &nsegs, 0) != 0) {
@@ -2186,6 +2195,11 @@ arge_newbuf(struct arge_softc *sc, int idx)
 	rxd = &sc->arge_cdata.arge_rxdesc[idx];
 	if (rxd->rx_m != NULL) {
 		bus_dmamap_unload(sc->arge_cdata.arge_rx_tag, rxd->rx_dmamap);
+		/* XXX TODO: free rx_m? */
+		device_printf(sc->arge_dev,
+		    "%s: ring[%d] rx_m wasn't free?\n",
+		    __func__,
+		    idx);
 	}
 	map = rxd->rx_dmamap;
 	rxd->rx_dmamap = sc->arge_cdata.arge_rx_sparemap;
@@ -2205,6 +2219,13 @@ arge_newbuf(struct arge_softc *sc, int idx)
 	return (0);
 }
 
+/*
+ * Move the data backwards 16 bits to (hopefully!) ensure the
+ * IPv4/IPv6 payload is aligned.
+ *
+ * This is required for earlier hardware where the RX path
+ * requires DWORD aligned buffers.
+ */
 static __inline void
 arge_fixup_rx(struct mbuf *m)
 {
@@ -2344,7 +2365,13 @@ arge_rx_locked(struct arge_softc *sc)
 		    BUS_DMASYNC_POSTREAD);
 		m = rxd->rx_m;
 
-		arge_fixup_rx(m);
+		/*
+		 * If the MAC requires 4 byte alignment then the RX setup
+		 * routine will have pre-offset things; so un-offset it here.
+		 */
+		if (sc->arge_hw_flags & ARGE_HW_FLG_RX_DESC_ALIGN_4BYTE)
+			arge_fixup_rx(m);
+
 		m->m_pkthdr.rcvif = ifp;
 		/* Skip 4 bytes of CRC */
 		m->m_pkthdr.len = m->m_len = packet_len - ETHER_CRC_LEN;
-- 
cgit v1.1