summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoradrian <adrian@FreeBSD.org>2016-06-19 03:45:32 +0000
committeradrian <adrian@FreeBSD.org>2016-06-19 03:45:32 +0000
commitfa16c812f5b570ef8994dae9eace7bb20d997f87 (patch)
treee060d0a33f334d75ebcfff5ae49a0bb607020fc2
parentba73a9e6514c95236c4f7b7b65336c567e2538ba (diff)
downloadFreeBSD-src-fa16c812f5b570ef8994dae9eace7bb20d997f87.zip
FreeBSD-src-fa16c812f5b570ef8994dae9eace7bb20d997f87.tar.gz
[ath] add support for batching frames to the general TX queues.
It turns out the frame scheduling policies (eg DBA_GATED) operate on a single TX FIFO entry. ASAP scheduling is fine; those frames always go out. DBA-gated sets the TX queue ready when the DBA timer fires, which triggers a beacon transmit. Normally this is used for content-after-beacon queue (CABQ) work, which needs to burst out immediately after a beacon. (eg broadcast, multicast, etc frames.) This is a general policy that you can use for any queue, and Sam's TDMA code uses it. When DBA_GATED is used and something like say, an 11e TX burst window, it only operates on a single TX FIFO entry. If you have a single frame per TX FIFO entry and say, a 2.5ms long burst window (eg TDMA!) then it'll only burst a single frame every 2.5ms. If there's no gating (eg ASAP) then the burst window is fine, and multiple TX FIFO slots get used. The CABQ code does pack in a list of frames (ie, the whole cabq) but up until this commit, the normal TX queues didn't. It showed up when I started to debug TDMA on the AR9380 and later. This commit doesn't fix the TDMA case - that's still broken here, because all I'm doing here is allowing 'some' frames to be bursting, but I'm certainly not filling the whole TX FIFO slot entry with frames. Doing that 'properly' kind of requires me to take into account how long packets should take to transmit and say, doing 1.5 or something times that per TX FIFO slot, as if you partially transmit a slot, when it's next gated it'll just finish that TX FIFO slot, then not advance to the next one. Now, I /also/ think queuing a new packet restarts DMA, but you have to push new frames into the TX FIFO. I need to experiment some more with this because if it's really the case, I will be able to do TDMA support without the egregious hacks I have in my local tree. Sam's TDMA code for previous chips would just kick the TXE bit to push along DMA again, but we can't do that for EDMA chips - we /have/ to push a new frame into the TX FIFO to restart DMA. Ugh. Tested: * AR9380, STA mode * AR9380, hostap mode * AR9580, hostap mode Approved by: re (gjb)
-rw-r--r--sys/dev/ath/if_ath_tx_edma.c209
1 files changed, 158 insertions, 51 deletions
diff --git a/sys/dev/ath/if_ath_tx_edma.c b/sys/dev/ath/if_ath_tx_edma.c
index 44835c5..c972019 100644
--- a/sys/dev/ath/if_ath_tx_edma.c
+++ b/sys/dev/ath/if_ath_tx_edma.c
@@ -138,79 +138,186 @@ MALLOC_DECLARE(M_ATHDEV);
static void ath_edma_tx_processq(struct ath_softc *sc, int dosched);
-/*
- * Push some frames into the TX FIFO if we have space.
- */
+#ifdef ATH_DEBUG_ALQ
static void
-ath_edma_tx_fifo_fill(struct ath_softc *sc, struct ath_txq *txq)
+ath_tx_alq_edma_push(struct ath_softc *sc, int txq, int nframes,
+ int fifo_depth, int frame_cnt)
+{
+ struct if_ath_alq_tx_fifo_push aq;
+
+ aq.txq = htobe32(txq);
+ aq.nframes = htobe32(nframes);
+ aq.fifo_depth = htobe32(fifo_depth);
+ aq.frame_cnt = htobe32(frame_cnt);
+
+ if_ath_alq_post(&sc->sc_alq, ATH_ALQ_TX_FIFO_PUSH,
+ sizeof(aq),
+ (const char *) &aq);
+}
+#endif /* ATH_DEBUG_ALQ */
+
+static void
+ath_tx_edma_push_staging_list(struct ath_softc *sc, struct ath_txq *txq,
+ int limit)
{
struct ath_buf *bf, *bf_last;
- int i = 0;
+ struct ath_buf *bfi, *bfp;
+ int i, sqdepth;
+ TAILQ_HEAD(axq_q_f_s, ath_buf) sq;
ATH_TXQ_LOCK_ASSERT(txq);
- DPRINTF(sc, ATH_DEBUG_TX_PROC, "%s: Q%d: called\n",
- __func__,
- txq->axq_qnum);
+ /*
+ * Don't bother doing any work if it's full.
+ */
+ if (txq->axq_fifo_depth >= HAL_TXFIFO_DEPTH)
+ return;
- TAILQ_FOREACH(bf, &txq->axq_q, bf_list) {
- if (txq->axq_fifo_depth >= HAL_TXFIFO_DEPTH)
- break;
+ if (TAILQ_EMPTY(&txq->axq_q))
+ return;
- /*
- * We have space in the FIFO - so let's push a frame
- * into it.
- */
+ TAILQ_INIT(&sq);
- /*
- * Remove it from the normal list
- */
+ /*
+ * First pass - walk sq, queue up to 'limit' entries,
+ * subtract them from the staging queue.
+ */
+ sqdepth = 0;
+ for (i = 0; i < limit; i++) {
+ /* Grab the head entry */
+ bf = ATH_TXQ_FIRST(txq);
+ if (bf == NULL)
+ break;
ATH_TXQ_REMOVE(txq, bf, bf_list);
- /*
- * XXX for now, we only dequeue a frame at a time, so
- * that's only one buffer. Later on when we just
- * push this staging _list_ into the queue, we'll
- * set bf_last to the end pointer in the list.
- */
- bf_last = bf;
- DPRINTF(sc, ATH_DEBUG_TX_PROC,
- "%s: Q%d: depth=%d; pushing %p->%p\n",
- __func__,
- txq->axq_qnum,
- txq->axq_fifo_depth,
- bf,
- bf_last);
+ /* Queue it into our staging list */
+ TAILQ_INSERT_TAIL(&sq, bf, bf_list);
+ sqdepth++;
+ }
- /*
- * Append it to the FIFO staging list
- */
- ATH_TXQ_INSERT_TAIL(&txq->fifo, bf, bf_list);
+ /*
+ * Ok, so now we have a staging list of up to 'limit'
+ * frames from the txq. Now let's wrap that up
+ * into its own list and pass that to the hardware
+ * as one FIFO entry.
+ */
- /*
- * Set fifo start / fifo end flags appropriately
- *
- */
- bf->bf_flags |= ATH_BUF_FIFOPTR;
- bf_last->bf_flags |= ATH_BUF_FIFOEND;
+ bf = TAILQ_FIRST(&sq);
+ bf_last = TAILQ_LAST(&sq, axq_q_s);
- /*
- * Push _into_ the FIFO.
- */
- ath_hal_puttxbuf(sc->sc_ah, txq->axq_qnum, bf->bf_daddr);
+ /*
+ * Ok, so here's the gymnastics reqiured to make this
+ * all sensible.
+ */
+
+ /*
+ * Tag the first/last buffer appropriately.
+ */
+ bf->bf_flags |= ATH_BUF_FIFOPTR;
+ bf_last->bf_flags |= ATH_BUF_FIFOEND;
+
+ /*
+ * Walk the descriptor list and link them appropriately.
+ */
+ bfp = NULL;
+ TAILQ_FOREACH(bfi, &sq, bf_list) {
+ if (bfp != NULL) {
+ ath_hal_settxdesclink(sc->sc_ah, bfp->bf_lastds,
+ bfi->bf_daddr);
+ }
+ bfp = bfi;
+ }
+
+ i = 0;
+ TAILQ_FOREACH(bfi, &sq, bf_list) {
#ifdef ATH_DEBUG
if (sc->sc_debug & ATH_DEBUG_XMIT_DESC)
- ath_printtxbuf(sc, bf, txq->axq_qnum, i, 0);
+ ath_printtxbuf(sc, bfi, txq->axq_qnum, i, 0);
#endif/* ATH_DEBUG */
#ifdef ATH_DEBUG_ALQ
if (if_ath_alq_checkdebug(&sc->sc_alq, ATH_ALQ_EDMA_TXDESC))
- ath_tx_alq_post(sc, bf);
+ ath_tx_alq_post(sc, bfi);
#endif /* ATH_DEBUG_ALQ */
- txq->axq_fifo_depth++;
i++;
}
- if (i > 0)
- ath_hal_txstart(sc->sc_ah, txq->axq_qnum);
+
+ /*
+ * We now need to push this set of frames onto the tail
+ * of the FIFO queue. We don't adjust the aggregate
+ * count, only the queue depth counter(s).
+ * We also need to blank the link pointer now.
+ */
+
+ TAILQ_CONCAT(&txq->fifo.axq_q, &sq, bf_list);
+ /* Bump total queue tracking in FIFO queue */
+ txq->fifo.axq_depth += sqdepth;
+
+ /* Bump FIFO queue */
+ txq->axq_fifo_depth++;
+ DPRINTF(sc, ATH_DEBUG_XMIT,
+ "%s: queued %d packets; depth=%d, fifo depth=%d\n",
+ __func__, sqdepth, txq->fifo.axq_depth, txq->axq_fifo_depth);
+
+ /* Push the first entry into the hardware */
+ ath_hal_puttxbuf(sc->sc_ah, txq->axq_qnum, bf->bf_daddr);
+
+ /* Push start on the DMA if it's not already started */
+ ath_hal_txstart(sc->sc_ah, txq->axq_qnum);
+
+#ifdef ATH_DEBUG_ALQ
+ ath_tx_alq_edma_push(sc, txq->axq_qnum, sqdepth,
+ txq->axq_fifo_depth,
+ txq->fifo.axq_depth);
+#endif /* ATH_DEBUG_ALQ */
+}
+
+/*
+ * Push some frames into the TX FIFO if we have space.
+ */
+static void
+ath_edma_tx_fifo_fill(struct ath_softc *sc, struct ath_txq *txq)
+{
+
+ ATH_TXQ_LOCK_ASSERT(txq);
+
+ DPRINTF(sc, ATH_DEBUG_TX_PROC, "%s: Q%d: called\n",
+ __func__,
+ txq->axq_qnum);
+
+ /*
+ * For now, push up to 4 frames per TX FIFO slot.
+ * If more are in the hardware queue then they'll
+ * get populated when we try to send another frame
+ * or complete a frame - so at most there'll be
+ * 32 non-AMPDU frames per TXQ.
+ *
+ * Note that the hardware staging queue will limit
+ * how many frames in total we will have pushed into
+ * here.
+ *
+ * Later on, we'll want to push less frames into
+ * the TX FIFO since we don't want to necessarily
+ * fill tens or hundreds of milliseconds of potential
+ * frames.
+ *
+ * However, we need more frames right now because of
+ * how the MAC implements the frame scheduling policy.
+ * It only ungates a single FIFO entry at a time,
+ * and will run that until CHNTIME expires or the
+ * end of that FIFO entry descriptor list is reached.
+ * So for TDMA we suffer a big performance penalty -
+ * single TX FIFO entries mean the MAC only sends out
+ * one frame per DBA event, which turned out on average
+ * 6ms per TX frame.
+ *
+ * So, for aggregates it's okay - it'll push two at a
+ * time and this will just do them more efficiently.
+ * For non-aggregates it'll do 4 at a time, up to the
+ * non-aggr limit (non_aggr, which is 32.) They should
+ * be time based rather than a hard count, but I also
+ * do need sleep.
+ */
+ ath_tx_edma_push_staging_list(sc, txq, 4);
}
/*
OpenPOWER on IntegriCloud