summaryrefslogtreecommitdiffstats
path: root/sys
diff options
context:
space:
mode:
authorrrs <rrs@FreeBSD.org>2013-02-07 15:20:54 +0000
committerrrs <rrs@FreeBSD.org>2013-02-07 15:20:54 +0000
commit75ad250e9798bf02f5b9ad805d88a8852c3e9545 (patch)
tree3ab01606aaa34364e5d367c3b493191e97cde393 /sys
parent305acfc1e8b3cbdb6e01a52a8fcf8ae561eae70d (diff)
downloadFreeBSD-src-75ad250e9798bf02f5b9ad805d88a8852c3e9545.zip
FreeBSD-src-75ad250e9798bf02f5b9ad805d88a8852c3e9545.tar.gz
This fixes a out-of-order problem with several
of the newer drivers. The basic problem was that the driver was pulling the mbuf off the drbr ring and then when sending with xmit(), encounting a full transmit ring. Thus the lower layer xmit() function would return an error, and the drivers would then append the data back on to the ring. For TCP this is a horrible scenario sure to bring on a fast-retransmit. The fix is to use drbr_peek() to pull the data pointer but not remove it from the ring. If it fails then we either call the new drbr_putback or drbr_advance method. Advance moves it forward (we do this sometimes when the xmit() function frees the mbuf). When we succeed we always call advance. The putback will always copy the mbuf back to the top of the ring. Note that the putback *cannot* be used with a drbr_dequeue() only with drbr_peek(). We most of the time, in putback, would not need to copy it back since most likey the mbuf is still the same, but sometimes xmit() functions will change the mbuf via a pullup or other call. So the optimial case for the single consumer is to always copy it back. If we ever do a multiple_consumer (for lagg?) we will need a test and atomic in the put back possibly a seperate putback_mc() in the ring buf. Reviewed by: jhb@freebsd.org, jlv@freebsd.org
Diffstat (limited to 'sys')
-rw-r--r--sys/dev/bxe/if_bxe.c28
-rw-r--r--sys/dev/e1000/if_em.c25
-rw-r--r--sys/dev/e1000/if_igb.c33
-rw-r--r--sys/dev/ixgbe/ixgbe.c23
-rw-r--r--sys/dev/ixgbe/ixv.c24
-rw-r--r--sys/dev/oce/oce_if.c18
-rw-r--r--sys/net/if_var.h53
-rw-r--r--sys/ofed/drivers/net/mlx4/en_tx.c20
-rw-r--r--sys/sys/buf_ring.h48
9 files changed, 189 insertions, 83 deletions
diff --git a/sys/dev/bxe/if_bxe.c b/sys/dev/bxe/if_bxe.c
index 52d2c1b..9566b27 100644
--- a/sys/dev/bxe/if_bxe.c
+++ b/sys/dev/bxe/if_bxe.c
@@ -9506,24 +9506,15 @@ bxe_tx_mq_start_locked(struct ifnet *ifp,
BXE_FP_LOCK_ASSERT(fp);
- if (m == NULL) {
- /* No new work, check for pending frames. */
- next = drbr_dequeue(ifp, fp->br);
- } else if (drbr_needs_enqueue(ifp, fp->br)) {
- /* Both new and pending work, maintain packet order. */
+ if (m != NULL) {
rc = drbr_enqueue(ifp, fp->br, m);
if (rc != 0) {
fp->tx_soft_errors++;
goto bxe_tx_mq_start_locked_exit;
}
- next = drbr_dequeue(ifp, fp->br);
- } else
- /* New work only, nothing pending. */
- next = m;
-
+ }
/* Keep adding entries while there are frames to send. */
- while (next != NULL) {
-
+ while ((next = drbr_peek(ifp, fp->br)) != NULL) {
/* The transmit mbuf now belongs to us, keep track of it. */
fp->tx_mbuf_alloc++;
@@ -9537,23 +9528,22 @@ bxe_tx_mq_start_locked(struct ifnet *ifp,
if (__predict_false(rc != 0)) {
fp->tx_encap_failures++;
/* Very Bad Frames(tm) may have been dropped. */
- if (next != NULL) {
+ if (next == NULL) {
+ drbr_advance(ifp, fp->br);
+ } else {
+ drbr_putback(ifp, fp->br, next);
/*
* Mark the TX queue as full and save
* the frame.
*/
ifp->if_drv_flags |= IFF_DRV_OACTIVE;
fp->tx_frame_deferred++;
-
- /* This may reorder frame. */
- rc = drbr_enqueue(ifp, fp->br, next);
fp->tx_mbuf_alloc--;
}
-
/* Stop looking for more work. */
break;
}
-
+ drbr_advance(ifp, fp->br);
/* The transmit frame was enqueued successfully. */
tx_count++;
@@ -9574,8 +9564,6 @@ bxe_tx_mq_start_locked(struct ifnet *ifp,
ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
break;
}
-
- next = drbr_dequeue(ifp, fp->br);
}
/* No TX packets were dequeued. */
diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c
index 3344697..a431266 100644
--- a/sys/dev/e1000/if_em.c
+++ b/sys/dev/e1000/if_em.c
@@ -905,22 +905,24 @@ em_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf *m)
}
enq = 0;
- if (m == NULL) {
- next = drbr_dequeue(ifp, txr->br);
- } else if (drbr_needs_enqueue(ifp, txr->br)) {
- if ((err = drbr_enqueue(ifp, txr->br, m)) != 0)
+ if (m != NULL) {
+ err = drbr_enqueue(ifp, txr->br, m);
+ if (err) {
return (err);
- next = drbr_dequeue(ifp, txr->br);
- } else
- next = m;
+ }
+ }
/* Process the queue */
- while (next != NULL) {
+ while ((next = drbr_peek(ifp, txr->br)) != NULL) {
if ((err = em_xmit(txr, &next)) != 0) {
- if (next != NULL)
- err = drbr_enqueue(ifp, txr->br, next);
- break;
+ if (next == NULL) {
+ drbr_advance(ifp, txr->br);
+ } else {
+ drbr_putback(ifp, txr->br, next);
+ }
+ break;
}
+ drbr_advance(ifp, txr->br);
enq++;
ifp->if_obytes += next->m_pkthdr.len;
if (next->m_flags & M_MCAST)
@@ -928,7 +930,6 @@ em_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf *m)
ETHER_BPF_MTAP(ifp, next);
if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
break;
- next = drbr_dequeue(ifp, txr->br);
}
if (enq > 0) {
diff --git a/sys/dev/e1000/if_igb.c b/sys/dev/e1000/if_igb.c
index 2921a91..8c6b429 100644
--- a/sys/dev/e1000/if_igb.c
+++ b/sys/dev/e1000/if_igb.c
@@ -350,6 +350,16 @@ TUNABLE_INT("hw.igb.max_interrupt_rate", &igb_max_interrupt_rate);
SYSCTL_INT(_hw_igb, OID_AUTO, max_interrupt_rate, CTLFLAG_RDTUN,
&igb_max_interrupt_rate, 0, "Maximum interrupts per second");
+#if __FreeBSD_version >= 800000
+/*
+** Tuneable number of buffers in the buf-ring (drbr_xxx)
+*/
+static int igb_buf_ring_size = IGB_BR_SIZE;
+TUNABLE_INT("hw.igb.buf_ring_size", &igb_buf_ring_size);
+SYSCTL_INT(_hw_igb, OID_AUTO, buf_ring_size, CTLFLAG_RDTUN,
+ &igb_buf_ring_size, 0, "Size of the bufring");
+#endif
+
/*
** Header split causes the packet header to
** be dma'd to a seperate mbuf from the payload.
@@ -965,12 +975,13 @@ igb_mq_start(struct ifnet *ifp, struct mbuf *m)
** out-of-order delivery, but
** settle for it if that fails
*/
- if (m)
+ if (m != NULL)
drbr_enqueue(ifp, txr->br, m);
err = igb_mq_start_locked(ifp, txr);
IGB_TX_UNLOCK(txr);
} else {
- err = drbr_enqueue(ifp, txr->br, m);
+ if (m != NULL)
+ err = drbr_enqueue(ifp, txr->br, m);
taskqueue_enqueue(que->tq, &txr->txq_task);
}
@@ -994,12 +1005,22 @@ igb_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr)
enq = 0;
/* Process the queue */
- while ((next = drbr_dequeue(ifp, txr->br)) != NULL) {
+ while ((next = drbr_peek(ifp, txr->br)) != NULL) {
if ((err = igb_xmit(txr, &next)) != 0) {
- if (next != NULL)
- err = drbr_enqueue(ifp, txr->br, next);
+ if (next == NULL) {
+ /* It was freed, move forward */
+ drbr_advance(ifp, txr->br);
+ } else {
+ /*
+ * Still have one left, it may not be
+ * the same since the transmit function
+ * may have changed it.
+ */
+ drbr_putback(ifp, txr->br, next);
+ }
break;
}
+ drbr_advance(ifp, txr->br);
enq++;
ifp->if_obytes += next->m_pkthdr.len;
if (next->m_flags & M_MCAST)
@@ -3301,7 +3322,7 @@ igb_allocate_queues(struct adapter *adapter)
}
#if __FreeBSD_version >= 800000
/* Allocate a buf ring */
- txr->br = buf_ring_alloc(IGB_BR_SIZE, M_DEVBUF,
+ txr->br = buf_ring_alloc(igb_buf_ring_size, M_DEVBUF,
M_WAITOK, &txr->tx_mtx);
#endif
}
diff --git a/sys/dev/ixgbe/ixgbe.c b/sys/dev/ixgbe/ixgbe.c
index cfcc342..0b77437 100644
--- a/sys/dev/ixgbe/ixgbe.c
+++ b/sys/dev/ixgbe/ixgbe.c
@@ -832,22 +832,24 @@ ixgbe_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf *m)
}
enqueued = 0;
- if (m == NULL) {
- next = drbr_dequeue(ifp, txr->br);
- } else if (drbr_needs_enqueue(ifp, txr->br)) {
- if ((err = drbr_enqueue(ifp, txr->br, m)) != 0)
+ if (m != NULL) {
+ err = drbr_enqueue(ifp, txr->br, m);
+ if (err) {
return (err);
- next = drbr_dequeue(ifp, txr->br);
- } else
- next = m;
+ }
+ }
/* Process the queue */
- while (next != NULL) {
+ while ((next = drbr_peek(ifp, txr->br)) != NULL) {
if ((err = ixgbe_xmit(txr, &next)) != 0) {
- if (next != NULL)
- err = drbr_enqueue(ifp, txr->br, next);
+ if (next == NULL) {
+ drbr_advance(ifp, txr->br);
+ } else {
+ drbr_putback(ifp, txr->br, next);
+ }
break;
}
+ drbr_advance(ifp, txr->br);
enqueued++;
/* Send a copy of the frame to the BPF listener */
ETHER_BPF_MTAP(ifp, next);
@@ -855,7 +857,6 @@ ixgbe_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf *m)
break;
if (txr->tx_avail < IXGBE_TX_OP_THRESHOLD)
ixgbe_txeof(txr);
- next = drbr_dequeue(ifp, txr->br);
}
if (enqueued > 0) {
diff --git a/sys/dev/ixgbe/ixv.c b/sys/dev/ixgbe/ixv.c
index ef5f753..b3a3d64 100644
--- a/sys/dev/ixgbe/ixv.c
+++ b/sys/dev/ixgbe/ixv.c
@@ -620,22 +620,23 @@ ixv_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf *m)
ixv_txeof(txr);
enqueued = 0;
- if (m == NULL) {
- next = drbr_dequeue(ifp, txr->br);
- } else if (drbr_needs_enqueue(ifp, txr->br)) {
- if ((err = drbr_enqueue(ifp, txr->br, m)) != 0)
+ if (m != NULL) {
+ err = drbr_enqueue(ifp, txr->br, m);
+ if (err) {
return (err);
- next = drbr_dequeue(ifp, txr->br);
- } else
- next = m;
-
+ }
+ }
/* Process the queue */
- while (next != NULL) {
+ while ((next = drbr_peek(ifp, txr->br)) != NULL) {
if ((err = ixv_xmit(txr, &next)) != 0) {
- if (next != NULL)
- err = drbr_enqueue(ifp, txr->br, next);
+ if (next == NULL) {
+ drbr_advance(ifp, txr->br);
+ } else {
+ drbr_putback(ifp, txr->br, next);
+ }
break;
}
+ drbr_advance(ifp, txr->br);
enqueued++;
ifp->if_obytes += next->m_pkthdr.len;
if (next->m_flags & M_MCAST)
@@ -648,7 +649,6 @@ ixv_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr, struct mbuf *m)
ifp->if_drv_flags |= IFF_DRV_OACTIVE;
break;
}
- next = drbr_dequeue(ifp, txr->br);
}
if (enqueued > 0) {
diff --git a/sys/dev/oce/oce_if.c b/sys/dev/oce/oce_if.c
index cba57bf..2b4c181 100644
--- a/sys/dev/oce/oce_if.c
+++ b/sys/dev/oce/oce_if.c
@@ -1166,29 +1166,27 @@ oce_multiq_transmit(struct ifnet *ifp, struct mbuf *m, struct oce_wq *wq)
return status;
}
- if (m == NULL)
- next = drbr_dequeue(ifp, br);
- else if (drbr_needs_enqueue(ifp, br)) {
+ if (m != NULL) {
if ((status = drbr_enqueue(ifp, br, m)) != 0)
return status;
- next = drbr_dequeue(ifp, br);
- } else
- next = m;
-
- while (next != NULL) {
+ }
+ while ((next = drbr_peek(ifp, br)) != NULL) {
if (oce_tx(sc, &next, queue_index)) {
- if (next != NULL) {
+ if (next == NULL) {
+ drbr_advance(ifp, br);
+ } else {
+ drbr_putback(ifp, br, next);
wq->tx_stats.tx_stops ++;
ifp->if_drv_flags |= IFF_DRV_OACTIVE;
status = drbr_enqueue(ifp, br, next);
}
break;
}
+ drbr_advance(ifp, br);
ifp->if_obytes += next->m_pkthdr.len;
if (next->m_flags & M_MCAST)
ifp->if_omcasts++;
ETHER_BPF_MTAP(ifp, next);
- next = drbr_dequeue(ifp, br);
}
return status;
diff --git a/sys/net/if_var.h b/sys/net/if_var.h
index 2ec73e4..7fcbd38 100644
--- a/sys/net/if_var.h
+++ b/sys/net/if_var.h
@@ -622,6 +622,45 @@ drbr_enqueue(struct ifnet *ifp, struct buf_ring *br, struct mbuf *m)
}
static __inline void
+drbr_putback(struct ifnet *ifp, struct buf_ring *br, struct mbuf *new)
+{
+ /*
+ * The top of the list needs to be swapped
+ * for this one.
+ */
+#ifdef ALTQ
+ if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {
+ /*
+ * Peek in altq case dequeued it
+ * so put it back.
+ */
+ IFQ_DRV_PREPEND(&ifp->if_snd, new);
+ return;
+ }
+#endif
+ buf_ring_putback_sc(br, new);
+}
+
+static __inline struct mbuf *
+drbr_peek(struct ifnet *ifp, struct buf_ring *br)
+{
+#ifdef ALTQ
+ struct mbuf *m;
+ if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {
+ /*
+ * Pull it off like a dequeue
+ * since drbr_advance() does nothing
+ * for altq and drbr_putback() will
+ * use the old prepend function.
+ */
+ IFQ_DEQUEUE(&ifp->if_snd, m);
+ return (m);
+ }
+#endif
+ return(buf_ring_peek(br));
+}
+
+static __inline void
drbr_flush(struct ifnet *ifp, struct buf_ring *br)
{
struct mbuf *m;
@@ -648,7 +687,7 @@ drbr_dequeue(struct ifnet *ifp, struct buf_ring *br)
#ifdef ALTQ
struct mbuf *m;
- if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
+ if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {
IFQ_DEQUEUE(&ifp->if_snd, m);
return (m);
}
@@ -656,6 +695,18 @@ drbr_dequeue(struct ifnet *ifp, struct buf_ring *br)
return (buf_ring_dequeue_sc(br));
}
+static __inline void
+drbr_advance(struct ifnet *ifp, struct buf_ring *br)
+{
+#ifdef ALTQ
+ /* Nothing to do here since peek dequeues in altq case */
+ if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd))
+ return;
+#endif
+ return (buf_ring_advance_sc(br));
+}
+
+
static __inline struct mbuf *
drbr_dequeue_cond(struct ifnet *ifp, struct buf_ring *br,
int (*func) (struct mbuf *, void *), void *arg)
diff --git a/sys/ofed/drivers/net/mlx4/en_tx.c b/sys/ofed/drivers/net/mlx4/en_tx.c
index e2e13c2..fee7afe 100644
--- a/sys/ofed/drivers/net/mlx4/en_tx.c
+++ b/sys/ofed/drivers/net/mlx4/en_tx.c
@@ -931,22 +931,21 @@ mlx4_en_transmit_locked(struct ifnet *dev, int tx_ind, struct mbuf *m)
}
enqueued = 0;
- if (m == NULL) {
- next = drbr_dequeue(dev, ring->br);
- } else if (drbr_needs_enqueue(dev, ring->br)) {
+ if (m != NULL) {
if ((err = drbr_enqueue(dev, ring->br, m)) != 0)
return (err);
- next = drbr_dequeue(dev, ring->br);
- } else
- next = m;
-
+ }
/* Process the queue */
- while (next != NULL) {
+ while ((next = drbr_peek(ifp, ring->br)) != NULL) {
if ((err = mlx4_en_xmit(dev, tx_ind, &next)) != 0) {
- if (next != NULL)
- err = drbr_enqueue(dev, ring->br, next);
+ if (next == NULL) {
+ drbr_advance(ifp, ring->br);
+ } else {
+ drbr_putback(ifp, ring->br, next);
+ }
break;
}
+ drbr_advance(ifp, ring->br);
enqueued++;
dev->if_obytes += next->m_pkthdr.len;
if (next->m_flags & M_MCAST)
@@ -955,7 +954,6 @@ mlx4_en_transmit_locked(struct ifnet *dev, int tx_ind, struct mbuf *m)
ETHER_BPF_MTAP(dev, next);
if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0)
break;
- next = drbr_dequeue(dev, ring->br);
}
if (enqueued > 0)
diff --git a/sys/sys/buf_ring.h b/sys/sys/buf_ring.h
index b660d9b..b1ccaec 100644
--- a/sys/sys/buf_ring.h
+++ b/sys/sys/buf_ring.h
@@ -208,6 +208,54 @@ buf_ring_dequeue_sc(struct buf_ring *br)
}
/*
+ * single-consumer advance after a peek
+ * use where it is protected by a lock
+ * e.g. a network driver's tx queue lock
+ */
+static __inline void
+buf_ring_advance_sc(struct buf_ring *br)
+{
+ uint32_t cons_head, cons_next;
+ uint32_t prod_tail;
+
+ cons_head = br->br_cons_head;
+ prod_tail = br->br_prod_tail;
+
+ cons_next = (cons_head + 1) & br->br_cons_mask;
+ if (cons_head == prod_tail)
+ return;
+ br->br_cons_head = cons_next;
+#ifdef DEBUG_BUFRING
+ br->br_ring[cons_head] = NULL;
+#endif
+ br->br_cons_tail = cons_next;
+}
+
+/*
+ * Used to return a buffer (most likely already there)
+ * to the top od the ring. The caller should *not*
+ * have used any dequeue to pull it out of the ring
+ * but instead should have used the peek() function.
+ * This is normally used where the transmit queue
+ * of a driver is full, and an mubf must be returned.
+ * Most likely whats in the ring-buffer is what
+ * is being put back (since it was not removed), but
+ * sometimes the lower transmit function may have
+ * done a pullup or other function that will have
+ * changed it. As an optimzation we always put it
+ * back (since jhb says the store is probably cheaper),
+ * if we have to do a multi-queue version we will need
+ * the compare and an atomic.
+ */
+static __inline void
+buf_ring_putback_sc(struct buf_ring *br, void *new)
+{
+ KASSERT(br->br_cons_head != br->br_prod_tail,
+ ("Buf-Ring has none in putback")) ;
+ br->br_ring[br->br_cons_head] = new;
+}
+
+/*
* return a pointer to the first entry in the ring
* without modifying it, or NULL if the ring is empty
* race-prone if not protected by a lock
OpenPOWER on IntegriCloud