summaryrefslogtreecommitdiffstats
path: root/sys/dev/e1000
diff options
context:
space:
mode:
authorluigi <luigi@FreeBSD.org>2014-08-20 23:34:36 +0000
committerluigi <luigi@FreeBSD.org>2014-08-20 23:34:36 +0000
commit223d76dc5012ea77078296847800a3d6181c61e2 (patch)
treed5d5263ca0c34de806d5e9e07b0b85eab96545f9 /sys/dev/e1000
parentb63e85f63f1ee972ee2221c84e26cc35597b38f7 (diff)
downloadFreeBSD-src-223d76dc5012ea77078296847800a3d6181c61e2.zip
FreeBSD-src-223d76dc5012ea77078296847800a3d6181c61e2.tar.gz
MFC 270063: update of netmap code
(vtnet and cxgbe not merged yet because we need some other mfc first)
Diffstat (limited to 'sys/dev/e1000')
-rw-r--r--sys/dev/e1000/if_em.c8
-rw-r--r--sys/dev/e1000/if_igb.c6
-rw-r--r--sys/dev/e1000/if_lem.c246
3 files changed, 248 insertions, 12 deletions
diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c
index cc8b34e..20321d0 100644
--- a/sys/dev/e1000/if_em.c
+++ b/sys/dev/e1000/if_em.c
@@ -3389,10 +3389,10 @@ em_setup_transmit_ring(struct tx_ring *txr)
uint64_t paddr;
void *addr;
- addr = PNMB(slot + si, &paddr);
+ addr = PNMB(na, slot + si, &paddr);
txr->tx_base[i].buffer_addr = htole64(paddr);
/* reload the map for netmap mode */
- netmap_load_map(txr->txtag, txbuf->map, addr);
+ netmap_load_map(na, txr->txtag, txbuf->map, addr);
}
#endif /* DEV_NETMAP */
@@ -4131,8 +4131,8 @@ em_setup_receive_ring(struct rx_ring *rxr)
uint64_t paddr;
void *addr;
- addr = PNMB(slot + si, &paddr);
- netmap_load_map(rxr->rxtag, rxbuf->map, addr);
+ addr = PNMB(na, slot + si, &paddr);
+ netmap_load_map(na, rxr->rxtag, rxbuf->map, addr);
/* Update descriptor */
rxr->rx_base[j].buffer_addr = htole64(paddr);
continue;
diff --git a/sys/dev/e1000/if_igb.c b/sys/dev/e1000/if_igb.c
index 15d71ce..484cba1 100644
--- a/sys/dev/e1000/if_igb.c
+++ b/sys/dev/e1000/if_igb.c
@@ -3531,7 +3531,7 @@ igb_setup_transmit_ring(struct tx_ring *txr)
if (slot) {
int si = netmap_idx_n2k(&na->tx_rings[txr->me], i);
/* no need to set the address */
- netmap_load_map(txr->txtag, txbuf->map, NMB(slot + si));
+ netmap_load_map(na, txr->txtag, txbuf->map, NMB(na, slot + si));
}
#endif /* DEV_NETMAP */
/* clear the watch index */
@@ -4335,8 +4335,8 @@ igb_setup_receive_ring(struct rx_ring *rxr)
uint64_t paddr;
void *addr;
- addr = PNMB(slot + sj, &paddr);
- netmap_load_map(rxr->ptag, rxbuf->pmap, addr);
+ addr = PNMB(na, slot + sj, &paddr);
+ netmap_load_map(na, rxr->ptag, rxbuf->pmap, addr);
/* Update descriptor */
rxr->rx_base[j].read.pkt_addr = htole64(paddr);
continue;
diff --git a/sys/dev/e1000/if_lem.c b/sys/dev/e1000/if_lem.c
index bc25e18..04a984b 100644
--- a/sys/dev/e1000/if_lem.c
+++ b/sys/dev/e1000/if_lem.c
@@ -32,6 +32,15 @@
******************************************************************************/
/*$FreeBSD$*/
+/*
+ * Uncomment the following extensions for better performance in a VM,
+ * especially if you have support in the hypervisor.
+ * See http://info.iet.unipi.it/~luigi/netmap/
+ */
+// #define BATCH_DISPATCH
+// #define NIC_SEND_COMBINING
+// #define NIC_PARAVIRT /* enable virtio-like synchronization */
+
#include "opt_inet.h"
#include "opt_inet6.h"
@@ -289,6 +298,10 @@ static int lem_tx_int_delay_dflt = EM_TICKS_TO_USECS(EM_TIDV);
static int lem_rx_int_delay_dflt = EM_TICKS_TO_USECS(EM_RDTR);
static int lem_tx_abs_int_delay_dflt = EM_TICKS_TO_USECS(EM_TADV);
static int lem_rx_abs_int_delay_dflt = EM_TICKS_TO_USECS(EM_RADV);
+/*
+ * increase lem_rxd and lem_txd to at least 2048 in netmap mode
+ * for better performance.
+ */
static int lem_rxd = EM_DEFAULT_RXD;
static int lem_txd = EM_DEFAULT_TXD;
static int lem_smart_pwr_down = FALSE;
@@ -458,6 +471,20 @@ lem_attach(device_t dev)
"max number of rx packets to process", &adapter->rx_process_limit,
lem_rx_process_limit);
+#ifdef NIC_SEND_COMBINING
+ /* Sysctls to control mitigation */
+ lem_add_rx_process_limit(adapter, "sc_enable",
+ "driver TDT mitigation", &adapter->sc_enable, 0);
+#endif /* NIC_SEND_COMBINING */
+#ifdef BATCH_DISPATCH
+ lem_add_rx_process_limit(adapter, "batch_enable",
+ "driver rx batch", &adapter->batch_enable, 0);
+#endif /* BATCH_DISPATCH */
+#ifdef NIC_PARAVIRT
+ lem_add_rx_process_limit(adapter, "rx_retries",
+ "driver rx retries", &adapter->rx_retries, 0);
+#endif /* NIC_PARAVIRT */
+
/* Sysctl for setting the interface flow control */
lem_set_flow_cntrl(adapter, "flow_control",
"flow control setting",
@@ -515,6 +542,49 @@ lem_attach(device_t dev)
*/
adapter->hw.mac.report_tx_early = 1;
+#ifdef NIC_PARAVIRT
+ device_printf(dev, "driver supports paravirt, subdev 0x%x\n",
+ adapter->hw.subsystem_device_id);
+ if (adapter->hw.subsystem_device_id == E1000_PARA_SUBDEV) {
+ uint64_t bus_addr;
+
+ device_printf(dev, "paravirt support on dev %p\n", adapter);
+ tsize = 4096; // XXX one page for the csb
+ if (lem_dma_malloc(adapter, tsize, &adapter->csb_mem, BUS_DMA_NOWAIT)) {
+ device_printf(dev, "Unable to allocate csb memory\n");
+ error = ENOMEM;
+ goto err_csb;
+ }
+ /* Setup the Base of the CSB */
+ adapter->csb = (struct paravirt_csb *)adapter->csb_mem.dma_vaddr;
+ /* force the first kick */
+ adapter->csb->host_need_txkick = 1; /* txring empty */
+ adapter->csb->guest_need_rxkick = 1; /* no rx packets */
+ bus_addr = adapter->csb_mem.dma_paddr;
+ lem_add_rx_process_limit(adapter, "csb_on",
+ "enable paravirt.", &adapter->csb->guest_csb_on, 0);
+ lem_add_rx_process_limit(adapter, "txc_lim",
+ "txc_lim", &adapter->csb->host_txcycles_lim, 1);
+
+ /* some stats */
+#define PA_SC(name, var, val) \
+ lem_add_rx_process_limit(adapter, name, name, var, val)
+ PA_SC("host_need_txkick",&adapter->csb->host_need_txkick, 1);
+ PA_SC("host_rxkick_at",&adapter->csb->host_rxkick_at, ~0);
+ PA_SC("guest_need_txkick",&adapter->csb->guest_need_txkick, 0);
+ PA_SC("guest_need_rxkick",&adapter->csb->guest_need_rxkick, 1);
+ PA_SC("tdt_reg_count",&adapter->tdt_reg_count, 0);
+ PA_SC("tdt_csb_count",&adapter->tdt_csb_count, 0);
+ PA_SC("tdt_int_count",&adapter->tdt_int_count, 0);
+ PA_SC("guest_need_kick_count",&adapter->guest_need_kick_count, 0);
+ /* tell the host where the block is */
+ E1000_WRITE_REG(&adapter->hw, E1000_CSBAH,
+ (u32)(bus_addr >> 32));
+ E1000_WRITE_REG(&adapter->hw, E1000_CSBAL,
+ (u32)bus_addr);
+ }
+#endif /* NIC_PARAVIRT */
+
tsize = roundup2(adapter->num_tx_desc * sizeof(struct e1000_tx_desc),
EM_DBA_ALIGN);
@@ -673,6 +743,11 @@ err_hw_init:
err_rx_desc:
lem_dma_free(adapter, &adapter->txdma);
err_tx_desc:
+#ifdef NIC_PARAVIRT
+ lem_dma_free(adapter, &adapter->csb_mem);
+err_csb:
+#endif /* NIC_PARAVIRT */
+
err_pci:
if (adapter->ifp != NULL)
if_free(adapter->ifp);
@@ -760,6 +835,12 @@ lem_detach(device_t dev)
adapter->rx_desc_base = NULL;
}
+#ifdef NIC_PARAVIRT
+ if (adapter->csb) {
+ lem_dma_free(adapter, &adapter->csb_mem);
+ adapter->csb = NULL;
+ }
+#endif /* NIC_PARAVIRT */
lem_release_hw_control(adapter);
free(adapter->mta, M_DEVBUF);
EM_TX_LOCK_DESTROY(adapter);
@@ -869,6 +950,16 @@ lem_start_locked(struct ifnet *ifp)
}
if (adapter->num_tx_desc_avail <= EM_TX_OP_THRESHOLD)
ifp->if_drv_flags |= IFF_DRV_OACTIVE;
+#ifdef NIC_PARAVIRT
+ if (if_getdrvflags(ifp) & IFF_DRV_OACTIVE && adapter->csb &&
+ adapter->csb->guest_csb_on &&
+ !(adapter->csb->guest_need_txkick & 1)) {
+ adapter->csb->guest_need_txkick = 1;
+ adapter->guest_need_kick_count++;
+ // XXX memory barrier
+ lem_txeof(adapter); // XXX possibly clear IFF_DRV_OACTIVE
+ }
+#endif /* NIC_PARAVIRT */
return;
}
@@ -1715,6 +1806,37 @@ lem_xmit(struct adapter *adapter, struct mbuf **m_headp)
*/
bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+
+#ifdef NIC_PARAVIRT
+ if (adapter->csb) {
+ adapter->csb->guest_tdt = i;
+ /* XXX memory barrier ? */
+ if (adapter->csb->guest_csb_on &&
+ !(adapter->csb->host_need_txkick & 1)) {
+ /* XXX maybe useless
+ * clean the ring. maybe do it before ?
+ * maybe a little bit of histeresys ?
+ */
+ if (adapter->num_tx_desc_avail <= 64) {// XXX
+ lem_txeof(adapter);
+ }
+ return (0);
+ }
+ }
+#endif /* NIC_PARAVIRT */
+
+#ifdef NIC_SEND_COMBINING
+ if (adapter->sc_enable) {
+ if (adapter->shadow_tdt & MIT_PENDING_INT) {
+ /* signal intr and data pending */
+ adapter->shadow_tdt = MIT_PENDING_TDT | (i & 0xffff);
+ return (0);
+ } else {
+ adapter->shadow_tdt = MIT_PENDING_INT;
+ }
+ }
+#endif /* NIC_SEND_COMBINING */
+
if (adapter->hw.mac.type == e1000_82547 &&
adapter->link_duplex == HALF_DUPLEX)
lem_82547_move_tail(adapter);
@@ -1995,6 +2117,20 @@ lem_local_timer(void *arg)
lem_smartspeed(adapter);
+#ifdef NIC_PARAVIRT
+ /* recover space if needed */
+ if (adapter->csb && adapter->csb->guest_csb_on &&
+ (adapter->watchdog_check == TRUE) &&
+ (ticks - adapter->watchdog_time > EM_WATCHDOG) &&
+ (adapter->num_tx_desc_avail != adapter->num_tx_desc) ) {
+ lem_txeof(adapter);
+ /*
+ * lem_txeof() normally (except when space in the queue
+ * runs low XXX) cleans watchdog_check so that
+ * we do not hung.
+ */
+ }
+#endif /* NIC_PARAVIRT */
/*
* We check the watchdog: the time since
* the last TX descriptor was cleaned.
@@ -2677,10 +2813,10 @@ lem_setup_transmit_structures(struct adapter *adapter)
uint64_t paddr;
void *addr;
- addr = PNMB(slot + si, &paddr);
+ addr = PNMB(na, slot + si, &paddr);
adapter->tx_desc_base[i].buffer_addr = htole64(paddr);
/* reload the map for netmap mode */
- netmap_load_map(adapter->txtag, tx_buffer->map, addr);
+ netmap_load_map(na, adapter->txtag, tx_buffer->map, addr);
}
#endif /* DEV_NETMAP */
tx_buffer->next_eop = -1;
@@ -3055,6 +3191,16 @@ lem_txeof(struct adapter *adapter)
adapter->next_tx_to_clean = first;
adapter->num_tx_desc_avail = num_avail;
+#ifdef NIC_SEND_COMBINING
+ if ((adapter->shadow_tdt & MIT_PENDING_TDT) == MIT_PENDING_TDT) {
+ /* a tdt write is pending, do it */
+ E1000_WRITE_REG(&adapter->hw, E1000_TDT(0),
+ 0xffff & adapter->shadow_tdt);
+ adapter->shadow_tdt = MIT_PENDING_INT;
+ } else {
+ adapter->shadow_tdt = 0; // disable
+ }
+#endif /* NIC_SEND_COMBINING */
/*
* If we have enough room, clear IFF_DRV_OACTIVE to
* tell the stack that it is OK to send packets.
@@ -3062,6 +3208,12 @@ lem_txeof(struct adapter *adapter)
*/
if (adapter->num_tx_desc_avail > EM_TX_CLEANUP_THRESHOLD) {
ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
+#ifdef NIC_PARAVIRT
+ if (adapter->csb) { // XXX also csb_on ?
+ adapter->csb->guest_need_txkick = 2; /* acked */
+ // XXX memory barrier
+ }
+#endif /* NIC_PARAVIRT */
if (adapter->num_tx_desc_avail == adapter->num_tx_desc) {
adapter->watchdog_check = FALSE;
return;
@@ -3247,8 +3399,8 @@ lem_setup_receive_structures(struct adapter *adapter)
uint64_t paddr;
void *addr;
- addr = PNMB(slot + si, &paddr);
- netmap_load_map(adapter->rxtag, rx_buffer->map, addr);
+ addr = PNMB(na, slot + si, &paddr);
+ netmap_load_map(na, adapter->rxtag, rx_buffer->map, addr);
/* Update descriptor */
adapter->rx_desc_base[i].buffer_addr = htole64(paddr);
continue;
@@ -3445,7 +3597,23 @@ lem_rxeof(struct adapter *adapter, int count, int *done)
int i, rx_sent = 0;
struct e1000_rx_desc *current_desc;
+#ifdef BATCH_DISPATCH
+ struct mbuf *mh = NULL, *mt = NULL;
+#endif /* BATCH_DISPATCH */
+#ifdef NIC_PARAVIRT
+ int retries = 0;
+ struct paravirt_csb* csb = adapter->csb;
+ int csb_mode = csb && csb->guest_csb_on;
+
+ //ND("clear guest_rxkick at %d", adapter->next_rx_desc_to_check);
+ if (csb_mode && csb->guest_need_rxkick)
+ csb->guest_need_rxkick = 0;
+#endif /* NIC_PARAVIRT */
EM_RX_LOCK(adapter);
+
+#ifdef BATCH_DISPATCH
+ batch_again:
+#endif /* BATCH_DISPATCH */
i = adapter->next_rx_desc_to_check;
current_desc = &adapter->rx_desc_base[i];
bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,
@@ -3458,19 +3626,45 @@ lem_rxeof(struct adapter *adapter, int count, int *done)
}
#endif /* DEV_NETMAP */
+#if 1 // XXX optimization ?
if (!((current_desc->status) & E1000_RXD_STAT_DD)) {
if (done != NULL)
*done = rx_sent;
EM_RX_UNLOCK(adapter);
return (FALSE);
}
+#endif /* 0 */
while (count != 0 && ifp->if_drv_flags & IFF_DRV_RUNNING) {
struct mbuf *m = NULL;
status = current_desc->status;
- if ((status & E1000_RXD_STAT_DD) == 0)
+ if ((status & E1000_RXD_STAT_DD) == 0) {
+#ifdef NIC_PARAVIRT
+ if (csb_mode) {
+ /* buffer not ready yet. Retry a few times before giving up */
+ if (++retries <= adapter->rx_retries) {
+ continue;
+ }
+ if (csb->guest_need_rxkick == 0) {
+ // ND("set guest_rxkick at %d", adapter->next_rx_desc_to_check);
+ csb->guest_need_rxkick = 1;
+ // XXX memory barrier, status volatile ?
+ continue; /* double check */
+ }
+ }
+ /* no buffer ready, give up */
+#endif /* NIC_PARAVIRT */
break;
+ }
+#ifdef NIC_PARAVIRT
+ if (csb_mode) {
+ if (csb->guest_need_rxkick)
+ // ND("clear again guest_rxkick at %d", adapter->next_rx_desc_to_check);
+ csb->guest_need_rxkick = 0;
+ retries = 0;
+ }
+#endif /* NIC_PARAVIRT */
mp = adapter->rx_buffer_area[i].m_head;
/*
@@ -3595,11 +3789,36 @@ discard:
bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+#ifdef NIC_PARAVIRT
+ if (csb_mode) {
+ /* the buffer at i has been already replaced by lem_get_buf()
+ * so it is safe to set guest_rdt = i and possibly send a kick.
+ * XXX see if we can optimize it later.
+ */
+ csb->guest_rdt = i;
+ // XXX memory barrier
+ if (i == csb->host_rxkick_at)
+ E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), i);
+ }
+#endif /* NIC_PARAVIRT */
/* Advance our pointers to the next descriptor. */
if (++i == adapter->num_rx_desc)
i = 0;
/* Call into the stack */
if (m != NULL) {
+#ifdef BATCH_DISPATCH
+ if (adapter->batch_enable) {
+ if (mh == NULL)
+ mh = mt = m;
+ else
+ mt->m_nextpkt = m;
+ mt = m;
+ m->m_nextpkt = NULL;
+ rx_sent++;
+ current_desc = &adapter->rx_desc_base[i];
+ continue;
+ }
+#endif /* BATCH_DISPATCH */
adapter->next_rx_desc_to_check = i;
EM_RX_UNLOCK(adapter);
(*ifp->if_input)(ifp, m);
@@ -3610,10 +3829,27 @@ discard:
current_desc = &adapter->rx_desc_base[i];
}
adapter->next_rx_desc_to_check = i;
+#ifdef BATCH_DISPATCH
+ if (mh) {
+ EM_RX_UNLOCK(adapter);
+ while ( (mt = mh) != NULL) {
+ mh = mh->m_nextpkt;
+ mt->m_nextpkt = NULL;
+ if_input(ifp, mt);
+ }
+ EM_RX_LOCK(adapter);
+ i = adapter->next_rx_desc_to_check; /* in case of interrupts */
+ if (count > 0)
+ goto batch_again;
+ }
+#endif /* BATCH_DISPATCH */
/* Advance the E1000's Receive Queue #0 "Tail Pointer". */
if (--i < 0)
i = adapter->num_rx_desc - 1;
+#ifdef NIC_PARAVIRT
+ if (!csb_mode) /* filter out writes */
+#endif /* NIC_PARAVIRT */
E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), i);
if (done != NULL)
*done = rx_sent;
OpenPOWER on IntegriCloud