summaryrefslogtreecommitdiffstats
path: root/sys/dev
diff options
context:
space:
mode:
authorluigi <luigi@FreeBSD.org>2014-08-20 23:34:36 +0000
committerluigi <luigi@FreeBSD.org>2014-08-20 23:34:36 +0000
commit223d76dc5012ea77078296847800a3d6181c61e2 (patch)
treed5d5263ca0c34de806d5e9e07b0b85eab96545f9 /sys/dev
parentb63e85f63f1ee972ee2221c84e26cc35597b38f7 (diff)
downloadFreeBSD-src-223d76dc5012ea77078296847800a3d6181c61e2.zip
FreeBSD-src-223d76dc5012ea77078296847800a3d6181c61e2.tar.gz
MFC 270063: update of netmap code
(vtnet and cxgbe not merged yet because we need some other mfc first)
Diffstat (limited to 'sys/dev')
-rw-r--r--sys/dev/e1000/if_em.c8
-rw-r--r--sys/dev/e1000/if_igb.c6
-rw-r--r--sys/dev/e1000/if_lem.c246
-rw-r--r--sys/dev/ixgbe/ixgbe.c6
-rw-r--r--sys/dev/netmap/if_em_netmap.h26
-rw-r--r--sys/dev/netmap/if_igb_netmap.h26
-rw-r--r--sys/dev/netmap/if_lem_netmap.h208
-rw-r--r--sys/dev/netmap/if_re_netmap.h40
-rw-r--r--sys/dev/netmap/if_vtnet_netmap.h434
-rw-r--r--sys/dev/netmap/ixgbe_netmap.h26
-rw-r--r--sys/dev/netmap/netmap.c683
-rw-r--r--sys/dev/netmap/netmap_freebsd.c149
-rw-r--r--sys/dev/netmap/netmap_generic.c122
-rw-r--r--sys/dev/netmap/netmap_kern.h416
-rw-r--r--sys/dev/netmap/netmap_mbq.h1
-rw-r--r--sys/dev/netmap/netmap_mem2.c251
-rw-r--r--sys/dev/netmap/netmap_mem2.h98
-rw-r--r--sys/dev/netmap/netmap_monitor.c498
-rw-r--r--sys/dev/netmap/netmap_offloadings.c6
-rw-r--r--sys/dev/netmap/netmap_pipe.c53
-rw-r--r--sys/dev/netmap/netmap_vale.c814
21 files changed, 3320 insertions, 797 deletions
diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c
index cc8b34e..20321d0 100644
--- a/sys/dev/e1000/if_em.c
+++ b/sys/dev/e1000/if_em.c
@@ -3389,10 +3389,10 @@ em_setup_transmit_ring(struct tx_ring *txr)
uint64_t paddr;
void *addr;
- addr = PNMB(slot + si, &paddr);
+ addr = PNMB(na, slot + si, &paddr);
txr->tx_base[i].buffer_addr = htole64(paddr);
/* reload the map for netmap mode */
- netmap_load_map(txr->txtag, txbuf->map, addr);
+ netmap_load_map(na, txr->txtag, txbuf->map, addr);
}
#endif /* DEV_NETMAP */
@@ -4131,8 +4131,8 @@ em_setup_receive_ring(struct rx_ring *rxr)
uint64_t paddr;
void *addr;
- addr = PNMB(slot + si, &paddr);
- netmap_load_map(rxr->rxtag, rxbuf->map, addr);
+ addr = PNMB(na, slot + si, &paddr);
+ netmap_load_map(na, rxr->rxtag, rxbuf->map, addr);
/* Update descriptor */
rxr->rx_base[j].buffer_addr = htole64(paddr);
continue;
diff --git a/sys/dev/e1000/if_igb.c b/sys/dev/e1000/if_igb.c
index 15d71ce..484cba1 100644
--- a/sys/dev/e1000/if_igb.c
+++ b/sys/dev/e1000/if_igb.c
@@ -3531,7 +3531,7 @@ igb_setup_transmit_ring(struct tx_ring *txr)
if (slot) {
int si = netmap_idx_n2k(&na->tx_rings[txr->me], i);
/* no need to set the address */
- netmap_load_map(txr->txtag, txbuf->map, NMB(slot + si));
+ netmap_load_map(na, txr->txtag, txbuf->map, NMB(na, slot + si));
}
#endif /* DEV_NETMAP */
/* clear the watch index */
@@ -4335,8 +4335,8 @@ igb_setup_receive_ring(struct rx_ring *rxr)
uint64_t paddr;
void *addr;
- addr = PNMB(slot + sj, &paddr);
- netmap_load_map(rxr->ptag, rxbuf->pmap, addr);
+ addr = PNMB(na, slot + sj, &paddr);
+ netmap_load_map(na, rxr->ptag, rxbuf->pmap, addr);
/* Update descriptor */
rxr->rx_base[j].read.pkt_addr = htole64(paddr);
continue;
diff --git a/sys/dev/e1000/if_lem.c b/sys/dev/e1000/if_lem.c
index bc25e18..04a984b 100644
--- a/sys/dev/e1000/if_lem.c
+++ b/sys/dev/e1000/if_lem.c
@@ -32,6 +32,15 @@
******************************************************************************/
/*$FreeBSD$*/
+/*
+ * Uncomment the following extensions for better performance in a VM,
+ * especially if you have support in the hypervisor.
+ * See http://info.iet.unipi.it/~luigi/netmap/
+ */
+// #define BATCH_DISPATCH
+// #define NIC_SEND_COMBINING
+// #define NIC_PARAVIRT /* enable virtio-like synchronization */
+
#include "opt_inet.h"
#include "opt_inet6.h"
@@ -289,6 +298,10 @@ static int lem_tx_int_delay_dflt = EM_TICKS_TO_USECS(EM_TIDV);
static int lem_rx_int_delay_dflt = EM_TICKS_TO_USECS(EM_RDTR);
static int lem_tx_abs_int_delay_dflt = EM_TICKS_TO_USECS(EM_TADV);
static int lem_rx_abs_int_delay_dflt = EM_TICKS_TO_USECS(EM_RADV);
+/*
+ * increase lem_rxd and lem_txd to at least 2048 in netmap mode
+ * for better performance.
+ */
static int lem_rxd = EM_DEFAULT_RXD;
static int lem_txd = EM_DEFAULT_TXD;
static int lem_smart_pwr_down = FALSE;
@@ -458,6 +471,20 @@ lem_attach(device_t dev)
"max number of rx packets to process", &adapter->rx_process_limit,
lem_rx_process_limit);
+#ifdef NIC_SEND_COMBINING
+ /* Sysctls to control mitigation */
+ lem_add_rx_process_limit(adapter, "sc_enable",
+ "driver TDT mitigation", &adapter->sc_enable, 0);
+#endif /* NIC_SEND_COMBINING */
+#ifdef BATCH_DISPATCH
+ lem_add_rx_process_limit(adapter, "batch_enable",
+ "driver rx batch", &adapter->batch_enable, 0);
+#endif /* BATCH_DISPATCH */
+#ifdef NIC_PARAVIRT
+ lem_add_rx_process_limit(adapter, "rx_retries",
+ "driver rx retries", &adapter->rx_retries, 0);
+#endif /* NIC_PARAVIRT */
+
/* Sysctl for setting the interface flow control */
lem_set_flow_cntrl(adapter, "flow_control",
"flow control setting",
@@ -515,6 +542,49 @@ lem_attach(device_t dev)
*/
adapter->hw.mac.report_tx_early = 1;
+#ifdef NIC_PARAVIRT
+ device_printf(dev, "driver supports paravirt, subdev 0x%x\n",
+ adapter->hw.subsystem_device_id);
+ if (adapter->hw.subsystem_device_id == E1000_PARA_SUBDEV) {
+ uint64_t bus_addr;
+
+ device_printf(dev, "paravirt support on dev %p\n", adapter);
+ tsize = 4096; // XXX one page for the csb
+ if (lem_dma_malloc(adapter, tsize, &adapter->csb_mem, BUS_DMA_NOWAIT)) {
+ device_printf(dev, "Unable to allocate csb memory\n");
+ error = ENOMEM;
+ goto err_csb;
+ }
+ /* Setup the Base of the CSB */
+ adapter->csb = (struct paravirt_csb *)adapter->csb_mem.dma_vaddr;
+ /* force the first kick */
+ adapter->csb->host_need_txkick = 1; /* txring empty */
+ adapter->csb->guest_need_rxkick = 1; /* no rx packets */
+ bus_addr = adapter->csb_mem.dma_paddr;
+ lem_add_rx_process_limit(adapter, "csb_on",
+ "enable paravirt.", &adapter->csb->guest_csb_on, 0);
+ lem_add_rx_process_limit(adapter, "txc_lim",
+ "txc_lim", &adapter->csb->host_txcycles_lim, 1);
+
+ /* some stats */
+#define PA_SC(name, var, val) \
+ lem_add_rx_process_limit(adapter, name, name, var, val)
+ PA_SC("host_need_txkick",&adapter->csb->host_need_txkick, 1);
+ PA_SC("host_rxkick_at",&adapter->csb->host_rxkick_at, ~0);
+ PA_SC("guest_need_txkick",&adapter->csb->guest_need_txkick, 0);
+ PA_SC("guest_need_rxkick",&adapter->csb->guest_need_rxkick, 1);
+ PA_SC("tdt_reg_count",&adapter->tdt_reg_count, 0);
+ PA_SC("tdt_csb_count",&adapter->tdt_csb_count, 0);
+ PA_SC("tdt_int_count",&adapter->tdt_int_count, 0);
+ PA_SC("guest_need_kick_count",&adapter->guest_need_kick_count, 0);
+ /* tell the host where the block is */
+ E1000_WRITE_REG(&adapter->hw, E1000_CSBAH,
+ (u32)(bus_addr >> 32));
+ E1000_WRITE_REG(&adapter->hw, E1000_CSBAL,
+ (u32)bus_addr);
+ }
+#endif /* NIC_PARAVIRT */
+
tsize = roundup2(adapter->num_tx_desc * sizeof(struct e1000_tx_desc),
EM_DBA_ALIGN);
@@ -673,6 +743,11 @@ err_hw_init:
err_rx_desc:
lem_dma_free(adapter, &adapter->txdma);
err_tx_desc:
+#ifdef NIC_PARAVIRT
+ lem_dma_free(adapter, &adapter->csb_mem);
+err_csb:
+#endif /* NIC_PARAVIRT */
+
err_pci:
if (adapter->ifp != NULL)
if_free(adapter->ifp);
@@ -760,6 +835,12 @@ lem_detach(device_t dev)
adapter->rx_desc_base = NULL;
}
+#ifdef NIC_PARAVIRT
+ if (adapter->csb) {
+ lem_dma_free(adapter, &adapter->csb_mem);
+ adapter->csb = NULL;
+ }
+#endif /* NIC_PARAVIRT */
lem_release_hw_control(adapter);
free(adapter->mta, M_DEVBUF);
EM_TX_LOCK_DESTROY(adapter);
@@ -869,6 +950,16 @@ lem_start_locked(struct ifnet *ifp)
}
if (adapter->num_tx_desc_avail <= EM_TX_OP_THRESHOLD)
ifp->if_drv_flags |= IFF_DRV_OACTIVE;
+#ifdef NIC_PARAVIRT
+ if (if_getdrvflags(ifp) & IFF_DRV_OACTIVE && adapter->csb &&
+ adapter->csb->guest_csb_on &&
+ !(adapter->csb->guest_need_txkick & 1)) {
+ adapter->csb->guest_need_txkick = 1;
+ adapter->guest_need_kick_count++;
+ // XXX memory barrier
+ lem_txeof(adapter); // XXX possibly clear IFF_DRV_OACTIVE
+ }
+#endif /* NIC_PARAVIRT */
return;
}
@@ -1715,6 +1806,37 @@ lem_xmit(struct adapter *adapter, struct mbuf **m_headp)
*/
bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+
+#ifdef NIC_PARAVIRT
+ if (adapter->csb) {
+ adapter->csb->guest_tdt = i;
+ /* XXX memory barrier ? */
+ if (adapter->csb->guest_csb_on &&
+ !(adapter->csb->host_need_txkick & 1)) {
+ /* XXX maybe useless
+ * clean the ring. maybe do it before ?
+ * maybe a little bit of histeresys ?
+ */
+ if (adapter->num_tx_desc_avail <= 64) {// XXX
+ lem_txeof(adapter);
+ }
+ return (0);
+ }
+ }
+#endif /* NIC_PARAVIRT */
+
+#ifdef NIC_SEND_COMBINING
+ if (adapter->sc_enable) {
+ if (adapter->shadow_tdt & MIT_PENDING_INT) {
+ /* signal intr and data pending */
+ adapter->shadow_tdt = MIT_PENDING_TDT | (i & 0xffff);
+ return (0);
+ } else {
+ adapter->shadow_tdt = MIT_PENDING_INT;
+ }
+ }
+#endif /* NIC_SEND_COMBINING */
+
if (adapter->hw.mac.type == e1000_82547 &&
adapter->link_duplex == HALF_DUPLEX)
lem_82547_move_tail(adapter);
@@ -1995,6 +2117,20 @@ lem_local_timer(void *arg)
lem_smartspeed(adapter);
+#ifdef NIC_PARAVIRT
+ /* recover space if needed */
+ if (adapter->csb && adapter->csb->guest_csb_on &&
+ (adapter->watchdog_check == TRUE) &&
+ (ticks - adapter->watchdog_time > EM_WATCHDOG) &&
+ (adapter->num_tx_desc_avail != adapter->num_tx_desc) ) {
+ lem_txeof(adapter);
+ /*
+ * lem_txeof() normally (except when space in the queue
+ * runs low XXX) cleans watchdog_check so that
+ * we do not hung.
+ */
+ }
+#endif /* NIC_PARAVIRT */
/*
* We check the watchdog: the time since
* the last TX descriptor was cleaned.
@@ -2677,10 +2813,10 @@ lem_setup_transmit_structures(struct adapter *adapter)
uint64_t paddr;
void *addr;
- addr = PNMB(slot + si, &paddr);
+ addr = PNMB(na, slot + si, &paddr);
adapter->tx_desc_base[i].buffer_addr = htole64(paddr);
/* reload the map for netmap mode */
- netmap_load_map(adapter->txtag, tx_buffer->map, addr);
+ netmap_load_map(na, adapter->txtag, tx_buffer->map, addr);
}
#endif /* DEV_NETMAP */
tx_buffer->next_eop = -1;
@@ -3055,6 +3191,16 @@ lem_txeof(struct adapter *adapter)
adapter->next_tx_to_clean = first;
adapter->num_tx_desc_avail = num_avail;
+#ifdef NIC_SEND_COMBINING
+ if ((adapter->shadow_tdt & MIT_PENDING_TDT) == MIT_PENDING_TDT) {
+ /* a tdt write is pending, do it */
+ E1000_WRITE_REG(&adapter->hw, E1000_TDT(0),
+ 0xffff & adapter->shadow_tdt);
+ adapter->shadow_tdt = MIT_PENDING_INT;
+ } else {
+ adapter->shadow_tdt = 0; // disable
+ }
+#endif /* NIC_SEND_COMBINING */
/*
* If we have enough room, clear IFF_DRV_OACTIVE to
* tell the stack that it is OK to send packets.
@@ -3062,6 +3208,12 @@ lem_txeof(struct adapter *adapter)
*/
if (adapter->num_tx_desc_avail > EM_TX_CLEANUP_THRESHOLD) {
ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
+#ifdef NIC_PARAVIRT
+ if (adapter->csb) { // XXX also csb_on ?
+ adapter->csb->guest_need_txkick = 2; /* acked */
+ // XXX memory barrier
+ }
+#endif /* NIC_PARAVIRT */
if (adapter->num_tx_desc_avail == adapter->num_tx_desc) {
adapter->watchdog_check = FALSE;
return;
@@ -3247,8 +3399,8 @@ lem_setup_receive_structures(struct adapter *adapter)
uint64_t paddr;
void *addr;
- addr = PNMB(slot + si, &paddr);
- netmap_load_map(adapter->rxtag, rx_buffer->map, addr);
+ addr = PNMB(na, slot + si, &paddr);
+ netmap_load_map(na, adapter->rxtag, rx_buffer->map, addr);
/* Update descriptor */
adapter->rx_desc_base[i].buffer_addr = htole64(paddr);
continue;
@@ -3445,7 +3597,23 @@ lem_rxeof(struct adapter *adapter, int count, int *done)
int i, rx_sent = 0;
struct e1000_rx_desc *current_desc;
+#ifdef BATCH_DISPATCH
+ struct mbuf *mh = NULL, *mt = NULL;
+#endif /* BATCH_DISPATCH */
+#ifdef NIC_PARAVIRT
+ int retries = 0;
+ struct paravirt_csb* csb = adapter->csb;
+ int csb_mode = csb && csb->guest_csb_on;
+
+ //ND("clear guest_rxkick at %d", adapter->next_rx_desc_to_check);
+ if (csb_mode && csb->guest_need_rxkick)
+ csb->guest_need_rxkick = 0;
+#endif /* NIC_PARAVIRT */
EM_RX_LOCK(adapter);
+
+#ifdef BATCH_DISPATCH
+ batch_again:
+#endif /* BATCH_DISPATCH */
i = adapter->next_rx_desc_to_check;
current_desc = &adapter->rx_desc_base[i];
bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,
@@ -3458,19 +3626,45 @@ lem_rxeof(struct adapter *adapter, int count, int *done)
}
#endif /* DEV_NETMAP */
+#if 1 // XXX optimization ?
if (!((current_desc->status) & E1000_RXD_STAT_DD)) {
if (done != NULL)
*done = rx_sent;
EM_RX_UNLOCK(adapter);
return (FALSE);
}
+#endif /* 0 */
while (count != 0 && ifp->if_drv_flags & IFF_DRV_RUNNING) {
struct mbuf *m = NULL;
status = current_desc->status;
- if ((status & E1000_RXD_STAT_DD) == 0)
+ if ((status & E1000_RXD_STAT_DD) == 0) {
+#ifdef NIC_PARAVIRT
+ if (csb_mode) {
+ /* buffer not ready yet. Retry a few times before giving up */
+ if (++retries <= adapter->rx_retries) {
+ continue;
+ }
+ if (csb->guest_need_rxkick == 0) {
+ // ND("set guest_rxkick at %d", adapter->next_rx_desc_to_check);
+ csb->guest_need_rxkick = 1;
+ // XXX memory barrier, status volatile ?
+ continue; /* double check */
+ }
+ }
+ /* no buffer ready, give up */
+#endif /* NIC_PARAVIRT */
break;
+ }
+#ifdef NIC_PARAVIRT
+ if (csb_mode) {
+ if (csb->guest_need_rxkick)
+ // ND("clear again guest_rxkick at %d", adapter->next_rx_desc_to_check);
+ csb->guest_need_rxkick = 0;
+ retries = 0;
+ }
+#endif /* NIC_PARAVIRT */
mp = adapter->rx_buffer_area[i].m_head;
/*
@@ -3595,11 +3789,36 @@ discard:
bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+#ifdef NIC_PARAVIRT
+ if (csb_mode) {
+ /* the buffer at i has been already replaced by lem_get_buf()
+ * so it is safe to set guest_rdt = i and possibly send a kick.
+ * XXX see if we can optimize it later.
+ */
+ csb->guest_rdt = i;
+ // XXX memory barrier
+ if (i == csb->host_rxkick_at)
+ E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), i);
+ }
+#endif /* NIC_PARAVIRT */
/* Advance our pointers to the next descriptor. */
if (++i == adapter->num_rx_desc)
i = 0;
/* Call into the stack */
if (m != NULL) {
+#ifdef BATCH_DISPATCH
+ if (adapter->batch_enable) {
+ if (mh == NULL)
+ mh = mt = m;
+ else
+ mt->m_nextpkt = m;
+ mt = m;
+ m->m_nextpkt = NULL;
+ rx_sent++;
+ current_desc = &adapter->rx_desc_base[i];
+ continue;
+ }
+#endif /* BATCH_DISPATCH */
adapter->next_rx_desc_to_check = i;
EM_RX_UNLOCK(adapter);
(*ifp->if_input)(ifp, m);
@@ -3610,10 +3829,27 @@ discard:
current_desc = &adapter->rx_desc_base[i];
}
adapter->next_rx_desc_to_check = i;
+#ifdef BATCH_DISPATCH
+ if (mh) {
+ EM_RX_UNLOCK(adapter);
+ while ( (mt = mh) != NULL) {
+ mh = mh->m_nextpkt;
+ mt->m_nextpkt = NULL;
+ if_input(ifp, mt);
+ }
+ EM_RX_LOCK(adapter);
+ i = adapter->next_rx_desc_to_check; /* in case of interrupts */
+ if (count > 0)
+ goto batch_again;
+ }
+#endif /* BATCH_DISPATCH */
/* Advance the E1000's Receive Queue #0 "Tail Pointer". */
if (--i < 0)
i = adapter->num_rx_desc - 1;
+#ifdef NIC_PARAVIRT
+ if (!csb_mode) /* filter out writes */
+#endif /* NIC_PARAVIRT */
E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), i);
if (done != NULL)
*done = rx_sent;
diff --git a/sys/dev/ixgbe/ixgbe.c b/sys/dev/ixgbe/ixgbe.c
index c27440a..75ab2eb 100644
--- a/sys/dev/ixgbe/ixgbe.c
+++ b/sys/dev/ixgbe/ixgbe.c
@@ -3079,7 +3079,7 @@ ixgbe_setup_transmit_ring(struct tx_ring *txr)
*/
if (slot) {
int si = netmap_idx_n2k(&na->tx_rings[txr->me], i);
- netmap_load_map(txr->txtag, txbuf->map, NMB(slot + si));
+ netmap_load_map(na, txr->txtag, txbuf->map, NMB(na, slot + si));
}
#endif /* DEV_NETMAP */
/* Clear the EOP descriptor pointer */
@@ -4025,8 +4025,8 @@ ixgbe_setup_receive_ring(struct rx_ring *rxr)
uint64_t paddr;
void *addr;
- addr = PNMB(slot + sj, &paddr);
- netmap_load_map(rxr->ptag, rxbuf->pmap, addr);
+ addr = PNMB(na, slot + sj, &paddr);
+ netmap_load_map(na, rxr->ptag, rxbuf->pmap, addr);
/* Update descriptor and the cached value */
rxr->rx_base[j].read.pkt_addr = htole64(paddr);
rxbuf->addr = htole64(paddr);
diff --git a/sys/dev/netmap/if_em_netmap.h b/sys/dev/netmap/if_em_netmap.h
index 17b4c4f..15e9be5 100644
--- a/sys/dev/netmap/if_em_netmap.h
+++ b/sys/dev/netmap/if_em_netmap.h
@@ -113,10 +113,10 @@ em_netmap_reg(struct netmap_adapter *na, int onoff)
* Reconcile kernel and user view of the transmit ring.
*/
static int
-em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+em_netmap_txsync(struct netmap_kring *kring, int flags)
{
+ struct netmap_adapter *na = kring->na;
struct ifnet *ifp = na->ifp;
- struct netmap_kring *kring = &na->tx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
@@ -128,7 +128,7 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
/* device-specific */
struct adapter *adapter = ifp->if_softc;
- struct tx_ring *txr = &adapter->tx_rings[ring_nr];
+ struct tx_ring *txr = &adapter->tx_rings[kring->ring_id];
bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
BUS_DMASYNC_POSTREAD);
@@ -144,7 +144,7 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_slot *slot = &ring->slot[nm_i];
u_int len = slot->len;
uint64_t paddr;
- void *addr = PNMB(slot, &paddr);
+ void *addr = PNMB(na, slot, &paddr);
/* device-specific */
struct e1000_tx_desc *curr = &txr->tx_base[nic_i];
@@ -153,12 +153,12 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nic_i == 0 || nic_i == report_frequency) ?
E1000_TXD_CMD_RS : 0;
- NM_CHECK_ADDR_LEN(addr, len);
+ NM_CHECK_ADDR_LEN(na, addr, len);
if (slot->flags & NS_BUF_CHANGED) {
curr->buffer_addr = htole64(paddr);
/* buffer has changed, reload map */
- netmap_reload_map(txr->txtag, txbuf->map, addr);
+ netmap_reload_map(na, txr->txtag, txbuf->map, addr);
}
slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
@@ -187,7 +187,7 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
*/
if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
/* record completed transmissions using TDH */
- nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr));
+ nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(kring->ring_id));
if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
D("TDH wrap %d", nic_i);
nic_i -= kring->nkr_num_slots;
@@ -208,10 +208,10 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* Reconcile kernel and user view of the receive ring.
*/
static int
-em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+em_netmap_rxsync(struct netmap_kring *kring, int flags)
{
+ struct netmap_adapter *na = kring->na;
struct ifnet *ifp = na->ifp;
- struct netmap_kring *kring = &na->rx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
@@ -222,7 +222,7 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
/* device-specific */
struct adapter *adapter = ifp->if_softc;
- struct rx_ring *rxr = &adapter->rx_rings[ring_nr];
+ struct rx_ring *rxr = &adapter->rx_rings[kring->ring_id];
if (head > lim)
return netmap_ring_reinit(kring);
@@ -271,18 +271,18 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
uint64_t paddr;
- void *addr = PNMB(slot, &paddr);
+ void *addr = PNMB(na, slot, &paddr);
struct e1000_rx_desc *curr = &rxr->rx_base[nic_i];
struct em_buffer *rxbuf = &rxr->rx_buffers[nic_i];
- if (addr == netmap_buffer_base) /* bad buf */
+ if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
goto ring_reset;
if (slot->flags & NS_BUF_CHANGED) {
/* buffer has changed, reload map */
curr->buffer_addr = htole64(paddr);
- netmap_reload_map(rxr->rxtag, rxbuf->map, addr);
+ netmap_reload_map(na, rxr->rxtag, rxbuf->map, addr);
slot->flags &= ~NS_BUF_CHANGED;
}
curr->status = 0;
diff --git a/sys/dev/netmap/if_igb_netmap.h b/sys/dev/netmap/if_igb_netmap.h
index e1929f0..c738460 100644
--- a/sys/dev/netmap/if_igb_netmap.h
+++ b/sys/dev/netmap/if_igb_netmap.h
@@ -81,10 +81,10 @@ igb_netmap_reg(struct netmap_adapter *na, int onoff)
* Reconcile kernel and user view of the transmit ring.
*/
static int
-igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+igb_netmap_txsync(struct netmap_kring *kring, int flags)
{
+ struct netmap_adapter *na = kring->na;
struct ifnet *ifp = na->ifp;
- struct netmap_kring *kring = &na->tx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
@@ -96,7 +96,7 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
/* device-specific */
struct adapter *adapter = ifp->if_softc;
- struct tx_ring *txr = &adapter->tx_rings[ring_nr];
+ struct tx_ring *txr = &adapter->tx_rings[kring->ring_id];
/* 82575 needs the queue index added */
u32 olinfo_status =
(adapter->hw.mac.type == e1000_82575) ? (txr->me << 4) : 0;
@@ -115,7 +115,7 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_slot *slot = &ring->slot[nm_i];
u_int len = slot->len;
uint64_t paddr;
- void *addr = PNMB(slot, &paddr);
+ void *addr = PNMB(na, slot, &paddr);
/* device-specific */
union e1000_adv_tx_desc *curr =
@@ -125,11 +125,11 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nic_i == 0 || nic_i == report_frequency) ?
E1000_ADVTXD_DCMD_RS : 0;
- NM_CHECK_ADDR_LEN(addr, len);
+ NM_CHECK_ADDR_LEN(na, addr, len);
if (slot->flags & NS_BUF_CHANGED) {
/* buffer has changed, reload map */
- netmap_reload_map(txr->txtag, txbuf->map, addr);
+ netmap_reload_map(na, txr->txtag, txbuf->map, addr);
}
slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
@@ -171,7 +171,7 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
*/
if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
/* record completed transmissions using TDH */
- nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr));
+ nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(kring->ring_id));
if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
D("TDH wrap %d", nic_i);
nic_i -= kring->nkr_num_slots;
@@ -190,10 +190,10 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* Reconcile kernel and user view of the receive ring.
*/
static int
-igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+igb_netmap_rxsync(struct netmap_kring *kring, int flags)
{
+ struct netmap_adapter *na = kring->na;
struct ifnet *ifp = na->ifp;
- struct netmap_kring *kring = &na->rx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
@@ -204,7 +204,7 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
/* device-specific */
struct adapter *adapter = ifp->if_softc;
- struct rx_ring *rxr = &adapter->rx_rings[ring_nr];
+ struct rx_ring *rxr = &adapter->rx_rings[kring->ring_id];
if (head > lim)
return netmap_ring_reinit(kring);
@@ -251,17 +251,17 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
uint64_t paddr;
- void *addr = PNMB(slot, &paddr);
+ void *addr = PNMB(na, slot, &paddr);
union e1000_adv_rx_desc *curr = &rxr->rx_base[nic_i];
struct igb_rx_buf *rxbuf = &rxr->rx_buffers[nic_i];
- if (addr == netmap_buffer_base) /* bad buf */
+ if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
goto ring_reset;
if (slot->flags & NS_BUF_CHANGED) {
/* buffer has changed, reload map */
- netmap_reload_map(rxr->ptag, rxbuf->pmap, addr);
+ netmap_reload_map(na, rxr->ptag, rxbuf->pmap, addr);
slot->flags &= ~NS_BUF_CHANGED;
}
curr->wb.upper.status_error = 0;
diff --git a/sys/dev/netmap/if_lem_netmap.h b/sys/dev/netmap/if_lem_netmap.h
index 4fce5c9..272f02c 100644
--- a/sys/dev/netmap/if_lem_netmap.h
+++ b/sys/dev/netmap/if_lem_netmap.h
@@ -39,6 +39,7 @@
#include <vm/pmap.h> /* vtophys ? */
#include <dev/netmap/netmap_kern.h>
+extern int netmap_adaptive_io;
/*
* Register/unregister. We are already under netmap lock.
@@ -84,10 +85,10 @@ lem_netmap_reg(struct netmap_adapter *na, int onoff)
* Reconcile kernel and user view of the transmit ring.
*/
static int
-lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+lem_netmap_txsync(struct netmap_kring *kring, int flags)
{
+ struct netmap_adapter *na = kring->na;
struct ifnet *ifp = na->ifp;
- struct netmap_kring *kring = &na->tx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
@@ -98,6 +99,10 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
/* device-specific */
struct adapter *adapter = ifp->if_softc;
+#ifdef NIC_PARAVIRT
+ struct paravirt_csb *csb = adapter->csb;
+ uint64_t *csbd = (uint64_t *)(csb + 1);
+#endif /* NIC_PARAVIRT */
bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
BUS_DMASYNC_POSTREAD);
@@ -108,12 +113,25 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nm_i = kring->nr_hwcur;
if (nm_i != head) { /* we have new packets to send */
+#ifdef NIC_PARAVIRT
+ int do_kick = 0;
+ uint64_t t = 0; // timestamp
+ int n = head - nm_i;
+ if (n < 0)
+ n += lim + 1;
+ if (csb) {
+ t = rdtsc(); /* last timestamp */
+ csbd[16] += t - csbd[0]; /* total Wg */
+ csbd[17] += n; /* Wg count */
+ csbd[0] = t;
+ }
+#endif /* NIC_PARAVIRT */
nic_i = netmap_idx_k2n(kring, nm_i);
while (nm_i != head) {
struct netmap_slot *slot = &ring->slot[nm_i];
u_int len = slot->len;
uint64_t paddr;
- void *addr = PNMB(slot, &paddr);
+ void *addr = PNMB(na, slot, &paddr);
/* device-specific */
struct e1000_tx_desc *curr = &adapter->tx_desc_base[nic_i];
@@ -122,12 +140,12 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nic_i == 0 || nic_i == report_frequency) ?
E1000_TXD_CMD_RS : 0;
- NM_CHECK_ADDR_LEN(addr, len);
+ NM_CHECK_ADDR_LEN(na, addr, len);
if (slot->flags & NS_BUF_CHANGED) {
/* buffer has changed, reload map */
curr->buffer_addr = htole64(paddr);
- netmap_reload_map(adapter->txtag, txbuf->map, addr);
+ netmap_reload_map(na, adapter->txtag, txbuf->map, addr);
}
slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
@@ -140,6 +158,7 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
+ // XXX might try an early kick
}
kring->nr_hwcur = head;
@@ -147,8 +166,38 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+#ifdef NIC_PARAVIRT
+ /* set unconditionally, then also kick if needed */
+ if (csb) {
+ t = rdtsc();
+ if (csb->host_need_txkick == 2) {
+ /* can compute an update of delta */
+ int64_t delta = t - csbd[3];
+ if (delta < 0)
+ delta = -delta;
+ if (csbd[8] == 0 || delta < csbd[8]) {
+ csbd[8] = delta;
+ csbd[9]++;
+ }
+ csbd[10]++;
+ }
+ csb->guest_tdt = nic_i;
+ csbd[18] += t - csbd[0]; // total wp
+ csbd[19] += n;
+ }
+ if (!csb || !csb->guest_csb_on || (csb->host_need_txkick & 1))
+ do_kick = 1;
+ if (do_kick)
+#endif /* NIC_PARAVIRT */
/* (re)start the tx unit up to slot nic_i (excluded) */
E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), nic_i);
+#ifdef NIC_PARAVIRT
+ if (do_kick) {
+ uint64_t t1 = rdtsc();
+ csbd[20] += t1 - t; // total Np
+ csbd[21]++;
+ }
+#endif /* NIC_PARAVIRT */
}
/*
@@ -157,6 +206,93 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
if (ticks != kring->last_reclaim || flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
kring->last_reclaim = ticks;
/* record completed transmissions using TDH */
+#ifdef NIC_PARAVIRT
+ /* host updates tdh unconditionally, and we have
+ * no side effects on reads, so we can read from there
+ * instead of exiting.
+ */
+ if (csb) {
+ static int drain = 0, nodrain=0, good = 0, bad = 0, fail = 0;
+ u_int x = adapter->next_tx_to_clean;
+ csbd[19]++; // XXX count reclaims
+ nic_i = csb->host_tdh;
+ if (csb->guest_csb_on) {
+ if (nic_i == x) {
+ bad++;
+ csbd[24]++; // failed reclaims
+ /* no progress, request kick and retry */
+ csb->guest_need_txkick = 1;
+ mb(); // XXX barrier
+ nic_i = csb->host_tdh;
+ } else {
+ good++;
+ }
+ if (nic_i != x) {
+ csb->guest_need_txkick = 2;
+ if (nic_i == csb->guest_tdt)
+ drain++;
+ else
+ nodrain++;
+#if 1
+ if (netmap_adaptive_io) {
+ /* new mechanism: last half ring (or so)
+ * released one slot at a time.
+ * This effectively makes the system spin.
+ *
+ * Take next_to_clean + 1 as a reference.
+ * tdh must be ahead or equal
+ * On entry, the logical order is
+ * x < tdh = nic_i
+ * We first push tdh up to avoid wraps.
+ * The limit is tdh-ll (half ring).
+ * if tdh-256 < x we report x;
+ * else we report tdh-256
+ */
+ u_int tdh = nic_i;
+ u_int ll = csbd[15];
+ u_int delta = lim/8;
+ if (netmap_adaptive_io == 2 || ll > delta)
+ csbd[15] = ll = delta;
+ else if (netmap_adaptive_io == 1 && ll > 1) {
+ csbd[15]--;
+ }
+
+ if (nic_i >= kring->nkr_num_slots) {
+ RD(5, "bad nic_i %d on input", nic_i);
+ }
+ x = nm_next(x, lim);
+ if (tdh < x)
+ tdh += lim + 1;
+ if (tdh <= x + ll) {
+ nic_i = x;
+ csbd[25]++; //report n + 1;
+ } else {
+ tdh = nic_i;
+ if (tdh < ll)
+ tdh += lim + 1;
+ nic_i = tdh - ll;
+ csbd[26]++; // report tdh - ll
+ }
+ }
+#endif
+ } else {
+ /* we stop, count whether we are idle or not */
+ int bh_active = csb->host_need_txkick & 2 ? 4 : 0;
+ csbd[27+ csb->host_need_txkick]++;
+ if (netmap_adaptive_io == 1) {
+ if (bh_active && csbd[15] > 1)
+ csbd[15]--;
+ else if (!bh_active && csbd[15] < lim/2)
+ csbd[15]++;
+ }
+ bad--;
+ fail++;
+ }
+ }
+ RD(1, "drain %d nodrain %d good %d retry %d fail %d",
+ drain, nodrain, good, bad, fail);
+ } else
+#endif /* !NIC_PARAVIRT */
nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(0));
if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
D("TDH wrap %d", nic_i);
@@ -176,10 +312,10 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* Reconcile kernel and user view of the receive ring.
*/
static int
-lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+lem_netmap_rxsync(struct netmap_kring *kring, int flags)
{
+ struct netmap_adapter *na = kring->na;
struct ifnet *ifp = na->ifp;
- struct netmap_kring *kring = &na->rx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
@@ -190,10 +326,21 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
/* device-specific */
struct adapter *adapter = ifp->if_softc;
+#ifdef NIC_PARAVIRT
+ struct paravirt_csb *csb = adapter->csb;
+ uint32_t csb_mode = csb && csb->guest_csb_on;
+ uint32_t do_host_rxkick = 0;
+#endif /* NIC_PARAVIRT */
if (head > lim)
return netmap_ring_reinit(kring);
+#ifdef NIC_PARAVIRT
+ if (csb_mode) {
+ force_update = 1;
+ csb->guest_need_rxkick = 0;
+ }
+#endif /* NIC_PARAVIRT */
/* XXX check sync modes */
bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,
BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
@@ -212,11 +359,28 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
uint32_t staterr = le32toh(curr->status);
int len;
+#ifdef NIC_PARAVIRT
+ if (csb_mode) {
+ if ((staterr & E1000_RXD_STAT_DD) == 0) {
+ /* don't bother to retry if more than 1 pkt */
+ if (n > 1)
+ break;
+ csb->guest_need_rxkick = 1;
+ wmb();
+ staterr = le32toh(curr->status);
+ if ((staterr & E1000_RXD_STAT_DD) == 0) {
+ break;
+ } else { /* we are good */
+ csb->guest_need_rxkick = 0;
+ }
+ }
+ } else
+#endif /* NIC_PARAVIRT */
if ((staterr & E1000_RXD_STAT_DD) == 0)
break;
len = le16toh(curr->length) - 4; // CRC
if (len < 0) {
- D("bogus pkt size %d nic idx %d", len, nic_i);
+ RD(5, "bogus pkt (%d) size %d nic idx %d", n, len, nic_i);
len = 0;
}
ring->slot[nm_i].len = len;
@@ -228,6 +392,18 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nic_i = nm_next(nic_i, lim);
}
if (n) { /* update the state variables */
+#ifdef NIC_PARAVIRT
+ if (csb_mode) {
+ if (n > 1) {
+ /* leave one spare buffer so we avoid rxkicks */
+ nm_i = nm_prev(nm_i, lim);
+ nic_i = nm_prev(nic_i, lim);
+ n--;
+ } else {
+ csb->guest_need_rxkick = 1;
+ }
+ }
+#endif /* NIC_PARAVIRT */
ND("%d new packets at nic %d nm %d tail %d",
n,
adapter->next_rx_desc_to_check,
@@ -249,23 +425,27 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
uint64_t paddr;
- void *addr = PNMB(slot, &paddr);
+ void *addr = PNMB(na, slot, &paddr);
struct e1000_rx_desc *curr = &adapter->rx_desc_base[nic_i];
struct em_buffer *rxbuf = &adapter->rx_buffer_area[nic_i];
- if (addr == netmap_buffer_base) /* bad buf */
+ if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
goto ring_reset;
if (slot->flags & NS_BUF_CHANGED) {
/* buffer has changed, reload map */
curr->buffer_addr = htole64(paddr);
- netmap_reload_map(adapter->rxtag, rxbuf->map, addr);
+ netmap_reload_map(na, adapter->rxtag, rxbuf->map, addr);
slot->flags &= ~NS_BUF_CHANGED;
}
curr->status = 0;
bus_dmamap_sync(adapter->rxtag, rxbuf->map,
BUS_DMASYNC_PREREAD);
+#ifdef NIC_PARAVIRT
+ if (csb_mode && csb->host_rxkick_at == nic_i)
+ do_host_rxkick = 1;
+#endif /* NIC_PARAVIRT */
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
@@ -277,6 +457,12 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* so move nic_i back by one unit
*/
nic_i = nm_prev(nic_i, lim);
+#ifdef NIC_PARAVIRT
+ /* set unconditionally, then also kick if needed */
+ if (csb)
+ csb->guest_rdt = nic_i;
+ if (!csb_mode || do_host_rxkick)
+#endif /* NIC_PARAVIRT */
E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), nic_i);
}
diff --git a/sys/dev/netmap/if_re_netmap.h b/sys/dev/netmap/if_re_netmap.h
index 10abe4f..98f6143 100644
--- a/sys/dev/netmap/if_re_netmap.h
+++ b/sys/dev/netmap/if_re_netmap.h
@@ -65,10 +65,10 @@ re_netmap_reg(struct netmap_adapter *na, int onoff)
* Reconcile kernel and user view of the transmit ring.
*/
static int
-re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+re_netmap_txsync(struct netmap_kring *kring, int flags)
{
+ struct netmap_adapter *na = kring->na;
struct ifnet *ifp = na->ifp;
- struct netmap_kring *kring = &na->tx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
@@ -96,14 +96,14 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_slot *slot = &ring->slot[nm_i];
u_int len = slot->len;
uint64_t paddr;
- void *addr = PNMB(slot, &paddr);
+ void *addr = PNMB(na, slot, &paddr);
/* device-specific */
struct rl_desc *desc = &sc->rl_ldata.rl_tx_list[nic_i];
int cmd = slot->len | RL_TDESC_CMD_EOF |
RL_TDESC_CMD_OWN | RL_TDESC_CMD_SOF ;
- NM_CHECK_ADDR_LEN(addr, len);
+ NM_CHECK_ADDR_LEN(na, addr, len);
if (nic_i == lim) /* mark end of ring */
cmd |= RL_TDESC_CMD_EOR;
@@ -112,7 +112,7 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
/* buffer has changed, reload map */
desc->rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr));
desc->rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr));
- netmap_reload_map(sc->rl_ldata.rl_tx_mtag,
+ netmap_reload_map(na, sc->rl_ldata.rl_tx_mtag,
txd[nic_i].tx_dmamap, addr);
}
slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
@@ -169,10 +169,10 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* Reconcile kernel and user view of the receive ring.
*/
static int
-re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+re_netmap_rxsync(struct netmap_kring *kring, int flags)
{
+ struct netmap_adapter *na = kring->na;
struct ifnet *ifp = na->ifp;
- struct netmap_kring *kring = &na->rx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
@@ -240,12 +240,12 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
uint64_t paddr;
- void *addr = PNMB(slot, &paddr);
+ void *addr = PNMB(na, slot, &paddr);
struct rl_desc *desc = &sc->rl_ldata.rl_rx_list[nic_i];
- int cmd = NETMAP_BUF_SIZE | RL_RDESC_CMD_OWN;
+ int cmd = NETMAP_BUF_SIZE(na) | RL_RDESC_CMD_OWN;
- if (addr == netmap_buffer_base) /* bad buf */
+ if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
goto ring_reset;
if (nic_i == lim) /* mark end of ring */
@@ -255,7 +255,7 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
/* buffer has changed, reload map */
desc->rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr));
desc->rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr));
- netmap_reload_map(sc->rl_ldata.rl_rx_mtag,
+ netmap_reload_map(na, sc->rl_ldata.rl_rx_mtag,
rxd[nic_i].rx_dmamap, addr);
slot->flags &= ~NS_BUF_CHANGED;
}
@@ -296,14 +296,10 @@ re_netmap_tx_init(struct rl_softc *sc)
struct netmap_adapter *na = NA(sc->rl_ifp);
struct netmap_slot *slot;
- if (!na || !(na->na_flags & NAF_NATIVE_ON)) {
- return;
- }
-
slot = netmap_reset(na, NR_TX, 0, 0);
- /* slot is NULL if we are not in netmap mode */
+ /* slot is NULL if we are not in native netmap mode */
if (!slot)
- return; // XXX cannot happen
+ return;
/* in netmap mode, overwrite addresses and maps */
txd = sc->rl_ldata.rl_tx_desc;
desc = sc->rl_ldata.rl_tx_list;
@@ -313,11 +309,11 @@ re_netmap_tx_init(struct rl_softc *sc)
for (i = 0; i < n; i++) {
uint64_t paddr;
int l = netmap_idx_n2k(&na->tx_rings[0], i);
- void *addr = PNMB(slot + l, &paddr);
+ void *addr = PNMB(na, slot + l, &paddr);
desc[i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr));
desc[i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr));
- netmap_load_map(sc->rl_ldata.rl_tx_mtag,
+ netmap_load_map(na, sc->rl_ldata.rl_tx_mtag,
txd[i].tx_dmamap, addr);
}
}
@@ -344,15 +340,15 @@ re_netmap_rx_init(struct rl_softc *sc)
uint64_t paddr;
uint32_t nm_i = netmap_idx_n2k(&na->rx_rings[0], nic_i);
- addr = PNMB(slot + nm_i, &paddr);
+ addr = PNMB(na, slot + nm_i, &paddr);
- netmap_reload_map(sc->rl_ldata.rl_rx_mtag,
+ netmap_reload_map(na, sc->rl_ldata.rl_rx_mtag,
sc->rl_ldata.rl_rx_desc[nic_i].rx_dmamap, addr);
bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag,
sc->rl_ldata.rl_rx_desc[nic_i].rx_dmamap, BUS_DMASYNC_PREREAD);
desc[nic_i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr));
desc[nic_i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr));
- cmdstat = NETMAP_BUF_SIZE;
+ cmdstat = NETMAP_BUF_SIZE(na);
if (nic_i == n - 1) /* mark the end of ring */
cmdstat |= RL_RDESC_CMD_EOR;
if (nic_i < max_avail)
diff --git a/sys/dev/netmap/if_vtnet_netmap.h b/sys/dev/netmap/if_vtnet_netmap.h
new file mode 100644
index 0000000..63f4fa9
--- /dev/null
+++ b/sys/dev/netmap/if_vtnet_netmap.h
@@ -0,0 +1,434 @@
+/*
+ * Copyright (C) 2014 Vincenzo Maffione, Luigi Rizzo. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#include <net/netmap.h>
+#include <sys/selinfo.h>
+#include <vm/vm.h>
+#include <vm/pmap.h> /* vtophys ? */
+#include <dev/netmap/netmap_kern.h>
+
+
+#define SOFTC_T vtnet_softc
+
+/* Free all the unused buffer in all the RX virtqueues.
+ * This function is called when entering and exiting netmap mode.
+ * - buffers queued by the virtio driver return skbuf/mbuf pointer
+ * and need to be freed;
+ * - buffers queued by netmap return the txq/rxq, and do not need work
+ */
+static void
+vtnet_netmap_free_bufs(struct SOFTC_T* sc)
+{
+ int i, nmb = 0, n = 0, last;
+
+ for (i = 0; i < sc->vtnet_max_vq_pairs; i++) {
+ struct vtnet_rxq *rxq = &sc->vtnet_rxqs[i];
+ struct virtqueue *vq;
+ struct mbuf *m;
+ struct vtnet_txq *txq = &sc->vtnet_txqs[i];
+ struct vtnet_tx_header *txhdr;
+
+ last = 0;
+ vq = rxq->vtnrx_vq;
+ while ((m = virtqueue_drain(vq, &last)) != NULL) {
+ n++;
+ if (m != (void *)rxq)
+ m_freem(m);
+ else
+ nmb++;
+ }
+
+ last = 0;
+ vq = txq->vtntx_vq;
+ while ((txhdr = virtqueue_drain(vq, &last)) != NULL) {
+ n++;
+ if (txhdr != (void *)txq) {
+ m_freem(txhdr->vth_mbuf);
+ uma_zfree(vtnet_tx_header_zone, txhdr);
+ } else
+ nmb++;
+ }
+ }
+ D("freed %d mbufs, %d netmap bufs on %d queues",
+ n - nmb, nmb, i);
+}
+
+/* Register and unregister. */
+static int
+vtnet_netmap_reg(struct netmap_adapter *na, int onoff)
+{
+ struct ifnet *ifp = na->ifp;
+ struct SOFTC_T *sc = ifp->if_softc;
+
+ VTNET_CORE_LOCK(sc);
+ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
+ /* enable or disable flags and callbacks in na and ifp */
+ if (onoff) {
+ nm_set_native_flags(na);
+ } else {
+ nm_clear_native_flags(na);
+ }
+ /* drain queues so netmap and native drivers
+ * do not interfere with each other
+ */
+ vtnet_netmap_free_bufs(sc);
+ vtnet_init_locked(sc); /* also enable intr */
+ VTNET_CORE_UNLOCK(sc);
+ return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1);
+}
+
+
+/* Reconcile kernel and user view of the transmit ring. */
+static int
+vtnet_netmap_txsync(struct netmap_kring *kring, int flags)
+{
+ struct netmap_adapter *na = kring->na;
+ struct ifnet *ifp = na->ifp;
+ struct netmap_ring *ring = kring->ring;
+ u_int ring_nr = kring->ring_id;
+ u_int nm_i; /* index into the netmap ring */
+ u_int nic_i; /* index into the NIC ring */
+ u_int n;
+ u_int const lim = kring->nkr_num_slots - 1;
+ u_int const head = kring->rhead;
+
+ /* device-specific */
+ struct SOFTC_T *sc = ifp->if_softc;
+ struct vtnet_txq *txq = &sc->vtnet_txqs[ring_nr];
+ struct virtqueue *vq = txq->vtntx_vq;
+
+ /*
+ * First part: process new packets to send.
+ */
+ rmb();
+
+ nm_i = kring->nr_hwcur;
+ if (nm_i != head) { /* we have new packets to send */
+ struct sglist *sg = txq->vtntx_sg;
+
+ nic_i = netmap_idx_k2n(kring, nm_i);
+ for (n = 0; nm_i != head; n++) {
+ /* we use an empty header here */
+ static struct virtio_net_hdr_mrg_rxbuf hdr;
+ struct netmap_slot *slot = &ring->slot[nm_i];
+ u_int len = slot->len;
+ uint64_t paddr;
+ void *addr = PNMB(na, slot, &paddr);
+ int err;
+
+ NM_CHECK_ADDR_LEN(na, addr, len);
+
+ slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
+ /* Initialize the scatterlist, expose it to the hypervisor,
+ * and kick the hypervisor (if necessary).
+ */
+ sglist_reset(sg); // cheap
+ // if vtnet_hdr_size > 0 ...
+ err = sglist_append(sg, &hdr, sc->vtnet_hdr_size);
+ // XXX later, support multi segment
+ err = sglist_append_phys(sg, paddr, len);
+ /* use na as the cookie */
+ err = virtqueue_enqueue(vq, txq, sg, sg->sg_nseg, 0);
+ if (unlikely(err < 0)) {
+ D("virtqueue_enqueue failed");
+ break;
+ }
+
+ nm_i = nm_next(nm_i, lim);
+ nic_i = nm_next(nic_i, lim);
+ }
+ /* Update hwcur depending on where we stopped. */
+ kring->nr_hwcur = nm_i; /* note we migth break early */
+
+ /* No more free TX slots? Ask the hypervisor for notifications,
+ * possibly only when a considerable amount of work has been
+ * done.
+ */
+ ND(3,"sent %d packets, hwcur %d", n, nm_i);
+ virtqueue_disable_intr(vq);
+ virtqueue_notify(vq);
+ } else {
+ if (ring->head != ring->tail)
+ ND(5, "pure notify ? head %d tail %d nused %d %d",
+ ring->head, ring->tail, virtqueue_nused(vq),
+ (virtqueue_dump(vq), 1));
+ virtqueue_notify(vq);
+ virtqueue_enable_intr(vq); // like postpone with 0
+ }
+
+
+ /* Free used slots. We only consider our own used buffers, recognized
+ * by the token we passed to virtqueue_add_outbuf.
+ */
+ n = 0;
+ for (;;) {
+ struct vtnet_tx_header *txhdr = virtqueue_dequeue(vq, NULL);
+ if (txhdr == NULL)
+ break;
+ if (likely(txhdr == (void *)txq)) {
+ n++;
+ if (virtqueue_nused(vq) < 32) { // XXX slow release
+ break;
+ }
+ } else { /* leftover from previous transmission */
+ m_freem(txhdr->vth_mbuf);
+ uma_zfree(vtnet_tx_header_zone, txhdr);
+ }
+ }
+ if (n) {
+ kring->nr_hwtail += n;
+ if (kring->nr_hwtail > lim)
+ kring->nr_hwtail -= lim + 1;
+ }
+ if (nm_i != kring->nr_hwtail /* && vtnet_txq_below_threshold(txq) == 0*/) {
+ ND(3, "disable intr, hwcur %d", nm_i);
+ virtqueue_disable_intr(vq);
+ } else {
+ ND(3, "enable intr, hwcur %d", nm_i);
+ virtqueue_postpone_intr(vq, VQ_POSTPONE_SHORT);
+ }
+
+//out:
+ nm_txsync_finalize(kring);
+
+ return 0;
+}
+
+static int
+vtnet_refill_rxq(struct netmap_kring *kring, u_int nm_i, u_int head)
+{
+ struct netmap_adapter *na = kring->na;
+ struct ifnet *ifp = na->ifp;
+ struct netmap_ring *ring = kring->ring;
+ u_int ring_nr = kring->ring_id;
+ u_int const lim = kring->nkr_num_slots - 1;
+ u_int n;
+
+ /* device-specific */
+ struct SOFTC_T *sc = ifp->if_softc;
+ struct vtnet_rxq *rxq = &sc->vtnet_rxqs[ring_nr];
+ struct virtqueue *vq = rxq->vtnrx_vq;
+
+ /* use a local sglist, default might be short */
+ struct sglist_seg ss[2];
+ struct sglist sg = { ss, 0, 0, 2 };
+
+ for (n = 0; nm_i != head; n++) {
+ static struct virtio_net_hdr_mrg_rxbuf hdr;
+ struct netmap_slot *slot = &ring->slot[nm_i];
+ uint64_t paddr;
+ void *addr = PNMB(na, slot, &paddr);
+ int err = 0;
+
+ if (addr == NETMAP_BUF_BASE(na)) { /* bad buf */
+ if (netmap_ring_reinit(kring))
+ return -1;
+ }
+
+ slot->flags &= ~NS_BUF_CHANGED;
+ sglist_reset(&sg); // cheap
+ err = sglist_append(&sg, &hdr, sc->vtnet_hdr_size);
+ err = sglist_append_phys(&sg, paddr, NETMAP_BUF_SIZE(na));
+ /* writable for the host */
+ err = virtqueue_enqueue(vq, rxq, &sg, 0, sg.sg_nseg);
+ if (err < 0) {
+ D("virtqueue_enqueue failed");
+ break;
+ }
+ nm_i = nm_next(nm_i, lim);
+ }
+ return nm_i;
+}
+
+/* Reconcile kernel and user view of the receive ring. */
+static int
+vtnet_netmap_rxsync(struct netmap_kring *kring, int flags)
+{
+ struct netmap_adapter *na = kring->na;
+ struct ifnet *ifp = na->ifp;
+ struct netmap_ring *ring = kring->ring;
+ u_int ring_nr = kring->ring_id;
+ u_int nm_i; /* index into the netmap ring */
+ // u_int nic_i; /* index into the NIC ring */
+ u_int n;
+ u_int const lim = kring->nkr_num_slots - 1;
+ u_int const head = nm_rxsync_prologue(kring);
+ int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
+
+ /* device-specific */
+ struct SOFTC_T *sc = ifp->if_softc;
+ struct vtnet_rxq *rxq = &sc->vtnet_rxqs[ring_nr];
+ struct virtqueue *vq = rxq->vtnrx_vq;
+
+ /* XXX netif_carrier_ok ? */
+
+ if (head > lim)
+ return netmap_ring_reinit(kring);
+
+ rmb();
+ /*
+ * First part: import newly received packets.
+ * Only accept our
+ * own buffers (matching the token). We should only get
+ * matching buffers, because of vtnet_netmap_free_rx_unused_bufs()
+ * and vtnet_netmap_init_buffers().
+ */
+ if (netmap_no_pendintr || force_update) {
+ uint16_t slot_flags = kring->nkr_slot_flags;
+ struct netmap_adapter *token;
+
+ nm_i = kring->nr_hwtail;
+ n = 0;
+ for (;;) {
+ int len;
+ token = virtqueue_dequeue(vq, &len);
+ if (token == NULL)
+ break;
+ if (likely(token == (void *)rxq)) {
+ ring->slot[nm_i].len = len;
+ ring->slot[nm_i].flags = slot_flags;
+ nm_i = nm_next(nm_i, lim);
+ n++;
+ } else {
+ D("This should not happen");
+ }
+ }
+ kring->nr_hwtail = nm_i;
+ kring->nr_kflags &= ~NKR_PENDINTR;
+ }
+ ND("[B] h %d c %d hwcur %d hwtail %d",
+ ring->head, ring->cur, kring->nr_hwcur,
+ kring->nr_hwtail);
+
+ /*
+ * Second part: skip past packets that userspace has released.
+ */
+ nm_i = kring->nr_hwcur; /* netmap ring index */
+ if (nm_i != head) {
+ int err = vtnet_refill_rxq(kring, nm_i, head);
+ if (err < 0)
+ return 1;
+ kring->nr_hwcur = err;
+ virtqueue_notify(vq);
+ /* After draining the queue may need an intr from the hypervisor */
+ vtnet_rxq_enable_intr(rxq);
+ }
+
+ /* tell userspace that there might be new packets. */
+ nm_rxsync_finalize(kring);
+
+ ND("[C] h %d c %d t %d hwcur %d hwtail %d",
+ ring->head, ring->cur, ring->tail,
+ kring->nr_hwcur, kring->nr_hwtail);
+
+ return 0;
+}
+
+
+/* Make RX virtqueues buffers pointing to netmap buffers. */
+static int
+vtnet_netmap_init_rx_buffers(struct SOFTC_T *sc)
+{
+ struct ifnet *ifp = sc->vtnet_ifp;
+ struct netmap_adapter* na = NA(ifp);
+ unsigned int r;
+
+ if (!nm_native_on(na))
+ return 0;
+ for (r = 0; r < na->num_rx_rings; r++) {
+ struct netmap_kring *kring = &na->rx_rings[r];
+ struct vtnet_rxq *rxq = &sc->vtnet_rxqs[r];
+ struct virtqueue *vq = rxq->vtnrx_vq;
+ struct netmap_slot* slot;
+ int err = 0;
+
+ slot = netmap_reset(na, NR_RX, r, 0);
+ if (!slot) {
+ D("strange, null netmap ring %d", r);
+ return 0;
+ }
+ /* Add up to na>-num_rx_desc-1 buffers to this RX virtqueue.
+ * It's important to leave one virtqueue slot free, otherwise
+ * we can run into ring->cur/ring->tail wraparounds.
+ */
+ err = vtnet_refill_rxq(kring, 0, na->num_rx_desc-1);
+ if (err < 0)
+ return 0;
+ virtqueue_notify(vq);
+ }
+
+ return 1;
+}
+
+/* Update the virtio-net device configurations. Number of queues can
+ * change dinamically, by 'ethtool --set-channels $IFNAME combined $N'.
+ * This is actually the only way virtio-net can currently enable
+ * the multiqueue mode.
+ * XXX note that we seem to lose packets if the netmap ring has more
+ * slots than the queue
+ */
+static int
+vtnet_netmap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
+ u_int *rxr, u_int *rxd)
+{
+ struct ifnet *ifp = na->ifp;
+ struct SOFTC_T *sc = ifp->if_softc;
+
+ *txr = *rxr = sc->vtnet_max_vq_pairs;
+ *rxd = 512; // sc->vtnet_rx_nmbufs;
+ *txd = *rxd; // XXX
+ D("vtnet config txq=%d, txd=%d rxq=%d, rxd=%d",
+ *txr, *txd, *rxr, *rxd);
+
+ return 0;
+}
+
+static void
+vtnet_netmap_attach(struct SOFTC_T *sc)
+{
+ struct netmap_adapter na;
+
+ bzero(&na, sizeof(na));
+
+ na.ifp = sc->vtnet_ifp;
+ na.num_tx_desc = 1024;// sc->vtnet_rx_nmbufs;
+ na.num_rx_desc = 1024; // sc->vtnet_rx_nmbufs;
+ na.nm_register = vtnet_netmap_reg;
+ na.nm_txsync = vtnet_netmap_txsync;
+ na.nm_rxsync = vtnet_netmap_rxsync;
+ na.nm_config = vtnet_netmap_config;
+ na.num_tx_rings = na.num_rx_rings = sc->vtnet_max_vq_pairs;
+ D("max rings %d", sc->vtnet_max_vq_pairs);
+ netmap_attach(&na);
+
+ D("virtio attached txq=%d, txd=%d rxq=%d, rxd=%d",
+ na.num_tx_rings, na.num_tx_desc,
+ na.num_tx_rings, na.num_rx_desc);
+}
+/* end of file */
diff --git a/sys/dev/netmap/ixgbe_netmap.h b/sys/dev/netmap/ixgbe_netmap.h
index a617cc4..3dc628a 100644
--- a/sys/dev/netmap/ixgbe_netmap.h
+++ b/sys/dev/netmap/ixgbe_netmap.h
@@ -153,10 +153,10 @@ ixgbe_netmap_reg(struct netmap_adapter *na, int onoff)
* methods should be handled by the individual drivers.
*/
static int
-ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+ixgbe_netmap_txsync(struct netmap_kring *kring, int flags)
{
+ struct netmap_adapter *na = kring->na;
struct ifnet *ifp = na->ifp;
- struct netmap_kring *kring = &na->tx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
@@ -171,7 +171,7 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
/* device-specific */
struct adapter *adapter = ifp->if_softc;
- struct tx_ring *txr = &adapter->tx_rings[ring_nr];
+ struct tx_ring *txr = &adapter->tx_rings[kring->ring_id];
int reclaim_tx;
bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
@@ -223,7 +223,7 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
struct netmap_slot *slot = &ring->slot[nm_i];
u_int len = slot->len;
uint64_t paddr;
- void *addr = PNMB(slot, &paddr);
+ void *addr = PNMB(na, slot, &paddr);
/* device-specific */
union ixgbe_adv_tx_desc *curr = &txr->tx_base[nic_i];
@@ -236,11 +236,11 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
__builtin_prefetch(&ring->slot[nm_i + 1]);
__builtin_prefetch(&txr->tx_buffers[nic_i + 1]);
- NM_CHECK_ADDR_LEN(addr, len);
+ NM_CHECK_ADDR_LEN(na, addr, len);
if (slot->flags & NS_BUF_CHANGED) {
/* buffer has changed, reload map */
- netmap_reload_map(txr->txtag, txbuf->map, addr);
+ netmap_reload_map(na, txr->txtag, txbuf->map, addr);
}
slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
@@ -309,7 +309,7 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* REPORT_STATUS in a few slots so TDH is the only
* good way.
*/
- nic_i = IXGBE_READ_REG(&adapter->hw, IXGBE_TDH(ring_nr));
+ nic_i = IXGBE_READ_REG(&adapter->hw, IXGBE_TDH(kring->ring_id));
if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
D("TDH wrap %d", nic_i);
nic_i -= kring->nkr_num_slots;
@@ -341,10 +341,10 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
* of whether or not we received an interrupt.
*/
static int
-ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+ixgbe_netmap_rxsync(struct netmap_kring *kring, int flags)
{
+ struct netmap_adapter *na = kring->na;
struct ifnet *ifp = na->ifp;
- struct netmap_kring *kring = &na->rx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
@@ -355,7 +355,7 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
/* device-specific */
struct adapter *adapter = ifp->if_softc;
- struct rx_ring *rxr = &adapter->rx_rings[ring_nr];
+ struct rx_ring *rxr = &adapter->rx_rings[kring->ring_id];
if (head > lim)
return netmap_ring_reinit(kring);
@@ -425,17 +425,17 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
uint64_t paddr;
- void *addr = PNMB(slot, &paddr);
+ void *addr = PNMB(na, slot, &paddr);
union ixgbe_adv_rx_desc *curr = &rxr->rx_base[nic_i];
struct ixgbe_rx_buf *rxbuf = &rxr->rx_buffers[nic_i];
- if (addr == netmap_buffer_base) /* bad buf */
+ if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
goto ring_reset;
if (slot->flags & NS_BUF_CHANGED) {
/* buffer has changed, reload map */
- netmap_reload_map(rxr->ptag, rxbuf->pmap, addr);
+ netmap_reload_map(na, rxr->ptag, rxbuf->pmap, addr);
slot->flags &= ~NS_BUF_CHANGED;
}
curr->wb.upper.status_error = 0;
diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c
index e8b6c5a..0fd362f 100644
--- a/sys/dev/netmap/netmap.c
+++ b/sys/dev/netmap/netmap.c
@@ -124,6 +124,223 @@ ports attached to the switch)
*/
+
+/* --- internals ----
+ *
+ * Roadmap to the code that implements the above.
+ *
+ * > 1. a process/thread issues one or more open() on /dev/netmap, to create
+ * > select()able file descriptor on which events are reported.
+ *
+ * Internally, we allocate a netmap_priv_d structure, that will be
+ * initialized on ioctl(NIOCREGIF).
+ *
+ * os-specific:
+ * FreeBSD: netmap_open (netmap_freebsd.c). The priv is
+ * per-thread.
+ * linux: linux_netmap_open (netmap_linux.c). The priv is
+ * per-open.
+ *
+ * > 2. on each descriptor, the process issues an ioctl() to identify
+ * > the interface that should report events to the file descriptor.
+ *
+ * Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
+ * Most important things happen in netmap_get_na() and
+ * netmap_do_regif(), called from there. Additional details can be
+ * found in the comments above those functions.
+ *
+ * In all cases, this action creates/takes-a-reference-to a
+ * netmap_*_adapter describing the port, and allocates a netmap_if
+ * and all necessary netmap rings, filling them with netmap buffers.
+ *
+ * In this phase, the sync callbacks for each ring are set (these are used
+ * in steps 5 and 6 below). The callbacks depend on the type of adapter.
+ * The adapter creation/initialization code puts them in the
+ * netmap_adapter (fields na->nm_txsync and na->nm_rxsync). Then, they
+ * are copied from there to the netmap_kring's during netmap_do_regif(), by
+ * the nm_krings_create() callback. All the nm_krings_create callbacks
+ * actually call netmap_krings_create() to perform this and the other
+ * common stuff. netmap_krings_create() also takes care of the host rings,
+ * if needed, by setting their sync callbacks appropriately.
+ *
+ * Additional actions depend on the kind of netmap_adapter that has been
+ * registered:
+ *
+ * - netmap_hw_adapter: [netmap.c]
+ * This is a system netdev/ifp with native netmap support.
+ * The ifp is detached from the host stack by redirecting:
+ * - transmissions (from the network stack) to netmap_transmit()
+ * - receive notifications to the nm_notify() callback for
+ * this adapter. The callback is normally netmap_notify(), unless
+ * the ifp is attached to a bridge using bwrap, in which case it
+ * is netmap_bwrap_intr_notify().
+ *
+ * - netmap_generic_adapter: [netmap_generic.c]
+ * A system netdev/ifp without native netmap support.
+ *
+ * (the decision about native/non native support is taken in
+ * netmap_get_hw_na(), called by netmap_get_na())
+ *
+ * - netmap_vp_adapter [netmap_vale.c]
+ * Returned by netmap_get_bdg_na().
+ * This is a persistent or ephemeral VALE port. Ephemeral ports
+ * are created on the fly if they don't already exist, and are
+ * always attached to a bridge.
+ * Persistent VALE ports must must be created seperately, and i
+ * then attached like normal NICs. The NIOCREGIF we are examining
+ * will find them only if they had previosly been created and
+ * attached (see VALE_CTL below).
+ *
+ * - netmap_pipe_adapter [netmap_pipe.c]
+ * Returned by netmap_get_pipe_na().
+ * Both pipe ends are created, if they didn't already exist.
+ *
+ * - netmap_monitor_adapter [netmap_monitor.c]
+ * Returned by netmap_get_monitor_na().
+ * If successful, the nm_sync callbacks of the monitored adapter
+ * will be intercepted by the returned monitor.
+ *
+ * - netmap_bwrap_adapter [netmap_vale.c]
+ * Cannot be obtained in this way, see VALE_CTL below
+ *
+ *
+ * os-specific:
+ * linux: we first go through linux_netmap_ioctl() to
+ * adapt the FreeBSD interface to the linux one.
+ *
+ *
+ * > 3. on each descriptor, the process issues an mmap() request to
+ * > map the shared memory region within the process' address space.
+ * > The list of interesting queues is indicated by a location in
+ * > the shared memory region.
+ *
+ * os-specific:
+ * FreeBSD: netmap_mmap_single (netmap_freebsd.c).
+ * linux: linux_netmap_mmap (netmap_linux.c).
+ *
+ * > 4. using the functions in the netmap(4) userspace API, a process
+ * > can look up the occupation state of a queue, access memory buffers,
+ * > and retrieve received packets or enqueue packets to transmit.
+ *
+ * these actions do not involve the kernel.
+ *
+ * > 5. using some ioctl()s the process can synchronize the userspace view
+ * > of the queue with the actual status in the kernel. This includes both
+ * > receiving the notification of new packets, and transmitting new
+ * > packets on the output interface.
+ *
+ * These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
+ * cases. They invoke the nm_sync callbacks on the netmap_kring
+ * structures, as initialized in step 2 and maybe later modified
+ * by a monitor. Monitors, however, will always call the original
+ * callback before doing anything else.
+ *
+ *
+ * > 6. select() or poll() can be used to wait for events on individual
+ * > transmit or receive queues (or all queues for a given interface).
+ *
+ * Implemented in netmap_poll(). This will call the same nm_sync()
+ * callbacks as in step 5 above.
+ *
+ * os-specific:
+ * linux: we first go through linux_netmap_poll() to adapt
+ * the FreeBSD interface to the linux one.
+ *
+ *
+ * ---- VALE_CTL -----
+ *
+ * VALE switches are controlled by issuing a NIOCREGIF with a non-null
+ * nr_cmd in the nmreq structure. These subcommands are handled by
+ * netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
+ * and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
+ * subcommands, respectively.
+ *
+ * Any network interface known to the system (including a persistent VALE
+ * port) can be attached to a VALE switch by issuing the
+ * NETMAP_BDG_ATTACH subcommand. After the attachment, persistent VALE ports
+ * look exactly like ephemeral VALE ports (as created in step 2 above). The
+ * attachment of other interfaces, instead, requires the creation of a
+ * netmap_bwrap_adapter. Moreover, the attached interface must be put in
+ * netmap mode. This may require the creation of a netmap_generic_adapter if
+ * we have no native support for the interface, or if generic adapters have
+ * been forced by sysctl.
+ *
+ * Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
+ * called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
+ * callback. In the case of the bwrap, the callback creates the
+ * netmap_bwrap_adapter. The initialization of the bwrap is then
+ * completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
+ * callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
+ * A generic adapter for the wrapped ifp will be created if needed, when
+ * netmap_get_bdg_na() calls netmap_get_hw_na().
+ *
+ *
+ * ---- DATAPATHS -----
+ *
+ * -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
+ *
+ * na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
+ *
+ * - tx from netmap userspace:
+ * concurrently:
+ * 1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
+ * kring->nm_sync() == DEVICE_netmap_txsync()
+ * 2) device interrupt handler
+ * na->nm_notify() == netmap_notify()
+ * - rx from netmap userspace:
+ * concurrently:
+ * 1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
+ * kring->nm_sync() == DEVICE_netmap_rxsync()
+ * 2) device interrupt handler
+ * na->nm_notify() == netmap_notify()
+ * - tx from host stack
+ * concurrently:
+ * 1) host stack
+ * netmap_transmit()
+ * na->nm_notify == netmap_notify()
+ * 2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
+ * kring->nm_sync() == netmap_rxsync_from_host_compat
+ * netmap_rxsync_from_host(na, NULL, NULL)
+ * - tx to host stack
+ * ioctl(NIOCTXSYNC)/netmap_poll() in process context
+ * kring->nm_sync() == netmap_txsync_to_host_compat
+ * netmap_txsync_to_host(na)
+ * NM_SEND_UP()
+ * FreeBSD: na->if_input() == ?? XXX
+ * linux: netif_rx() with NM_MAGIC_PRIORITY_RX
+ *
+ *
+ *
+ * -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
+ *
+ *
+ *
+ * -= VALE PORT =-
+ *
+ *
+ *
+ * -= NETMAP PIPE =-
+ *
+ *
+ *
+ * -= SYSTEM DEVICE WITH NATIVE SUPPORT, CONNECTED TO VALE, NO HOST RINGS =-
+ *
+ *
+ *
+ * -= SYSTEM DEVICE WITH NATIVE SUPPORT, CONNECTED TO VALE, WITH HOST RINGS =-
+ *
+ *
+ *
+ * -= SYSTEM DEVICE WITH GENERIC SUPPORT, CONNECTED TO VALE, NO HOST RINGS =-
+ *
+ *
+ *
+ * -= SYSTEM DEVICE WITH GENERIC SUPPORT, CONNECTED TO VALE, WITH HOST RINGS =-
+ *
+ *
+ *
+ */
+
/*
* OS-specific code that is used only within this file.
* Other OS-specific code that must be accessed by drivers
@@ -218,6 +435,10 @@ int netmap_txsync_retry = 2;
SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
&netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
+int netmap_adaptive_io = 0;
+SYSCTL_INT(_dev_netmap, OID_AUTO, adaptive_io, CTLFLAG_RW,
+ &netmap_adaptive_io, 0 , "Adaptive I/O on paravirt");
+
int netmap_flags = 0; /* debug flags */
int netmap_fwd = 0; /* force transparent mode */
int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */
@@ -259,7 +480,7 @@ nm_kr_get(struct netmap_kring *kr)
* mark the ring as stopped, and run through the locks
* to make sure other users get to see it.
*/
-void
+static void
netmap_disable_ring(struct netmap_kring *kr)
{
kr->nkr_stopped = 1;
@@ -269,41 +490,59 @@ netmap_disable_ring(struct netmap_kring *kr)
nm_kr_put(kr);
}
+/* stop or enable a single tx ring */
+void
+netmap_set_txring(struct netmap_adapter *na, u_int ring_id, int stopped)
+{
+ if (stopped)
+ netmap_disable_ring(na->tx_rings + ring_id);
+ else
+ na->tx_rings[ring_id].nkr_stopped = 0;
+ /* nofify that the stopped state has changed. This is currently
+ *only used by bwrap to propagate the state to its own krings.
+ * (see netmap_bwrap_intr_notify).
+ */
+ na->nm_notify(na, ring_id, NR_TX, NAF_DISABLE_NOTIFY);
+}
+
+/* stop or enable a single rx ring */
+void
+netmap_set_rxring(struct netmap_adapter *na, u_int ring_id, int stopped)
+{
+ if (stopped)
+ netmap_disable_ring(na->rx_rings + ring_id);
+ else
+ na->rx_rings[ring_id].nkr_stopped = 0;
+ /* nofify that the stopped state has changed. This is currently
+ *only used by bwrap to propagate the state to its own krings.
+ * (see netmap_bwrap_intr_notify).
+ */
+ na->nm_notify(na, ring_id, NR_RX, NAF_DISABLE_NOTIFY);
+}
+
/* stop or enable all the rings of na */
-static void
-netmap_set_all_rings(struct ifnet *ifp, int stopped)
+void
+netmap_set_all_rings(struct netmap_adapter *na, int stopped)
{
- struct netmap_adapter *na;
int i;
u_int ntx, nrx;
- if (!(ifp->if_capenable & IFCAP_NETMAP))
+ if (!nm_netmap_on(na))
return;
- na = NA(ifp);
-
ntx = netmap_real_tx_rings(na);
nrx = netmap_real_rx_rings(na);
for (i = 0; i < ntx; i++) {
- if (stopped)
- netmap_disable_ring(na->tx_rings + i);
- else
- na->tx_rings[i].nkr_stopped = 0;
- na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY);
+ netmap_set_txring(na, i, stopped);
}
for (i = 0; i < nrx; i++) {
- if (stopped)
- netmap_disable_ring(na->rx_rings + i);
- else
- na->rx_rings[i].nkr_stopped = 0;
- na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY);
+ netmap_set_rxring(na, i, stopped);
}
}
-
/*
* Convenience function used in drivers. Waits for current txsync()s/rxsync()s
* to finish and prevents any new one from starting. Call this before turning
@@ -314,10 +553,9 @@ netmap_set_all_rings(struct ifnet *ifp, int stopped)
void
netmap_disable_all_rings(struct ifnet *ifp)
{
- netmap_set_all_rings(ifp, 1 /* stopped */);
+ netmap_set_all_rings(NA(ifp), 1 /* stopped */);
}
-
/*
* Convenience function used in drivers. Re-enables rxsync and txsync on the
* adapter's rings In linux drivers, this should be placed near each
@@ -326,7 +564,7 @@ netmap_disable_all_rings(struct ifnet *ifp)
void
netmap_enable_all_rings(struct ifnet *ifp)
{
- netmap_set_all_rings(ifp, 0 /* enabled */);
+ netmap_set_all_rings(NA(ifp), 0 /* enabled */);
}
@@ -410,7 +648,6 @@ nm_dump_buf(char *p, int len, int lim, char *dst)
int
netmap_update_config(struct netmap_adapter *na)
{
- struct ifnet *ifp = na->ifp;
u_int txr, txd, rxr, rxd;
txr = txd = rxr = rxd = 0;
@@ -429,11 +666,11 @@ netmap_update_config(struct netmap_adapter *na)
return 0; /* nothing changed */
if (netmap_verbose || na->active_fds > 0) {
D("stored config %s: txring %d x %d, rxring %d x %d",
- NM_IFPNAME(ifp),
+ na->name,
na->num_tx_rings, na->num_tx_desc,
na->num_rx_rings, na->num_rx_desc);
D("new config %s: txring %d x %d, rxring %d x %d",
- NM_IFPNAME(ifp), txr, txd, rxr, rxd);
+ na->name, txr, txd, rxr, rxd);
}
if (na->active_fds == 0) {
D("configuration changed (but fine)");
@@ -447,20 +684,6 @@ netmap_update_config(struct netmap_adapter *na)
return 1;
}
-static int
-netmap_txsync_compat(struct netmap_kring *kring, int flags)
-{
- struct netmap_adapter *na = kring->na;
- return na->nm_txsync(na, kring->ring_id, flags);
-}
-
-static int
-netmap_rxsync_compat(struct netmap_kring *kring, int flags)
-{
- struct netmap_adapter *na = kring->na;
- return na->nm_rxsync(na, kring->ring_id, flags);
-}
-
/* kring->nm_sync callback for the host tx ring */
static int
netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags)
@@ -538,7 +761,7 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
kring->ring_id = i;
kring->nkr_num_slots = ndesc;
if (i < na->num_tx_rings) {
- kring->nm_sync = netmap_txsync_compat; // XXX
+ kring->nm_sync = na->nm_txsync;
} else if (i == na->num_tx_rings) {
kring->nm_sync = netmap_txsync_to_host_compat;
}
@@ -547,7 +770,7 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
*/
kring->rhead = kring->rcur = kring->nr_hwcur = 0;
kring->rtail = kring->nr_hwtail = ndesc - 1;
- snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", NM_IFPNAME(na->ifp), i);
+ snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", na->name, i);
ND("ktx %s h %d c %d t %d",
kring->name, kring->rhead, kring->rcur, kring->rtail);
mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF);
@@ -562,13 +785,13 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
kring->ring_id = i;
kring->nkr_num_slots = ndesc;
if (i < na->num_rx_rings) {
- kring->nm_sync = netmap_rxsync_compat; // XXX
+ kring->nm_sync = na->nm_rxsync;
} else if (i == na->num_rx_rings) {
kring->nm_sync = netmap_rxsync_from_host_compat;
}
kring->rhead = kring->rcur = kring->nr_hwcur = 0;
kring->rtail = kring->nr_hwtail = 0;
- snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", NM_IFPNAME(na->ifp), i);
+ snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", na->name, i);
ND("krx %s h %d c %d t %d",
kring->name, kring->rhead, kring->rcur, kring->rtail);
mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF);
@@ -624,7 +847,7 @@ netmap_hw_krings_delete(struct netmap_adapter *na)
*/
/* call with NMG_LOCK held */
static struct netmap_if*
-netmap_if_new(const char *ifname, struct netmap_adapter *na)
+netmap_if_new(struct netmap_adapter *na)
{
struct netmap_if *nifp;
@@ -641,7 +864,7 @@ netmap_if_new(const char *ifname, struct netmap_adapter *na)
* the netmap rings themselves
*/
if (na->nm_krings_create(na))
- goto cleanup;
+ return NULL;
/* create all missing netmap rings */
if (netmap_mem_rings_create(na))
@@ -650,7 +873,7 @@ netmap_if_new(const char *ifname, struct netmap_adapter *na)
final:
/* in all cases, create a new netmap if */
- nifp = netmap_mem_if_new(ifname, na);
+ nifp = netmap_mem_if_new(na);
if (nifp == NULL)
goto cleanup;
@@ -689,7 +912,7 @@ netmap_get_memory_locked(struct netmap_priv_d* p)
nmd = p->np_na->nm_mem;
}
if (p->np_mref == NULL) {
- error = netmap_mem_finalize(nmd);
+ error = netmap_mem_finalize(nmd, p->np_na);
if (!error)
p->np_mref = nmd;
} else if (p->np_mref != nmd) {
@@ -728,17 +951,15 @@ static void
netmap_drop_memory_locked(struct netmap_priv_d* p)
{
if (p->np_mref) {
- netmap_mem_deref(p->np_mref);
+ netmap_mem_deref(p->np_mref, p->np_na);
p->np_mref = NULL;
}
}
/*
- * File descriptor's private data destructor.
- *
* Call nm_register(ifp,0) to stop netmap mode on the interface and
- * revert to normal operation. We expect that np_na->ifp has not gone.
+ * revert to normal operation.
* The second argument is the nifp to work on. In some cases it is
* not attached yet to the netmap_priv_d so we need to pass it as
* a separate argument.
@@ -748,14 +969,13 @@ static void
netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp)
{
struct netmap_adapter *na = priv->np_na;
- struct ifnet *ifp = na->ifp;
NMG_LOCK_ASSERT();
na->active_fds--;
if (na->active_fds <= 0) { /* last instance */
if (netmap_verbose)
- D("deleting last instance for %s", NM_IFPNAME(ifp));
+ D("deleting last instance for %s", na->name);
/*
* (TO CHECK) This function is only called
* when the last reference to this file descriptor goes
@@ -770,8 +990,7 @@ netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp)
* happens if the close() occurs while a concurrent
* syscall is running.
*/
- if (ifp)
- na->nm_register(na, 0); /* off, clear flags */
+ na->nm_register(na, 0); /* off, clear flags */
/* Wake up any sleeping threads. netmap_poll will
* then return POLLERR
* XXX The wake up now must happen during *_down(), when
@@ -922,13 +1141,13 @@ netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
if ((slot->flags & NS_FORWARD) == 0 && !force)
continue;
- if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) {
+ if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
RD(5, "bad pkt at %d len %d", n, slot->len);
continue;
}
slot->flags &= ~NS_FORWARD; // XXX needed ?
/* XXX TODO: adapt to the case of a multisegment packet */
- m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL);
+ m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
if (m == NULL)
break;
@@ -981,7 +1200,7 @@ netmap_sw_to_nic(struct netmap_adapter *na)
dst->len = tmp.len;
dst->flags = NS_BUF_CHANGED;
- rdst->head = rdst->cur = nm_next(dst_cur, dst_lim);
+ rdst->cur = nm_next(dst_cur, dst_lim);
}
/* if (sent) XXX txsync ? */
}
@@ -1028,6 +1247,11 @@ netmap_txsync_to_host(struct netmap_adapter *na)
* They have been put in kring->rx_queue by netmap_transmit().
* We protect access to the kring using kring->rx_queue.lock
*
+ * This routine also does the selrecord if called from the poll handler
+ * (we know because td != NULL).
+ *
+ * NOTE: on linux, selrecord() is defined as a macro and uses pwait
+ * as an additional hidden argument.
* returns the number of packets delivered to tx queues in
* transparent mode, or a negative value if error
*/
@@ -1059,14 +1283,15 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai
int len = MBUF_LEN(m);
struct netmap_slot *slot = &ring->slot[nm_i];
- m_copydata(m, 0, len, BDG_NMB(na, slot));
+ m_copydata(m, 0, len, NMB(na, slot));
ND("nm %d len %d", nm_i, len);
if (netmap_verbose)
- D("%s", nm_dump_buf(BDG_NMB(na, slot),len, 128, NULL));
+ D("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
slot->len = len;
slot->flags = kring->nkr_slot_flags;
nm_i = nm_next(nm_i, lim);
+ m_freem(m);
}
kring->nr_hwtail = nm_i;
}
@@ -1083,6 +1308,10 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai
nm_rxsync_finalize(kring);
+ /* access copies of cur,tail in the kring */
+ if (kring->rcur == kring->rtail && td) /* no bufs available */
+ selrecord(td, &kring->si);
+
mbq_unlock(q);
return ret;
}
@@ -1128,21 +1357,23 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
i = netmap_admode = NETMAP_ADMODE_BEST;
if (NETMAP_CAPABLE(ifp)) {
- /* If an adapter already exists, but is
- * attached to a vale port, we report that the
- * port is busy.
- */
- if (NETMAP_OWNED_BY_KERN(NA(ifp)))
- return EBUSY;
-
+ prev_na = NA(ifp);
/* If an adapter already exists, return it if
* there are active file descriptors or if
* netmap is not forced to use generic
* adapters.
*/
- if (NA(ifp)->active_fds > 0 ||
- i != NETMAP_ADMODE_GENERIC) {
- *na = NA(ifp);
+ if (NETMAP_OWNED_BY_ANY(prev_na)
+ || i != NETMAP_ADMODE_GENERIC
+ || prev_na->na_flags & NAF_FORCE_NATIVE
+#ifdef WITH_PIPES
+ /* ugly, but we cannot allow an adapter switch
+ * if some pipe is referring to this one
+ */
+ || prev_na->na_next_pipe > 0
+#endif
+ ) {
+ *na = prev_na;
return 0;
}
}
@@ -1212,13 +1443,30 @@ netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
*na = NULL; /* default return value */
- /* first try to see if this is a bridge port. */
NMG_LOCK_ASSERT();
+ /* we cascade through all possibile types of netmap adapter.
+ * All netmap_get_*_na() functions return an error and an na,
+ * with the following combinations:
+ *
+ * error na
+ * 0 NULL type doesn't match
+ * !0 NULL type matches, but na creation/lookup failed
+ * 0 !NULL type matches and na created/found
+ * !0 !NULL impossible
+ */
+
+ /* try to see if this is a monitor port */
+ error = netmap_get_monitor_na(nmr, na, create);
+ if (error || *na != NULL)
+ return error;
+
+ /* try to see if this is a pipe port */
error = netmap_get_pipe_na(nmr, na, create);
if (error || *na != NULL)
return error;
+ /* try to see if this is a bridge port */
error = netmap_get_bdg_na(nmr, na, create);
if (error)
return error;
@@ -1241,11 +1489,6 @@ netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
if (error)
goto out;
- /* Users cannot use the NIC attached to a bridge directly */
- if (NETMAP_OWNED_BY_KERN(ret)) {
- error = EBUSY;
- goto out;
- }
*na = ret;
netmap_adapter_get(ret);
@@ -1444,7 +1687,7 @@ netmap_ring_reinit(struct netmap_kring *kring)
int errors = 0;
// XXX KASSERT nm_kr_tryget
- RD(10, "called for %s", NM_IFPNAME(kring->na->ifp));
+ RD(10, "called for %s", kring->name);
// XXX probably wrong to trust userspace
kring->rhead = ring->head;
kring->rcur = ring->cur;
@@ -1463,7 +1706,7 @@ netmap_ring_reinit(struct netmap_kring *kring)
RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
ring->slot[i].buf_idx = 0;
ring->slot[i].len = 0;
- } else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) {
+ } else if (len > NETMAP_BUF_SIZE(kring->na)) {
ring->slot[i].len = 0;
RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
}
@@ -1481,13 +1724,15 @@ netmap_ring_reinit(struct netmap_kring *kring)
return (errors ? 1 : 0);
}
-
-/*
- * Set the ring ID. For devices with a single queue, a request
- * for all rings is the same as a single ring.
+/* interpret the ringid and flags fields of an nmreq, by translating them
+ * into a pair of intervals of ring indices:
+ *
+ * [priv->np_txqfirst, priv->np_txqlast) and
+ * [priv->np_rxqfirst, priv->np_rxqlast)
+ *
*/
-static int
-netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
+int
+netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
{
struct netmap_adapter *na = priv->np_na;
u_int j, i = ringid & NETMAP_RING_MASK;
@@ -1551,15 +1796,11 @@ netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
D("invalid regif type %d", reg);
return EINVAL;
}
- priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
priv->np_flags = (flags & ~NR_REG_MASK) | reg;
- if (nm_tx_si_user(priv))
- na->tx_si_users++;
- if (nm_rx_si_user(priv))
- na->rx_si_users++;
+
if (netmap_verbose) {
D("%s: tx [%d,%d) rx [%d,%d) id %d",
- NM_IFPNAME(na->ifp),
+ na->name,
priv->np_txqfirst,
priv->np_txqlast,
priv->np_rxqfirst,
@@ -1569,16 +1810,113 @@ netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
return 0;
}
+
+/*
+ * Set the ring ID. For devices with a single queue, a request
+ * for all rings is the same as a single ring.
+ */
+static int
+netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
+{
+ struct netmap_adapter *na = priv->np_na;
+ int error;
+
+ error = netmap_interp_ringid(priv, ringid, flags);
+ if (error) {
+ return error;
+ }
+
+ priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
+
+ /* optimization: count the users registered for more than
+ * one ring, which are the ones sleeping on the global queue.
+ * The default netmap_notify() callback will then
+ * avoid signaling the global queue if nobody is using it
+ */
+ if (nm_tx_si_user(priv))
+ na->tx_si_users++;
+ if (nm_rx_si_user(priv))
+ na->rx_si_users++;
+ return 0;
+}
+
/*
* possibly move the interface to netmap-mode.
* If success it returns a pointer to netmap_if, otherwise NULL.
* This must be called with NMG_LOCK held.
+ *
+ * The following na callbacks are called in the process:
+ *
+ * na->nm_config() [by netmap_update_config]
+ * (get current number and size of rings)
+ *
+ * We have a generic one for linux (netmap_linux_config).
+ * The bwrap has to override this, since it has to forward
+ * the request to the wrapped adapter (netmap_bwrap_config).
+ *
+ * XXX netmap_if_new calls this again (2014-03-15)
+ *
+ * na->nm_krings_create() [by netmap_if_new]
+ * (create and init the krings array)
+ *
+ * One of the following:
+ *
+ * * netmap_hw_krings_create, (hw ports)
+ * creates the standard layout for the krings
+ * and adds the mbq (used for the host rings).
+ *
+ * * netmap_vp_krings_create (VALE ports)
+ * add leases and scratchpads
+ *
+ * * netmap_pipe_krings_create (pipes)
+ * create the krings and rings of both ends and
+ * cross-link them
+ *
+ * * netmap_monitor_krings_create (monitors)
+ * avoid allocating the mbq
+ *
+ * * netmap_bwrap_krings_create (bwraps)
+ * create both the brap krings array,
+ * the krings array of the wrapped adapter, and
+ * (if needed) the fake array for the host adapter
+ *
+ * na->nm_register(, 1)
+ * (put the adapter in netmap mode)
+ *
+ * This may be one of the following:
+ * (XXX these should be either all *_register or all *_reg 2014-03-15)
+ *
+ * * netmap_hw_register (hw ports)
+ * checks that the ifp is still there, then calls
+ * the hardware specific callback;
+ *
+ * * netmap_vp_reg (VALE ports)
+ * If the port is connected to a bridge,
+ * set the NAF_NETMAP_ON flag under the
+ * bridge write lock.
+ *
+ * * netmap_pipe_reg (pipes)
+ * inform the other pipe end that it is no
+ * longer responsibile for the lifetime of this
+ * pipe end
+ *
+ * * netmap_monitor_reg (monitors)
+ * intercept the sync callbacks of the monitored
+ * rings
+ *
+ * * netmap_bwrap_register (bwraps)
+ * cross-link the bwrap and hwna rings,
+ * forward the request to the hwna, override
+ * the hwna notify callback (to get the frames
+ * coming from outside go through the bridge).
+ *
+ * XXX maybe netmap_if_new() should be merged with this (2014-03-15).
+ *
*/
struct netmap_if *
netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
uint16_t ringid, uint32_t flags, int *err)
{
- struct ifnet *ifp = na->ifp;
struct netmap_if *nifp = NULL;
int error, need_mem = 0;
@@ -1597,24 +1935,22 @@ netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
if (error)
goto out;
}
- nifp = netmap_if_new(NM_IFPNAME(ifp), na);
-
/* Allocate a netmap_if and, if necessary, all the netmap_ring's */
+ nifp = netmap_if_new(na);
if (nifp == NULL) { /* allocation failed */
error = ENOMEM;
goto out;
}
na->active_fds++;
- if (ifp->if_capenable & IFCAP_NETMAP) {
- /* was already set */
- } else {
- /* Otherwise set the card in netmap mode
+ if (!nm_netmap_on(na)) {
+ /* Netmap not active, set the card in netmap mode
* and make it use the shared buffers.
*/
/* cache the allocator info in the na */
- na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut;
+ na->na_lut = netmap_mem_get_lut(na->nm_mem);
ND("%p->na_lut == %p", na, na->na_lut);
- na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal;
+ na->na_lut_objtotal = netmap_mem_get_buftotal(na->nm_mem);
+ na->na_lut_objsize = netmap_mem_get_bufsize(na->nm_mem);
error = na->nm_register(na, 1); /* mode on */
if (error) {
netmap_do_unregif(priv, nifp);
@@ -1624,12 +1960,12 @@ netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
out:
*err = error;
if (error) {
- priv->np_na = NULL;
/* we should drop the allocator, but only
* if we were the ones who grabbed it
*/
if (need_mem)
netmap_drop_memory_locked(priv);
+ priv->np_na = NULL;
}
if (nifp != NULL) {
/*
@@ -1662,7 +1998,6 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
int fflag, struct thread *td)
{
struct netmap_priv_d *priv = NULL;
- struct ifnet *ifp = NULL;
struct nmreq *nmr = (struct nmreq *) data;
struct netmap_adapter *na = NULL;
int error;
@@ -1740,7 +2075,9 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
/* possibly attach/detach NIC and VALE switch */
i = nmr->nr_cmd;
if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH
- || i == NETMAP_BDG_VNET_HDR) {
+ || i == NETMAP_BDG_VNET_HDR
+ || i == NETMAP_BDG_NEWIF
+ || i == NETMAP_BDG_DELIF) {
error = netmap_bdg_ctl(nmr, NULL);
break;
} else if (i != 0) {
@@ -1762,7 +2099,6 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */
if (error)
break;
- ifp = na->ifp;
if (NETMAP_OWNED_BY_KERN(na)) {
netmap_adapter_put(na);
error = EBUSY;
@@ -1824,9 +2160,7 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
break;
}
- ifp = na->ifp;
- if (ifp == NULL) {
- RD(1, "the ifp is gone");
+ if (!nm_netmap_on(na)) {
error = ENXIO;
break;
}
@@ -1870,6 +2204,9 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
break;
+ case NIOCCONFIG:
+ error = netmap_bdg_config(nmr);
+ break;
#ifdef __FreeBSD__
case FIONBIO:
case FIOASYNC:
@@ -1886,6 +2223,7 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
default: /* allow device-specific ioctls */
{
struct socket so;
+ struct ifnet *ifp;
bzero(&so, sizeof(so));
NMG_LOCK();
@@ -1935,7 +2273,6 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)
{
struct netmap_priv_d *priv = NULL;
struct netmap_adapter *na;
- struct ifnet *ifp;
struct netmap_kring *kring;
u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0;
struct mbq q; /* packets from hw queues to host stack */
@@ -1974,18 +2311,12 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)
rmb(); /* make sure following reads are not from cache */
na = priv->np_na;
- ifp = na->ifp;
- // check for deleted
- if (ifp == NULL) {
- RD(1, "the ifp is gone");
- return POLLERR;
- }
- if ( (ifp->if_capenable & IFCAP_NETMAP) == 0)
+ if (!nm_netmap_on(na))
return POLLERR;
if (netmap_verbose & 0x8000)
- D("device %s events 0x%x", NM_IFPNAME(ifp), events);
+ D("device %s events 0x%x", na->name, events);
want_tx = events & (POLLOUT | POLLWRNORM);
want_rx = events & (POLLIN | POLLRDNORM);
@@ -2056,7 +2387,6 @@ flush_tx:
* be better. In current code, however, we only
* stop the rings for brief intervals (2014-03-14)
*/
-
if (netmap_verbose)
RD(2, "%p lost race on txring %d, ok",
priv, i);
@@ -2115,6 +2445,8 @@ do_retry_rx:
/*
* transparent mode support: collect packets
* from the rxring(s).
+ * XXX NR_FORWARD should only be read on
+ * physical or NIC ports
*/
if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
ND(10, "forwarding some buffers up %d to %d",
@@ -2141,12 +2473,13 @@ do_retry_rx:
/* transparent mode XXX only during first pass ? */
if (na->na_flags & NAF_HOST_RINGS) {
kring = &na->rx_rings[na->num_rx_rings];
- if (netmap_fwd || kring->ring->flags & NR_FORWARD) {
- send_down = netmap_rxsync_from_host(na, td, dev);
- if (send_down && (netmap_no_timestamp == 0 ||
- kring->ring->flags & NR_TIMESTAMP)) {
- microtime(&kring->ring->ts);
- }
+ if (check_all_rx
+ && (netmap_fwd || kring->ring->flags & NR_FORWARD)) {
+ /* XXX fix to use kring fields */
+ if (nm_ring_empty(kring->ring))
+ send_down = netmap_rxsync_from_host(na, td, dev);
+ if (!nm_ring_empty(kring->ring))
+ revents |= want_rx;
}
}
@@ -2174,7 +2507,7 @@ do_retry_rx:
* rings to a single file descriptor.
*/
- if (q.head)
+ if (q.head && na->ifp != NULL)
netmap_send_up(na->ifp, &q);
return (revents);
@@ -2224,19 +2557,27 @@ netmap_attach_common(struct netmap_adapter *na)
if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
D("%s: invalid rings tx %d rx %d",
- ifp->if_xname, na->num_tx_rings, na->num_rx_rings);
+ na->name, na->num_tx_rings, na->num_rx_rings);
return EINVAL;
}
- WNA(ifp) = na;
+ /* ifp is NULL for virtual adapters (bwrap, non-persistent VALE ports,
+ * pipes, monitors). For bwrap we actually have a non-null ifp for
+ * use by the external modules, but that is set after this
+ * function has been called.
+ * XXX this is ugly, maybe split this function in two (2014-03-14)
+ */
+ if (ifp != NULL) {
+ WNA(ifp) = na;
/* the following is only needed for na that use the host port.
* XXX do we have something similar for linux ?
*/
#ifdef __FreeBSD__
- na->if_input = ifp->if_input; /* for netmap_send_up */
+ na->if_input = ifp->if_input; /* for netmap_send_up */
#endif /* __FreeBSD__ */
- NETMAP_SET_CAPABLE(ifp);
+ NETMAP_SET_CAPABLE(ifp);
+ }
if (na->nm_krings_create == NULL) {
/* we assume that we have been called by a driver,
* since other port types all provide their own
@@ -2250,7 +2591,13 @@ netmap_attach_common(struct netmap_adapter *na)
na->active_fds = 0;
if (na->nm_mem == NULL)
+ /* use the global allocator */
na->nm_mem = &nm_mem;
+ if (na->nm_bdg_attach == NULL)
+ /* no special nm_bdg_attach callback. On VALE
+ * attach, we need to interpose a bwrap
+ */
+ na->nm_bdg_attach = netmap_bwrap_attach;
return 0;
}
@@ -2273,6 +2620,28 @@ netmap_detach_common(struct netmap_adapter *na)
free(na, M_DEVBUF);
}
+/* Wrapper for the register callback provided hardware drivers.
+ * na->ifp == NULL means the the driver module has been
+ * unloaded, so we cannot call into it.
+ * Note that module unloading, in our patched linux drivers,
+ * happens under NMG_LOCK and after having stopped all the
+ * nic rings (see netmap_detach). This provides sufficient
+ * protection for the other driver-provied callbacks
+ * (i.e., nm_config and nm_*xsync), that therefore don't need
+ * to wrapped.
+ */
+static int
+netmap_hw_register(struct netmap_adapter *na, int onoff)
+{
+ struct netmap_hw_adapter *hwna =
+ (struct netmap_hw_adapter*)na;
+
+ if (na->ifp == NULL)
+ return onoff ? ENXIO : 0;
+
+ return hwna->nm_hw_register(na, onoff);
+}
+
/*
* Initialize a ``netmap_adapter`` object created by driver on attach.
@@ -2298,6 +2667,9 @@ netmap_attach(struct netmap_adapter *arg)
goto fail;
hwna->up = *arg;
hwna->up.na_flags |= NAF_HOST_RINGS;
+ strncpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
+ hwna->nm_hw_register = hwna->up.nm_register;
+ hwna->up.nm_register = netmap_hw_register;
if (netmap_attach_common(&hwna->up)) {
free(hwna, M_DEVBUF);
goto fail;
@@ -2314,10 +2686,20 @@ netmap_attach(struct netmap_adapter *arg)
#endif
}
hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;
+ if (ifp->ethtool_ops) {
+ hwna->nm_eto = *ifp->ethtool_ops;
+ }
+ hwna->nm_eto.set_ringparam = linux_netmap_set_ringparam;
+#ifdef ETHTOOL_SCHANNELS
+ hwna->nm_eto.set_channels = linux_netmap_set_channels;
+#endif
+ if (arg->nm_config == NULL) {
+ hwna->up.nm_config = netmap_linux_config;
+ }
#endif /* linux */
D("success for %s tx %d/%d rx %d/%d queues/slots",
- NM_IFPNAME(ifp),
+ hwna->up.name,
hwna->up.num_tx_rings, hwna->up.num_tx_desc,
hwna->up.num_rx_rings, hwna->up.num_rx_desc
);
@@ -2393,6 +2775,8 @@ netmap_detach(struct ifnet *ifp)
* tell them that the interface is gone
*/
na->ifp = NULL;
+ // XXX also clear NAF_NATIVE_ON ?
+ na->na_flags &= ~NAF_NETMAP_ON;
/* give them a chance to notice */
netmap_enable_all_rings(ifp);
}
@@ -2426,8 +2810,8 @@ netmap_transmit(struct ifnet *ifp, struct mbuf *m)
// if we follow the down/configure/up protocol -gl
// mtx_lock(&na->core_lock);
- if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) {
- D("%s not in netmap mode anymore", NM_IFPNAME(ifp));
+ if (!nm_netmap_on(na)) {
+ D("%s not in netmap mode anymore", na->name);
error = ENXIO;
goto done;
}
@@ -2436,9 +2820,9 @@ netmap_transmit(struct ifnet *ifp, struct mbuf *m)
q = &kring->rx_queue;
// XXX reconsider long packets if we handle fragments
- if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */
- D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp),
- len, NETMAP_BDG_BUF_SIZE(na->nm_mem));
+ if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
+ D("%s from_host, drop packet size %d > %d", na->name,
+ len, NETMAP_BUF_SIZE(na));
goto done;
}
@@ -2454,12 +2838,12 @@ netmap_transmit(struct ifnet *ifp, struct mbuf *m)
space += kring->nkr_num_slots;
if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX
RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p",
- NM_IFPNAME(ifp), kring->nr_hwcur, kring->nr_hwtail, mbq_len(q),
+ na->name, kring->nr_hwcur, kring->nr_hwtail, mbq_len(q),
len, m);
} else {
mbq_enqueue(q, m);
ND(10, "%s %d bufs in queue len %d m %p",
- NM_IFPNAME(ifp), mbq_len(q), len, m);
+ na->name, mbq_len(q), len, m);
/* notify outside the lock */
m = NULL;
error = 0;
@@ -2492,12 +2876,8 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
struct netmap_kring *kring;
int new_hwofs, lim;
- if (na == NULL) {
- D("NULL na, should not happen");
- return NULL; /* no netmap support here */
- }
- if (!(na->ifp->if_capenable & IFCAP_NETMAP)) {
- ND("interface not in netmap mode");
+ if (!nm_native_on(na)) {
+ ND("interface not in native netmap mode");
return NULL; /* nothing to reinitialize */
}
@@ -2528,7 +2908,7 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
/* Always set the new offset value and realign the ring. */
if (netmap_verbose)
D("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
- NM_IFPNAME(na->ifp),
+ na->name,
tx == NR_TX ? "TX" : "RX", n,
kring->nkr_hwofs, new_hwofs,
kring->nr_hwtail,
@@ -2570,8 +2950,9 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
* The 'notify' routine depends on what the ring is attached to.
* - for a netmap file descriptor, do a selwakeup on the individual
* waitqueue, plus one on the global one if needed
- * - for a switch, call the proper forwarding routine
- * - XXX more ?
+ * (see netmap_notify)
+ * - for a nic connected to a switch, call the proper forwarding routine
+ * (see netmap_bwrap_intr_notify)
*/
void
netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
@@ -2620,11 +3001,18 @@ netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
int
netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
{
- // XXX could we check NAF_NATIVE_ON ?
- if (!(ifp->if_capenable & IFCAP_NETMAP))
+ struct netmap_adapter *na = NA(ifp);
+
+ /*
+ * XXX emulated netmap mode sets NAF_SKIP_INTR so
+ * we still use the regular driver even though the previous
+ * check fails. It is unclear whether we should use
+ * nm_native_on() here.
+ */
+ if (!nm_netmap_on(na))
return 0;
- if (NA(ifp)->na_flags & NAF_SKIP_INTR) {
+ if (na->na_flags & NAF_SKIP_INTR) {
ND("use regular interrupt");
return 0;
}
@@ -2677,6 +3065,9 @@ netmap_init(void)
goto fail;
netmap_init_bridges();
+#ifdef __FreeBSD__
+ nm_vi_init_index();
+#endif
printf("netmap: loaded module\n");
return (0);
fail:
diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c
index e43d669..160b7c0 100644
--- a/sys/dev/netmap/netmap_freebsd.c
+++ b/sys/dev/netmap/netmap_freebsd.c
@@ -50,6 +50,9 @@
#include <sys/selinfo.h>
#include <net/if.h>
#include <net/if_var.h>
+#include <net/if_types.h> /* IFT_ETHER */
+#include <net/ethernet.h> /* ether_ifdetach */
+#include <net/if_dl.h> /* LLADDR */
#include <machine/bus.h> /* bus_dmamap_* */
#include <netinet/in.h> /* in6_cksum_pseudo() */
#include <machine/in_cksum.h> /* in_pseudo(), in_cksum_hdr() */
@@ -91,8 +94,7 @@ nm_csum_fold(rawsum_t cur_sum)
return htobe16((~cur_sum) & 0xFFFF);
}
-uint16_t
-nm_csum_ipv4(struct nm_iphdr *iph)
+uint16_t nm_csum_ipv4(struct nm_iphdr *iph)
{
#if 0
return in_cksum_hdr((void *)iph);
@@ -148,8 +150,7 @@ nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data,
int
netmap_catch_rx(struct netmap_adapter *na, int intercept)
{
- struct netmap_generic_adapter *gna =
- (struct netmap_generic_adapter *)na;
+ struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;
struct ifnet *ifp = na->ifp;
if (intercept) {
@@ -221,9 +222,9 @@ generic_xmit_frame(struct ifnet *ifp, struct mbuf *m,
* (and eventually, just reference the netmap buffer)
*/
- if (*m->m_ext.ref_cnt != 1) {
+ if (GET_MBUF_REFCNT(m) != 1) {
D("invalid refcnt %d for %p",
- *m->m_ext.ref_cnt, m);
+ GET_MBUF_REFCNT(m), m);
panic("in generic_xmit_frame");
}
// XXX the ext_size check is unnecessary if we link the netmap buf
@@ -231,14 +232,14 @@ generic_xmit_frame(struct ifnet *ifp, struct mbuf *m,
RD(5, "size %d < len %d", m->m_ext.ext_size, len);
len = m->m_ext.ext_size;
}
- if (1) { /* XXX seems to have negligible benefits */
+ if (0) { /* XXX seems to have negligible benefits */
m->m_ext.ext_buf = m->m_data = addr;
} else {
bcopy(addr, m->m_data, len);
}
m->m_len = m->m_pkthdr.len = len;
// inc refcount. All ours, we could skip the atomic
- atomic_fetchadd_int(m->m_ext.ref_cnt, 1);
+ atomic_fetchadd_int(PNT_MBUF_REFCNT(m), 1);
m->m_flags |= M_FLOWID;
m->m_pkthdr.flowid = ring_nr;
m->m_pkthdr.rcvif = ifp; /* used for tx notification */
@@ -277,10 +278,11 @@ generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq)
void
-netmap_mitigation_init(struct nm_generic_mit *mit, struct netmap_adapter *na)
+netmap_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na)
{
ND("called");
mit->mit_pending = 0;
+ mit->mit_ring_idx = idx;
mit->mit_na = na;
}
@@ -313,6 +315,135 @@ netmap_mitigation_cleanup(struct nm_generic_mit *mit)
ND("called");
}
+static int
+nm_vi_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr)
+{
+ return EINVAL;
+}
+
+static void
+nm_vi_start(struct ifnet *ifp)
+{
+ panic("nm_vi_start() must not be called");
+}
+
+/*
+ * Index manager of persistent virtual interfaces.
+ * It is used to decide the lowest byte of the MAC address.
+ * We use the same algorithm with management of bridge port index.
+ */
+#define NM_VI_MAX 255
+static struct {
+ uint8_t index[NM_VI_MAX]; /* XXX just for a reasonable number */
+ uint8_t active;
+ struct mtx lock;
+} nm_vi_indices;
+
+void
+nm_vi_init_index(void)
+{
+ int i;
+ for (i = 0; i < NM_VI_MAX; i++)
+ nm_vi_indices.index[i] = i;
+ nm_vi_indices.active = 0;
+ mtx_init(&nm_vi_indices.lock, "nm_vi_indices_lock", NULL, MTX_DEF);
+}
+
+/* return -1 if no index available */
+static int
+nm_vi_get_index(void)
+{
+ int ret;
+
+ mtx_lock(&nm_vi_indices.lock);
+ ret = nm_vi_indices.active == NM_VI_MAX ? -1 :
+ nm_vi_indices.index[nm_vi_indices.active++];
+ mtx_unlock(&nm_vi_indices.lock);
+ return ret;
+}
+
+static void
+nm_vi_free_index(uint8_t val)
+{
+ int i, lim;
+
+ mtx_lock(&nm_vi_indices.lock);
+ lim = nm_vi_indices.active;
+ for (i = 0; i < lim; i++) {
+ if (nm_vi_indices.index[i] == val) {
+ /* swap index[lim-1] and j */
+ int tmp = nm_vi_indices.index[lim-1];
+ nm_vi_indices.index[lim-1] = val;
+ nm_vi_indices.index[i] = tmp;
+ nm_vi_indices.active--;
+ break;
+ }
+ }
+ if (lim == nm_vi_indices.active)
+ D("funny, index %u didn't found", val);
+ mtx_unlock(&nm_vi_indices.lock);
+}
+#undef NM_VI_MAX
+
+/*
+ * Implementation of a netmap-capable virtual interface that
+ * registered to the system.
+ * It is based on if_tap.c and ip_fw_log.c in FreeBSD 9.
+ *
+ * Note: Linux sets refcount to 0 on allocation of net_device,
+ * then increments it on registration to the system.
+ * FreeBSD sets refcount to 1 on if_alloc(), and does not
+ * increment this refcount on if_attach().
+ */
+int
+nm_vi_persist(const char *name, struct ifnet **ret)
+{
+ struct ifnet *ifp;
+ u_short macaddr_hi;
+ uint32_t macaddr_mid;
+ u_char eaddr[6];
+ int unit = nm_vi_get_index(); /* just to decide MAC address */
+
+ if (unit < 0)
+ return EBUSY;
+ /*
+ * We use the same MAC address generation method with tap
+ * except for the highest octet is 00:be instead of 00:bd
+ */
+ macaddr_hi = htons(0x00be); /* XXX tap + 1 */
+ macaddr_mid = (uint32_t) ticks;
+ bcopy(&macaddr_hi, eaddr, sizeof(short));
+ bcopy(&macaddr_mid, &eaddr[2], sizeof(uint32_t));
+ eaddr[5] = (uint8_t)unit;
+
+ ifp = if_alloc(IFT_ETHER);
+ if (ifp == NULL) {
+ D("if_alloc failed");
+ return ENOMEM;
+ }
+ if_initname(ifp, name, IF_DUNIT_NONE);
+ ifp->if_mtu = 65536;
+ ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST;
+ ifp->if_init = (void *)nm_vi_dummy;
+ ifp->if_ioctl = nm_vi_dummy;
+ ifp->if_start = nm_vi_start;
+ ifp->if_mtu = ETHERMTU;
+ IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen);
+ ifp->if_capabilities |= IFCAP_LINKSTATE;
+ ifp->if_capenable |= IFCAP_LINKSTATE;
+
+ ether_ifattach(ifp, eaddr);
+ *ret = ifp;
+ return 0;
+}
+/* unregister from the system and drop the final refcount */
+void
+nm_vi_detach(struct ifnet *ifp)
+{
+ nm_vi_free_index(((char *)IF_LLADDR(ifp))[5]);
+ ether_ifdetach(ifp);
+ if_free(ifp);
+}
/*
* In order to track whether pages are still mapped, we hook into
diff --git a/sys/dev/netmap/netmap_generic.c b/sys/dev/netmap/netmap_generic.c
index a14dbc2..7740382 100644
--- a/sys/dev/netmap/netmap_generic.c
+++ b/sys/dev/netmap/netmap_generic.c
@@ -102,51 +102,42 @@ __FBSDID("$FreeBSD$");
* mbuf wrappers
*/
-/*
- * mbuf destructor, also need to change the type to EXT_EXTREF,
+/* mbuf destructor, also need to change the type to EXT_EXTREF,
* add an M_NOFREE flag, and then clear the flag and
* chain into uma_zfree(zone_pack, mf)
* (or reinstall the buffer ?)
- *
- * On FreeBSD 9 the destructor is called as ext_free(ext_arg1, ext_arg2)
- * whereas newer version have ext_free(m, ext_arg1, ext_arg2)
- * For compatibility we set ext_arg1 = m on allocation so we have
- * the same code on both.
*/
#define SET_MBUF_DESTRUCTOR(m, fn) do { \
- (m)->m_ext.ext_free = (void *)fn; \
- (m)->m_ext.ext_type = EXT_EXTREF; \
- } while (0)
+ (m)->m_ext.ext_free = (void *)fn; \
+ (m)->m_ext.ext_type = EXT_EXTREF; \
+} while (0)
-static void
+static void
netmap_default_mbuf_destructor(struct mbuf *m)
-{
- /* restore original data pointer and type */
- m->m_ext.ext_buf = m->m_data = m->m_ext.ext_arg2;
+{
+ /* restore original mbuf */
+ m->m_ext.ext_buf = m->m_data = m->m_ext.ext_arg1;
+ m->m_ext.ext_arg1 = NULL;
m->m_ext.ext_type = EXT_PACKET;
m->m_ext.ext_free = NULL;
- m->m_ext.ext_arg1 = m->m_ext.ext_arg2 = NULL;
- if (*(m->m_ext.ref_cnt) == 0)
- *(m->m_ext.ref_cnt) = 1;
+ if (GET_MBUF_REFCNT(m) == 0)
+ SET_MBUF_REFCNT(m, 1);
uma_zfree(zone_pack, m);
-}
+}
-static inline struct mbuf *
-netmap_get_mbuf(int len)
-{
+static inline struct mbuf *
+netmap_get_mbuf(int len)
+{
struct mbuf *m;
m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR | M_NOFREE);
if (m) {
- m->m_ext.ext_arg1 = m; /* FreeBSD 9 compat */
- m->m_ext.ext_arg2 = m->m_ext.ext_buf; /* save original */
+ m->m_ext.ext_arg1 = m->m_ext.ext_buf; // XXX save
m->m_ext.ext_free = (void *)netmap_default_mbuf_destructor;
m->m_ext.ext_type = EXT_EXTREF;
- ND(5, "create m %p refcnt %d", m, *m->m_ext.ref_cnt);
+ ND(5, "create m %p refcnt %d", m, GET_MBUF_REFCNT(m));
}
return m;
-}
-
-#define GET_MBUF_REFCNT(m) ((m)->m_ext.ref_cnt ? *(m)->m_ext.ref_cnt : -1)
+}
@@ -158,8 +149,6 @@ netmap_get_mbuf(int len)
#include <linux/ethtool.h> /* struct ethtool_ops, get_ringparam */
#include <linux/hrtimer.h>
-//#define RATE /* Enables communication statistics. */
-
//#define REG_RESET
#endif /* linux */
@@ -174,7 +163,7 @@ netmap_get_mbuf(int len)
/* ======================== usage stats =========================== */
-#ifdef RATE
+#ifdef RATE_GENERIC
#define IFRATE(x) x
struct rate_stats {
unsigned long txpkt;
@@ -218,23 +207,33 @@ static void rate_callback(unsigned long arg)
static struct rate_context rate_ctx;
+void generic_rate(int txp, int txs, int txi, int rxp, int rxs, int rxi)
+{
+ if (txp) rate_ctx.new.txpkt++;
+ if (txs) rate_ctx.new.txsync++;
+ if (txi) rate_ctx.new.txirq++;
+ if (rxp) rate_ctx.new.rxpkt++;
+ if (rxs) rate_ctx.new.rxsync++;
+ if (rxi) rate_ctx.new.rxirq++;
+}
+
#else /* !RATE */
#define IFRATE(x)
#endif /* !RATE */
/* =============== GENERIC NETMAP ADAPTER SUPPORT ================= */
-#define GENERIC_BUF_SIZE netmap_buf_size /* Size of the mbufs in the Tx pool. */
/*
* Wrapper used by the generic adapter layer to notify
* the poller threads. Differently from netmap_rx_irq(), we check
- * only IFCAP_NETMAP instead of NAF_NATIVE_ON to enable the irq.
+ * only NAF_NETMAP_ON instead of NAF_NATIVE_ON to enable the irq.
*/
static void
netmap_generic_irq(struct ifnet *ifp, u_int q, u_int *work_done)
{
- if (unlikely(!(ifp->if_capenable & IFCAP_NETMAP)))
+ struct netmap_adapter *na = NA(ifp);
+ if (unlikely(!nm_netmap_on(na)))
return;
netmap_common_irq(ifp, q, work_done);
@@ -245,7 +244,6 @@ netmap_generic_irq(struct ifnet *ifp, u_int q, u_int *work_done)
static int
generic_netmap_register(struct netmap_adapter *na, int enable)
{
- struct ifnet *ifp = na->ifp;
struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;
struct mbuf *m;
int error;
@@ -271,7 +269,7 @@ generic_netmap_register(struct netmap_adapter *na, int enable)
goto out;
}
for (r=0; r<na->num_rx_rings; r++)
- netmap_mitigation_init(&gna->mit[r], na);
+ netmap_mitigation_init(&gna->mit[r], r, na);
/* Initialize the rx queue, as generic_rx_handler() can
* be called as soon as netmap_catch_rx() returns.
@@ -296,7 +294,7 @@ generic_netmap_register(struct netmap_adapter *na, int enable)
for (i=0; i<na->num_tx_desc; i++)
na->tx_rings[r].tx_pool[i] = NULL;
for (i=0; i<na->num_tx_desc; i++) {
- m = netmap_get_mbuf(GENERIC_BUF_SIZE);
+ m = netmap_get_mbuf(NETMAP_BUF_SIZE(na));
if (!m) {
D("tx_pool[%d] allocation failed", i);
error = ENOMEM;
@@ -312,14 +310,14 @@ generic_netmap_register(struct netmap_adapter *na, int enable)
D("netdev_rx_handler_register() failed (%d)", error);
goto register_handler;
}
- ifp->if_capenable |= IFCAP_NETMAP;
+ na->na_flags |= NAF_NETMAP_ON;
/* Make netmap control the packet steering. */
netmap_catch_tx(gna, 1);
rtnl_unlock();
-#ifdef RATE
+#ifdef RATE_GENERIC
if (rate_ctx.refcount == 0) {
D("setup_timer()");
memset(&rate_ctx, 0, sizeof(rate_ctx));
@@ -338,7 +336,7 @@ generic_netmap_register(struct netmap_adapter *na, int enable)
error handling code below. */
rtnl_lock();
- ifp->if_capenable &= ~IFCAP_NETMAP;
+ na->na_flags &= ~NAF_NETMAP_ON;
/* Release packet steering control. */
netmap_catch_tx(gna, 0);
@@ -365,7 +363,7 @@ generic_netmap_register(struct netmap_adapter *na, int enable)
free(na->tx_rings[r].tx_pool, M_DEVBUF);
}
-#ifdef RATE
+#ifdef RATE_GENERIC
if (--rate_ctx.refcount == 0) {
D("del_timer()");
del_timer(&rate_ctx.timer);
@@ -421,6 +419,8 @@ generic_mbuf_destructor(struct mbuf *m)
IFRATE(rate_ctx.new.txirq++);
}
+extern int netmap_adaptive_io;
+
/* Record completed transmissions and update hwtail.
*
* The oldest tx buffer not yet completed is at nr_hwtail + 1,
@@ -440,7 +440,7 @@ generic_netmap_tx_clean(struct netmap_kring *kring)
if (unlikely(m == NULL)) {
/* this is done, try to replenish the entry */
- tx_pool[nm_i] = m = netmap_get_mbuf(GENERIC_BUF_SIZE);
+ tx_pool[nm_i] = m = netmap_get_mbuf(NETMAP_BUF_SIZE(kring->na));
if (unlikely(m == NULL)) {
D("mbuf allocation failed, XXX error");
// XXX how do we proceed ? break ?
@@ -451,6 +451,23 @@ generic_netmap_tx_clean(struct netmap_kring *kring)
}
n++;
nm_i = nm_next(nm_i, lim);
+#if 0 /* rate adaptation */
+ if (netmap_adaptive_io > 1) {
+ if (n >= netmap_adaptive_io)
+ break;
+ } else if (netmap_adaptive_io) {
+ /* if hwcur - nm_i < lim/8 do an early break
+ * so we prevent the sender from stalling. See CVT.
+ */
+ if (hwcur >= nm_i) {
+ if (hwcur - nm_i < lim/2)
+ break;
+ } else {
+ if (hwcur + lim + 1 - nm_i < lim/2)
+ break;
+ }
+ }
+#endif
}
kring->nr_hwtail = nm_prev(nm_i, lim);
ND("tx completed [%d] -> hwtail %d", n, kring->nr_hwtail);
@@ -530,14 +547,15 @@ generic_set_tx_event(struct netmap_kring *kring, u_int hwcur)
* since it implements the TX flow control (and takes some locks).
*/
static int
-generic_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+generic_netmap_txsync(struct netmap_kring *kring, int flags)
{
+ struct netmap_adapter *na = kring->na;
struct ifnet *ifp = na->ifp;
- struct netmap_kring *kring = &na->tx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */ // j
u_int const lim = kring->nkr_num_slots - 1;
u_int const head = kring->rhead;
+ u_int ring_nr = kring->ring_id;
IFRATE(rate_ctx.new.txsync++);
@@ -553,19 +571,19 @@ generic_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
while (nm_i != head) {
struct netmap_slot *slot = &ring->slot[nm_i];
u_int len = slot->len;
- void *addr = NMB(slot);
+ void *addr = NMB(na, slot);
/* device-specific */
struct mbuf *m;
int tx_ret;
- NM_CHECK_ADDR_LEN(addr, len);
+ NM_CHECK_ADDR_LEN(na, addr, len);
/* Tale a mbuf from the tx pool and copy in the user packet. */
m = kring->tx_pool[nm_i];
if (unlikely(!m)) {
RD(5, "This should never happen");
- kring->tx_pool[nm_i] = m = netmap_get_mbuf(GENERIC_BUF_SIZE);
+ kring->tx_pool[nm_i] = m = netmap_get_mbuf(NETMAP_BUF_SIZE(na));
if (unlikely(m == NULL)) {
D("mbuf allocation failed");
break;
@@ -580,7 +598,7 @@ generic_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
*/
tx_ret = generic_xmit_frame(ifp, m, addr, len, ring_nr);
if (unlikely(tx_ret)) {
- RD(5, "start_xmit failed: err %d [nm_i %u, head %u, hwtail %u]",
+ ND(5, "start_xmit failed: err %d [nm_i %u, head %u, hwtail %u]",
tx_ret, nm_i, head, kring->nr_hwtail);
/*
* No room for this mbuf in the device driver.
@@ -686,10 +704,10 @@ generic_rx_handler(struct ifnet *ifp, struct mbuf *m)
* Access must be protected because the rx handler is asynchronous,
*/
static int
-generic_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+generic_netmap_rxsync(struct netmap_kring *kring, int flags)
{
- struct netmap_kring *kring = &na->rx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
+ struct netmap_adapter *na = kring->na;
u_int nm_i; /* index into the netmap ring */ //j,
u_int n;
u_int const lim = kring->nkr_num_slots - 1;
@@ -712,11 +730,11 @@ generic_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
nm_i = kring->nr_hwtail; /* first empty slot in the receive ring */
for (n = 0; nm_i != stop_i; n++) {
int len;
- void *addr = NMB(&ring->slot[nm_i]);
+ void *addr = NMB(na, &ring->slot[nm_i]);
struct mbuf *m;
/* we only check the address here on generic rx rings */
- if (addr == netmap_buffer_base) { /* Bad buffer */
+ if (addr == NETMAP_BUF_BASE(na)) { /* Bad buffer */
return netmap_ring_reinit(kring);
}
/*
@@ -823,7 +841,7 @@ generic_netmap_attach(struct ifnet *ifp)
na->nm_txsync = &generic_netmap_txsync;
na->nm_rxsync = &generic_netmap_rxsync;
na->nm_dtor = &generic_netmap_dtor;
- /* when using generic, IFCAP_NETMAP is set so we force
+ /* when using generic, NAF_NETMAP_ON is set so we force
* NAF_SKIP_INTR to use the regular interrupt handler
*/
na->na_flags = NAF_SKIP_INTR | NAF_HOST_RINGS;
diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h
index 087564c..dc6afd8 100644
--- a/sys/dev/netmap/netmap_kern.h
+++ b/sys/dev/netmap/netmap_kern.h
@@ -36,6 +36,7 @@
#define WITH_VALE // comment out to disable VALE support
#define WITH_PIPES
+#define WITH_MONITOR
#if defined(__FreeBSD__)
@@ -66,11 +67,23 @@
struct netmap_adapter *netmap_getna(if_t ifp);
#endif
+#if __FreeBSD_version >= 1100027
+#define GET_MBUF_REFCNT(m) ((m)->m_ext.ext_cnt ? *((m)->m_ext.ext_cnt) : -1)
+#define SET_MBUF_REFCNT(m, x) *((m)->m_ext.ext_cnt) = x
+#define PNT_MBUF_REFCNT(m) ((m)->m_ext.ext_cnt)
+#else
+#define GET_MBUF_REFCNT(m) ((m)->m_ext.ref_cnt ? *((m)->m_ext.ref_cnt) : -1)
+#define SET_MBUF_REFCNT(m, x) *((m)->m_ext.ref_cnt) = x
+#define PNT_MBUF_REFCNT(m) ((m)->m_ext.ref_cnt)
+#endif
+
MALLOC_DECLARE(M_NETMAP);
// XXX linux struct, not used in FreeBSD
struct net_device_ops {
};
+struct ethtool_ops {
+};
struct hrtimer {
};
@@ -82,7 +95,7 @@ struct hrtimer {
#define MBUF_IFP(m) ((m)->dev)
#define NM_SEND_UP(ifp, m) \
do { \
- m->priority = NM_MAGIC_PRIORITY; \
+ m->priority = NM_MAGIC_PRIORITY_RX; \
netif_rx(m); \
} while (0)
@@ -100,18 +113,6 @@ struct hrtimer {
#define DEV_NETMAP
#endif /* DEV_NETMAP */
-/*
- * IFCAP_NETMAP goes into net_device's priv_flags (if_capenable).
- * This was 16 bits up to linux 2.6.36, so we need a 16 bit value on older
- * platforms and tolerate the clash with IFF_DYNAMIC and IFF_BRIDGE_PORT.
- * For the 32-bit value, 0x100000 has no clashes until at least 3.5.1
- */
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
-#define IFCAP_NETMAP 0x8000
-#else
-#define IFCAP_NETMAP 0x200000
-#endif
-
#elif defined (__APPLE__)
#warning apple support is incomplete.
@@ -215,7 +216,7 @@ extern NMG_LOCK_T netmap_global_lock;
* rxsync_from_host() and netmap_transmit(). The mbq is protected
* by its internal lock.
*
- * RX rings attached to the VALE switch are accessed by both sender
+ * RX rings attached to the VALE switch are accessed by both senders
* and receiver. They are protected through the q_lock on the RX ring.
*/
struct netmap_kring {
@@ -266,7 +267,13 @@ struct netmap_kring {
uint32_t nkr_hwlease;
uint32_t nkr_lease_idx;
- volatile int nkr_stopped; // XXX what for ?
+ /* while nkr_stopped is set, no new [tr]xsync operations can
+ * be started on this kring.
+ * This is used by netmap_disable_all_rings()
+ * to find a synchronization point where critical data
+ * structures pointed to by the kring can be added or removed
+ */
+ volatile int nkr_stopped;
/* Support for adapters without native netmap support.
* On tx rings we preallocate an array of tx buffers
@@ -281,13 +288,40 @@ struct netmap_kring {
uint32_t ring_id; /* debugging */
char name[64]; /* diagnostic */
+ /* [tx]sync callback for this kring.
+ * The default nm_kring_create callback (netmap_krings_create)
+ * sets the nm_sync callback of each hardware tx(rx) kring to
+ * the corresponding nm_txsync(nm_rxsync) taken from the
+ * netmap_adapter; moreover, it sets the sync callback
+ * of the host tx(rx) ring to netmap_txsync_to_host
+ * (netmap_rxsync_from_host).
+ *
+ * Overrides: the above configuration is not changed by
+ * any of the nm_krings_create callbacks.
+ */
int (*nm_sync)(struct netmap_kring *kring, int flags);
#ifdef WITH_PIPES
- struct netmap_kring *pipe;
- struct netmap_ring *save_ring;
+ struct netmap_kring *pipe; /* if this is a pipe ring,
+ * pointer to the other end
+ */
+ struct netmap_ring *save_ring; /* pointer to hidden rings
+ * (see netmap_pipe.c for details)
+ */
#endif /* WITH_PIPES */
+#ifdef WITH_MONITOR
+ /* pointer to the adapter that is monitoring this kring (if any)
+ */
+ struct netmap_monitor_adapter *monitor;
+ /*
+ * Monitors work by intercepting the txsync and/or rxsync of the
+ * monitored krings. This is implemented by replacing
+ * the nm_sync pointer above and saving the previous
+ * one in save_sync below.
+ */
+ int (*save_sync)(struct netmap_kring *kring, int flags);
+#endif
} __attribute__((__aligned__(64)));
@@ -360,6 +394,8 @@ tail->| |<-hwtail | |<-hwlease
enum txrx { NR_RX = 0, NR_TX = 1 };
+struct netmap_vp_adapter; // forward
+
/*
* The "struct netmap_adapter" extends the "struct adapter"
* (or equivalent) device descriptor.
@@ -390,13 +426,19 @@ struct netmap_adapter {
* deallocation of the memory allocator
*/
#define NAF_NATIVE_ON 16 /* the adapter is native and the attached
- * interface is in netmap mode
+ * interface is in netmap mode.
+ * Virtual ports (vale, pipe, monitor...)
+ * should never use this flag.
*/
#define NAF_NETMAP_ON 32 /* netmap is active (either native or
- * emulated. Where possible (e.g. FreeBSD)
+ * emulated). Where possible (e.g. FreeBSD)
* IFCAP_NETMAP also mirrors this flag.
*/
#define NAF_HOST_RINGS 64 /* the adapter supports the host rings */
+#define NAF_FORCE_NATIVE 128 /* the adapter is always NATIVE */
+#define NAF_BUSY (1U<<31) /* the adapter is used internally and
+ * cannot be registered from userspace
+ */
int active_fds; /* number of user-space descriptors using this
interface, which is equal to the number of
struct netmap_if objs in the mapped region. */
@@ -423,6 +465,8 @@ struct netmap_adapter {
/* count users of the global wait queues */
int tx_si_users, rx_si_users;
+ void *pdev; /* used to store pci device */
+
/* copy of if_qflush and if_transmit pointers, to intercept
* packets from the network stack when netmap is active.
*/
@@ -444,7 +488,7 @@ struct netmap_adapter {
*
* nm_register() is called on NIOCREGIF and close() to enter
* or exit netmap mode on the NIC
- * Called with NMG_LOCK held.
+ * Called with NNG_LOCK held.
*
* nm_txsync() pushes packets to the underlying hw/switch
*
@@ -453,14 +497,20 @@ struct netmap_adapter {
* nm_config() returns configuration information from the OS
* Called with NMG_LOCK held.
*
- * nm_krings_create() create and init the krings array
- * (the array layout must conform to the description
- * found above the definition of netmap_krings_create)
+ * nm_krings_create() create and init the tx_rings and
+ * rx_rings arrays of kring structures. In particular,
+ * set the nm_sync callbacks for each ring.
+ * There is no need to also allocate the corresponding
+ * netmap_rings, since netmap_mem_rings_create() will always
+ * be called to provide the missing ones.
+ * Called with NNG_LOCK held.
*
- * nm_krings_delete() cleanup and delete the kring array
+ * nm_krings_delete() cleanup and delete the tx_rings and rx_rings
+ * arrays
+ * Called with NMG_LOCK held.
*
* nm_notify() is used to act after data have become available
- * (or the stopped state of the ring has changed)
+ * (or the stopped state of the ring has changed)
* For hw devices this is typically a selwakeup(),
* but for NIC/host ports attached to a switch (or vice-versa)
* we also need to invoke the 'txsync' code downstream.
@@ -469,8 +519,8 @@ struct netmap_adapter {
int (*nm_register)(struct netmap_adapter *, int onoff);
- int (*nm_txsync)(struct netmap_adapter *, u_int ring, int flags);
- int (*nm_rxsync)(struct netmap_adapter *, u_int ring, int flags);
+ int (*nm_txsync)(struct netmap_kring *kring, int flags);
+ int (*nm_rxsync)(struct netmap_kring *kring, int flags);
#define NAF_FORCE_READ 1
#define NAF_FORCE_RECLAIM 2
/* return configuration information */
@@ -480,7 +530,35 @@ struct netmap_adapter {
void (*nm_krings_delete)(struct netmap_adapter *);
int (*nm_notify)(struct netmap_adapter *,
u_int ring, enum txrx, int flags);
-#define NAF_DISABLE_NOTIFY 8
+#define NAF_DISABLE_NOTIFY 8 /* notify that the stopped state of the
+ * ring has changed (kring->nkr_stopped)
+ */
+
+#ifdef WITH_VALE
+ /*
+ * nm_bdg_attach() initializes the na_vp field to point
+ * to an adapter that can be attached to a VALE switch. If the
+ * current adapter is already a VALE port, na_vp is simply a cast;
+ * otherwise, na_vp points to a netmap_bwrap_adapter.
+ * If applicable, this callback also initializes na_hostvp,
+ * that can be used to connect the adapter host rings to the
+ * switch.
+ * Called with NMG_LOCK held.
+ *
+ * nm_bdg_ctl() is called on the actual attach/detach to/from
+ * to/from the switch, to perform adapter-specific
+ * initializations
+ * Called with NMG_LOCK held.
+ */
+ int (*nm_bdg_attach)(const char *bdg_name, struct netmap_adapter *);
+ int (*nm_bdg_ctl)(struct netmap_adapter *, struct nmreq *, int);
+
+ /* adapter used to attach this adapter to a VALE switch (if any) */
+ struct netmap_vp_adapter *na_vp;
+ /* adapter used to attach the host rings of this adapter
+ * to a VALE switch (if any) */
+ struct netmap_vp_adapter *na_hostvp;
+#endif
/* standard refcount to control the lifetime of the adapter
* (it should be equal to the lifetime of the corresponding ifp)
@@ -494,17 +572,22 @@ struct netmap_adapter {
struct netmap_mem_d *nm_mem;
struct lut_entry *na_lut;
uint32_t na_lut_objtotal; /* max buffer index */
+ uint32_t na_lut_objsize; /* buffer size */
- /* used internally. If non-null, the interface cannot be bound
- * from userspace
+ /* additional information attached to this adapter
+ * by other netmap subsystems. Currently used by
+ * bwrap and LINUX/v1000.
*/
void *na_private;
#ifdef WITH_PIPES
+ /* array of pipes that have this adapter as a parent */
struct netmap_pipe_adapter **na_pipes;
- int na_next_pipe;
- int na_max_pipes;
+ int na_next_pipe; /* next free slot in the array */
+ int na_max_pipes; /* size of the array */
#endif /* WITH_PIPES */
+
+ char name[64];
};
@@ -514,9 +597,9 @@ struct netmap_adapter {
* if the NIC is owned by a user, only users can share it.
* Evaluation must be done under NMG_LOCK().
*/
-#define NETMAP_OWNED_BY_KERN(na) (na->na_private)
+#define NETMAP_OWNED_BY_KERN(na) ((na)->na_flags & NAF_BUSY)
#define NETMAP_OWNED_BY_ANY(na) \
- (NETMAP_OWNED_BY_KERN(na) || (na->active_fds > 0))
+ (NETMAP_OWNED_BY_KERN(na) || ((na)->active_fds > 0))
/*
@@ -546,12 +629,17 @@ struct netmap_hw_adapter { /* physical device */
struct netmap_adapter up;
struct net_device_ops nm_ndo; // XXX linux only
+ struct ethtool_ops nm_eto; // XXX linux only
+ const struct ethtool_ops* save_ethtool;
+
+ int (*nm_hw_register)(struct netmap_adapter *, int onoff);
};
/* Mitigation support. */
struct nm_generic_mit {
struct hrtimer mit_timer;
int mit_pending;
+ int mit_ring_idx; /* index of the ring being mitigated */
struct netmap_adapter *mit_na; /* backpointer */
};
@@ -641,16 +729,19 @@ struct netmap_bwrap_adapter {
/* backup of the hwna notify callback */
int (*save_notify)(struct netmap_adapter *,
u_int ring, enum txrx, int flags);
+ /* backup of the hwna memory allocator */
+ struct netmap_mem_d *save_nmd;
/*
* When we attach a physical interface to the bridge, we
* allow the controlling process to terminate, so we need
- * a place to store the netmap_priv_d data structure.
+ * a place to store the n_detmap_priv_d data structure.
* This is only done when physical interfaces
* are attached to a bridge.
*/
struct netmap_priv_d *na_kpriv;
};
+int netmap_bwrap_attach(const char *name, struct netmap_adapter *);
#endif /* WITH_VALE */
@@ -747,12 +838,11 @@ static __inline int nm_kr_tryget(struct netmap_kring *kr)
* netmap_load_map/netmap_reload_map are helper routines to set/reset
* the dmamap for a packet buffer
*
- * netmap_reset() is a helper routine to be called in the driver
- * when reinitializing a ring.
+ * netmap_reset() is a helper routine to be called in the hw driver
+ * when reinitializing a ring. It should not be called by
+ * virtual ports (vale, pipes, monitor)
*/
int netmap_attach(struct netmap_adapter *);
-int netmap_attach_common(struct netmap_adapter *);
-void netmap_detach_common(struct netmap_adapter *na);
void netmap_detach(struct ifnet *);
int netmap_transmit(struct ifnet *, struct mbuf *);
struct netmap_slot *netmap_reset(struct netmap_adapter *na,
@@ -764,10 +854,33 @@ int netmap_rx_irq(struct ifnet *, u_int, u_int *);
#define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL)
void netmap_common_irq(struct ifnet *, u_int, u_int *work_done);
-void netmap_disable_all_rings(struct ifnet *);
-void netmap_enable_all_rings(struct ifnet *);
-void netmap_disable_ring(struct netmap_kring *kr);
+#ifdef WITH_VALE
+/* functions used by external modules to interface with VALE */
+#define netmap_vp_to_ifp(_vp) ((_vp)->up.ifp)
+#define netmap_ifp_to_vp(_ifp) (NA(_ifp)->na_vp)
+#define netmap_ifp_to_host_vp(_ifp) (NA(_ifp)->na_hostvp)
+#define netmap_bdg_idx(_vp) ((_vp)->bdg_port)
+const char *netmap_bdg_name(struct netmap_vp_adapter *);
+#else /* !WITH_VALE */
+#define netmap_vp_to_ifp(_vp) NULL
+#define netmap_ifp_to_vp(_ifp) NULL
+#define netmap_ifp_to_host_vp(_ifp) NULL
+#define netmap_bdg_idx(_vp) -1
+#define netmap_bdg_name(_vp) NULL
+#endif /* WITH_VALE */
+
+static inline int
+nm_native_on(struct netmap_adapter *na)
+{
+ return na && na->na_flags & NAF_NATIVE_ON;
+}
+
+static inline int
+nm_netmap_on(struct netmap_adapter *na)
+{
+ return na && na->na_flags & NAF_NETMAP_ON;
+}
/* set/clear native flags and if_transmit/netdev_ops */
static inline void
@@ -785,6 +898,8 @@ nm_set_native_flags(struct netmap_adapter *na)
#else
na->if_transmit = (void *)ifp->netdev_ops;
ifp->netdev_ops = &((struct netmap_hw_adapter *)na)->nm_ndo;
+ ((struct netmap_hw_adapter *)na)->save_ethtool = ifp->ethtool_ops;
+ ifp->ethtool_ops = &((struct netmap_hw_adapter*)na)->nm_eto;
#endif
}
@@ -798,6 +913,7 @@ nm_clear_native_flags(struct netmap_adapter *na)
ifp->if_transmit = na->if_transmit;
#else
ifp->netdev_ops = (void *)na->if_transmit;
+ ifp->ethtool_ops = ((struct netmap_hw_adapter*)na)->save_ethtool;
#endif
na->na_flags &= ~(NAF_NATIVE_ON | NAF_NETMAP_ON);
#ifdef IFCAP_NETMAP /* or FreeBSD ? */
@@ -858,30 +974,72 @@ nm_rxsync_finalize(struct netmap_kring *kring)
/* check/fix address and len in tx rings */
#if 1 /* debug version */
-#define NM_CHECK_ADDR_LEN(_a, _l) do { \
- if (_a == netmap_buffer_base || _l > NETMAP_BUF_SIZE) { \
+#define NM_CHECK_ADDR_LEN(_na, _a, _l) do { \
+ if (_a == NETMAP_BUF_BASE(_na) || _l > NETMAP_BUF_SIZE(_na)) { \
RD(5, "bad addr/len ring %d slot %d idx %d len %d", \
- ring_nr, nm_i, slot->buf_idx, len); \
- if (_l > NETMAP_BUF_SIZE) \
- _l = NETMAP_BUF_SIZE; \
+ kring->ring_id, nm_i, slot->buf_idx, len); \
+ if (_l > NETMAP_BUF_SIZE(_na)) \
+ _l = NETMAP_BUF_SIZE(_na); \
} } while (0)
#else /* no debug version */
-#define NM_CHECK_ADDR_LEN(_a, _l) do { \
- if (_l > NETMAP_BUF_SIZE) \
- _l = NETMAP_BUF_SIZE; \
+#define NM_CHECK_ADDR_LEN(_na, _a, _l) do { \
+ if (_l > NETMAP_BUF_SIZE(_na)) \
+ _l = NETMAP_BUF_SIZE(_na); \
} while (0)
#endif
/*---------------------------------------------------------------*/
/*
- * Support routines to be used with the VALE switch
+ * Support routines used by netmap subsystems
+ * (native drivers, VALE, generic, pipes, monitors, ...)
+ */
+
+
+/* common routine for all functions that create a netmap adapter. It performs
+ * two main tasks:
+ * - if the na points to an ifp, mark the ifp as netmap capable
+ * using na as its native adapter;
+ * - provide defaults for the setup callbacks and the memory allocator
+ */
+int netmap_attach_common(struct netmap_adapter *);
+/* common actions to be performed on netmap adapter destruction */
+void netmap_detach_common(struct netmap_adapter *);
+/* fill priv->np_[tr]xq{first,last} using the ringid and flags information
+ * coming from a struct nmreq
+ */
+int netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags);
+/* update the ring parameters (number and size of tx and rx rings).
+ * It calls the nm_config callback, if available.
*/
int netmap_update_config(struct netmap_adapter *na);
+/* create and initialize the common fields of the krings array.
+ * using the information that must be already available in the na.
+ * tailroom can be used to request the allocation of additional
+ * tailroom bytes after the krings array. This is used by
+ * netmap_vp_adapter's (i.e., VALE ports) to make room for
+ * leasing-related data structures
+ */
int netmap_krings_create(struct netmap_adapter *na, u_int tailroom);
+/* deletes the kring array of the adapter. The array must have
+ * been created using netmap_krings_create
+ */
void netmap_krings_delete(struct netmap_adapter *na);
-int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait);
+/* set the stopped/enabled status of ring
+ * When stopping, they also wait for all current activity on the ring to
+ * terminate. The status change is then notified using the na nm_notify
+ * callback.
+ */
+void netmap_set_txring(struct netmap_adapter *, u_int ring_id, int stopped);
+void netmap_set_rxring(struct netmap_adapter *, u_int ring_id, int stopped);
+/* set the stopped/enabled status of all rings of the adapter. */
+void netmap_set_all_rings(struct netmap_adapter *, int stopped);
+/* convenience wrappers for netmap_set_all_rings, used in drivers */
+void netmap_disable_all_rings(struct ifnet *);
+void netmap_enable_all_rings(struct ifnet *);
+
+int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait);
struct netmap_if *
netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
@@ -904,10 +1062,18 @@ int netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na);
* NM_BDG_MAXPORTS for broadcast, NM_BDG_MAXPORTS+1 for unknown.
* XXX in practice "unknown" might be handled same as broadcast.
*/
-typedef u_int (*bdg_lookup_fn_t)(char *buf, u_int len,
- uint8_t *ring_nr, struct netmap_vp_adapter *);
-u_int netmap_bdg_learning(char *, u_int, uint8_t *,
- struct netmap_vp_adapter *);
+typedef u_int (*bdg_lookup_fn_t)(struct nm_bdg_fwd *ft, uint8_t *ring_nr,
+ const struct netmap_vp_adapter *);
+typedef int (*bdg_config_fn_t)(struct nm_ifreq *);
+typedef void (*bdg_dtor_fn_t)(const struct netmap_vp_adapter *);
+struct netmap_bdg_ops {
+ bdg_lookup_fn_t lookup;
+ bdg_config_fn_t config;
+ bdg_dtor_fn_t dtor;
+};
+
+u_int netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
+ const struct netmap_vp_adapter *);
#define NM_BDG_MAXPORTS 254 /* up to 254 */
#define NM_BDG_BROADCAST NM_BDG_MAXPORTS
@@ -915,11 +1081,11 @@ u_int netmap_bdg_learning(char *, u_int, uint8_t *,
#define NM_NAME "vale" /* prefix for bridge port name */
-
/* these are redefined in case of no VALE support */
int netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create);
void netmap_init_bridges(void);
-int netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func);
+int netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops);
+int netmap_bdg_config(struct nmreq *nmr);
#else /* !WITH_VALE */
#define netmap_get_bdg_na(_1, _2, _3) 0
@@ -941,6 +1107,12 @@ int netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create
#define netmap_get_pipe_na(_1, _2, _3) 0
#endif
+#ifdef WITH_MONITOR
+int netmap_get_monitor_na(struct nmreq *nmr, struct netmap_adapter **na, int create);
+#else
+#define netmap_get_monitor_na(_1, _2, _3) 0
+#endif
+
/* Various prototypes */
int netmap_poll(struct cdev *dev, int events, struct thread *td);
int netmap_init(void);
@@ -952,7 +1124,6 @@ int netmap_dtor_locked(struct netmap_priv_d *priv);
int netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td);
/* netmap_adapter creation/destruction */
-#define NM_IFPNAME(ifp) ((ifp) ? (ifp)->if_xname : "zombie")
// #define NM_DEBUG_PUTGET 1
@@ -965,7 +1136,7 @@ void __netmap_adapter_get(struct netmap_adapter *na);
#define netmap_adapter_get(na) \
do { \
struct netmap_adapter *__na = na; \
- D("getting %p:%s (%d)", __na, NM_IFPNAME(__na->ifp), __na->na_refcount); \
+ D("getting %p:%s (%d)", __na, (__na)->name, (__na)->na_refcount); \
__netmap_adapter_get(__na); \
} while (0)
@@ -974,7 +1145,7 @@ int __netmap_adapter_put(struct netmap_adapter *na);
#define netmap_adapter_put(na) \
({ \
struct netmap_adapter *__na = na; \
- D("putting %p:%s (%d)", __na, NM_IFPNAME(__na->ifp), __na->na_refcount); \
+ D("putting %p:%s (%d)", __na, (__na)->name, (__na)->na_refcount); \
__netmap_adapter_put(__na); \
})
@@ -990,12 +1161,10 @@ int netmap_adapter_put(struct netmap_adapter *na);
/*
* module variables
*/
-extern u_int netmap_buf_size;
-#define NETMAP_BUF_SIZE netmap_buf_size // XXX remove
+#define NETMAP_BUF_BASE(na) ((na)->na_lut[0].vaddr)
+#define NETMAP_BUF_SIZE(na) ((na)->na_lut_objsize)
extern int netmap_mitigate; // XXX not really used
extern int netmap_no_pendintr;
-extern u_int netmap_total_buffers; // global allocator
-extern char *netmap_buffer_base; // global allocator
extern int netmap_verbose; // XXX debugging
enum { /* verbose flags */
NM_VERB_ON = 1, /* generic verbose */
@@ -1055,6 +1224,10 @@ extern int netmap_generic_rings;
#ifdef __FreeBSD__
+/* Assigns the device IOMMU domain to an allocator.
+ * Returns -ENOMEM in case the domain is different */
+#define nm_iommu_group_id(dev) (0)
+
/* Callback invoked by the dma machinery after a successful dmamap_load */
static void netmap_dmamap_cb(__unused void *arg,
__unused bus_dma_segment_t * segs, __unused int nseg, __unused int error)
@@ -1065,26 +1238,77 @@ static void netmap_dmamap_cb(__unused void *arg,
* XXX can we do it without a callback ?
*/
static inline void
-netmap_load_map(bus_dma_tag_t tag, bus_dmamap_t map, void *buf)
+netmap_load_map(struct netmap_adapter *na,
+ bus_dma_tag_t tag, bus_dmamap_t map, void *buf)
{
if (map)
- bus_dmamap_load(tag, map, buf, NETMAP_BUF_SIZE,
+ bus_dmamap_load(tag, map, buf, NETMAP_BUF_SIZE(na),
netmap_dmamap_cb, NULL, BUS_DMA_NOWAIT);
}
+static inline void
+netmap_unload_map(struct netmap_adapter *na,
+ bus_dma_tag_t tag, bus_dmamap_t map)
+{
+ if (map)
+ bus_dmamap_unload(tag, map);
+}
+
/* update the map when a buffer changes. */
static inline void
-netmap_reload_map(bus_dma_tag_t tag, bus_dmamap_t map, void *buf)
+netmap_reload_map(struct netmap_adapter *na,
+ bus_dma_tag_t tag, bus_dmamap_t map, void *buf)
{
if (map) {
bus_dmamap_unload(tag, map);
- bus_dmamap_load(tag, map, buf, NETMAP_BUF_SIZE,
+ bus_dmamap_load(tag, map, buf, NETMAP_BUF_SIZE(na),
netmap_dmamap_cb, NULL, BUS_DMA_NOWAIT);
}
}
#else /* linux */
+int nm_iommu_group_id(bus_dma_tag_t dev);
+extern size_t netmap_mem_get_bufsize(struct netmap_mem_d *);
+#include <linux/dma-mapping.h>
+
+static inline void
+netmap_load_map(struct netmap_adapter *na,
+ bus_dma_tag_t tag, bus_dmamap_t map, void *buf)
+{
+ if (map) {
+ *map = dma_map_single(na->pdev, buf, netmap_mem_get_bufsize(na->nm_mem),
+ DMA_BIDIRECTIONAL);
+ }
+}
+
+static inline void
+netmap_unload_map(struct netmap_adapter *na,
+ bus_dma_tag_t tag, bus_dmamap_t map)
+{
+ u_int sz = netmap_mem_get_bufsize(na->nm_mem);
+
+ if (*map) {
+ dma_unmap_single(na->pdev, *map, sz,
+ DMA_BIDIRECTIONAL);
+ }
+}
+
+static inline void
+netmap_reload_map(struct netmap_adapter *na,
+ bus_dma_tag_t tag, bus_dmamap_t map, void *buf)
+{
+ u_int sz = netmap_mem_get_bufsize(na->nm_mem);
+
+ if (*map) {
+ dma_unmap_single(na->pdev, *map, sz,
+ DMA_BIDIRECTIONAL);
+ }
+
+ *map = dma_map_single(na->pdev, buf, sz,
+ DMA_BIDIRECTIONAL);
+}
+
/*
* XXX How do we redefine these functions:
*
@@ -1095,8 +1319,7 @@ netmap_reload_map(bus_dma_tag_t tag, bus_dmamap_t map, void *buf)
* unfortunately the direction is not, so we need to change
* something to have a cross API
*/
-#define netmap_load_map(_t, _m, _b)
-#define netmap_reload_map(_t, _m, _b)
+
#if 0
struct e1000_buffer *buffer_info = &tx_ring->buffer_info[l];
/* set time_stamp *before* dma to help avoid a possible race */
@@ -1165,40 +1388,32 @@ struct lut_entry {
};
struct netmap_obj_pool;
-extern struct lut_entry *netmap_buffer_lut;
-#define NMB_VA(i) (netmap_buffer_lut[i].vaddr)
-#define NMB_PA(i) (netmap_buffer_lut[i].paddr)
/*
* NMB return the virtual address of a buffer (buffer 0 on bad index)
* PNMB also fills the physical address
*/
static inline void *
-NMB(struct netmap_slot *slot)
+NMB(struct netmap_adapter *na, struct netmap_slot *slot)
{
+ struct lut_entry *lut = na->na_lut;
uint32_t i = slot->buf_idx;
- return (unlikely(i >= netmap_total_buffers)) ? NMB_VA(0) : NMB_VA(i);
+ return (unlikely(i >= na->na_lut_objtotal)) ?
+ lut[0].vaddr : lut[i].vaddr;
}
static inline void *
-PNMB(struct netmap_slot *slot, uint64_t *pp)
+PNMB(struct netmap_adapter *na, struct netmap_slot *slot, uint64_t *pp)
{
uint32_t i = slot->buf_idx;
- void *ret = (i >= netmap_total_buffers) ? NMB_VA(0) : NMB_VA(i);
+ struct lut_entry *lut = na->na_lut;
+ void *ret = (i >= na->na_lut_objtotal) ? lut[0].vaddr : lut[i].vaddr;
- *pp = (i >= netmap_total_buffers) ? NMB_PA(0) : NMB_PA(i);
+ *pp = (i >= na->na_lut_objtotal) ? lut[0].paddr : lut[i].paddr;
return ret;
}
/* Generic version of NMB, which uses device-specific memory. */
-static inline void *
-BDG_NMB(struct netmap_adapter *na, struct netmap_slot *slot)
-{
- struct lut_entry *lut = na->na_lut;
- uint32_t i = slot->buf_idx;
- return (unlikely(i >= na->na_lut_objtotal)) ?
- lut[0].vaddr : lut[i].vaddr;
-}
@@ -1251,6 +1466,17 @@ struct netmap_priv_d {
struct thread *np_td; /* kqueue, just debugging */
};
+#ifdef WITH_MONITOR
+
+struct netmap_monitor_adapter {
+ struct netmap_adapter up;
+
+ struct netmap_priv_d priv;
+ uint32_t flags;
+};
+
+#endif /* WITH_MONITOR */
+
/*
* generic netmap emulation for devices that do not have
@@ -1265,12 +1491,20 @@ int generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, void *addr, u_int len,
int generic_find_num_desc(struct ifnet *ifp, u_int *tx, u_int *rx);
void generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq);
+//#define RATE_GENERIC /* Enables communication statistics for generic. */
+#ifdef RATE_GENERIC
+void generic_rate(int txp, int txs, int txi, int rxp, int rxs, int rxi);
+#else
+#define generic_rate(txp, txs, txi, rxp, rxs, rxi)
+#endif
+
/*
* netmap_mitigation API. This is used by the generic adapter
* to reduce the number of interrupt requests/selwakeup
* to clients on incoming packets.
*/
-void netmap_mitigation_init(struct nm_generic_mit *mit, struct netmap_adapter *na);
+void netmap_mitigation_init(struct nm_generic_mit *mit, int idx,
+ struct netmap_adapter *na);
void netmap_mitigation_start(struct nm_generic_mit *mit);
void netmap_mitigation_restart(struct nm_generic_mit *mit);
int netmap_mitigation_active(struct nm_generic_mit *mit);
@@ -1378,4 +1612,10 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na,
struct netmap_vp_adapter *dst_na,
struct nm_bdg_fwd *ft_p, struct netmap_ring *ring,
u_int *j, u_int lim, u_int *howmany);
+
+/* persistent virtual port routines */
+int nm_vi_persist(const char *, struct ifnet **);
+void nm_vi_detach(struct ifnet *);
+void nm_vi_init_index(void);
+
#endif /* _NET_NETMAP_KERN_H_ */
diff --git a/sys/dev/netmap/netmap_mbq.h b/sys/dev/netmap/netmap_mbq.h
index a011c4c..455ca8a 100644
--- a/sys/dev/netmap/netmap_mbq.h
+++ b/sys/dev/netmap/netmap_mbq.h
@@ -74,6 +74,7 @@ mbq_unlock(struct mbq *q)
mtx_unlock_spin(&q->lock);
}
+
void mbq_safe_init(struct mbq *q);
void mbq_safe_destroy(struct mbq *q);
void mbq_safe_enqueue(struct mbq *q, struct mbuf *m);
diff --git a/sys/dev/netmap/netmap_mem2.c b/sys/dev/netmap/netmap_mem2.c
index d237794..fa891ec 100644
--- a/sys/dev/netmap/netmap_mem2.c
+++ b/sys/dev/netmap/netmap_mem2.c
@@ -54,6 +54,112 @@ __FBSDID("$FreeBSD$");
#include <dev/netmap/netmap_kern.h>
#include "netmap_mem2.h"
+#define NETMAP_BUF_MAX_NUM 20*4096*2 /* large machine */
+
+#define NETMAP_POOL_MAX_NAMSZ 32
+
+
+enum {
+ NETMAP_IF_POOL = 0,
+ NETMAP_RING_POOL,
+ NETMAP_BUF_POOL,
+ NETMAP_POOLS_NR
+};
+
+
+struct netmap_obj_params {
+ u_int size;
+ u_int num;
+};
+struct netmap_obj_pool {
+ char name[NETMAP_POOL_MAX_NAMSZ]; /* name of the allocator */
+
+ /* ---------------------------------------------------*/
+ /* these are only meaningful if the pool is finalized */
+ /* (see 'finalized' field in netmap_mem_d) */
+ u_int objtotal; /* actual total number of objects. */
+ u_int memtotal; /* actual total memory space */
+ u_int numclusters; /* actual number of clusters */
+
+ u_int objfree; /* number of free objects. */
+
+ struct lut_entry *lut; /* virt,phys addresses, objtotal entries */
+ uint32_t *bitmap; /* one bit per buffer, 1 means free */
+ uint32_t bitmap_slots; /* number of uint32 entries in bitmap */
+ /* ---------------------------------------------------*/
+
+ /* limits */
+ u_int objminsize; /* minimum object size */
+ u_int objmaxsize; /* maximum object size */
+ u_int nummin; /* minimum number of objects */
+ u_int nummax; /* maximum number of objects */
+
+ /* these are changed only by config */
+ u_int _objtotal; /* total number of objects */
+ u_int _objsize; /* object size */
+ u_int _clustsize; /* cluster size */
+ u_int _clustentries; /* objects per cluster */
+ u_int _numclusters; /* number of clusters */
+
+ /* requested values */
+ u_int r_objtotal;
+ u_int r_objsize;
+};
+
+#ifdef linux
+// XXX a mtx would suffice here 20130415 lr
+#define NMA_LOCK_T struct semaphore
+#else /* !linux */
+#define NMA_LOCK_T struct mtx
+#endif /* linux */
+
+typedef int (*netmap_mem_config_t)(struct netmap_mem_d*);
+typedef int (*netmap_mem_finalize_t)(struct netmap_mem_d*);
+typedef void (*netmap_mem_deref_t)(struct netmap_mem_d*);
+
+typedef uint16_t nm_memid_t;
+
+struct netmap_mem_d {
+ NMA_LOCK_T nm_mtx; /* protect the allocator */
+ u_int nm_totalsize; /* shorthand */
+
+ u_int flags;
+#define NETMAP_MEM_FINALIZED 0x1 /* preallocation done */
+ int lasterr; /* last error for curr config */
+ int refcount; /* existing priv structures */
+ /* the three allocators */
+ struct netmap_obj_pool pools[NETMAP_POOLS_NR];
+
+ netmap_mem_config_t config;
+ netmap_mem_finalize_t finalize;
+ netmap_mem_deref_t deref;
+
+ nm_memid_t nm_id; /* allocator identifier */
+ int nm_grp; /* iommu groupd id */
+
+ /* list of all existing allocators, sorted by nm_id */
+ struct netmap_mem_d *prev, *next;
+};
+
+/* accessor functions */
+struct lut_entry*
+netmap_mem_get_lut(struct netmap_mem_d *nmd)
+{
+ return nmd->pools[NETMAP_BUF_POOL].lut;
+}
+
+u_int
+netmap_mem_get_buftotal(struct netmap_mem_d *nmd)
+{
+ return nmd->pools[NETMAP_BUF_POOL].objtotal;
+}
+
+size_t
+netmap_mem_get_bufsize(struct netmap_mem_d *nmd)
+{
+ return nmd->pools[NETMAP_BUF_POOL]._objsize;
+}
+
#ifdef linux
#define NMA_LOCK_INIT(n) sema_init(&(n)->nm_mtx, 1)
#define NMA_LOCK_DESTROY(n)
@@ -135,6 +241,7 @@ struct netmap_mem_d nm_mem = { /* Our memory allocator. */
.deref = netmap_mem_global_deref,
.nm_id = 1,
+ .nm_grp = -1,
.prev = &nm_mem,
.next = &nm_mem,
@@ -143,9 +250,6 @@ struct netmap_mem_d nm_mem = { /* Our memory allocator. */
struct netmap_mem_d *netmap_last_mem_d = &nm_mem;
-// XXX logically belongs to nm_mem
-struct lut_entry *netmap_buffer_lut; /* exported */
-
/* blueprint for the private memory allocators */
static int netmap_mem_private_config(struct netmap_mem_d *nmd);
static int netmap_mem_private_finalize(struct netmap_mem_d *nmd);
@@ -254,6 +358,25 @@ nm_mem_release_id(struct netmap_mem_d *nmd)
NMA_UNLOCK(&nm_mem);
}
+static int
+nm_mem_assign_group(struct netmap_mem_d *nmd, struct device *dev)
+{
+ int err = 0, id;
+ id = nm_iommu_group_id(dev);
+ if (netmap_verbose)
+ D("iommu_group %d", id);
+
+ NMA_LOCK(nmd);
+
+ if (nmd->nm_grp < 0)
+ nmd->nm_grp = id;
+
+ if (nmd->nm_grp != id)
+ nmd->lasterr = err = ENOMEM;
+
+ NMA_UNLOCK(nmd);
+ return err;
+}
/*
* First, find the allocator that contains the requested offset,
@@ -274,7 +397,7 @@ netmap_mem_ofstophys(struct netmap_mem_d* nmd, vm_ooffset_t offset)
if (offset >= p[i].memtotal)
continue;
// now lookup the cluster's address
- pa = p[i].lut[offset / p[i]._objsize].paddr +
+ pa = vtophys(p[i].lut[offset / p[i]._objsize].vaddr) +
offset % p[i]._objsize;
NMA_UNLOCK(nmd);
return pa;
@@ -300,18 +423,22 @@ netmap_mem_get_info(struct netmap_mem_d* nmd, u_int* size, u_int *memflags,
error = nmd->config(nmd);
if (error)
goto out;
- if (nmd->flags & NETMAP_MEM_FINALIZED) {
- *size = nmd->nm_totalsize;
- } else {
- int i;
- *size = 0;
- for (i = 0; i < NETMAP_POOLS_NR; i++) {
- struct netmap_obj_pool *p = nmd->pools + i;
- *size += (p->_numclusters * p->_clustsize);
+ if (size) {
+ if (nmd->flags & NETMAP_MEM_FINALIZED) {
+ *size = nmd->nm_totalsize;
+ } else {
+ int i;
+ *size = 0;
+ for (i = 0; i < NETMAP_POOLS_NR; i++) {
+ struct netmap_obj_pool *p = nmd->pools + i;
+ *size += (p->_numclusters * p->_clustsize);
+ }
}
}
- *memflags = nmd->flags;
- *id = nmd->nm_id;
+ if (memflags)
+ *memflags = nmd->flags;
+ if (id)
+ *id = nmd->nm_id;
out:
NMA_UNLOCK(nmd);
return error;
@@ -471,12 +598,15 @@ netmap_obj_free_va(struct netmap_obj_pool *p, void *vaddr)
vaddr, p->name);
}
+#define netmap_mem_bufsize(n) \
+ ((n)->pools[NETMAP_BUF_POOL]._objsize)
+
#define netmap_if_malloc(n, len) netmap_obj_malloc(&(n)->pools[NETMAP_IF_POOL], len, NULL, NULL)
#define netmap_if_free(n, v) netmap_obj_free_va(&(n)->pools[NETMAP_IF_POOL], (v))
#define netmap_ring_malloc(n, len) netmap_obj_malloc(&(n)->pools[NETMAP_RING_POOL], len, NULL, NULL)
#define netmap_ring_free(n, v) netmap_obj_free_va(&(n)->pools[NETMAP_RING_POOL], (v))
#define netmap_buf_malloc(n, _pos, _index) \
- netmap_obj_malloc(&(n)->pools[NETMAP_BUF_POOL], NETMAP_BDG_BUF_SIZE(n), _pos, _index)
+ netmap_obj_malloc(&(n)->pools[NETMAP_BUF_POOL], netmap_mem_bufsize(n), _pos, _index)
#if 0 // XXX unused
@@ -675,7 +805,7 @@ netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int obj
p->r_objtotal = objtotal;
p->r_objsize = objsize;
-#define MAX_CLUSTSIZE (1<<17)
+#define MAX_CLUSTSIZE (1<<22) // 4 MB
#define LINE_ROUND NM_CACHE_ALIGN // 64
if (objsize >= MAX_CLUSTSIZE) {
/* we could do it but there is no point */
@@ -713,15 +843,14 @@ netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int obj
clustentries = i;
break;
}
- if (delta > ( (clustentries*objsize) % PAGE_SIZE) )
- clustentries = i;
}
- // D("XXX --- ouch, delta %d (bad for buffers)", delta);
- /* compute clustsize and round to the next page */
+ /* exact solution not found */
+ if (clustentries == 0) {
+ D("unsupported allocation for %d bytes", objsize);
+ return EINVAL;
+ }
+ /* compute clustsize */
clustsize = clustentries * objsize;
- i = (clustsize & (PAGE_SIZE - 1));
- if (i)
- clustsize += PAGE_SIZE - i;
if (netmap_verbose)
D("objsize %d clustsize %d objects %d",
objsize, clustsize, clustentries);
@@ -857,6 +986,47 @@ netmap_mem_reset_all(struct netmap_mem_d *nmd)
}
static int
+netmap_mem_unmap(struct netmap_obj_pool *p, struct netmap_adapter *na)
+{
+ int i, lim = p->_objtotal;
+
+ if (na->pdev == NULL)
+ return 0;
+
+#ifdef __FreeBSD__
+ (void)i;
+ (void)lim;
+ D("unsupported on FreeBSD");
+#else /* linux */
+ for (i = 2; i < lim; i++) {
+ netmap_unload_map(na, (bus_dma_tag_t) na->pdev, &p->lut[i].paddr);
+ }
+#endif /* linux */
+
+ return 0;
+}
+
+static int
+netmap_mem_map(struct netmap_obj_pool *p, struct netmap_adapter *na)
+{
+#ifdef __FreeBSD__
+ D("unsupported on FreeBSD");
+#else /* linux */
+ int i, lim = p->_objtotal;
+
+ if (na->pdev == NULL)
+ return 0;
+
+ for (i = 2; i < lim; i++) {
+ netmap_load_map(na, (bus_dma_tag_t) na->pdev, &p->lut[i].paddr,
+ p->lut[i].vaddr);
+ }
+#endif /* linux */
+
+ return 0;
+}
+
+static int
netmap_mem_finalize_all(struct netmap_mem_d *nmd)
{
int i;
@@ -1091,13 +1261,6 @@ netmap_mem_global_finalize(struct netmap_mem_d *nmd)
if (netmap_mem_finalize_all(nmd))
goto out;
- /* backward compatibility */
- netmap_buf_size = nmd->pools[NETMAP_BUF_POOL]._objsize;
- netmap_total_buffers = nmd->pools[NETMAP_BUF_POOL].objtotal;
-
- netmap_buffer_lut = nmd->pools[NETMAP_BUF_POOL].lut;
- netmap_buffer_base = nmd->pools[NETMAP_BUF_POOL].lut[0].vaddr;
-
nmd->lasterr = 0;
out:
@@ -1198,7 +1361,7 @@ netmap_mem_rings_create(struct netmap_adapter *na)
ring->cur = kring->rcur;
ring->tail = kring->rtail;
*(uint16_t *)(uintptr_t)&ring->nr_buf_size =
- NETMAP_BDG_BUF_SIZE(na->nm_mem);
+ netmap_mem_bufsize(na->nm_mem);
ND("%s h %d c %d t %d", kring->name,
ring->head, ring->cur, ring->tail);
ND("initializing slots for txring");
@@ -1241,7 +1404,7 @@ netmap_mem_rings_create(struct netmap_adapter *na)
ring->cur = kring->rcur;
ring->tail = kring->rtail;
*(int *)(uintptr_t)&ring->nr_buf_size =
- NETMAP_BDG_BUF_SIZE(na->nm_mem);
+ netmap_mem_bufsize(na->nm_mem);
ND("%s h %d c %d t %d", kring->name,
ring->head, ring->cur, ring->tail);
ND("initializing slots for rxring %p", ring);
@@ -1290,7 +1453,7 @@ netmap_mem_rings_delete(struct netmap_adapter *na)
* the interface is in netmap mode.
*/
struct netmap_if *
-netmap_mem_if_new(const char *ifname, struct netmap_adapter *na)
+netmap_mem_if_new(struct netmap_adapter *na)
{
struct netmap_if *nifp;
ssize_t base; /* handy for relative offsets between rings and nifp */
@@ -1316,7 +1479,7 @@ netmap_mem_if_new(const char *ifname, struct netmap_adapter *na)
/* initialize base fields -- override const */
*(u_int *)(uintptr_t)&nifp->ni_tx_rings = na->num_tx_rings;
*(u_int *)(uintptr_t)&nifp->ni_rx_rings = na->num_rx_rings;
- strncpy(nifp->ni_name, ifname, (size_t)IFNAMSIZ);
+ strncpy(nifp->ni_name, na->name, (size_t)IFNAMSIZ);
/*
* fill the slots for the rx and tx rings. They contain the offset
@@ -1358,6 +1521,8 @@ netmap_mem_global_deref(struct netmap_mem_d *nmd)
NMA_LOCK(nmd);
nmd->refcount--;
+ if (!nmd->refcount)
+ nmd->nm_grp = -1;
if (netmap_verbose)
D("refcount = %d", nmd->refcount);
@@ -1365,13 +1530,25 @@ netmap_mem_global_deref(struct netmap_mem_d *nmd)
}
int
-netmap_mem_finalize(struct netmap_mem_d *nmd)
+netmap_mem_finalize(struct netmap_mem_d *nmd, struct netmap_adapter *na)
{
- return nmd->finalize(nmd);
+ if (nm_mem_assign_group(nmd, na->pdev) < 0) {
+ return ENOMEM;
+ } else {
+ nmd->finalize(nmd);
+ }
+
+ if (!nmd->lasterr && na->pdev)
+ netmap_mem_map(&nmd->pools[NETMAP_BUF_POOL], na);
+
+ return nmd->lasterr;
}
void
-netmap_mem_deref(struct netmap_mem_d *nmd)
+netmap_mem_deref(struct netmap_mem_d *nmd, struct netmap_adapter *na)
{
+ NMA_LOCK(nmd);
+ netmap_mem_unmap(&nmd->pools[NETMAP_BUF_POOL], na);
+ NMA_UNLOCK(nmd);
return nmd->deref(nmd);
}
diff --git a/sys/dev/netmap/netmap_mem2.h b/sys/dev/netmap/netmap_mem2.h
index e83616a..4c620bd 100644
--- a/sys/dev/netmap/netmap_mem2.h
+++ b/sys/dev/netmap/netmap_mem2.h
@@ -97,70 +97,6 @@
#define _NET_NETMAP_MEM2_H_
-#define NETMAP_BUF_MAX_NUM 20*4096*2 /* large machine */
-
-#define NETMAP_POOL_MAX_NAMSZ 32
-
-
-enum {
- NETMAP_IF_POOL = 0,
- NETMAP_RING_POOL,
- NETMAP_BUF_POOL,
- NETMAP_POOLS_NR
-};
-
-
-struct netmap_obj_params {
- u_int size;
- u_int num;
-};
-struct netmap_obj_pool {
- char name[NETMAP_POOL_MAX_NAMSZ]; /* name of the allocator */
-
- /* ---------------------------------------------------*/
- /* these are only meaningful if the pool is finalized */
- /* (see 'finalized' field in netmap_mem_d) */
- u_int objtotal; /* actual total number of objects. */
- u_int memtotal; /* actual total memory space */
- u_int numclusters; /* actual number of clusters */
-
- u_int objfree; /* number of free objects. */
-
- struct lut_entry *lut; /* virt,phys addresses, objtotal entries */
- uint32_t *bitmap; /* one bit per buffer, 1 means free */
- uint32_t bitmap_slots; /* number of uint32 entries in bitmap */
- /* ---------------------------------------------------*/
-
- /* limits */
- u_int objminsize; /* minimum object size */
- u_int objmaxsize; /* maximum object size */
- u_int nummin; /* minimum number of objects */
- u_int nummax; /* maximum number of objects */
-
- /* these are changed only by config */
- u_int _objtotal; /* total number of objects */
- u_int _objsize; /* object size */
- u_int _clustsize; /* cluster size */
- u_int _clustentries; /* objects per cluster */
- u_int _numclusters; /* number of clusters */
-
- /* requested values */
- u_int r_objtotal;
- u_int r_objsize;
-};
-
-#ifdef linux
-// XXX a mtx would suffice here 20130415 lr
-#define NMA_LOCK_T struct semaphore
-#else /* !linux */
-#define NMA_LOCK_T struct mtx
-#endif /* linux */
-
-typedef int (*netmap_mem_config_t)(struct netmap_mem_d*);
-typedef int (*netmap_mem_finalize_t)(struct netmap_mem_d*);
-typedef void (*netmap_mem_deref_t)(struct netmap_mem_d*);
-
-typedef uint16_t nm_memid_t;
/* We implement two kinds of netmap_mem_d structures:
*
@@ -178,40 +114,21 @@ typedef uint16_t nm_memid_t;
* are no active users. By 'active user' we mean an existing netmap_priv
* structure holding a reference to the allocator.
*/
-struct netmap_mem_d {
- NMA_LOCK_T nm_mtx; /* protect the allocator */
- u_int nm_totalsize; /* shorthand */
-
- u_int flags;
-#define NETMAP_MEM_FINALIZED 0x1 /* preallocation done */
-#define NETMAP_MEM_PRIVATE 0x2 /* uses private address space */
- int lasterr; /* last error for curr config */
- int refcount; /* existing priv structures */
- /* the three allocators */
- struct netmap_obj_pool pools[NETMAP_POOLS_NR];
-
- netmap_mem_config_t config;
- netmap_mem_finalize_t finalize;
- netmap_mem_deref_t deref;
-
- nm_memid_t nm_id; /* allocator identifier */
-
- /* list of all existing allocators, sorted by nm_id */
- struct netmap_mem_d *prev, *next;
-};
extern struct netmap_mem_d nm_mem;
+struct lut_entry* netmap_mem_get_lut(struct netmap_mem_d *);
+u_int netmap_mem_get_buftotal(struct netmap_mem_d *);
+size_t netmap_mem_get_bufsize(struct netmap_mem_d *);
vm_paddr_t netmap_mem_ofstophys(struct netmap_mem_d *, vm_ooffset_t);
-int netmap_mem_finalize(struct netmap_mem_d *);
+int netmap_mem_finalize(struct netmap_mem_d *, struct netmap_adapter *);
int netmap_mem_init(void);
void netmap_mem_fini(void);
-struct netmap_if *
- netmap_mem_if_new(const char *, struct netmap_adapter *);
+struct netmap_if * netmap_mem_if_new(struct netmap_adapter *);
void netmap_mem_if_delete(struct netmap_adapter *, struct netmap_if *);
int netmap_mem_rings_create(struct netmap_adapter *);
void netmap_mem_rings_delete(struct netmap_adapter *);
-void netmap_mem_deref(struct netmap_mem_d *);
+void netmap_mem_deref(struct netmap_mem_d *, struct netmap_adapter *);
int netmap_mem_get_info(struct netmap_mem_d *, u_int *size, u_int *memflags, uint16_t *id);
ssize_t netmap_mem_if_offset(struct netmap_mem_d *, const void *vaddr);
struct netmap_mem_d* netmap_mem_private_new(const char *name,
@@ -219,7 +136,8 @@ struct netmap_mem_d* netmap_mem_private_new(const char *name,
int* error);
void netmap_mem_private_delete(struct netmap_mem_d *);
-#define NETMAP_BDG_BUF_SIZE(n) ((n)->pools[NETMAP_BUF_POOL]._objsize)
+#define NETMAP_MEM_PRIVATE 0x2 /* allocator uses private address space */
+#define NETMAP_MEM_IO 0x4 /* the underlying memory is mmapped I/O */
uint32_t netmap_extra_alloc(struct netmap_adapter *, uint32_t *, uint32_t n);
diff --git a/sys/dev/netmap/netmap_monitor.c b/sys/dev/netmap/netmap_monitor.c
new file mode 100644
index 0000000..485c370
--- /dev/null
+++ b/sys/dev/netmap/netmap_monitor.c
@@ -0,0 +1,498 @@
+/*
+ * Copyright (C) 2014 Giuseppe Lettieri. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ *
+ * Monitors
+ *
+ * netmap monitors can be used to do zero-copy monitoring of network traffic
+ * on another adapter, when the latter adapter is working in netmap mode.
+ *
+ * Monitors offer to userspace the same interface as any other netmap port,
+ * with as many pairs of netmap rings as the monitored adapter.
+ * However, only the rx rings are actually used. Each monitor rx ring receives
+ * the traffic transiting on both the tx and rx corresponding rings in the
+ * monitored adapter. During registration, the user can choose if she wants
+ * to intercept tx only, rx only, or both tx and rx traffic.
+ *
+ * The monitor only sees the frames after they have been consumed in the
+ * monitored adapter:
+ *
+ * - For tx traffic, this is after the slots containing the frames have been
+ * marked as free. Note that this may happen at a considerably delay after
+ * frame transmission, since freeing of slots is often done lazily.
+ *
+ * - For rx traffic, this is after the consumer on the monitored adapter
+ * has released them. In most cases, the consumer is a userspace
+ * application which may have modified the frame contents.
+ *
+ * If the monitor is not able to cope with the stream of frames, excess traffic
+ * will be dropped.
+ *
+ * Each ring can be monitored by at most one monitor. This may change in the
+ * future, if we implement monitor chaining.
+ *
+ */
+
+
+#if defined(__FreeBSD__)
+#include <sys/cdefs.h> /* prerequisite */
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/param.h> /* defines used in kernel.h */
+#include <sys/kernel.h> /* types used in module initialization */
+#include <sys/malloc.h>
+#include <sys/poll.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/selinfo.h>
+#include <sys/sysctl.h>
+#include <sys/socket.h> /* sockaddrs */
+#include <net/if.h>
+#include <net/if_var.h>
+#include <machine/bus.h> /* bus_dmamap_* */
+#include <sys/refcount.h>
+
+
+#elif defined(linux)
+
+#include "bsd_glue.h"
+
+#elif defined(__APPLE__)
+
+#warning OSX support is only partial
+#include "osx_glue.h"
+
+#else
+
+#error Unsupported platform
+
+#endif /* unsupported */
+
+/*
+ * common headers
+ */
+
+#include <net/netmap.h>
+#include <dev/netmap/netmap_kern.h>
+#include <dev/netmap/netmap_mem2.h>
+
+#ifdef WITH_MONITOR
+
+#define NM_MONITOR_MAXSLOTS 4096
+
+/* monitor works by replacing the nm_sync callbacks in the monitored rings.
+ * The actions to be performed are the same on both tx and rx rings, so we
+ * have collected them here
+ */
+static int
+netmap_monitor_parent_sync(struct netmap_kring *kring, int flags, u_int* ringptr)
+{
+ struct netmap_monitor_adapter *mna = kring->monitor;
+ struct netmap_kring *mkring = &mna->up.rx_rings[kring->ring_id];
+ struct netmap_ring *ring = kring->ring, *mring = mkring->ring;
+ int error;
+ int rel_slots, free_slots, busy;
+ u_int beg, end, i;
+ u_int lim = kring->nkr_num_slots - 1,
+ mlim = mkring->nkr_num_slots - 1;
+
+ /* get the relased slots (rel_slots) */
+ beg = *ringptr;
+ error = kring->save_sync(kring, flags);
+ if (error)
+ return error;
+ end = *ringptr;
+ rel_slots = end - beg;
+ if (rel_slots < 0)
+ rel_slots += kring->nkr_num_slots;
+
+ if (!rel_slots) {
+ return 0;
+ }
+
+ /* we need to lock the monitor receive ring, since it
+ * is the target of bot tx and rx traffic from the monitored
+ * adapter
+ */
+ mtx_lock(&mkring->q_lock);
+ /* get the free slots available on the monitor ring */
+ i = mkring->nr_hwtail;
+ busy = i - mkring->nr_hwcur;
+ if (busy < 0)
+ busy += mkring->nkr_num_slots;
+ free_slots = mlim - busy;
+
+ if (!free_slots) {
+ mtx_unlock(&mkring->q_lock);
+ return 0;
+ }
+
+ /* swap min(free_slots, rel_slots) slots */
+ if (free_slots < rel_slots) {
+ beg += (rel_slots - free_slots);
+ if (beg > lim)
+ beg = 0;
+ rel_slots = free_slots;
+ }
+
+ for ( ; rel_slots; rel_slots--) {
+ struct netmap_slot *s = &ring->slot[beg];
+ struct netmap_slot *ms = &mring->slot[i];
+ uint32_t tmp;
+
+ tmp = ms->buf_idx;
+ ms->buf_idx = s->buf_idx;
+ s->buf_idx = tmp;
+
+ tmp = ms->len;
+ ms->len = s->len;
+ s->len = tmp;
+
+ s->flags |= NS_BUF_CHANGED;
+
+ beg = nm_next(beg, lim);
+ i = nm_next(i, mlim);
+
+ }
+ wmb();
+ mkring->nr_hwtail = i;
+
+ mtx_unlock(&mkring->q_lock);
+ /* notify the new frames to the monitor */
+ mna->up.nm_notify(&mna->up, mkring->ring_id, NR_RX, 0);
+ return 0;
+}
+
+/* callback used to replace the nm_sync callback in the monitored tx rings */
+static int
+netmap_monitor_parent_txsync(struct netmap_kring *kring, int flags)
+{
+ ND("%s %x", kring->name, flags);
+ return netmap_monitor_parent_sync(kring, flags, &kring->nr_hwtail);
+}
+
+/* callback used to replace the nm_sync callback in the monitored rx rings */
+static int
+netmap_monitor_parent_rxsync(struct netmap_kring *kring, int flags)
+{
+ ND("%s %x", kring->name, flags);
+ return netmap_monitor_parent_sync(kring, flags, &kring->rcur);
+}
+
+/* nm_sync callback for the monitor's own tx rings.
+ * This makes no sense and always returns error
+ */
+static int
+netmap_monitor_txsync(struct netmap_kring *kring, int flags)
+{
+ D("%s %x", kring->name, flags);
+ return EIO;
+}
+
+/* nm_sync callback for the monitor's own rx rings.
+ * Note that the lock in netmap_monitor_parent_sync only protects
+ * writers among themselves. Synchronization between writers
+ * (i.e., netmap_monitor_parent_txsync and netmap_monitor_parent_rxsync)
+ * and readers (i.e., netmap_monitor_rxsync) relies on memory barriers.
+ */
+static int
+netmap_monitor_rxsync(struct netmap_kring *kring, int flags)
+{
+ ND("%s %x", kring->name, flags);
+ kring->nr_hwcur = kring->rcur;
+ rmb();
+ nm_rxsync_finalize(kring);
+ return 0;
+}
+
+/* nm_krings_create callbacks for monitors.
+ * We could use the default netmap_hw_krings_monitor, but
+ * we don't need the mbq.
+ */
+static int
+netmap_monitor_krings_create(struct netmap_adapter *na)
+{
+ return netmap_krings_create(na, 0);
+}
+
+
+/* nm_register callback for monitors.
+ *
+ * On registration, replace the nm_sync callbacks in the monitored
+ * rings with our own, saving the previous ones in the monitored
+ * rings themselves, where they are used by netmap_monitor_parent_sync.
+ *
+ * On de-registration, restore the original callbacks. We need to
+ * stop traffic while we are doing this, since the monitored adapter may
+ * have already started executing a netmap_monitor_parent_sync
+ * and may not like the kring->save_sync pointer to become NULL.
+ */
+static int
+netmap_monitor_reg(struct netmap_adapter *na, int onoff)
+{
+ struct netmap_monitor_adapter *mna =
+ (struct netmap_monitor_adapter *)na;
+ struct netmap_priv_d *priv = &mna->priv;
+ struct netmap_adapter *pna = priv->np_na;
+ struct netmap_kring *kring;
+ int i;
+
+ ND("%p: onoff %d", na, onoff);
+ if (onoff) {
+ if (!nm_netmap_on(pna)) {
+ /* parent left netmap mode, fatal */
+ return ENXIO;
+ }
+ if (mna->flags & NR_MONITOR_TX) {
+ for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) {
+ kring = &pna->tx_rings[i];
+ kring->save_sync = kring->nm_sync;
+ kring->nm_sync = netmap_monitor_parent_txsync;
+ }
+ }
+ if (mna->flags & NR_MONITOR_RX) {
+ for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) {
+ kring = &pna->rx_rings[i];
+ kring->save_sync = kring->nm_sync;
+ kring->nm_sync = netmap_monitor_parent_rxsync;
+ }
+ }
+ na->na_flags |= NAF_NETMAP_ON;
+ } else {
+ if (!nm_netmap_on(pna)) {
+ /* parent left netmap mode, nothing to restore */
+ return 0;
+ }
+ na->na_flags &= ~NAF_NETMAP_ON;
+ if (mna->flags & NR_MONITOR_TX) {
+ for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) {
+ netmap_set_txring(pna, i, 1 /* stopped */);
+ kring = &pna->tx_rings[i];
+ kring->nm_sync = kring->save_sync;
+ kring->save_sync = NULL;
+ netmap_set_txring(pna, i, 0 /* enabled */);
+ }
+ }
+ if (mna->flags & NR_MONITOR_RX) {
+ for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) {
+ netmap_set_rxring(pna, i, 1 /* stopped */);
+ kring = &pna->rx_rings[i];
+ kring->nm_sync = kring->save_sync;
+ kring->save_sync = NULL;
+ netmap_set_rxring(pna, i, 0 /* enabled */);
+ }
+ }
+ }
+ return 0;
+}
+/* nm_krings_delete callback for monitors */
+static void
+netmap_monitor_krings_delete(struct netmap_adapter *na)
+{
+ netmap_krings_delete(na);
+}
+
+
+/* nm_dtor callback for monitors */
+static void
+netmap_monitor_dtor(struct netmap_adapter *na)
+{
+ struct netmap_monitor_adapter *mna =
+ (struct netmap_monitor_adapter *)na;
+ struct netmap_priv_d *priv = &mna->priv;
+ struct netmap_adapter *pna = priv->np_na;
+ int i;
+
+ ND("%p", na);
+ if (nm_netmap_on(pna)) {
+ /* parent still in netmap mode, mark its krings as free */
+ if (mna->flags & NR_MONITOR_TX) {
+ for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) {
+ pna->tx_rings[i].monitor = NULL;
+ }
+ }
+ if (mna->flags & NR_MONITOR_RX) {
+ for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) {
+ pna->rx_rings[i].monitor = NULL;
+ }
+ }
+ }
+ netmap_adapter_put(pna);
+}
+
+
+/* check if nmr is a request for a monitor adapter that we can satisfy */
+int
+netmap_get_monitor_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
+{
+ struct nmreq pnmr;
+ struct netmap_adapter *pna; /* parent adapter */
+ struct netmap_monitor_adapter *mna;
+ int i, error;
+
+ if ((nmr->nr_flags & (NR_MONITOR_TX | NR_MONITOR_RX)) == 0) {
+ ND("not a monitor");
+ return 0;
+ }
+ /* this is a request for a monitor adapter */
+
+ D("flags %x", nmr->nr_flags);
+
+ mna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (mna == NULL) {
+ D("memory error");
+ return ENOMEM;
+ }
+
+ /* first, try to find the adapter that we want to monitor
+ * We use the same nmr, after we have turned off the monitor flags.
+ * In this way we can potentially monitor everything netmap understands,
+ * except other monitors.
+ */
+ memcpy(&pnmr, nmr, sizeof(pnmr));
+ pnmr.nr_flags &= ~(NR_MONITOR_TX | NR_MONITOR_RX);
+ error = netmap_get_na(&pnmr, &pna, create);
+ if (error) {
+ D("parent lookup failed: %d", error);
+ return error;
+ }
+ D("found parent: %s", pna->name);
+
+ if (!nm_netmap_on(pna)) {
+ /* parent not in netmap mode */
+ /* XXX we can wait for the parent to enter netmap mode,
+ * by intercepting its nm_register callback (2014-03-16)
+ */
+ D("%s not in netmap mode", pna->name);
+ error = EINVAL;
+ goto put_out;
+ }
+
+ /* grab all the rings we need in the parent */
+ mna->priv.np_na = pna;
+ error = netmap_interp_ringid(&mna->priv, nmr->nr_ringid, nmr->nr_flags);
+ if (error) {
+ D("ringid error");
+ goto put_out;
+ }
+ if (nmr->nr_flags & NR_MONITOR_TX) {
+ for (i = mna->priv.np_txqfirst; i < mna->priv.np_txqlast; i++) {
+ struct netmap_kring *kring = &pna->tx_rings[i];
+ if (kring->monitor) {
+ error = EBUSY;
+ D("ring busy");
+ goto release_out;
+ }
+ kring->monitor = mna;
+ }
+ }
+ if (nmr->nr_flags & NR_MONITOR_RX) {
+ for (i = mna->priv.np_rxqfirst; i < mna->priv.np_rxqlast; i++) {
+ struct netmap_kring *kring = &pna->rx_rings[i];
+ if (kring->monitor) {
+ error = EBUSY;
+ D("ring busy");
+ goto release_out;
+ }
+ kring->monitor = mna;
+ }
+ }
+
+ snprintf(mna->up.name, sizeof(mna->up.name), "mon:%s", pna->name);
+
+ /* the monitor supports the host rings iff the parent does */
+ mna->up.na_flags = (pna->na_flags & NAF_HOST_RINGS);
+ mna->up.nm_txsync = netmap_monitor_txsync;
+ mna->up.nm_rxsync = netmap_monitor_rxsync;
+ mna->up.nm_register = netmap_monitor_reg;
+ mna->up.nm_dtor = netmap_monitor_dtor;
+ mna->up.nm_krings_create = netmap_monitor_krings_create;
+ mna->up.nm_krings_delete = netmap_monitor_krings_delete;
+ mna->up.nm_mem = pna->nm_mem;
+ mna->up.na_lut = pna->na_lut;
+ mna->up.na_lut_objtotal = pna->na_lut_objtotal;
+ mna->up.na_lut_objsize = pna->na_lut_objsize;
+
+ mna->up.num_tx_rings = 1; // XXX we don't need it, but field can't be zero
+ /* we set the number of our rx_rings to be max(num_rx_rings, num_rx_rings)
+ * in the parent
+ */
+ mna->up.num_rx_rings = pna->num_rx_rings;
+ if (pna->num_tx_rings > pna->num_rx_rings)
+ mna->up.num_rx_rings = pna->num_tx_rings;
+ /* by default, the number of slots is the same as in
+ * the parent rings, but the user may ask for a different
+ * number
+ */
+ mna->up.num_tx_desc = nmr->nr_tx_slots;
+ nm_bound_var(&mna->up.num_tx_desc, pna->num_tx_desc,
+ 1, NM_MONITOR_MAXSLOTS, NULL);
+ mna->up.num_rx_desc = nmr->nr_rx_slots;
+ nm_bound_var(&mna->up.num_rx_desc, pna->num_rx_desc,
+ 1, NM_MONITOR_MAXSLOTS, NULL);
+ error = netmap_attach_common(&mna->up);
+ if (error) {
+ D("attach_common error");
+ goto release_out;
+ }
+
+ /* remember the traffic directions we have to monitor */
+ mna->flags = (nmr->nr_flags & (NR_MONITOR_TX | NR_MONITOR_RX));
+
+ *na = &mna->up;
+ netmap_adapter_get(*na);
+
+ /* write the configuration back */
+ nmr->nr_tx_rings = mna->up.num_tx_rings;
+ nmr->nr_rx_rings = mna->up.num_rx_rings;
+ nmr->nr_tx_slots = mna->up.num_tx_desc;
+ nmr->nr_rx_slots = mna->up.num_rx_desc;
+
+ /* keep the reference to the parent */
+ D("monitor ok");
+
+ return 0;
+
+release_out:
+ D("monitor error");
+ for (i = mna->priv.np_txqfirst; i < mna->priv.np_txqlast; i++) {
+ if (pna->tx_rings[i].monitor == mna)
+ pna->tx_rings[i].monitor = NULL;
+ }
+ for (i = mna->priv.np_rxqfirst; i < mna->priv.np_rxqlast; i++) {
+ if (pna->rx_rings[i].monitor == mna)
+ pna->rx_rings[i].monitor = NULL;
+ }
+put_out:
+ netmap_adapter_put(pna);
+ free(mna, M_DEVBUF);
+ return error;
+}
+
+
+#endif /* WITH_MONITOR */
diff --git a/sys/dev/netmap/netmap_offloadings.c b/sys/dev/netmap/netmap_offloadings.c
index a776a24..34eafab 100644
--- a/sys/dev/netmap/netmap_offloadings.c
+++ b/sys/dev/netmap/netmap_offloadings.c
@@ -159,7 +159,7 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na,
src = ft_p->ft_buf;
src_len = ft_p->ft_len;
slot = &ring->slot[*j];
- dst = BDG_NMB(&dst_na->up, slot);
+ dst = NMB(&dst_na->up, slot);
dst_len = src_len;
/* We are processing the first input slot and there is a mismatch
@@ -303,7 +303,7 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na,
/* Next destination slot. */
*j = nm_next(*j, lim);
slot = &ring->slot[*j];
- dst = BDG_NMB(&dst_na->up, slot);
+ dst = NMB(&dst_na->up, slot);
gso_bytes = 0;
gso_idx++;
@@ -365,7 +365,7 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na,
/* Next destination slot. */
*j = nm_next(*j, lim);
slot = &ring->slot[*j];
- dst = BDG_NMB(&dst_na->up, slot);
+ dst = NMB(&dst_na->up, slot);
/* Next source slot. */
ft_p++;
diff --git a/sys/dev/netmap/netmap_pipe.c b/sys/dev/netmap/netmap_pipe.c
index 9fcc4d2..bc998c0 100644
--- a/sys/dev/netmap/netmap_pipe.c
+++ b/sys/dev/netmap/netmap_pipe.c
@@ -126,7 +126,7 @@ void
netmap_pipe_dealloc(struct netmap_adapter *na)
{
if (na->na_pipes) {
- ND("freeing pipes for %s", NM_IFPNAME(na->ifp));
+ ND("freeing pipes for %s", na->name);
free(na->na_pipes, M_DEVBUF);
na->na_pipes = NULL;
na->na_max_pipes = 0;
@@ -155,7 +155,7 @@ static int
netmap_pipe_add(struct netmap_adapter *parent, struct netmap_pipe_adapter *na)
{
if (parent->na_next_pipe >= parent->na_max_pipes) {
- D("%s: no space left for pipes", NM_IFPNAME(parent->ifp));
+ D("%s: no space left for pipes", parent->name);
return ENOMEM;
}
@@ -179,10 +179,9 @@ netmap_pipe_remove(struct netmap_adapter *parent, struct netmap_pipe_adapter *na
}
static int
-netmap_pipe_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+netmap_pipe_txsync(struct netmap_kring *txkring, int flags)
{
- struct netmap_kring *txkring = na->tx_rings + ring_nr,
- *rxkring = txkring->pipe;
+ struct netmap_kring *rxkring = txkring->pipe;
u_int limit; /* slots to transfer */
u_int j, k, lim_tx = txkring->nkr_num_slots - 1,
lim_rx = rxkring->nkr_num_slots - 1;
@@ -245,10 +244,9 @@ netmap_pipe_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
}
static int
-netmap_pipe_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+netmap_pipe_rxsync(struct netmap_kring *rxkring, int flags)
{
- struct netmap_kring *rxkring = na->rx_rings + ring_nr,
- *txkring = rxkring->pipe;
+ struct netmap_kring *txkring = rxkring->pipe;
uint32_t oldhwcur = rxkring->nr_hwcur;
ND("%s %x <- %s", rxkring->name, flags, txkring->name);
@@ -425,12 +423,11 @@ netmap_pipe_reg(struct netmap_adapter *na, int onoff)
{
struct netmap_pipe_adapter *pna =
(struct netmap_pipe_adapter *)na;
- struct ifnet *ifp = na->ifp;
ND("%p: onoff %d", na, onoff);
if (onoff) {
- ifp->if_capenable |= IFCAP_NETMAP;
+ na->na_flags |= NAF_NETMAP_ON;
} else {
- ifp->if_capenable &= ~IFCAP_NETMAP;
+ na->na_flags &= ~NAF_NETMAP_ON;
}
if (pna->peer_ref) {
ND("%p: case 1.a or 2.a, nothing to do", na);
@@ -522,8 +519,6 @@ netmap_pipe_dtor(struct netmap_adapter *na)
if (pna->role == NR_REG_PIPE_MASTER)
netmap_pipe_remove(pna->parent, pna);
netmap_adapter_put(pna->parent);
- free(na->ifp, M_DEVBUF);
- na->ifp = NULL;
pna->parent = NULL;
}
@@ -533,7 +528,6 @@ netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
struct nmreq pnmr;
struct netmap_adapter *pna; /* parent adapter */
struct netmap_pipe_adapter *mna, *sna, *req;
- struct ifnet *ifp, *ifp2;
u_int pipe_id;
int role = nmr->nr_flags & NR_REG_MASK;
int error;
@@ -556,7 +550,7 @@ netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
ND("parent lookup failed: %d", error);
return error;
}
- ND("found parent: %s", NM_IFPNAME(pna->ifp));
+ ND("found parent: %s", na->name);
if (NETMAP_OWNED_BY_KERN(pna)) {
ND("parent busy");
@@ -591,19 +585,12 @@ netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
* The endpoint we were asked for holds a reference to
* the other one.
*/
- ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
- if (!ifp) {
- error = ENOMEM;
- goto put_out;
- }
- strcpy(ifp->if_xname, NM_IFPNAME(pna->ifp));
-
mna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO);
if (mna == NULL) {
error = ENOMEM;
- goto free_ifp;
+ goto put_out;
}
- mna->up.ifp = ifp;
+ snprintf(mna->up.name, sizeof(mna->up.name), "%s{%d", pna->name, pipe_id);
mna->id = pipe_id;
mna->role = NR_REG_PIPE_MASTER;
@@ -618,6 +605,7 @@ netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
mna->up.nm_mem = pna->nm_mem;
mna->up.na_lut = pna->na_lut;
mna->up.na_lut_objtotal = pna->na_lut_objtotal;
+ mna->up.na_lut_objsize = pna->na_lut_objsize;
mna->up.num_tx_rings = 1;
mna->up.num_rx_rings = 1;
@@ -629,28 +617,21 @@ netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
1, NM_PIPE_MAXSLOTS, NULL);
error = netmap_attach_common(&mna->up);
if (error)
- goto free_ifp;
+ goto free_mna;
/* register the master with the parent */
error = netmap_pipe_add(pna, mna);
if (error)
goto free_mna;
/* create the slave */
- ifp2 = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
- if (!ifp) {
- error = ENOMEM;
- goto free_mna;
- }
- strcpy(ifp2->if_xname, NM_IFPNAME(pna->ifp));
-
sna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO);
if (sna == NULL) {
error = ENOMEM;
- goto free_ifp2;
+ goto free_mna;
}
/* most fields are the same, copy from master and then fix */
*sna = *mna;
- sna->up.ifp = ifp2;
+ snprintf(sna->up.name, sizeof(sna->up.name), "%s}%d", pna->name, pipe_id);
sna->role = NR_REG_PIPE_SLAVE;
error = netmap_attach_common(&sna->up);
if (error)
@@ -696,12 +677,8 @@ found:
free_sna:
free(sna, M_DEVBUF);
-free_ifp2:
- free(ifp2, M_DEVBUF);
free_mna:
free(mna, M_DEVBUF);
-free_ifp:
- free(ifp, M_DEVBUF);
put_out:
netmap_adapter_put(pna);
return error;
diff --git a/sys/dev/netmap/netmap_vale.c b/sys/dev/netmap/netmap_vale.c
index 8e309e9..6b1fe1f 100644
--- a/sys/dev/netmap/netmap_vale.c
+++ b/sys/dev/netmap/netmap_vale.c
@@ -157,11 +157,9 @@ SYSCTL_DECL(_dev_netmap);
SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , "");
-static int bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp);
-static int bdg_netmap_reg(struct netmap_adapter *na, int onoff);
-static int netmap_bwrap_attach(struct ifnet *, struct ifnet *);
+static int netmap_vp_create(struct nmreq *, struct ifnet *, struct netmap_vp_adapter **);
+static int netmap_vp_reg(struct netmap_adapter *na, int onoff);
static int netmap_bwrap_register(struct netmap_adapter *, int onoff);
-int kern_netmap_regif(struct nmreq *nmr);
/*
* For each output interface, nm_bdg_q is used to construct a list.
@@ -217,7 +215,7 @@ struct nm_bridge {
* different ring index.
* This function must be set by netmap_bdgctl().
*/
- bdg_lookup_fn_t nm_bdg_lookup;
+ struct netmap_bdg_ops bdg_ops;
/* the forwarding table, MAC+ports.
* XXX should be changed to an argument to be passed to
@@ -226,6 +224,15 @@ struct nm_bridge {
struct nm_hash_ent ht[NM_BDG_HASH];
};
+const char*
+netmap_bdg_name(struct netmap_vp_adapter *vp)
+{
+ struct nm_bridge *b = vp->na_bdg;
+ if (b == NULL)
+ return NULL;
+ return b->bdg_basename;
+}
+
/*
* XXX in principle nm_bridges could be created dynamically
@@ -321,7 +328,7 @@ nm_find_bridge(const char *name, int create)
for (i = 0; i < NM_BDG_MAXPORTS; i++)
b->bdg_port_index[i] = i;
/* set the default function */
- b->nm_bdg_lookup = netmap_bdg_learning;
+ b->bdg_ops.lookup = netmap_bdg_learning;
/* reset the MAC address table */
bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
}
@@ -389,6 +396,9 @@ nm_alloc_bdgfwd(struct netmap_adapter *na)
}
+/* remove from bridge b the ports in slots hw and sw
+ * (sw can be -1 if not needed)
+ */
static void
netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
{
@@ -434,6 +444,8 @@ netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
}
BDG_WLOCK(b);
+ if (b->bdg_ops.dtor)
+ b->bdg_ops.dtor(b->bdg_ports[s_hw]);
b->bdg_ports[s_hw] = NULL;
if (s_sw >= 0) {
b->bdg_ports[s_sw] = NULL;
@@ -445,29 +457,131 @@ netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
ND("now %d active ports", lim);
if (lim == 0) {
ND("marking bridge %s as free", b->bdg_basename);
- b->nm_bdg_lookup = NULL;
+ bzero(&b->bdg_ops, sizeof(b->bdg_ops));
}
}
+/* nm_bdg_ctl callback for VALE ports */
+static int
+netmap_vp_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)
+{
+ struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
+ struct nm_bridge *b = vpna->na_bdg;
+
+ if (attach)
+ return 0; /* nothing to do */
+ if (b) {
+ netmap_set_all_rings(na, 0 /* disable */);
+ netmap_bdg_detach_common(b, vpna->bdg_port, -1);
+ vpna->na_bdg = NULL;
+ netmap_set_all_rings(na, 1 /* enable */);
+ }
+ /* I have took reference just for attach */
+ netmap_adapter_put(na);
+ return 0;
+}
+/* nm_dtor callback for ephemeral VALE ports */
static void
-netmap_adapter_vp_dtor(struct netmap_adapter *na)
+netmap_vp_dtor(struct netmap_adapter *na)
{
struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
struct nm_bridge *b = vpna->na_bdg;
- struct ifnet *ifp = na->ifp;
- ND("%s has %d references", NM_IFPNAME(ifp), na->na_refcount);
+ ND("%s has %d references", na->name, na->na_refcount);
if (b) {
netmap_bdg_detach_common(b, vpna->bdg_port, -1);
}
+}
- bzero(ifp, sizeof(*ifp));
- free(ifp, M_DEVBUF);
+/* nm_dtor callback for persistent VALE ports */
+static void
+netmap_persist_vp_dtor(struct netmap_adapter *na)
+{
+ struct ifnet *ifp = na->ifp;
+
+ netmap_vp_dtor(na);
na->ifp = NULL;
+ nm_vi_detach(ifp);
+}
+
+/* remove a persistent VALE port from the system */
+static int
+nm_vi_destroy(const char *name)
+{
+ struct ifnet *ifp;
+ int error;
+
+ ifp = ifunit_ref(name);
+ if (!ifp)
+ return ENXIO;
+ NMG_LOCK();
+ /* make sure this is actually a VALE port */
+ if (!NETMAP_CAPABLE(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {
+ error = EINVAL;
+ goto err;
+ }
+
+ if (NA(ifp)->na_refcount > 1) {
+ error = EBUSY;
+ goto err;
+ }
+ NMG_UNLOCK();
+
+ D("destroying a persistent vale interface %s", ifp->if_xname);
+ /* Linux requires all the references are released
+ * before unregister
+ */
+ if_rele(ifp);
+ netmap_detach(ifp);
+ return 0;
+
+err:
+ NMG_UNLOCK();
+ if_rele(ifp);
+ return error;
}
+/*
+ * Create a virtual interface registered to the system.
+ * The interface will be attached to a bridge later.
+ */
+static int
+nm_vi_create(struct nmreq *nmr)
+{
+ struct ifnet *ifp;
+ struct netmap_vp_adapter *vpna;
+ int error;
+
+ /* don't include VALE prefix */
+ if (!strncmp(nmr->nr_name, NM_NAME, strlen(NM_NAME)))
+ return EINVAL;
+ ifp = ifunit_ref(nmr->nr_name);
+ if (ifp) { /* already exist, cannot create new one */
+ if_rele(ifp);
+ return EEXIST;
+ }
+ error = nm_vi_persist(nmr->nr_name, &ifp);
+ if (error)
+ return error;
+
+ NMG_LOCK();
+ /* netmap_vp_create creates a struct netmap_vp_adapter */
+ error = netmap_vp_create(nmr, ifp, &vpna);
+ if (error) {
+ D("error %d", error);
+ nm_vi_detach(ifp);
+ return error;
+ }
+ /* persist-specific routines */
+ vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl;
+ vpna->up.nm_dtor = netmap_persist_vp_dtor;
+ netmap_adapter_get(&vpna->up);
+ NMG_UNLOCK();
+ D("created %s", ifp->if_xname);
+ return 0;
+}
/* Try to get a reference to a netmap adapter attached to a VALE switch.
* If the adapter is found (or is created), this function returns 0, a
@@ -481,11 +595,11 @@ netmap_adapter_vp_dtor(struct netmap_adapter *na)
int
netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
{
- const char *name = nmr->nr_name;
+ char *nr_name = nmr->nr_name;
+ const char *ifname;
struct ifnet *ifp;
int error = 0;
- struct netmap_adapter *ret;
- struct netmap_vp_adapter *vpna;
+ struct netmap_vp_adapter *vpna, *hostna = NULL;
struct nm_bridge *b;
int i, j, cand = -1, cand2 = -1;
int needed;
@@ -494,15 +608,17 @@ netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
/* first try to see if this is a bridge port. */
NMG_LOCK_ASSERT();
- if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) {
+ if (strncmp(nr_name, NM_NAME, sizeof(NM_NAME) - 1)) {
return 0; /* no error, but no VALE prefix */
}
- b = nm_find_bridge(name, create);
+ b = nm_find_bridge(nr_name, create);
if (b == NULL) {
- D("no bridges available for '%s'", name);
+ D("no bridges available for '%s'", nr_name);
return (create ? ENOMEM : ENXIO);
}
+ if (strlen(nr_name) < b->bdg_namelen) /* impossible */
+ panic("x");
/* Now we are sure that name starts with the bridge's name,
* lookup the port in the bridge. We need to scan the entire
@@ -516,13 +632,11 @@ netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
i = b->bdg_port_index[j];
vpna = b->bdg_ports[i];
// KASSERT(na != NULL);
- ifp = vpna->up.ifp;
- /* XXX make sure the name only contains one : */
- if (!strcmp(NM_IFPNAME(ifp), name)) {
+ D("checking %s", vpna->up.name);
+ if (!strcmp(vpna->up.name, nr_name)) {
netmap_adapter_get(&vpna->up);
- ND("found existing if %s refs %d", name,
- vpna->na_bdg_refcount);
- *na = (struct netmap_adapter *)vpna;
+ ND("found existing if %s refs %d", nr_name)
+ *na = &vpna->up;
return 0;
}
}
@@ -539,68 +653,50 @@ netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
cand = b->bdg_port_index[b->bdg_active_ports];
cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
ND("+++ bridge %s port %s used %d avail %d %d",
- b->bdg_basename, name, b->bdg_active_ports, cand, cand2);
+ b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2);
/*
* try see if there is a matching NIC with this name
* (after the bridge's name)
*/
- ifp = ifunit_ref(name + b->bdg_namelen + 1);
- if (!ifp) { /* this is a virtual port */
+ ifname = nr_name + b->bdg_namelen + 1;
+ ifp = ifunit_ref(ifname);
+ if (!ifp) {
+ /* Create an ephemeral virtual port
+ * This block contains all the ephemeral-specific logics
+ */
if (nmr->nr_cmd) {
/* nr_cmd must be 0 for a virtual port */
return EINVAL;
}
- /* create a struct ifnet for the new port.
- * need M_NOWAIT as we are under nma_lock
- */
- ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
- if (!ifp)
- return ENOMEM;
-
- strcpy(ifp->if_xname, name);
/* bdg_netmap_attach creates a struct netmap_adapter */
- error = bdg_netmap_attach(nmr, ifp);
+ error = netmap_vp_create(nmr, NULL, &vpna);
if (error) {
D("error %d", error);
free(ifp, M_DEVBUF);
return error;
}
- ret = NA(ifp);
- cand2 = -1; /* only need one port */
- } else { /* this is a NIC */
- struct ifnet *fake_ifp;
+ /* shortcut - we can skip get_hw_na(),
+ * ownership check and nm_bdg_attach()
+ */
+ } else {
+ struct netmap_adapter *hw;
- error = netmap_get_hw_na(ifp, &ret);
- if (error || ret == NULL)
+ error = netmap_get_hw_na(ifp, &hw);
+ if (error || hw == NULL)
goto out;
- /* make sure the NIC is not already in use */
- if (NETMAP_OWNED_BY_ANY(ret)) {
- D("NIC %s busy, cannot attach to bridge",
- NM_IFPNAME(ifp));
- error = EBUSY;
- goto out;
- }
- /* create a fake interface */
- fake_ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
- if (!fake_ifp) {
- error = ENOMEM;
- goto out;
- }
- strcpy(fake_ifp->if_xname, name);
- error = netmap_bwrap_attach(fake_ifp, ifp);
- if (error) {
- free(fake_ifp, M_DEVBUF);
+ /* host adapter might not be created */
+ error = hw->nm_bdg_attach(nr_name, hw);
+ if (error)
goto out;
- }
- ret = NA(fake_ifp);
- if (nmr->nr_arg1 != NETMAP_BDG_HOST)
- cand2 = -1; /* only need one port */
+ vpna = hw->na_vp;
+ hostna = hw->na_hostvp;
if_rele(ifp);
+ if (nmr->nr_arg1 != NETMAP_BDG_HOST)
+ hostna = NULL;
}
- vpna = (struct netmap_vp_adapter *)ret;
BDG_WLOCK(b);
vpna->bdg_port = cand;
@@ -609,8 +705,7 @@ netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
b->bdg_ports[cand] = vpna;
vpna->na_bdg = b;
b->bdg_active_ports++;
- if (cand2 >= 0) {
- struct netmap_vp_adapter *hostna = vpna + 1;
+ if (hostna != NULL) {
/* also bind the host stack to the bridge */
b->bdg_ports[cand2] = hostna;
hostna->bdg_port = cand2;
@@ -618,10 +713,10 @@ netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
b->bdg_active_ports++;
ND("host %p to bridge port %d", hostna, cand2);
}
- ND("if %s refs %d", name, vpna->up.na_refcount);
+ ND("if %s refs %d", ifname, vpna->up.na_refcount);
BDG_WUNLOCK(b);
- *na = ret;
- netmap_adapter_get(ret);
+ *na = &vpna->up;
+ netmap_adapter_get(*na);
return 0;
out:
@@ -631,24 +726,17 @@ out:
}
-/* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */
+/* Process NETMAP_BDG_ATTACH */
static int
-nm_bdg_attach(struct nmreq *nmr)
+nm_bdg_ctl_attach(struct nmreq *nmr)
{
struct netmap_adapter *na;
- struct netmap_if *nifp;
- struct netmap_priv_d *npriv;
- struct netmap_bwrap_adapter *bna;
int error;
- npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO);
- if (npriv == NULL)
- return ENOMEM;
-
NMG_LOCK();
error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */);
- if (error) /* no device, or another bridge or user owns the device */
+ if (error) /* no device */
goto unlock_exit;
if (na == NULL) { /* VALE prefix missing */
@@ -656,39 +744,37 @@ nm_bdg_attach(struct nmreq *nmr)
goto unlock_exit;
}
- if (na->active_fds > 0) { /* already registered */
+ if (NETMAP_OWNED_BY_ANY(na)) {
error = EBUSY;
goto unref_exit;
}
- nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags, &error);
- if (!nifp) {
- goto unref_exit;
+ if (na->nm_bdg_ctl) {
+ /* nop for VALE ports. The bwrap needs to put the hwna
+ * in netmap mode (see netmap_bwrap_bdg_ctl)
+ */
+ error = na->nm_bdg_ctl(na, nmr, 1);
+ if (error)
+ goto unref_exit;
+ ND("registered %s to netmap-mode", na->name);
}
-
- bna = (struct netmap_bwrap_adapter*)na;
- bna->na_kpriv = npriv;
NMG_UNLOCK();
- ND("registered %s to netmap-mode", NM_IFPNAME(na->ifp));
return 0;
unref_exit:
netmap_adapter_put(na);
unlock_exit:
NMG_UNLOCK();
- bzero(npriv, sizeof(*npriv));
- free(npriv, M_DEVBUF);
return error;
}
+/* process NETMAP_BDG_DETACH */
static int
-nm_bdg_detach(struct nmreq *nmr)
+nm_bdg_ctl_detach(struct nmreq *nmr)
{
struct netmap_adapter *na;
int error;
- struct netmap_bwrap_adapter *bna;
- int last_instance;
NMG_LOCK();
error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */);
@@ -701,28 +787,13 @@ nm_bdg_detach(struct nmreq *nmr)
goto unlock_exit;
}
- bna = (struct netmap_bwrap_adapter *)na;
-
- if (na->active_fds == 0) { /* not registered */
- error = EINVAL;
- goto unref_exit;
- }
-
- last_instance = netmap_dtor_locked(bna->na_kpriv); /* unregister */
- if (!last_instance) {
- D("--- error, trying to detach an entry with active mmaps");
- error = EINVAL;
- } else {
- struct netmap_priv_d *npriv = bna->na_kpriv;
-
- bna->na_kpriv = NULL;
- D("deleting priv");
-
- bzero(npriv, sizeof(*npriv));
- free(npriv, M_DEVBUF);
+ if (na->nm_bdg_ctl) {
+ /* remove the port from bridge. The bwrap
+ * also needs to put the hwna in normal mode
+ */
+ error = na->nm_bdg_ctl(na, nmr, 0);
}
-unref_exit:
netmap_adapter_put(na);
unlock_exit:
NMG_UNLOCK();
@@ -731,28 +802,39 @@ unlock_exit:
}
-/* exported to kernel callers, e.g. OVS ?
- * Entry point.
+/* Called by either user's context (netmap_ioctl())
+ * or external kernel modules (e.g., Openvswitch).
+ * Operation is indicated in nmr->nr_cmd.
+ * NETMAP_BDG_OPS that sets configure/lookup/dtor functions to the bridge
+ * requires bdg_ops argument; the other commands ignore this argument.
+ *
* Called without NMG_LOCK.
*/
int
-netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
+netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops)
{
struct nm_bridge *b;
struct netmap_adapter *na;
struct netmap_vp_adapter *vpna;
- struct ifnet *iter;
char *name = nmr->nr_name;
int cmd = nmr->nr_cmd, namelen = strlen(name);
int error = 0, i, j;
switch (cmd) {
+ case NETMAP_BDG_NEWIF:
+ error = nm_vi_create(nmr);
+ break;
+
+ case NETMAP_BDG_DELIF:
+ error = nm_vi_destroy(nmr->nr_name);
+ break;
+
case NETMAP_BDG_ATTACH:
- error = nm_bdg_attach(nmr);
+ error = nm_bdg_ctl_attach(nmr);
break;
case NETMAP_BDG_DETACH:
- error = nm_bdg_detach(nmr);
+ error = nm_bdg_ctl_detach(nmr);
break;
case NETMAP_BDG_LIST:
@@ -770,6 +852,7 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
break;
}
+ name = name + b->bdg_namelen + 1;
error = ENOENT;
for (j = 0; j < b->bdg_active_ports; j++) {
i = b->bdg_port_index[j];
@@ -778,11 +861,10 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
D("---AAAAAAAAARGH-------");
continue;
}
- iter = vpna->up.ifp;
/* the former and the latter identify a
* virtual port and a NIC, respectively
*/
- if (!strcmp(iter->if_xname, name)) {
+ if (!strcmp(vpna->up.name, name)) {
/* bridge index */
nmr->nr_arg1 = b - nm_bridges;
nmr->nr_arg2 = i; /* port index */
@@ -813,8 +895,7 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
nmr->nr_arg2 = j;
j = b->bdg_port_index[j];
vpna = b->bdg_ports[j];
- iter = vpna->up.ifp;
- strncpy(name, iter->if_xname, (size_t)IFNAMSIZ);
+ strncpy(name, vpna->up.name, (size_t)IFNAMSIZ);
error = 0;
break;
}
@@ -822,12 +903,12 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
}
break;
- case NETMAP_BDG_LOOKUP_REG:
- /* register a lookup function to the given bridge.
+ case NETMAP_BDG_REGOPS: /* XXX this should not be available from userspace */
+ /* register callbacks to the given bridge.
* nmr->nr_name may be just bridge's name (including ':'
* if it is not just NM_NAME).
*/
- if (!func) {
+ if (!bdg_ops) {
error = EINVAL;
break;
}
@@ -836,7 +917,7 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
if (!b) {
error = EINVAL;
} else {
- b->nm_bdg_lookup = func;
+ b->bdg_ops = *bdg_ops;
}
NMG_UNLOCK();
break;
@@ -856,7 +937,7 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
vpna = (struct netmap_vp_adapter *)na;
vpna->virt_hdr_len = nmr->nr_arg1;
if (vpna->virt_hdr_len)
- vpna->mfs = NETMAP_BDG_BUF_SIZE(na->nm_mem);
+ vpna->mfs = NETMAP_BUF_SIZE(na);
D("Using vnet_hdr_len %d for %p", vpna->virt_hdr_len, vpna);
netmap_adapter_put(na);
}
@@ -871,6 +952,32 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
return error;
}
+int
+netmap_bdg_config(struct nmreq *nmr)
+{
+ struct nm_bridge *b;
+ int error = EINVAL;
+
+ NMG_LOCK();
+ b = nm_find_bridge(nmr->nr_name, 0);
+ if (!b) {
+ NMG_UNLOCK();
+ return error;
+ }
+ NMG_UNLOCK();
+ /* Don't call config() with NMG_LOCK() held */
+ BDG_RLOCK(b);
+ if (b->bdg_ops.config != NULL)
+ error = b->bdg_ops.config((struct nm_ifreq *)nmr);
+ BDG_RUNLOCK(b);
+ return error;
+}
+
+
+/* nm_krings_create callback for VALE ports.
+ * Calls the standard netmap_krings_create, then adds leases on rx
+ * rings and bdgfwd on tx rings.
+ */
static int
netmap_vp_krings_create(struct netmap_adapter *na)
{
@@ -905,6 +1012,7 @@ netmap_vp_krings_create(struct netmap_adapter *na)
}
+/* nm_krings_delete callback for VALE ports. */
static void
netmap_vp_krings_delete(struct netmap_adapter *na)
{
@@ -919,17 +1027,20 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
/*
+ * main dispatch routine for the bridge.
* Grab packets from a kring, move them into the ft structure
* associated to the tx (input) port. Max one instance per port,
* filtered on input (ioctl, poll or XXX).
* Returns the next position in the ring.
*/
static int
-nm_bdg_preflush(struct netmap_vp_adapter *na, u_int ring_nr,
- struct netmap_kring *kring, u_int end)
+nm_bdg_preflush(struct netmap_kring *kring, u_int end)
{
+ struct netmap_vp_adapter *na =
+ (struct netmap_vp_adapter*)kring->na;
struct netmap_ring *ring = kring->ring;
struct nm_bdg_fwd *ft;
+ u_int ring_nr = kring->ring_id;
u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
u_int ft_i = 0; /* start from 0 */
u_int frags = 1; /* how many frags ? */
@@ -958,12 +1069,12 @@ nm_bdg_preflush(struct netmap_vp_adapter *na, u_int ring_nr,
/* this slot goes into a list so initialize the link field */
ft[ft_i].ft_next = NM_FT_NULL;
buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
- (void *)(uintptr_t)slot->ptr : BDG_NMB(&na->up, slot);
+ (void *)(uintptr_t)slot->ptr : NMB(&na->up, slot);
if (unlikely(buf == NULL)) {
RD(5, "NULL %s buffer pointer from %s slot %d len %d",
(slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT",
kring->name, j, ft[ft_i].ft_len);
- buf = ft[ft_i].ft_buf = NMB_VA(0); /* the 'null' buffer */
+ buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up);
ft[ft_i].ft_len = 0;
ft[ft_i].ft_flags = 0;
}
@@ -1035,23 +1146,28 @@ nm_bridge_rthash(const uint8_t *addr)
#undef mix
+/* nm_register callback for VALE ports */
static int
-bdg_netmap_reg(struct netmap_adapter *na, int onoff)
+netmap_vp_reg(struct netmap_adapter *na, int onoff)
{
struct netmap_vp_adapter *vpna =
(struct netmap_vp_adapter*)na;
- struct ifnet *ifp = na->ifp;
- /* the interface is already attached to the bridge,
- * so we only need to toggle IFCAP_NETMAP.
+ /* persistent ports may be put in netmap mode
+ * before being attached to a bridge
*/
- BDG_WLOCK(vpna->na_bdg);
+ if (vpna->na_bdg)
+ BDG_WLOCK(vpna->na_bdg);
if (onoff) {
- ifp->if_capenable |= IFCAP_NETMAP;
+ na->na_flags |= NAF_NETMAP_ON;
+ /* XXX on FreeBSD, persistent VALE ports should also
+ * toggle IFCAP_NETMAP in na->ifp (2014-03-16)
+ */
} else {
- ifp->if_capenable &= ~IFCAP_NETMAP;
+ na->na_flags &= ~NAF_NETMAP_ON;
}
- BDG_WUNLOCK(vpna->na_bdg);
+ if (vpna->na_bdg)
+ BDG_WUNLOCK(vpna->na_bdg);
return 0;
}
@@ -1063,16 +1179,28 @@ bdg_netmap_reg(struct netmap_adapter *na, int onoff)
* ring in *dst_ring (at the moment, always use ring 0)
*/
u_int
-netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring,
- struct netmap_vp_adapter *na)
+netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
+ const struct netmap_vp_adapter *na)
{
+ uint8_t *buf = ft->ft_buf;
+ u_int buf_len = ft->ft_len;
struct nm_hash_ent *ht = na->na_bdg->ht;
uint32_t sh, dh;
u_int dst, mysrc = na->bdg_port;
uint64_t smac, dmac;
- if (buf_len < 14) {
- RD(5, "invalid buf length %d", buf_len);
+ /* safety check, unfortunately we have many cases */
+ if (buf_len >= 14 + na->virt_hdr_len) {
+ /* virthdr + mac_hdr in the same slot */
+ buf += na->virt_hdr_len;
+ buf_len -= na->virt_hdr_len;
+ } else if (buf_len == na->virt_hdr_len && ft->ft_flags & NS_MOREFRAG) {
+ /* only header in first fragment */
+ ft++;
+ buf = ft->ft_buf;
+ buf_len = ft->ft_len;
+ } else {
+ RD(5, "invalid buf format, length %d", buf_len);
return NM_BDG_NOPORT;
}
dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
@@ -1170,7 +1298,7 @@ nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
k->nr_hwtail >= k->nkr_num_slots ||
k->nkr_lease_idx >= k->nkr_num_slots) {
D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
- k->na->ifp->if_xname,
+ k->na->name,
k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
k->nkr_lease_idx, k->nkr_num_slots);
}
@@ -1178,6 +1306,7 @@ nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
}
/*
+ *
* This flush routine supports only unicast and broadcast but a large
* number of ports, and lets us replace the learn and dispatch functions.
*/
@@ -1204,22 +1333,13 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
uint8_t dst_ring = ring_nr; /* default, same ring as origin */
uint16_t dst_port, d_i;
struct nm_bdg_q *d;
- uint8_t *buf = ft[i].ft_buf;
- u_int len = ft[i].ft_len;
ND("slot %d frags %d", i, ft[i].ft_frags);
/* Drop the packet if the virtio-net header is not into the first
fragment nor at the very beginning of the second. */
- if (unlikely(na->virt_hdr_len > len))
+ if (unlikely(na->virt_hdr_len > ft[i].ft_len))
continue;
- if (len == na->virt_hdr_len) {
- buf = ft[i+1].ft_buf;
- len = ft[i+1].ft_len;
- } else {
- buf += na->virt_hdr_len;
- len -= na->virt_hdr_len;
- }
- dst_port = b->nm_bdg_lookup(buf, len, &dst_ring, na);
+ dst_port = b->bdg_ops.lookup(&ft[i], &dst_ring, na);
if (netmap_verbose > 255)
RD(5, "slot %d port %d -> %d", i, me, dst_port);
if (dst_port == NM_BDG_NOPORT)
@@ -1270,9 +1390,8 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
}
ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
- /* second pass: scan destinations (XXX will be modular somehow) */
+ /* second pass: scan destinations */
for (i = 0; i < num_dsts; i++) {
- struct ifnet *dst_ifp;
struct netmap_vp_adapter *dst_na;
struct netmap_kring *kring;
struct netmap_ring *ring;
@@ -1296,13 +1415,12 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
goto cleanup;
if (dst_na->up.na_flags & NAF_SW_ONLY)
goto cleanup;
- dst_ifp = dst_na->up.ifp;
/*
* The interface may be in !netmap mode in two cases:
* - when na is attached but not activated yet;
* - when na is being deactivated but is still attached.
*/
- if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) {
+ if (unlikely(!nm_netmap_on(&dst_na->up))) {
ND("not in netmap mode!");
goto cleanup;
}
@@ -1320,7 +1438,7 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
needed = d->bq_len + brddst->bq_len;
if (unlikely(dst_na->virt_hdr_len != na->virt_hdr_len)) {
- RD(3, "virt_hdr_mismatch, src %d len %d", na->virt_hdr_len, dst_na->virt_hdr_len);
+ RD(3, "virt_hdr_mismatch, src %d dst %d", na->virt_hdr_len, dst_na->virt_hdr_len);
/* There is a virtio-net header/offloadings mismatch between
* source and destination. The slower mismatch datapath will
* be used to cope with all the mismatches.
@@ -1358,6 +1476,10 @@ retry:
if (dst_na->retry && retry) {
/* try to get some free slot from the previous run */
dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
+ /* actually useful only for bwraps, since there
+ * the notify will trigger a txsync on the hwna. VALE ports
+ * have dst_na->retry == 0
+ */
}
/* reserve the buffers in the queue and an entry
* to report completion, and drop lock.
@@ -1413,7 +1535,7 @@ retry:
size_t copy_len = ft_p->ft_len, dst_len = copy_len;
slot = &ring->slot[j];
- dst = BDG_NMB(&dst_na->up, slot);
+ dst = NMB(&dst_na->up, slot);
ND("send [%d] %d(%d) bytes at %s:%d",
i, (int)copy_len, (int)dst_len,
@@ -1421,8 +1543,8 @@ retry:
/* round to a multiple of 64 */
copy_len = (copy_len + 63) & ~63;
- if (unlikely(copy_len > NETMAP_BUF_SIZE ||
- copy_len > NETMAP_BUF_SIZE)) {
+ if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) ||
+ copy_len > NETMAP_BUF_SIZE(&na->up))) {
RD(5, "invalid len %d, down to 64", (int)copy_len);
copy_len = dst_len = 64; // XXX
}
@@ -1495,8 +1617,16 @@ retry:
still_locked = 0;
mtx_unlock(&kring->q_lock);
dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
- if (dst_na->retry && retry--)
+ /* this is netmap_notify for VALE ports and
+ * netmap_bwrap_notify for bwrap. The latter will
+ * trigger a txsync on the underlying hwna
+ */
+ if (dst_na->retry && retry--) {
+ /* XXX this is going to call nm_notify again.
+ * Only useful for bwrap in virtual machines
+ */
goto retry;
+ }
}
}
if (still_locked)
@@ -1511,11 +1641,12 @@ cleanup:
return 0;
}
-
+/* nm_txsync callback for VALE ports */
static int
-netmap_vp_txsync(struct netmap_vp_adapter *na, u_int ring_nr, int flags)
+netmap_vp_txsync(struct netmap_kring *kring, int flags)
{
- struct netmap_kring *kring = &na->up.tx_rings[ring_nr];
+ struct netmap_vp_adapter *na =
+ (struct netmap_vp_adapter *)kring->na;
u_int done;
u_int const lim = kring->nkr_num_slots - 1;
u_int const cur = kring->rcur;
@@ -1524,10 +1655,14 @@ netmap_vp_txsync(struct netmap_vp_adapter *na, u_int ring_nr, int flags)
done = cur; // used all
goto done;
}
+ if (!na->na_bdg) {
+ done = cur;
+ goto done;
+ }
if (bridge_batch > NM_BDG_BATCH)
bridge_batch = NM_BDG_BATCH;
- done = nm_bdg_preflush(na, ring_nr, kring, cur);
+ done = nm_bdg_preflush(kring, cur);
done:
if (done != cur)
D("early break at %d/ %d, tail %d", done, cur, kring->nr_hwtail);
@@ -1538,27 +1673,18 @@ done:
kring->nr_hwtail = nm_prev(done, lim);
nm_txsync_finalize(kring);
if (netmap_verbose)
- D("%s ring %d flags %d", NM_IFPNAME(na->up.ifp), ring_nr, flags);
+ D("%s ring %d flags %d", na->up.name, kring->ring_id, flags);
return 0;
}
-/*
- * main dispatch routine for the bridge.
- * We already know that only one thread is running this.
- * we must run nm_bdg_preflush without lock.
+/* rxsync code used by VALE ports nm_rxsync callback and also
+ * internally by the brwap
*/
static int
-bdg_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags)
{
- struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
- return netmap_vp_txsync(vpna, ring_nr, flags);
-}
-
-static int
-netmap_vp_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
-{
- struct netmap_kring *kring = &na->rx_rings[ring_nr];
+ struct netmap_adapter *na = kring->na;
struct netmap_ring *ring = kring->ring;
u_int nm_i, lim = kring->nkr_num_slots - 1;
u_int head = nm_rxsync_prologue(kring);
@@ -1579,9 +1705,9 @@ netmap_vp_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
/* consistency check, but nothing really important here */
for (n = 0; likely(nm_i != head); n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
- void *addr = BDG_NMB(na, slot);
+ void *addr = NMB(na, slot);
- if (addr == netmap_buffer_base) { /* bad buf */
+ if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */
D("bad buffer index %d, ignore ?",
slot->buf_idx);
}
@@ -1599,26 +1725,45 @@ done:
}
/*
+ * nm_rxsync callback for VALE ports
* user process reading from a VALE switch.
* Already protected against concurrent calls from userspace,
* but we must acquire the queue's lock to protect against
* writers on the same queue.
*/
static int
-bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+netmap_vp_rxsync(struct netmap_kring *kring, int flags)
{
- struct netmap_kring *kring = &na->rx_rings[ring_nr];
int n;
mtx_lock(&kring->q_lock);
- n = netmap_vp_rxsync(na, ring_nr, flags);
+ n = netmap_vp_rxsync_locked(kring, flags);
mtx_unlock(&kring->q_lock);
return n;
}
+/* nm_bdg_attach callback for VALE ports
+ * The na_vp port is this same netmap_adapter. There is no host port.
+ */
+static int
+netmap_vp_bdg_attach(const char *name, struct netmap_adapter *na)
+{
+ struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
+
+ if (vpna->na_bdg)
+ return EBUSY;
+ na->na_vp = vpna;
+ strncpy(na->name, name, sizeof(na->name));
+ na->na_hostvp = NULL;
+ return 0;
+}
+
+/* create a netmap_vp_adapter that describes a VALE port.
+ * Only persistent VALE ports have a non-null ifp.
+ */
static int
-bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp)
+netmap_vp_create(struct nmreq *nmr, struct ifnet *ifp, struct netmap_vp_adapter **ret)
{
struct netmap_vp_adapter *vpna;
struct netmap_adapter *na;
@@ -1632,6 +1777,7 @@ bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp)
na = &vpna->up;
na->ifp = ifp;
+ strncpy(na->name, nmr->nr_name, sizeof(na->name));
/* bound checking */
na->num_tx_rings = nmr->nr_tx_rings;
@@ -1664,22 +1810,24 @@ bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp)
D("max frame size %u", vpna->mfs);
na->na_flags |= NAF_BDG_MAYSLEEP | NAF_MEM_OWNER;
- na->nm_txsync = bdg_netmap_txsync;
- na->nm_rxsync = bdg_netmap_rxsync;
- na->nm_register = bdg_netmap_reg;
- na->nm_dtor = netmap_adapter_vp_dtor;
+ na->nm_txsync = netmap_vp_txsync;
+ na->nm_rxsync = netmap_vp_rxsync;
+ na->nm_register = netmap_vp_reg;
na->nm_krings_create = netmap_vp_krings_create;
na->nm_krings_delete = netmap_vp_krings_delete;
- na->nm_mem = netmap_mem_private_new(NM_IFPNAME(na->ifp),
+ na->nm_dtor = netmap_vp_dtor;
+ na->nm_mem = netmap_mem_private_new(na->name,
na->num_tx_rings, na->num_tx_desc,
na->num_rx_rings, na->num_rx_desc,
nmr->nr_arg3, npipes, &error);
if (na->nm_mem == NULL)
goto err;
+ na->nm_bdg_attach = netmap_vp_bdg_attach;
/* other nmd fields are set in the common routine */
error = netmap_attach_common(na);
if (error)
goto err;
+ *ret = vpna;
return 0;
err:
@@ -1689,30 +1837,60 @@ err:
return error;
}
+/* Bridge wrapper code (bwrap).
+ * This is used to connect a non-VALE-port netmap_adapter (hwna) to a
+ * VALE switch.
+ * The main task is to swap the meaning of tx and rx rings to match the
+ * expectations of the VALE switch code (see nm_bdg_flush).
+ *
+ * The bwrap works by interposing a netmap_bwrap_adapter between the
+ * rest of the system and the hwna. The netmap_bwrap_adapter looks like
+ * a netmap_vp_adapter to the rest the system, but, internally, it
+ * translates all callbacks to what the hwna expects.
+ *
+ * Note that we have to intercept callbacks coming from two sides:
+ *
+ * - callbacks coming from the netmap module are intercepted by
+ * passing around the netmap_bwrap_adapter instead of the hwna
+ *
+ * - callbacks coming from outside of the netmap module only know
+ * about the hwna. This, however, only happens in interrupt
+ * handlers, where only the hwna->nm_notify callback is called.
+ * What the bwrap does is to overwrite the hwna->nm_notify callback
+ * with its own netmap_bwrap_intr_notify.
+ * XXX This assumes that the hwna->nm_notify callback was the
+ * standard netmap_notify(), as it is the case for nic adapters.
+ * Any additional action performed by hwna->nm_notify will not be
+ * performed by netmap_bwrap_intr_notify.
+ *
+ * Additionally, the bwrap can optionally attach the host rings pair
+ * of the wrapped adapter to a different port of the switch.
+ */
+
static void
netmap_bwrap_dtor(struct netmap_adapter *na)
{
struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
struct netmap_adapter *hwna = bna->hwna;
- struct nm_bridge *b = bna->up.na_bdg,
- *bh = bna->host.na_bdg;
- struct ifnet *ifp = na->ifp;
ND("na %p", na);
+ /* drop reference to hwna->ifp.
+ * If we don't do this, netmap_detach_common(na)
+ * will think it has set NA(na->ifp) to NULL
+ */
+ na->ifp = NULL;
+ /* for safety, also drop the possible reference
+ * in the hostna
+ */
+ bna->host.up.ifp = NULL;
- if (b) {
- netmap_bdg_detach_common(b, bna->up.bdg_port,
- (bh ? bna->host.bdg_port : -1));
- }
-
+ hwna->nm_mem = bna->save_nmd;
hwna->na_private = NULL;
+ hwna->na_vp = hwna->na_hostvp = NULL;
+ hwna->na_flags &= ~NAF_BUSY;
netmap_adapter_put(hwna);
- bzero(ifp, sizeof(*ifp));
- free(ifp, M_DEVBUF);
- na->ifp = NULL;
-
}
@@ -1737,7 +1915,6 @@ netmap_bwrap_dtor(struct netmap_adapter *na)
static int
netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, int flags)
{
- struct ifnet *ifp = na->ifp;
struct netmap_bwrap_adapter *bna = na->na_private;
struct netmap_vp_adapter *hostna = &bna->host;
struct netmap_kring *kring, *bkring;
@@ -1747,20 +1924,24 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx,
int error = 0;
if (netmap_verbose)
- D("%s %s%d 0x%x", NM_IFPNAME(ifp),
+ D("%s %s%d 0x%x", na->name,
(tx == NR_TX ? "TX" : "RX"), ring_nr, flags);
if (flags & NAF_DISABLE_NOTIFY) {
- kring = tx == NR_TX ? na->tx_rings : na->rx_rings;
- bkring = tx == NR_TX ? vpna->up.rx_rings : vpna->up.tx_rings;
- if (kring[ring_nr].nkr_stopped)
- netmap_disable_ring(&bkring[ring_nr]);
- else
- bkring[ring_nr].nkr_stopped = 0;
+ /* the enabled/disabled state of the ring has changed,
+ * propagate the info to the wrapper (with tx/rx swapped)
+ */
+ if (tx == NR_TX) {
+ netmap_set_rxring(&vpna->up, ring_nr,
+ na->tx_rings[ring_nr].nkr_stopped);
+ } else {
+ netmap_set_txring(&vpna->up, ring_nr,
+ na->rx_rings[ring_nr].nkr_stopped);
+ }
return 0;
}
- if (ifp == NULL || !(ifp->if_capenable & IFCAP_NETMAP))
+ if (!nm_netmap_on(na))
return 0;
/* we only care about receive interrupts */
@@ -1786,7 +1967,7 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx,
* the info from the rx kring.
*/
if (netmap_verbose)
- D("%s head %d cur %d tail %d (kring %d %d %d)", NM_IFPNAME(ifp),
+ D("%s head %d cur %d tail %d (kring %d %d %d)", na->name,
ring->head, ring->cur, ring->tail,
kring->rhead, kring->rcur, kring->rtail);
@@ -1807,7 +1988,7 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx,
goto put_out;
if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) {
D("how strange, interrupt with no packets on %s",
- NM_IFPNAME(ifp));
+ na->name);
goto put_out;
}
@@ -1823,7 +2004,7 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx,
/* pass packets to the switch */
nm_txsync_prologue(bkring); // XXX error checking ?
- netmap_vp_txsync(vpna, ring_nr, flags);
+ netmap_vp_txsync(bkring, flags);
/* mark all buffers as released on this ring */
ring->head = ring->cur = kring->nr_hwtail;
@@ -1845,6 +2026,7 @@ put_out:
}
+/* nm_register callback for bwrap */
static int
netmap_bwrap_register(struct netmap_adapter *na, int onoff)
{
@@ -1854,22 +2036,35 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff)
struct netmap_vp_adapter *hostna = &bna->host;
int error;
- ND("%s %s", NM_IFPNAME(na->ifp), onoff ? "on" : "off");
+ ND("%s %s", na->name, onoff ? "on" : "off");
if (onoff) {
int i;
+ /* netmap_do_regif has been called on the bwrap na.
+ * We need to pass the information about the
+ * memory allocator down to the hwna before
+ * putting it in netmap mode
+ */
hwna->na_lut = na->na_lut;
hwna->na_lut_objtotal = na->na_lut_objtotal;
+ hwna->na_lut_objsize = na->na_lut_objsize;
if (hostna->na_bdg) {
+ /* if the host rings have been attached to switch,
+ * we need to copy the memory allocator information
+ * in the hostna also
+ */
hostna->up.na_lut = na->na_lut;
hostna->up.na_lut_objtotal = na->na_lut_objtotal;
+ hostna->up.na_lut_objsize = na->na_lut_objsize;
}
/* cross-link the netmap rings
* The original number of rings comes from hwna,
* rx rings on one side equals tx rings on the other.
+ * We need to do this now, after the initialization
+ * of the kring->ring pointers
*/
for (i = 0; i < na->num_rx_rings + 1; i++) {
hwna->tx_rings[i].nkr_num_slots = na->rx_rings[i].nkr_num_slots;
@@ -1881,27 +2076,31 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff)
}
}
- if (hwna->ifp) {
- error = hwna->nm_register(hwna, onoff);
- if (error)
- return error;
- }
+ /* forward the request to the hwna */
+ error = hwna->nm_register(hwna, onoff);
+ if (error)
+ return error;
- bdg_netmap_reg(na, onoff);
+ /* impersonate a netmap_vp_adapter */
+ netmap_vp_reg(na, onoff);
+ if (hostna->na_bdg)
+ netmap_vp_reg(&hostna->up, onoff);
if (onoff) {
+ /* intercept the hwna nm_nofify callback */
bna->save_notify = hwna->nm_notify;
hwna->nm_notify = netmap_bwrap_intr_notify;
} else {
hwna->nm_notify = bna->save_notify;
hwna->na_lut = NULL;
hwna->na_lut_objtotal = 0;
+ hwna->na_lut_objsize = 0;
}
return 0;
}
-
+/* nm_config callback for bwrap */
static int
netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
u_int *rxr, u_int *rxd)
@@ -1922,6 +2121,7 @@ netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
}
+/* nm_krings_create callback for bwrap */
static int
netmap_bwrap_krings_create(struct netmap_adapter *na)
{
@@ -1931,21 +2131,33 @@ netmap_bwrap_krings_create(struct netmap_adapter *na)
struct netmap_adapter *hostna = &bna->host.up;
int error;
- ND("%s", NM_IFPNAME(na->ifp));
+ ND("%s", na->name);
+ /* impersonate a netmap_vp_adapter */
error = netmap_vp_krings_create(na);
if (error)
return error;
+ /* also create the hwna krings */
error = hwna->nm_krings_create(hwna);
if (error) {
netmap_vp_krings_delete(na);
return error;
}
+ /* the connection between the bwrap krings and the hwna krings
+ * will be perfomed later, in the nm_register callback, since
+ * now the kring->ring pointers have not been initialized yet
+ */
if (na->na_flags & NAF_HOST_RINGS) {
+ /* the hostna rings are the host rings of the bwrap.
+ * The corresponding krings must point back to the
+ * hostna
+ */
hostna->tx_rings = na->tx_rings + na->num_tx_rings;
+ hostna->tx_rings[0].na = hostna;
hostna->rx_rings = na->rx_rings + na->num_rx_rings;
+ hostna->rx_rings[0].na = hostna;
}
return 0;
@@ -1959,7 +2171,7 @@ netmap_bwrap_krings_delete(struct netmap_adapter *na)
(struct netmap_bwrap_adapter *)na;
struct netmap_adapter *hwna = bna->hwna;
- ND("%s", NM_IFPNAME(na->ifp));
+ ND("%s", na->name);
hwna->nm_krings_delete(hwna);
netmap_vp_krings_delete(na);
@@ -1986,13 +2198,13 @@ netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int f
ring = kring->ring;
lim = kring->nkr_num_slots - 1;
- if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP))
+ if (!nm_netmap_on(hwna))
return 0;
mtx_lock(&kring->q_lock);
/* first step: simulate a user wakeup on the rx ring */
- netmap_vp_rxsync(na, ring_n, flags);
+ netmap_vp_rxsync_locked(kring, flags);
ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
- NM_IFPNAME(na->ifp), ring_n,
+ na->name, ring_n,
kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
ring->head, ring->cur, ring->tail,
hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail);
@@ -2013,9 +2225,9 @@ netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int f
ring->tail = kring->rtail; /* restore saved value of tail, for safety */
/* fifth step: the user goes to sleep again, causing another rxsync */
- netmap_vp_rxsync(na, ring_n, flags);
+ netmap_vp_rxsync_locked(kring, flags);
ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
- NM_IFPNAME(na->ifp), ring_n,
+ na->name, ring_n,
kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
ring->head, ring->cur, ring->tail,
hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
@@ -2024,6 +2236,7 @@ netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int f
}
+/* notify method for the bridge-->host-rings path */
static int
netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
{
@@ -2035,23 +2248,95 @@ netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx,
}
-/* attach a bridge wrapper to the 'real' device */
+/* nm_bdg_ctl callback for the bwrap.
+ * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd].
+ * On attach, it needs to provide a fake netmap_priv_d structure and
+ * perform a netmap_do_regif() on the bwrap. This will put both the
+ * bwrap and the hwna in netmap mode, with the netmap rings shared
+ * and cross linked. Moroever, it will start intercepting interrupts
+ * directed to hwna.
+ */
static int
-netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real)
+netmap_bwrap_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)
+{
+ struct netmap_priv_d *npriv;
+ struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
+ struct netmap_if *nifp;
+ int error = 0;
+
+ if (attach) {
+ if (NETMAP_OWNED_BY_ANY(na)) {
+ return EBUSY;
+ }
+ if (bna->na_kpriv) {
+ /* nothing to do */
+ return 0;
+ }
+ npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO);
+ if (npriv == NULL)
+ return ENOMEM;
+ nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags, &error);
+ if (!nifp) {
+ bzero(npriv, sizeof(*npriv));
+ free(npriv, M_DEVBUF);
+ return error;
+ }
+ bna->na_kpriv = npriv;
+ na->na_flags |= NAF_BUSY;
+ } else {
+ int last_instance;
+
+ if (na->active_fds == 0) /* not registered */
+ return EINVAL;
+ last_instance = netmap_dtor_locked(bna->na_kpriv);
+ if (!last_instance) {
+ D("--- error, trying to detach an entry with active mmaps");
+ error = EINVAL;
+ } else {
+ struct nm_bridge *b = bna->up.na_bdg,
+ *bh = bna->host.na_bdg;
+ npriv = bna->na_kpriv;
+ bna->na_kpriv = NULL;
+ D("deleting priv");
+
+ bzero(npriv, sizeof(*npriv));
+ free(npriv, M_DEVBUF);
+ if (b) {
+ /* XXX the bwrap dtor should take care
+ * of this (2014-06-16)
+ */
+ netmap_bdg_detach_common(b, bna->up.bdg_port,
+ (bh ? bna->host.bdg_port : -1));
+ }
+ na->na_flags &= ~NAF_BUSY;
+ }
+ }
+ return error;
+
+}
+
+/* attach a bridge wrapper to the 'real' device */
+int
+netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
{
struct netmap_bwrap_adapter *bna;
- struct netmap_adapter *na;
- struct netmap_adapter *hwna = NA(real);
- struct netmap_adapter *hostna;
- int error;
+ struct netmap_adapter *na = NULL;
+ struct netmap_adapter *hostna = NULL;
+ int error = 0;
+ /* make sure the NIC is not already in use */
+ if (NETMAP_OWNED_BY_ANY(hwna)) {
+ D("NIC %s busy, cannot attach to bridge", hwna->name);
+ return EBUSY;
+ }
bna = malloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO);
- if (bna == NULL)
+ if (bna == NULL) {
return ENOMEM;
+ }
na = &bna->up.up;
- na->ifp = fake;
+ strncpy(na->name, nr_name, sizeof(na->name));
/* fill the ring data for the bwrap adapter with rx/tx meanings
* swapped. The real cross-linking will be done during register,
* when all the krings will have been created.
@@ -2068,17 +2353,28 @@ netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real)
na->nm_krings_create = netmap_bwrap_krings_create;
na->nm_krings_delete = netmap_bwrap_krings_delete;
na->nm_notify = netmap_bwrap_notify;
- na->nm_mem = hwna->nm_mem;
- na->na_private = na; /* prevent NIOCREGIF */
+ na->nm_bdg_ctl = netmap_bwrap_bdg_ctl;
+ na->pdev = hwna->pdev;
+ na->nm_mem = netmap_mem_private_new(na->name,
+ na->num_tx_rings, na->num_tx_desc,
+ na->num_rx_rings, na->num_rx_desc,
+ 0, 0, &error);
+ na->na_flags |= NAF_MEM_OWNER;
+ if (na->nm_mem == NULL)
+ goto err_put;
bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
bna->hwna = hwna;
netmap_adapter_get(hwna);
hwna->na_private = bna; /* weak reference */
-
+ hwna->na_vp = &bna->up;
+
if (hwna->na_flags & NAF_HOST_RINGS) {
+ if (hwna->na_flags & NAF_SW_ONLY)
+ na->na_flags |= NAF_SW_ONLY;
na->na_flags |= NAF_HOST_RINGS;
hostna = &bna->host.up;
+ snprintf(hostna->name, sizeof(hostna->name), "%s^", nr_name);
hostna->ifp = hwna->ifp;
hostna->num_tx_rings = 1;
hostna->num_tx_desc = hwna->num_rx_desc;
@@ -2089,20 +2385,44 @@ netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real)
hostna->nm_notify = netmap_bwrap_host_notify;
hostna->nm_mem = na->nm_mem;
hostna->na_private = bna;
+ hostna->na_vp = &bna->up;
+ na->na_hostvp = hwna->na_hostvp =
+ hostna->na_hostvp = &bna->host;
+ hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */
}
ND("%s<->%s txr %d txd %d rxr %d rxd %d",
- fake->if_xname, real->if_xname,
+ na->name, ifp->if_xname,
na->num_tx_rings, na->num_tx_desc,
na->num_rx_rings, na->num_rx_desc);
error = netmap_attach_common(na);
if (error) {
- netmap_adapter_put(hwna);
- free(bna, M_DEVBUF);
- return error;
+ goto err_free;
}
+ /* make bwrap ifp point to the real ifp
+ * NOTE: netmap_attach_common() interprets a non-NULL na->ifp
+ * as a request to make the ifp point to the na. Since we
+ * do not want to change the na already pointed to by hwna->ifp,
+ * the following assignment has to be delayed until now
+ */
+ na->ifp = hwna->ifp;
+ hwna->na_flags |= NAF_BUSY;
+ /* make hwna point to the allocator we are actually using,
+ * so that monitors will be able to find it
+ */
+ bna->save_nmd = hwna->nm_mem;
+ hwna->nm_mem = na->nm_mem;
return 0;
+
+err_free:
+ netmap_mem_private_delete(na->nm_mem);
+err_put:
+ hwna->na_vp = hwna->na_hostvp = NULL;
+ netmap_adapter_put(hwna);
+ free(bna, M_DEVBUF);
+ return error;
+
}
OpenPOWER on IntegriCloud