diff options
author | luigi <luigi@FreeBSD.org> | 2014-08-20 23:34:36 +0000 |
---|---|---|
committer | luigi <luigi@FreeBSD.org> | 2014-08-20 23:34:36 +0000 |
commit | 223d76dc5012ea77078296847800a3d6181c61e2 (patch) | |
tree | d5d5263ca0c34de806d5e9e07b0b85eab96545f9 | |
parent | b63e85f63f1ee972ee2221c84e26cc35597b38f7 (diff) | |
download | FreeBSD-src-223d76dc5012ea77078296847800a3d6181c61e2.zip FreeBSD-src-223d76dc5012ea77078296847800a3d6181c61e2.tar.gz |
MFC 270063: update of netmap code
(vtnet and cxgbe not merged yet because we need some other mfc first)
-rw-r--r-- | sys/conf/files | 1 | ||||
-rw-r--r-- | sys/dev/e1000/if_em.c | 8 | ||||
-rw-r--r-- | sys/dev/e1000/if_igb.c | 6 | ||||
-rw-r--r-- | sys/dev/e1000/if_lem.c | 246 | ||||
-rw-r--r-- | sys/dev/ixgbe/ixgbe.c | 6 | ||||
-rw-r--r-- | sys/dev/netmap/if_em_netmap.h | 26 | ||||
-rw-r--r-- | sys/dev/netmap/if_igb_netmap.h | 26 | ||||
-rw-r--r-- | sys/dev/netmap/if_lem_netmap.h | 208 | ||||
-rw-r--r-- | sys/dev/netmap/if_re_netmap.h | 40 | ||||
-rw-r--r-- | sys/dev/netmap/if_vtnet_netmap.h | 434 | ||||
-rw-r--r-- | sys/dev/netmap/ixgbe_netmap.h | 26 | ||||
-rw-r--r-- | sys/dev/netmap/netmap.c | 683 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_freebsd.c | 149 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_generic.c | 122 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_kern.h | 416 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_mbq.h | 1 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_mem2.c | 251 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_mem2.h | 98 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_monitor.c | 498 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_offloadings.c | 6 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_pipe.c | 53 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_vale.c | 814 | ||||
-rw-r--r-- | tools/tools/netmap/pkt-gen.c | 141 | ||||
-rw-r--r-- | tools/tools/netmap/vale-ctl.c | 71 |
24 files changed, 3498 insertions, 832 deletions
diff --git a/sys/conf/files b/sys/conf/files index beea185..ad2e11c 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1933,6 +1933,7 @@ dev/netmap/netmap_freebsd.c optional netmap dev/netmap/netmap_generic.c optional netmap dev/netmap/netmap_mbq.c optional netmap dev/netmap/netmap_mem2.c optional netmap +dev/netmap/netmap_monitor.c optional netmap dev/netmap/netmap_offloadings.c optional netmap dev/netmap/netmap_pipe.c optional netmap dev/netmap/netmap_vale.c optional netmap diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c index cc8b34e..20321d0 100644 --- a/sys/dev/e1000/if_em.c +++ b/sys/dev/e1000/if_em.c @@ -3389,10 +3389,10 @@ em_setup_transmit_ring(struct tx_ring *txr) uint64_t paddr; void *addr; - addr = PNMB(slot + si, &paddr); + addr = PNMB(na, slot + si, &paddr); txr->tx_base[i].buffer_addr = htole64(paddr); /* reload the map for netmap mode */ - netmap_load_map(txr->txtag, txbuf->map, addr); + netmap_load_map(na, txr->txtag, txbuf->map, addr); } #endif /* DEV_NETMAP */ @@ -4131,8 +4131,8 @@ em_setup_receive_ring(struct rx_ring *rxr) uint64_t paddr; void *addr; - addr = PNMB(slot + si, &paddr); - netmap_load_map(rxr->rxtag, rxbuf->map, addr); + addr = PNMB(na, slot + si, &paddr); + netmap_load_map(na, rxr->rxtag, rxbuf->map, addr); /* Update descriptor */ rxr->rx_base[j].buffer_addr = htole64(paddr); continue; diff --git a/sys/dev/e1000/if_igb.c b/sys/dev/e1000/if_igb.c index 15d71ce..484cba1 100644 --- a/sys/dev/e1000/if_igb.c +++ b/sys/dev/e1000/if_igb.c @@ -3531,7 +3531,7 @@ igb_setup_transmit_ring(struct tx_ring *txr) if (slot) { int si = netmap_idx_n2k(&na->tx_rings[txr->me], i); /* no need to set the address */ - netmap_load_map(txr->txtag, txbuf->map, NMB(slot + si)); + netmap_load_map(na, txr->txtag, txbuf->map, NMB(na, slot + si)); } #endif /* DEV_NETMAP */ /* clear the watch index */ @@ -4335,8 +4335,8 @@ igb_setup_receive_ring(struct rx_ring *rxr) uint64_t paddr; void *addr; - addr = PNMB(slot + sj, &paddr); - netmap_load_map(rxr->ptag, rxbuf->pmap, addr); + addr = PNMB(na, slot + sj, &paddr); + netmap_load_map(na, rxr->ptag, rxbuf->pmap, addr); /* Update descriptor */ rxr->rx_base[j].read.pkt_addr = htole64(paddr); continue; diff --git a/sys/dev/e1000/if_lem.c b/sys/dev/e1000/if_lem.c index bc25e18..04a984b 100644 --- a/sys/dev/e1000/if_lem.c +++ b/sys/dev/e1000/if_lem.c @@ -32,6 +32,15 @@ ******************************************************************************/ /*$FreeBSD$*/ +/* + * Uncomment the following extensions for better performance in a VM, + * especially if you have support in the hypervisor. + * See http://info.iet.unipi.it/~luigi/netmap/ + */ +// #define BATCH_DISPATCH +// #define NIC_SEND_COMBINING +// #define NIC_PARAVIRT /* enable virtio-like synchronization */ + #include "opt_inet.h" #include "opt_inet6.h" @@ -289,6 +298,10 @@ static int lem_tx_int_delay_dflt = EM_TICKS_TO_USECS(EM_TIDV); static int lem_rx_int_delay_dflt = EM_TICKS_TO_USECS(EM_RDTR); static int lem_tx_abs_int_delay_dflt = EM_TICKS_TO_USECS(EM_TADV); static int lem_rx_abs_int_delay_dflt = EM_TICKS_TO_USECS(EM_RADV); +/* + * increase lem_rxd and lem_txd to at least 2048 in netmap mode + * for better performance. + */ static int lem_rxd = EM_DEFAULT_RXD; static int lem_txd = EM_DEFAULT_TXD; static int lem_smart_pwr_down = FALSE; @@ -458,6 +471,20 @@ lem_attach(device_t dev) "max number of rx packets to process", &adapter->rx_process_limit, lem_rx_process_limit); +#ifdef NIC_SEND_COMBINING + /* Sysctls to control mitigation */ + lem_add_rx_process_limit(adapter, "sc_enable", + "driver TDT mitigation", &adapter->sc_enable, 0); +#endif /* NIC_SEND_COMBINING */ +#ifdef BATCH_DISPATCH + lem_add_rx_process_limit(adapter, "batch_enable", + "driver rx batch", &adapter->batch_enable, 0); +#endif /* BATCH_DISPATCH */ +#ifdef NIC_PARAVIRT + lem_add_rx_process_limit(adapter, "rx_retries", + "driver rx retries", &adapter->rx_retries, 0); +#endif /* NIC_PARAVIRT */ + /* Sysctl for setting the interface flow control */ lem_set_flow_cntrl(adapter, "flow_control", "flow control setting", @@ -515,6 +542,49 @@ lem_attach(device_t dev) */ adapter->hw.mac.report_tx_early = 1; +#ifdef NIC_PARAVIRT + device_printf(dev, "driver supports paravirt, subdev 0x%x\n", + adapter->hw.subsystem_device_id); + if (adapter->hw.subsystem_device_id == E1000_PARA_SUBDEV) { + uint64_t bus_addr; + + device_printf(dev, "paravirt support on dev %p\n", adapter); + tsize = 4096; // XXX one page for the csb + if (lem_dma_malloc(adapter, tsize, &adapter->csb_mem, BUS_DMA_NOWAIT)) { + device_printf(dev, "Unable to allocate csb memory\n"); + error = ENOMEM; + goto err_csb; + } + /* Setup the Base of the CSB */ + adapter->csb = (struct paravirt_csb *)adapter->csb_mem.dma_vaddr; + /* force the first kick */ + adapter->csb->host_need_txkick = 1; /* txring empty */ + adapter->csb->guest_need_rxkick = 1; /* no rx packets */ + bus_addr = adapter->csb_mem.dma_paddr; + lem_add_rx_process_limit(adapter, "csb_on", + "enable paravirt.", &adapter->csb->guest_csb_on, 0); + lem_add_rx_process_limit(adapter, "txc_lim", + "txc_lim", &adapter->csb->host_txcycles_lim, 1); + + /* some stats */ +#define PA_SC(name, var, val) \ + lem_add_rx_process_limit(adapter, name, name, var, val) + PA_SC("host_need_txkick",&adapter->csb->host_need_txkick, 1); + PA_SC("host_rxkick_at",&adapter->csb->host_rxkick_at, ~0); + PA_SC("guest_need_txkick",&adapter->csb->guest_need_txkick, 0); + PA_SC("guest_need_rxkick",&adapter->csb->guest_need_rxkick, 1); + PA_SC("tdt_reg_count",&adapter->tdt_reg_count, 0); + PA_SC("tdt_csb_count",&adapter->tdt_csb_count, 0); + PA_SC("tdt_int_count",&adapter->tdt_int_count, 0); + PA_SC("guest_need_kick_count",&adapter->guest_need_kick_count, 0); + /* tell the host where the block is */ + E1000_WRITE_REG(&adapter->hw, E1000_CSBAH, + (u32)(bus_addr >> 32)); + E1000_WRITE_REG(&adapter->hw, E1000_CSBAL, + (u32)bus_addr); + } +#endif /* NIC_PARAVIRT */ + tsize = roundup2(adapter->num_tx_desc * sizeof(struct e1000_tx_desc), EM_DBA_ALIGN); @@ -673,6 +743,11 @@ err_hw_init: err_rx_desc: lem_dma_free(adapter, &adapter->txdma); err_tx_desc: +#ifdef NIC_PARAVIRT + lem_dma_free(adapter, &adapter->csb_mem); +err_csb: +#endif /* NIC_PARAVIRT */ + err_pci: if (adapter->ifp != NULL) if_free(adapter->ifp); @@ -760,6 +835,12 @@ lem_detach(device_t dev) adapter->rx_desc_base = NULL; } +#ifdef NIC_PARAVIRT + if (adapter->csb) { + lem_dma_free(adapter, &adapter->csb_mem); + adapter->csb = NULL; + } +#endif /* NIC_PARAVIRT */ lem_release_hw_control(adapter); free(adapter->mta, M_DEVBUF); EM_TX_LOCK_DESTROY(adapter); @@ -869,6 +950,16 @@ lem_start_locked(struct ifnet *ifp) } if (adapter->num_tx_desc_avail <= EM_TX_OP_THRESHOLD) ifp->if_drv_flags |= IFF_DRV_OACTIVE; +#ifdef NIC_PARAVIRT + if (if_getdrvflags(ifp) & IFF_DRV_OACTIVE && adapter->csb && + adapter->csb->guest_csb_on && + !(adapter->csb->guest_need_txkick & 1)) { + adapter->csb->guest_need_txkick = 1; + adapter->guest_need_kick_count++; + // XXX memory barrier + lem_txeof(adapter); // XXX possibly clear IFF_DRV_OACTIVE + } +#endif /* NIC_PARAVIRT */ return; } @@ -1715,6 +1806,37 @@ lem_xmit(struct adapter *adapter, struct mbuf **m_headp) */ bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + +#ifdef NIC_PARAVIRT + if (adapter->csb) { + adapter->csb->guest_tdt = i; + /* XXX memory barrier ? */ + if (adapter->csb->guest_csb_on && + !(adapter->csb->host_need_txkick & 1)) { + /* XXX maybe useless + * clean the ring. maybe do it before ? + * maybe a little bit of histeresys ? + */ + if (adapter->num_tx_desc_avail <= 64) {// XXX + lem_txeof(adapter); + } + return (0); + } + } +#endif /* NIC_PARAVIRT */ + +#ifdef NIC_SEND_COMBINING + if (adapter->sc_enable) { + if (adapter->shadow_tdt & MIT_PENDING_INT) { + /* signal intr and data pending */ + adapter->shadow_tdt = MIT_PENDING_TDT | (i & 0xffff); + return (0); + } else { + adapter->shadow_tdt = MIT_PENDING_INT; + } + } +#endif /* NIC_SEND_COMBINING */ + if (adapter->hw.mac.type == e1000_82547 && adapter->link_duplex == HALF_DUPLEX) lem_82547_move_tail(adapter); @@ -1995,6 +2117,20 @@ lem_local_timer(void *arg) lem_smartspeed(adapter); +#ifdef NIC_PARAVIRT + /* recover space if needed */ + if (adapter->csb && adapter->csb->guest_csb_on && + (adapter->watchdog_check == TRUE) && + (ticks - adapter->watchdog_time > EM_WATCHDOG) && + (adapter->num_tx_desc_avail != adapter->num_tx_desc) ) { + lem_txeof(adapter); + /* + * lem_txeof() normally (except when space in the queue + * runs low XXX) cleans watchdog_check so that + * we do not hung. + */ + } +#endif /* NIC_PARAVIRT */ /* * We check the watchdog: the time since * the last TX descriptor was cleaned. @@ -2677,10 +2813,10 @@ lem_setup_transmit_structures(struct adapter *adapter) uint64_t paddr; void *addr; - addr = PNMB(slot + si, &paddr); + addr = PNMB(na, slot + si, &paddr); adapter->tx_desc_base[i].buffer_addr = htole64(paddr); /* reload the map for netmap mode */ - netmap_load_map(adapter->txtag, tx_buffer->map, addr); + netmap_load_map(na, adapter->txtag, tx_buffer->map, addr); } #endif /* DEV_NETMAP */ tx_buffer->next_eop = -1; @@ -3055,6 +3191,16 @@ lem_txeof(struct adapter *adapter) adapter->next_tx_to_clean = first; adapter->num_tx_desc_avail = num_avail; +#ifdef NIC_SEND_COMBINING + if ((adapter->shadow_tdt & MIT_PENDING_TDT) == MIT_PENDING_TDT) { + /* a tdt write is pending, do it */ + E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), + 0xffff & adapter->shadow_tdt); + adapter->shadow_tdt = MIT_PENDING_INT; + } else { + adapter->shadow_tdt = 0; // disable + } +#endif /* NIC_SEND_COMBINING */ /* * If we have enough room, clear IFF_DRV_OACTIVE to * tell the stack that it is OK to send packets. @@ -3062,6 +3208,12 @@ lem_txeof(struct adapter *adapter) */ if (adapter->num_tx_desc_avail > EM_TX_CLEANUP_THRESHOLD) { ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; +#ifdef NIC_PARAVIRT + if (adapter->csb) { // XXX also csb_on ? + adapter->csb->guest_need_txkick = 2; /* acked */ + // XXX memory barrier + } +#endif /* NIC_PARAVIRT */ if (adapter->num_tx_desc_avail == adapter->num_tx_desc) { adapter->watchdog_check = FALSE; return; @@ -3247,8 +3399,8 @@ lem_setup_receive_structures(struct adapter *adapter) uint64_t paddr; void *addr; - addr = PNMB(slot + si, &paddr); - netmap_load_map(adapter->rxtag, rx_buffer->map, addr); + addr = PNMB(na, slot + si, &paddr); + netmap_load_map(na, adapter->rxtag, rx_buffer->map, addr); /* Update descriptor */ adapter->rx_desc_base[i].buffer_addr = htole64(paddr); continue; @@ -3445,7 +3597,23 @@ lem_rxeof(struct adapter *adapter, int count, int *done) int i, rx_sent = 0; struct e1000_rx_desc *current_desc; +#ifdef BATCH_DISPATCH + struct mbuf *mh = NULL, *mt = NULL; +#endif /* BATCH_DISPATCH */ +#ifdef NIC_PARAVIRT + int retries = 0; + struct paravirt_csb* csb = adapter->csb; + int csb_mode = csb && csb->guest_csb_on; + + //ND("clear guest_rxkick at %d", adapter->next_rx_desc_to_check); + if (csb_mode && csb->guest_need_rxkick) + csb->guest_need_rxkick = 0; +#endif /* NIC_PARAVIRT */ EM_RX_LOCK(adapter); + +#ifdef BATCH_DISPATCH + batch_again: +#endif /* BATCH_DISPATCH */ i = adapter->next_rx_desc_to_check; current_desc = &adapter->rx_desc_base[i]; bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, @@ -3458,19 +3626,45 @@ lem_rxeof(struct adapter *adapter, int count, int *done) } #endif /* DEV_NETMAP */ +#if 1 // XXX optimization ? if (!((current_desc->status) & E1000_RXD_STAT_DD)) { if (done != NULL) *done = rx_sent; EM_RX_UNLOCK(adapter); return (FALSE); } +#endif /* 0 */ while (count != 0 && ifp->if_drv_flags & IFF_DRV_RUNNING) { struct mbuf *m = NULL; status = current_desc->status; - if ((status & E1000_RXD_STAT_DD) == 0) + if ((status & E1000_RXD_STAT_DD) == 0) { +#ifdef NIC_PARAVIRT + if (csb_mode) { + /* buffer not ready yet. Retry a few times before giving up */ + if (++retries <= adapter->rx_retries) { + continue; + } + if (csb->guest_need_rxkick == 0) { + // ND("set guest_rxkick at %d", adapter->next_rx_desc_to_check); + csb->guest_need_rxkick = 1; + // XXX memory barrier, status volatile ? + continue; /* double check */ + } + } + /* no buffer ready, give up */ +#endif /* NIC_PARAVIRT */ break; + } +#ifdef NIC_PARAVIRT + if (csb_mode) { + if (csb->guest_need_rxkick) + // ND("clear again guest_rxkick at %d", adapter->next_rx_desc_to_check); + csb->guest_need_rxkick = 0; + retries = 0; + } +#endif /* NIC_PARAVIRT */ mp = adapter->rx_buffer_area[i].m_head; /* @@ -3595,11 +3789,36 @@ discard: bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); +#ifdef NIC_PARAVIRT + if (csb_mode) { + /* the buffer at i has been already replaced by lem_get_buf() + * so it is safe to set guest_rdt = i and possibly send a kick. + * XXX see if we can optimize it later. + */ + csb->guest_rdt = i; + // XXX memory barrier + if (i == csb->host_rxkick_at) + E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), i); + } +#endif /* NIC_PARAVIRT */ /* Advance our pointers to the next descriptor. */ if (++i == adapter->num_rx_desc) i = 0; /* Call into the stack */ if (m != NULL) { +#ifdef BATCH_DISPATCH + if (adapter->batch_enable) { + if (mh == NULL) + mh = mt = m; + else + mt->m_nextpkt = m; + mt = m; + m->m_nextpkt = NULL; + rx_sent++; + current_desc = &adapter->rx_desc_base[i]; + continue; + } +#endif /* BATCH_DISPATCH */ adapter->next_rx_desc_to_check = i; EM_RX_UNLOCK(adapter); (*ifp->if_input)(ifp, m); @@ -3610,10 +3829,27 @@ discard: current_desc = &adapter->rx_desc_base[i]; } adapter->next_rx_desc_to_check = i; +#ifdef BATCH_DISPATCH + if (mh) { + EM_RX_UNLOCK(adapter); + while ( (mt = mh) != NULL) { + mh = mh->m_nextpkt; + mt->m_nextpkt = NULL; + if_input(ifp, mt); + } + EM_RX_LOCK(adapter); + i = adapter->next_rx_desc_to_check; /* in case of interrupts */ + if (count > 0) + goto batch_again; + } +#endif /* BATCH_DISPATCH */ /* Advance the E1000's Receive Queue #0 "Tail Pointer". */ if (--i < 0) i = adapter->num_rx_desc - 1; +#ifdef NIC_PARAVIRT + if (!csb_mode) /* filter out writes */ +#endif /* NIC_PARAVIRT */ E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), i); if (done != NULL) *done = rx_sent; diff --git a/sys/dev/ixgbe/ixgbe.c b/sys/dev/ixgbe/ixgbe.c index c27440a..75ab2eb 100644 --- a/sys/dev/ixgbe/ixgbe.c +++ b/sys/dev/ixgbe/ixgbe.c @@ -3079,7 +3079,7 @@ ixgbe_setup_transmit_ring(struct tx_ring *txr) */ if (slot) { int si = netmap_idx_n2k(&na->tx_rings[txr->me], i); - netmap_load_map(txr->txtag, txbuf->map, NMB(slot + si)); + netmap_load_map(na, txr->txtag, txbuf->map, NMB(na, slot + si)); } #endif /* DEV_NETMAP */ /* Clear the EOP descriptor pointer */ @@ -4025,8 +4025,8 @@ ixgbe_setup_receive_ring(struct rx_ring *rxr) uint64_t paddr; void *addr; - addr = PNMB(slot + sj, &paddr); - netmap_load_map(rxr->ptag, rxbuf->pmap, addr); + addr = PNMB(na, slot + sj, &paddr); + netmap_load_map(na, rxr->ptag, rxbuf->pmap, addr); /* Update descriptor and the cached value */ rxr->rx_base[j].read.pkt_addr = htole64(paddr); rxbuf->addr = htole64(paddr); diff --git a/sys/dev/netmap/if_em_netmap.h b/sys/dev/netmap/if_em_netmap.h index 17b4c4f..15e9be5 100644 --- a/sys/dev/netmap/if_em_netmap.h +++ b/sys/dev/netmap/if_em_netmap.h @@ -113,10 +113,10 @@ em_netmap_reg(struct netmap_adapter *na, int onoff) * Reconcile kernel and user view of the transmit ring. */ static int -em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) +em_netmap_txsync(struct netmap_kring *kring, int flags) { + struct netmap_adapter *na = kring->na; struct ifnet *ifp = na->ifp; - struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ @@ -128,7 +128,7 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) /* device-specific */ struct adapter *adapter = ifp->if_softc; - struct tx_ring *txr = &adapter->tx_rings[ring_nr]; + struct tx_ring *txr = &adapter->tx_rings[kring->ring_id]; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); @@ -144,7 +144,7 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; uint64_t paddr; - void *addr = PNMB(slot, &paddr); + void *addr = PNMB(na, slot, &paddr); /* device-specific */ struct e1000_tx_desc *curr = &txr->tx_base[nic_i]; @@ -153,12 +153,12 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) nic_i == 0 || nic_i == report_frequency) ? E1000_TXD_CMD_RS : 0; - NM_CHECK_ADDR_LEN(addr, len); + NM_CHECK_ADDR_LEN(na, addr, len); if (slot->flags & NS_BUF_CHANGED) { curr->buffer_addr = htole64(paddr); /* buffer has changed, reload map */ - netmap_reload_map(txr->txtag, txbuf->map, addr); + netmap_reload_map(na, txr->txtag, txbuf->map, addr); } slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); @@ -187,7 +187,7 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) */ if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { /* record completed transmissions using TDH */ - nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); + nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(kring->ring_id)); if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ D("TDH wrap %d", nic_i); nic_i -= kring->nkr_num_slots; @@ -208,10 +208,10 @@ em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) * Reconcile kernel and user view of the receive ring. */ static int -em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +em_netmap_rxsync(struct netmap_kring *kring, int flags) { + struct netmap_adapter *na = kring->na; struct ifnet *ifp = na->ifp; - struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ @@ -222,7 +222,7 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) /* device-specific */ struct adapter *adapter = ifp->if_softc; - struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; + struct rx_ring *rxr = &adapter->rx_rings[kring->ring_id]; if (head > lim) return netmap_ring_reinit(kring); @@ -271,18 +271,18 @@ em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; - void *addr = PNMB(slot, &paddr); + void *addr = PNMB(na, slot, &paddr); struct e1000_rx_desc *curr = &rxr->rx_base[nic_i]; struct em_buffer *rxbuf = &rxr->rx_buffers[nic_i]; - if (addr == netmap_buffer_base) /* bad buf */ + if (addr == NETMAP_BUF_BASE(na)) /* bad buf */ goto ring_reset; if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ curr->buffer_addr = htole64(paddr); - netmap_reload_map(rxr->rxtag, rxbuf->map, addr); + netmap_reload_map(na, rxr->rxtag, rxbuf->map, addr); slot->flags &= ~NS_BUF_CHANGED; } curr->status = 0; diff --git a/sys/dev/netmap/if_igb_netmap.h b/sys/dev/netmap/if_igb_netmap.h index e1929f0..c738460 100644 --- a/sys/dev/netmap/if_igb_netmap.h +++ b/sys/dev/netmap/if_igb_netmap.h @@ -81,10 +81,10 @@ igb_netmap_reg(struct netmap_adapter *na, int onoff) * Reconcile kernel and user view of the transmit ring. */ static int -igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) +igb_netmap_txsync(struct netmap_kring *kring, int flags) { + struct netmap_adapter *na = kring->na; struct ifnet *ifp = na->ifp; - struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ @@ -96,7 +96,7 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) /* device-specific */ struct adapter *adapter = ifp->if_softc; - struct tx_ring *txr = &adapter->tx_rings[ring_nr]; + struct tx_ring *txr = &adapter->tx_rings[kring->ring_id]; /* 82575 needs the queue index added */ u32 olinfo_status = (adapter->hw.mac.type == e1000_82575) ? (txr->me << 4) : 0; @@ -115,7 +115,7 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; uint64_t paddr; - void *addr = PNMB(slot, &paddr); + void *addr = PNMB(na, slot, &paddr); /* device-specific */ union e1000_adv_tx_desc *curr = @@ -125,11 +125,11 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) nic_i == 0 || nic_i == report_frequency) ? E1000_ADVTXD_DCMD_RS : 0; - NM_CHECK_ADDR_LEN(addr, len); + NM_CHECK_ADDR_LEN(na, addr, len); if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ - netmap_reload_map(txr->txtag, txbuf->map, addr); + netmap_reload_map(na, txr->txtag, txbuf->map, addr); } slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); @@ -171,7 +171,7 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) */ if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { /* record completed transmissions using TDH */ - nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); + nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(kring->ring_id)); if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ D("TDH wrap %d", nic_i); nic_i -= kring->nkr_num_slots; @@ -190,10 +190,10 @@ igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) * Reconcile kernel and user view of the receive ring. */ static int -igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +igb_netmap_rxsync(struct netmap_kring *kring, int flags) { + struct netmap_adapter *na = kring->na; struct ifnet *ifp = na->ifp; - struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ @@ -204,7 +204,7 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) /* device-specific */ struct adapter *adapter = ifp->if_softc; - struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; + struct rx_ring *rxr = &adapter->rx_rings[kring->ring_id]; if (head > lim) return netmap_ring_reinit(kring); @@ -251,17 +251,17 @@ igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; - void *addr = PNMB(slot, &paddr); + void *addr = PNMB(na, slot, &paddr); union e1000_adv_rx_desc *curr = &rxr->rx_base[nic_i]; struct igb_rx_buf *rxbuf = &rxr->rx_buffers[nic_i]; - if (addr == netmap_buffer_base) /* bad buf */ + if (addr == NETMAP_BUF_BASE(na)) /* bad buf */ goto ring_reset; if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ - netmap_reload_map(rxr->ptag, rxbuf->pmap, addr); + netmap_reload_map(na, rxr->ptag, rxbuf->pmap, addr); slot->flags &= ~NS_BUF_CHANGED; } curr->wb.upper.status_error = 0; diff --git a/sys/dev/netmap/if_lem_netmap.h b/sys/dev/netmap/if_lem_netmap.h index 4fce5c9..272f02c 100644 --- a/sys/dev/netmap/if_lem_netmap.h +++ b/sys/dev/netmap/if_lem_netmap.h @@ -39,6 +39,7 @@ #include <vm/pmap.h> /* vtophys ? */ #include <dev/netmap/netmap_kern.h> +extern int netmap_adaptive_io; /* * Register/unregister. We are already under netmap lock. @@ -84,10 +85,10 @@ lem_netmap_reg(struct netmap_adapter *na, int onoff) * Reconcile kernel and user view of the transmit ring. */ static int -lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) +lem_netmap_txsync(struct netmap_kring *kring, int flags) { + struct netmap_adapter *na = kring->na; struct ifnet *ifp = na->ifp; - struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ @@ -98,6 +99,10 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) /* device-specific */ struct adapter *adapter = ifp->if_softc; +#ifdef NIC_PARAVIRT + struct paravirt_csb *csb = adapter->csb; + uint64_t *csbd = (uint64_t *)(csb + 1); +#endif /* NIC_PARAVIRT */ bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, BUS_DMASYNC_POSTREAD); @@ -108,12 +113,25 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) nm_i = kring->nr_hwcur; if (nm_i != head) { /* we have new packets to send */ +#ifdef NIC_PARAVIRT + int do_kick = 0; + uint64_t t = 0; // timestamp + int n = head - nm_i; + if (n < 0) + n += lim + 1; + if (csb) { + t = rdtsc(); /* last timestamp */ + csbd[16] += t - csbd[0]; /* total Wg */ + csbd[17] += n; /* Wg count */ + csbd[0] = t; + } +#endif /* NIC_PARAVIRT */ nic_i = netmap_idx_k2n(kring, nm_i); while (nm_i != head) { struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; uint64_t paddr; - void *addr = PNMB(slot, &paddr); + void *addr = PNMB(na, slot, &paddr); /* device-specific */ struct e1000_tx_desc *curr = &adapter->tx_desc_base[nic_i]; @@ -122,12 +140,12 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) nic_i == 0 || nic_i == report_frequency) ? E1000_TXD_CMD_RS : 0; - NM_CHECK_ADDR_LEN(addr, len); + NM_CHECK_ADDR_LEN(na, addr, len); if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ curr->buffer_addr = htole64(paddr); - netmap_reload_map(adapter->txtag, txbuf->map, addr); + netmap_reload_map(na, adapter->txtag, txbuf->map, addr); } slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); @@ -140,6 +158,7 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); + // XXX might try an early kick } kring->nr_hwcur = head; @@ -147,8 +166,38 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); +#ifdef NIC_PARAVIRT + /* set unconditionally, then also kick if needed */ + if (csb) { + t = rdtsc(); + if (csb->host_need_txkick == 2) { + /* can compute an update of delta */ + int64_t delta = t - csbd[3]; + if (delta < 0) + delta = -delta; + if (csbd[8] == 0 || delta < csbd[8]) { + csbd[8] = delta; + csbd[9]++; + } + csbd[10]++; + } + csb->guest_tdt = nic_i; + csbd[18] += t - csbd[0]; // total wp + csbd[19] += n; + } + if (!csb || !csb->guest_csb_on || (csb->host_need_txkick & 1)) + do_kick = 1; + if (do_kick) +#endif /* NIC_PARAVIRT */ /* (re)start the tx unit up to slot nic_i (excluded) */ E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), nic_i); +#ifdef NIC_PARAVIRT + if (do_kick) { + uint64_t t1 = rdtsc(); + csbd[20] += t1 - t; // total Np + csbd[21]++; + } +#endif /* NIC_PARAVIRT */ } /* @@ -157,6 +206,93 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) if (ticks != kring->last_reclaim || flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { kring->last_reclaim = ticks; /* record completed transmissions using TDH */ +#ifdef NIC_PARAVIRT + /* host updates tdh unconditionally, and we have + * no side effects on reads, so we can read from there + * instead of exiting. + */ + if (csb) { + static int drain = 0, nodrain=0, good = 0, bad = 0, fail = 0; + u_int x = adapter->next_tx_to_clean; + csbd[19]++; // XXX count reclaims + nic_i = csb->host_tdh; + if (csb->guest_csb_on) { + if (nic_i == x) { + bad++; + csbd[24]++; // failed reclaims + /* no progress, request kick and retry */ + csb->guest_need_txkick = 1; + mb(); // XXX barrier + nic_i = csb->host_tdh; + } else { + good++; + } + if (nic_i != x) { + csb->guest_need_txkick = 2; + if (nic_i == csb->guest_tdt) + drain++; + else + nodrain++; +#if 1 + if (netmap_adaptive_io) { + /* new mechanism: last half ring (or so) + * released one slot at a time. + * This effectively makes the system spin. + * + * Take next_to_clean + 1 as a reference. + * tdh must be ahead or equal + * On entry, the logical order is + * x < tdh = nic_i + * We first push tdh up to avoid wraps. + * The limit is tdh-ll (half ring). + * if tdh-256 < x we report x; + * else we report tdh-256 + */ + u_int tdh = nic_i; + u_int ll = csbd[15]; + u_int delta = lim/8; + if (netmap_adaptive_io == 2 || ll > delta) + csbd[15] = ll = delta; + else if (netmap_adaptive_io == 1 && ll > 1) { + csbd[15]--; + } + + if (nic_i >= kring->nkr_num_slots) { + RD(5, "bad nic_i %d on input", nic_i); + } + x = nm_next(x, lim); + if (tdh < x) + tdh += lim + 1; + if (tdh <= x + ll) { + nic_i = x; + csbd[25]++; //report n + 1; + } else { + tdh = nic_i; + if (tdh < ll) + tdh += lim + 1; + nic_i = tdh - ll; + csbd[26]++; // report tdh - ll + } + } +#endif + } else { + /* we stop, count whether we are idle or not */ + int bh_active = csb->host_need_txkick & 2 ? 4 : 0; + csbd[27+ csb->host_need_txkick]++; + if (netmap_adaptive_io == 1) { + if (bh_active && csbd[15] > 1) + csbd[15]--; + else if (!bh_active && csbd[15] < lim/2) + csbd[15]++; + } + bad--; + fail++; + } + } + RD(1, "drain %d nodrain %d good %d retry %d fail %d", + drain, nodrain, good, bad, fail); + } else +#endif /* !NIC_PARAVIRT */ nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(0)); if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ D("TDH wrap %d", nic_i); @@ -176,10 +312,10 @@ lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) * Reconcile kernel and user view of the receive ring. */ static int -lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +lem_netmap_rxsync(struct netmap_kring *kring, int flags) { + struct netmap_adapter *na = kring->na; struct ifnet *ifp = na->ifp; - struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ @@ -190,10 +326,21 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) /* device-specific */ struct adapter *adapter = ifp->if_softc; +#ifdef NIC_PARAVIRT + struct paravirt_csb *csb = adapter->csb; + uint32_t csb_mode = csb && csb->guest_csb_on; + uint32_t do_host_rxkick = 0; +#endif /* NIC_PARAVIRT */ if (head > lim) return netmap_ring_reinit(kring); +#ifdef NIC_PARAVIRT + if (csb_mode) { + force_update = 1; + csb->guest_need_rxkick = 0; + } +#endif /* NIC_PARAVIRT */ /* XXX check sync modes */ bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); @@ -212,11 +359,28 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) uint32_t staterr = le32toh(curr->status); int len; +#ifdef NIC_PARAVIRT + if (csb_mode) { + if ((staterr & E1000_RXD_STAT_DD) == 0) { + /* don't bother to retry if more than 1 pkt */ + if (n > 1) + break; + csb->guest_need_rxkick = 1; + wmb(); + staterr = le32toh(curr->status); + if ((staterr & E1000_RXD_STAT_DD) == 0) { + break; + } else { /* we are good */ + csb->guest_need_rxkick = 0; + } + } + } else +#endif /* NIC_PARAVIRT */ if ((staterr & E1000_RXD_STAT_DD) == 0) break; len = le16toh(curr->length) - 4; // CRC if (len < 0) { - D("bogus pkt size %d nic idx %d", len, nic_i); + RD(5, "bogus pkt (%d) size %d nic idx %d", n, len, nic_i); len = 0; } ring->slot[nm_i].len = len; @@ -228,6 +392,18 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ +#ifdef NIC_PARAVIRT + if (csb_mode) { + if (n > 1) { + /* leave one spare buffer so we avoid rxkicks */ + nm_i = nm_prev(nm_i, lim); + nic_i = nm_prev(nic_i, lim); + n--; + } else { + csb->guest_need_rxkick = 1; + } + } +#endif /* NIC_PARAVIRT */ ND("%d new packets at nic %d nm %d tail %d", n, adapter->next_rx_desc_to_check, @@ -249,23 +425,27 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; - void *addr = PNMB(slot, &paddr); + void *addr = PNMB(na, slot, &paddr); struct e1000_rx_desc *curr = &adapter->rx_desc_base[nic_i]; struct em_buffer *rxbuf = &adapter->rx_buffer_area[nic_i]; - if (addr == netmap_buffer_base) /* bad buf */ + if (addr == NETMAP_BUF_BASE(na)) /* bad buf */ goto ring_reset; if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ curr->buffer_addr = htole64(paddr); - netmap_reload_map(adapter->rxtag, rxbuf->map, addr); + netmap_reload_map(na, adapter->rxtag, rxbuf->map, addr); slot->flags &= ~NS_BUF_CHANGED; } curr->status = 0; bus_dmamap_sync(adapter->rxtag, rxbuf->map, BUS_DMASYNC_PREREAD); +#ifdef NIC_PARAVIRT + if (csb_mode && csb->host_rxkick_at == nic_i) + do_host_rxkick = 1; +#endif /* NIC_PARAVIRT */ nm_i = nm_next(nm_i, lim); nic_i = nm_next(nic_i, lim); } @@ -277,6 +457,12 @@ lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) * so move nic_i back by one unit */ nic_i = nm_prev(nic_i, lim); +#ifdef NIC_PARAVIRT + /* set unconditionally, then also kick if needed */ + if (csb) + csb->guest_rdt = nic_i; + if (!csb_mode || do_host_rxkick) +#endif /* NIC_PARAVIRT */ E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), nic_i); } diff --git a/sys/dev/netmap/if_re_netmap.h b/sys/dev/netmap/if_re_netmap.h index 10abe4f..98f6143 100644 --- a/sys/dev/netmap/if_re_netmap.h +++ b/sys/dev/netmap/if_re_netmap.h @@ -65,10 +65,10 @@ re_netmap_reg(struct netmap_adapter *na, int onoff) * Reconcile kernel and user view of the transmit ring. */ static int -re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) +re_netmap_txsync(struct netmap_kring *kring, int flags) { + struct netmap_adapter *na = kring->na; struct ifnet *ifp = na->ifp; - struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ @@ -96,14 +96,14 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; uint64_t paddr; - void *addr = PNMB(slot, &paddr); + void *addr = PNMB(na, slot, &paddr); /* device-specific */ struct rl_desc *desc = &sc->rl_ldata.rl_tx_list[nic_i]; int cmd = slot->len | RL_TDESC_CMD_EOF | RL_TDESC_CMD_OWN | RL_TDESC_CMD_SOF ; - NM_CHECK_ADDR_LEN(addr, len); + NM_CHECK_ADDR_LEN(na, addr, len); if (nic_i == lim) /* mark end of ring */ cmd |= RL_TDESC_CMD_EOR; @@ -112,7 +112,7 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) /* buffer has changed, reload map */ desc->rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); desc->rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); - netmap_reload_map(sc->rl_ldata.rl_tx_mtag, + netmap_reload_map(na, sc->rl_ldata.rl_tx_mtag, txd[nic_i].tx_dmamap, addr); } slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); @@ -169,10 +169,10 @@ re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) * Reconcile kernel and user view of the receive ring. */ static int -re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +re_netmap_rxsync(struct netmap_kring *kring, int flags) { + struct netmap_adapter *na = kring->na; struct ifnet *ifp = na->ifp; - struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ @@ -240,12 +240,12 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; - void *addr = PNMB(slot, &paddr); + void *addr = PNMB(na, slot, &paddr); struct rl_desc *desc = &sc->rl_ldata.rl_rx_list[nic_i]; - int cmd = NETMAP_BUF_SIZE | RL_RDESC_CMD_OWN; + int cmd = NETMAP_BUF_SIZE(na) | RL_RDESC_CMD_OWN; - if (addr == netmap_buffer_base) /* bad buf */ + if (addr == NETMAP_BUF_BASE(na)) /* bad buf */ goto ring_reset; if (nic_i == lim) /* mark end of ring */ @@ -255,7 +255,7 @@ re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) /* buffer has changed, reload map */ desc->rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); desc->rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); - netmap_reload_map(sc->rl_ldata.rl_rx_mtag, + netmap_reload_map(na, sc->rl_ldata.rl_rx_mtag, rxd[nic_i].rx_dmamap, addr); slot->flags &= ~NS_BUF_CHANGED; } @@ -296,14 +296,10 @@ re_netmap_tx_init(struct rl_softc *sc) struct netmap_adapter *na = NA(sc->rl_ifp); struct netmap_slot *slot; - if (!na || !(na->na_flags & NAF_NATIVE_ON)) { - return; - } - slot = netmap_reset(na, NR_TX, 0, 0); - /* slot is NULL if we are not in netmap mode */ + /* slot is NULL if we are not in native netmap mode */ if (!slot) - return; // XXX cannot happen + return; /* in netmap mode, overwrite addresses and maps */ txd = sc->rl_ldata.rl_tx_desc; desc = sc->rl_ldata.rl_tx_list; @@ -313,11 +309,11 @@ re_netmap_tx_init(struct rl_softc *sc) for (i = 0; i < n; i++) { uint64_t paddr; int l = netmap_idx_n2k(&na->tx_rings[0], i); - void *addr = PNMB(slot + l, &paddr); + void *addr = PNMB(na, slot + l, &paddr); desc[i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); desc[i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); - netmap_load_map(sc->rl_ldata.rl_tx_mtag, + netmap_load_map(na, sc->rl_ldata.rl_tx_mtag, txd[i].tx_dmamap, addr); } } @@ -344,15 +340,15 @@ re_netmap_rx_init(struct rl_softc *sc) uint64_t paddr; uint32_t nm_i = netmap_idx_n2k(&na->rx_rings[0], nic_i); - addr = PNMB(slot + nm_i, &paddr); + addr = PNMB(na, slot + nm_i, &paddr); - netmap_reload_map(sc->rl_ldata.rl_rx_mtag, + netmap_reload_map(na, sc->rl_ldata.rl_rx_mtag, sc->rl_ldata.rl_rx_desc[nic_i].rx_dmamap, addr); bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, sc->rl_ldata.rl_rx_desc[nic_i].rx_dmamap, BUS_DMASYNC_PREREAD); desc[nic_i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); desc[nic_i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); - cmdstat = NETMAP_BUF_SIZE; + cmdstat = NETMAP_BUF_SIZE(na); if (nic_i == n - 1) /* mark the end of ring */ cmdstat |= RL_RDESC_CMD_EOR; if (nic_i < max_avail) diff --git a/sys/dev/netmap/if_vtnet_netmap.h b/sys/dev/netmap/if_vtnet_netmap.h new file mode 100644 index 0000000..63f4fa9 --- /dev/null +++ b/sys/dev/netmap/if_vtnet_netmap.h @@ -0,0 +1,434 @@ +/* + * Copyright (C) 2014 Vincenzo Maffione, Luigi Rizzo. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + */ + +#include <net/netmap.h> +#include <sys/selinfo.h> +#include <vm/vm.h> +#include <vm/pmap.h> /* vtophys ? */ +#include <dev/netmap/netmap_kern.h> + + +#define SOFTC_T vtnet_softc + +/* Free all the unused buffer in all the RX virtqueues. + * This function is called when entering and exiting netmap mode. + * - buffers queued by the virtio driver return skbuf/mbuf pointer + * and need to be freed; + * - buffers queued by netmap return the txq/rxq, and do not need work + */ +static void +vtnet_netmap_free_bufs(struct SOFTC_T* sc) +{ + int i, nmb = 0, n = 0, last; + + for (i = 0; i < sc->vtnet_max_vq_pairs; i++) { + struct vtnet_rxq *rxq = &sc->vtnet_rxqs[i]; + struct virtqueue *vq; + struct mbuf *m; + struct vtnet_txq *txq = &sc->vtnet_txqs[i]; + struct vtnet_tx_header *txhdr; + + last = 0; + vq = rxq->vtnrx_vq; + while ((m = virtqueue_drain(vq, &last)) != NULL) { + n++; + if (m != (void *)rxq) + m_freem(m); + else + nmb++; + } + + last = 0; + vq = txq->vtntx_vq; + while ((txhdr = virtqueue_drain(vq, &last)) != NULL) { + n++; + if (txhdr != (void *)txq) { + m_freem(txhdr->vth_mbuf); + uma_zfree(vtnet_tx_header_zone, txhdr); + } else + nmb++; + } + } + D("freed %d mbufs, %d netmap bufs on %d queues", + n - nmb, nmb, i); +} + +/* Register and unregister. */ +static int +vtnet_netmap_reg(struct netmap_adapter *na, int onoff) +{ + struct ifnet *ifp = na->ifp; + struct SOFTC_T *sc = ifp->if_softc; + + VTNET_CORE_LOCK(sc); + ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + /* enable or disable flags and callbacks in na and ifp */ + if (onoff) { + nm_set_native_flags(na); + } else { + nm_clear_native_flags(na); + } + /* drain queues so netmap and native drivers + * do not interfere with each other + */ + vtnet_netmap_free_bufs(sc); + vtnet_init_locked(sc); /* also enable intr */ + VTNET_CORE_UNLOCK(sc); + return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); +} + + +/* Reconcile kernel and user view of the transmit ring. */ +static int +vtnet_netmap_txsync(struct netmap_kring *kring, int flags) +{ + struct netmap_adapter *na = kring->na; + struct ifnet *ifp = na->ifp; + struct netmap_ring *ring = kring->ring; + u_int ring_nr = kring->ring_id; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; + + /* device-specific */ + struct SOFTC_T *sc = ifp->if_softc; + struct vtnet_txq *txq = &sc->vtnet_txqs[ring_nr]; + struct virtqueue *vq = txq->vtntx_vq; + + /* + * First part: process new packets to send. + */ + rmb(); + + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* we have new packets to send */ + struct sglist *sg = txq->vtntx_sg; + + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != head; n++) { + /* we use an empty header here */ + static struct virtio_net_hdr_mrg_rxbuf hdr; + struct netmap_slot *slot = &ring->slot[nm_i]; + u_int len = slot->len; + uint64_t paddr; + void *addr = PNMB(na, slot, &paddr); + int err; + + NM_CHECK_ADDR_LEN(na, addr, len); + + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + /* Initialize the scatterlist, expose it to the hypervisor, + * and kick the hypervisor (if necessary). + */ + sglist_reset(sg); // cheap + // if vtnet_hdr_size > 0 ... + err = sglist_append(sg, &hdr, sc->vtnet_hdr_size); + // XXX later, support multi segment + err = sglist_append_phys(sg, paddr, len); + /* use na as the cookie */ + err = virtqueue_enqueue(vq, txq, sg, sg->sg_nseg, 0); + if (unlikely(err < 0)) { + D("virtqueue_enqueue failed"); + break; + } + + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); + } + /* Update hwcur depending on where we stopped. */ + kring->nr_hwcur = nm_i; /* note we migth break early */ + + /* No more free TX slots? Ask the hypervisor for notifications, + * possibly only when a considerable amount of work has been + * done. + */ + ND(3,"sent %d packets, hwcur %d", n, nm_i); + virtqueue_disable_intr(vq); + virtqueue_notify(vq); + } else { + if (ring->head != ring->tail) + ND(5, "pure notify ? head %d tail %d nused %d %d", + ring->head, ring->tail, virtqueue_nused(vq), + (virtqueue_dump(vq), 1)); + virtqueue_notify(vq); + virtqueue_enable_intr(vq); // like postpone with 0 + } + + + /* Free used slots. We only consider our own used buffers, recognized + * by the token we passed to virtqueue_add_outbuf. + */ + n = 0; + for (;;) { + struct vtnet_tx_header *txhdr = virtqueue_dequeue(vq, NULL); + if (txhdr == NULL) + break; + if (likely(txhdr == (void *)txq)) { + n++; + if (virtqueue_nused(vq) < 32) { // XXX slow release + break; + } + } else { /* leftover from previous transmission */ + m_freem(txhdr->vth_mbuf); + uma_zfree(vtnet_tx_header_zone, txhdr); + } + } + if (n) { + kring->nr_hwtail += n; + if (kring->nr_hwtail > lim) + kring->nr_hwtail -= lim + 1; + } + if (nm_i != kring->nr_hwtail /* && vtnet_txq_below_threshold(txq) == 0*/) { + ND(3, "disable intr, hwcur %d", nm_i); + virtqueue_disable_intr(vq); + } else { + ND(3, "enable intr, hwcur %d", nm_i); + virtqueue_postpone_intr(vq, VQ_POSTPONE_SHORT); + } + +//out: + nm_txsync_finalize(kring); + + return 0; +} + +static int +vtnet_refill_rxq(struct netmap_kring *kring, u_int nm_i, u_int head) +{ + struct netmap_adapter *na = kring->na; + struct ifnet *ifp = na->ifp; + struct netmap_ring *ring = kring->ring; + u_int ring_nr = kring->ring_id; + u_int const lim = kring->nkr_num_slots - 1; + u_int n; + + /* device-specific */ + struct SOFTC_T *sc = ifp->if_softc; + struct vtnet_rxq *rxq = &sc->vtnet_rxqs[ring_nr]; + struct virtqueue *vq = rxq->vtnrx_vq; + + /* use a local sglist, default might be short */ + struct sglist_seg ss[2]; + struct sglist sg = { ss, 0, 0, 2 }; + + for (n = 0; nm_i != head; n++) { + static struct virtio_net_hdr_mrg_rxbuf hdr; + struct netmap_slot *slot = &ring->slot[nm_i]; + uint64_t paddr; + void *addr = PNMB(na, slot, &paddr); + int err = 0; + + if (addr == NETMAP_BUF_BASE(na)) { /* bad buf */ + if (netmap_ring_reinit(kring)) + return -1; + } + + slot->flags &= ~NS_BUF_CHANGED; + sglist_reset(&sg); // cheap + err = sglist_append(&sg, &hdr, sc->vtnet_hdr_size); + err = sglist_append_phys(&sg, paddr, NETMAP_BUF_SIZE(na)); + /* writable for the host */ + err = virtqueue_enqueue(vq, rxq, &sg, 0, sg.sg_nseg); + if (err < 0) { + D("virtqueue_enqueue failed"); + break; + } + nm_i = nm_next(nm_i, lim); + } + return nm_i; +} + +/* Reconcile kernel and user view of the receive ring. */ +static int +vtnet_netmap_rxsync(struct netmap_kring *kring, int flags) +{ + struct netmap_adapter *na = kring->na; + struct ifnet *ifp = na->ifp; + struct netmap_ring *ring = kring->ring; + u_int ring_nr = kring->ring_id; + u_int nm_i; /* index into the netmap ring */ + // u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_rxsync_prologue(kring); + int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; + + /* device-specific */ + struct SOFTC_T *sc = ifp->if_softc; + struct vtnet_rxq *rxq = &sc->vtnet_rxqs[ring_nr]; + struct virtqueue *vq = rxq->vtnrx_vq; + + /* XXX netif_carrier_ok ? */ + + if (head > lim) + return netmap_ring_reinit(kring); + + rmb(); + /* + * First part: import newly received packets. + * Only accept our + * own buffers (matching the token). We should only get + * matching buffers, because of vtnet_netmap_free_rx_unused_bufs() + * and vtnet_netmap_init_buffers(). + */ + if (netmap_no_pendintr || force_update) { + uint16_t slot_flags = kring->nkr_slot_flags; + struct netmap_adapter *token; + + nm_i = kring->nr_hwtail; + n = 0; + for (;;) { + int len; + token = virtqueue_dequeue(vq, &len); + if (token == NULL) + break; + if (likely(token == (void *)rxq)) { + ring->slot[nm_i].len = len; + ring->slot[nm_i].flags = slot_flags; + nm_i = nm_next(nm_i, lim); + n++; + } else { + D("This should not happen"); + } + } + kring->nr_hwtail = nm_i; + kring->nr_kflags &= ~NKR_PENDINTR; + } + ND("[B] h %d c %d hwcur %d hwtail %d", + ring->head, ring->cur, kring->nr_hwcur, + kring->nr_hwtail); + + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; /* netmap ring index */ + if (nm_i != head) { + int err = vtnet_refill_rxq(kring, nm_i, head); + if (err < 0) + return 1; + kring->nr_hwcur = err; + virtqueue_notify(vq); + /* After draining the queue may need an intr from the hypervisor */ + vtnet_rxq_enable_intr(rxq); + } + + /* tell userspace that there might be new packets. */ + nm_rxsync_finalize(kring); + + ND("[C] h %d c %d t %d hwcur %d hwtail %d", + ring->head, ring->cur, ring->tail, + kring->nr_hwcur, kring->nr_hwtail); + + return 0; +} + + +/* Make RX virtqueues buffers pointing to netmap buffers. */ +static int +vtnet_netmap_init_rx_buffers(struct SOFTC_T *sc) +{ + struct ifnet *ifp = sc->vtnet_ifp; + struct netmap_adapter* na = NA(ifp); + unsigned int r; + + if (!nm_native_on(na)) + return 0; + for (r = 0; r < na->num_rx_rings; r++) { + struct netmap_kring *kring = &na->rx_rings[r]; + struct vtnet_rxq *rxq = &sc->vtnet_rxqs[r]; + struct virtqueue *vq = rxq->vtnrx_vq; + struct netmap_slot* slot; + int err = 0; + + slot = netmap_reset(na, NR_RX, r, 0); + if (!slot) { + D("strange, null netmap ring %d", r); + return 0; + } + /* Add up to na>-num_rx_desc-1 buffers to this RX virtqueue. + * It's important to leave one virtqueue slot free, otherwise + * we can run into ring->cur/ring->tail wraparounds. + */ + err = vtnet_refill_rxq(kring, 0, na->num_rx_desc-1); + if (err < 0) + return 0; + virtqueue_notify(vq); + } + + return 1; +} + +/* Update the virtio-net device configurations. Number of queues can + * change dinamically, by 'ethtool --set-channels $IFNAME combined $N'. + * This is actually the only way virtio-net can currently enable + * the multiqueue mode. + * XXX note that we seem to lose packets if the netmap ring has more + * slots than the queue + */ +static int +vtnet_netmap_config(struct netmap_adapter *na, u_int *txr, u_int *txd, + u_int *rxr, u_int *rxd) +{ + struct ifnet *ifp = na->ifp; + struct SOFTC_T *sc = ifp->if_softc; + + *txr = *rxr = sc->vtnet_max_vq_pairs; + *rxd = 512; // sc->vtnet_rx_nmbufs; + *txd = *rxd; // XXX + D("vtnet config txq=%d, txd=%d rxq=%d, rxd=%d", + *txr, *txd, *rxr, *rxd); + + return 0; +} + +static void +vtnet_netmap_attach(struct SOFTC_T *sc) +{ + struct netmap_adapter na; + + bzero(&na, sizeof(na)); + + na.ifp = sc->vtnet_ifp; + na.num_tx_desc = 1024;// sc->vtnet_rx_nmbufs; + na.num_rx_desc = 1024; // sc->vtnet_rx_nmbufs; + na.nm_register = vtnet_netmap_reg; + na.nm_txsync = vtnet_netmap_txsync; + na.nm_rxsync = vtnet_netmap_rxsync; + na.nm_config = vtnet_netmap_config; + na.num_tx_rings = na.num_rx_rings = sc->vtnet_max_vq_pairs; + D("max rings %d", sc->vtnet_max_vq_pairs); + netmap_attach(&na); + + D("virtio attached txq=%d, txd=%d rxq=%d, rxd=%d", + na.num_tx_rings, na.num_tx_desc, + na.num_tx_rings, na.num_rx_desc); +} +/* end of file */ diff --git a/sys/dev/netmap/ixgbe_netmap.h b/sys/dev/netmap/ixgbe_netmap.h index a617cc4..3dc628a 100644 --- a/sys/dev/netmap/ixgbe_netmap.h +++ b/sys/dev/netmap/ixgbe_netmap.h @@ -153,10 +153,10 @@ ixgbe_netmap_reg(struct netmap_adapter *na, int onoff) * methods should be handled by the individual drivers. */ static int -ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) +ixgbe_netmap_txsync(struct netmap_kring *kring, int flags) { + struct netmap_adapter *na = kring->na; struct ifnet *ifp = na->ifp; - struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ @@ -171,7 +171,7 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) /* device-specific */ struct adapter *adapter = ifp->if_softc; - struct tx_ring *txr = &adapter->tx_rings[ring_nr]; + struct tx_ring *txr = &adapter->tx_rings[kring->ring_id]; int reclaim_tx; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, @@ -223,7 +223,7 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; uint64_t paddr; - void *addr = PNMB(slot, &paddr); + void *addr = PNMB(na, slot, &paddr); /* device-specific */ union ixgbe_adv_tx_desc *curr = &txr->tx_base[nic_i]; @@ -236,11 +236,11 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) __builtin_prefetch(&ring->slot[nm_i + 1]); __builtin_prefetch(&txr->tx_buffers[nic_i + 1]); - NM_CHECK_ADDR_LEN(addr, len); + NM_CHECK_ADDR_LEN(na, addr, len); if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ - netmap_reload_map(txr->txtag, txbuf->map, addr); + netmap_reload_map(na, txr->txtag, txbuf->map, addr); } slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); @@ -309,7 +309,7 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) * REPORT_STATUS in a few slots so TDH is the only * good way. */ - nic_i = IXGBE_READ_REG(&adapter->hw, IXGBE_TDH(ring_nr)); + nic_i = IXGBE_READ_REG(&adapter->hw, IXGBE_TDH(kring->ring_id)); if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ D("TDH wrap %d", nic_i); nic_i -= kring->nkr_num_slots; @@ -341,10 +341,10 @@ ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) * of whether or not we received an interrupt. */ static int -ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +ixgbe_netmap_rxsync(struct netmap_kring *kring, int flags) { + struct netmap_adapter *na = kring->na; struct ifnet *ifp = na->ifp; - struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ u_int nic_i; /* index into the NIC ring */ @@ -355,7 +355,7 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) /* device-specific */ struct adapter *adapter = ifp->if_softc; - struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; + struct rx_ring *rxr = &adapter->rx_rings[kring->ring_id]; if (head > lim) return netmap_ring_reinit(kring); @@ -425,17 +425,17 @@ ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) for (n = 0; nm_i != head; n++) { struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; - void *addr = PNMB(slot, &paddr); + void *addr = PNMB(na, slot, &paddr); union ixgbe_adv_rx_desc *curr = &rxr->rx_base[nic_i]; struct ixgbe_rx_buf *rxbuf = &rxr->rx_buffers[nic_i]; - if (addr == netmap_buffer_base) /* bad buf */ + if (addr == NETMAP_BUF_BASE(na)) /* bad buf */ goto ring_reset; if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ - netmap_reload_map(rxr->ptag, rxbuf->pmap, addr); + netmap_reload_map(na, rxr->ptag, rxbuf->pmap, addr); slot->flags &= ~NS_BUF_CHANGED; } curr->wb.upper.status_error = 0; diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c index e8b6c5a..0fd362f 100644 --- a/sys/dev/netmap/netmap.c +++ b/sys/dev/netmap/netmap.c @@ -124,6 +124,223 @@ ports attached to the switch) */ + +/* --- internals ---- + * + * Roadmap to the code that implements the above. + * + * > 1. a process/thread issues one or more open() on /dev/netmap, to create + * > select()able file descriptor on which events are reported. + * + * Internally, we allocate a netmap_priv_d structure, that will be + * initialized on ioctl(NIOCREGIF). + * + * os-specific: + * FreeBSD: netmap_open (netmap_freebsd.c). The priv is + * per-thread. + * linux: linux_netmap_open (netmap_linux.c). The priv is + * per-open. + * + * > 2. on each descriptor, the process issues an ioctl() to identify + * > the interface that should report events to the file descriptor. + * + * Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0. + * Most important things happen in netmap_get_na() and + * netmap_do_regif(), called from there. Additional details can be + * found in the comments above those functions. + * + * In all cases, this action creates/takes-a-reference-to a + * netmap_*_adapter describing the port, and allocates a netmap_if + * and all necessary netmap rings, filling them with netmap buffers. + * + * In this phase, the sync callbacks for each ring are set (these are used + * in steps 5 and 6 below). The callbacks depend on the type of adapter. + * The adapter creation/initialization code puts them in the + * netmap_adapter (fields na->nm_txsync and na->nm_rxsync). Then, they + * are copied from there to the netmap_kring's during netmap_do_regif(), by + * the nm_krings_create() callback. All the nm_krings_create callbacks + * actually call netmap_krings_create() to perform this and the other + * common stuff. netmap_krings_create() also takes care of the host rings, + * if needed, by setting their sync callbacks appropriately. + * + * Additional actions depend on the kind of netmap_adapter that has been + * registered: + * + * - netmap_hw_adapter: [netmap.c] + * This is a system netdev/ifp with native netmap support. + * The ifp is detached from the host stack by redirecting: + * - transmissions (from the network stack) to netmap_transmit() + * - receive notifications to the nm_notify() callback for + * this adapter. The callback is normally netmap_notify(), unless + * the ifp is attached to a bridge using bwrap, in which case it + * is netmap_bwrap_intr_notify(). + * + * - netmap_generic_adapter: [netmap_generic.c] + * A system netdev/ifp without native netmap support. + * + * (the decision about native/non native support is taken in + * netmap_get_hw_na(), called by netmap_get_na()) + * + * - netmap_vp_adapter [netmap_vale.c] + * Returned by netmap_get_bdg_na(). + * This is a persistent or ephemeral VALE port. Ephemeral ports + * are created on the fly if they don't already exist, and are + * always attached to a bridge. + * Persistent VALE ports must must be created seperately, and i + * then attached like normal NICs. The NIOCREGIF we are examining + * will find them only if they had previosly been created and + * attached (see VALE_CTL below). + * + * - netmap_pipe_adapter [netmap_pipe.c] + * Returned by netmap_get_pipe_na(). + * Both pipe ends are created, if they didn't already exist. + * + * - netmap_monitor_adapter [netmap_monitor.c] + * Returned by netmap_get_monitor_na(). + * If successful, the nm_sync callbacks of the monitored adapter + * will be intercepted by the returned monitor. + * + * - netmap_bwrap_adapter [netmap_vale.c] + * Cannot be obtained in this way, see VALE_CTL below + * + * + * os-specific: + * linux: we first go through linux_netmap_ioctl() to + * adapt the FreeBSD interface to the linux one. + * + * + * > 3. on each descriptor, the process issues an mmap() request to + * > map the shared memory region within the process' address space. + * > The list of interesting queues is indicated by a location in + * > the shared memory region. + * + * os-specific: + * FreeBSD: netmap_mmap_single (netmap_freebsd.c). + * linux: linux_netmap_mmap (netmap_linux.c). + * + * > 4. using the functions in the netmap(4) userspace API, a process + * > can look up the occupation state of a queue, access memory buffers, + * > and retrieve received packets or enqueue packets to transmit. + * + * these actions do not involve the kernel. + * + * > 5. using some ioctl()s the process can synchronize the userspace view + * > of the queue with the actual status in the kernel. This includes both + * > receiving the notification of new packets, and transmitting new + * > packets on the output interface. + * + * These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC + * cases. They invoke the nm_sync callbacks on the netmap_kring + * structures, as initialized in step 2 and maybe later modified + * by a monitor. Monitors, however, will always call the original + * callback before doing anything else. + * + * + * > 6. select() or poll() can be used to wait for events on individual + * > transmit or receive queues (or all queues for a given interface). + * + * Implemented in netmap_poll(). This will call the same nm_sync() + * callbacks as in step 5 above. + * + * os-specific: + * linux: we first go through linux_netmap_poll() to adapt + * the FreeBSD interface to the linux one. + * + * + * ---- VALE_CTL ----- + * + * VALE switches are controlled by issuing a NIOCREGIF with a non-null + * nr_cmd in the nmreq structure. These subcommands are handled by + * netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created + * and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF + * subcommands, respectively. + * + * Any network interface known to the system (including a persistent VALE + * port) can be attached to a VALE switch by issuing the + * NETMAP_BDG_ATTACH subcommand. After the attachment, persistent VALE ports + * look exactly like ephemeral VALE ports (as created in step 2 above). The + * attachment of other interfaces, instead, requires the creation of a + * netmap_bwrap_adapter. Moreover, the attached interface must be put in + * netmap mode. This may require the creation of a netmap_generic_adapter if + * we have no native support for the interface, or if generic adapters have + * been forced by sysctl. + * + * Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(), + * called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach() + * callback. In the case of the bwrap, the callback creates the + * netmap_bwrap_adapter. The initialization of the bwrap is then + * completed by calling netmap_do_regif() on it, in the nm_bdg_ctl() + * callback (netmap_bwrap_bdg_ctl in netmap_vale.c). + * A generic adapter for the wrapped ifp will be created if needed, when + * netmap_get_bdg_na() calls netmap_get_hw_na(). + * + * + * ---- DATAPATHS ----- + * + * -= SYSTEM DEVICE WITH NATIVE SUPPORT =- + * + * na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach() + * + * - tx from netmap userspace: + * concurrently: + * 1) ioctl(NIOCTXSYNC)/netmap_poll() in process context + * kring->nm_sync() == DEVICE_netmap_txsync() + * 2) device interrupt handler + * na->nm_notify() == netmap_notify() + * - rx from netmap userspace: + * concurrently: + * 1) ioctl(NIOCRXSYNC)/netmap_poll() in process context + * kring->nm_sync() == DEVICE_netmap_rxsync() + * 2) device interrupt handler + * na->nm_notify() == netmap_notify() + * - tx from host stack + * concurrently: + * 1) host stack + * netmap_transmit() + * na->nm_notify == netmap_notify() + * 2) ioctl(NIOCRXSYNC)/netmap_poll() in process context + * kring->nm_sync() == netmap_rxsync_from_host_compat + * netmap_rxsync_from_host(na, NULL, NULL) + * - tx to host stack + * ioctl(NIOCTXSYNC)/netmap_poll() in process context + * kring->nm_sync() == netmap_txsync_to_host_compat + * netmap_txsync_to_host(na) + * NM_SEND_UP() + * FreeBSD: na->if_input() == ?? XXX + * linux: netif_rx() with NM_MAGIC_PRIORITY_RX + * + * + * + * -= SYSTEM DEVICE WITH GENERIC SUPPORT =- + * + * + * + * -= VALE PORT =- + * + * + * + * -= NETMAP PIPE =- + * + * + * + * -= SYSTEM DEVICE WITH NATIVE SUPPORT, CONNECTED TO VALE, NO HOST RINGS =- + * + * + * + * -= SYSTEM DEVICE WITH NATIVE SUPPORT, CONNECTED TO VALE, WITH HOST RINGS =- + * + * + * + * -= SYSTEM DEVICE WITH GENERIC SUPPORT, CONNECTED TO VALE, NO HOST RINGS =- + * + * + * + * -= SYSTEM DEVICE WITH GENERIC SUPPORT, CONNECTED TO VALE, WITH HOST RINGS =- + * + * + * + */ + /* * OS-specific code that is used only within this file. * Other OS-specific code that must be accessed by drivers @@ -218,6 +435,10 @@ int netmap_txsync_retry = 2; SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); +int netmap_adaptive_io = 0; +SYSCTL_INT(_dev_netmap, OID_AUTO, adaptive_io, CTLFLAG_RW, + &netmap_adaptive_io, 0 , "Adaptive I/O on paravirt"); + int netmap_flags = 0; /* debug flags */ int netmap_fwd = 0; /* force transparent mode */ int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */ @@ -259,7 +480,7 @@ nm_kr_get(struct netmap_kring *kr) * mark the ring as stopped, and run through the locks * to make sure other users get to see it. */ -void +static void netmap_disable_ring(struct netmap_kring *kr) { kr->nkr_stopped = 1; @@ -269,41 +490,59 @@ netmap_disable_ring(struct netmap_kring *kr) nm_kr_put(kr); } +/* stop or enable a single tx ring */ +void +netmap_set_txring(struct netmap_adapter *na, u_int ring_id, int stopped) +{ + if (stopped) + netmap_disable_ring(na->tx_rings + ring_id); + else + na->tx_rings[ring_id].nkr_stopped = 0; + /* nofify that the stopped state has changed. This is currently + *only used by bwrap to propagate the state to its own krings. + * (see netmap_bwrap_intr_notify). + */ + na->nm_notify(na, ring_id, NR_TX, NAF_DISABLE_NOTIFY); +} + +/* stop or enable a single rx ring */ +void +netmap_set_rxring(struct netmap_adapter *na, u_int ring_id, int stopped) +{ + if (stopped) + netmap_disable_ring(na->rx_rings + ring_id); + else + na->rx_rings[ring_id].nkr_stopped = 0; + /* nofify that the stopped state has changed. This is currently + *only used by bwrap to propagate the state to its own krings. + * (see netmap_bwrap_intr_notify). + */ + na->nm_notify(na, ring_id, NR_RX, NAF_DISABLE_NOTIFY); +} + /* stop or enable all the rings of na */ -static void -netmap_set_all_rings(struct ifnet *ifp, int stopped) +void +netmap_set_all_rings(struct netmap_adapter *na, int stopped) { - struct netmap_adapter *na; int i; u_int ntx, nrx; - if (!(ifp->if_capenable & IFCAP_NETMAP)) + if (!nm_netmap_on(na)) return; - na = NA(ifp); - ntx = netmap_real_tx_rings(na); nrx = netmap_real_rx_rings(na); for (i = 0; i < ntx; i++) { - if (stopped) - netmap_disable_ring(na->tx_rings + i); - else - na->tx_rings[i].nkr_stopped = 0; - na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY); + netmap_set_txring(na, i, stopped); } for (i = 0; i < nrx; i++) { - if (stopped) - netmap_disable_ring(na->rx_rings + i); - else - na->rx_rings[i].nkr_stopped = 0; - na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY); + netmap_set_rxring(na, i, stopped); } } - /* * Convenience function used in drivers. Waits for current txsync()s/rxsync()s * to finish and prevents any new one from starting. Call this before turning @@ -314,10 +553,9 @@ netmap_set_all_rings(struct ifnet *ifp, int stopped) void netmap_disable_all_rings(struct ifnet *ifp) { - netmap_set_all_rings(ifp, 1 /* stopped */); + netmap_set_all_rings(NA(ifp), 1 /* stopped */); } - /* * Convenience function used in drivers. Re-enables rxsync and txsync on the * adapter's rings In linux drivers, this should be placed near each @@ -326,7 +564,7 @@ netmap_disable_all_rings(struct ifnet *ifp) void netmap_enable_all_rings(struct ifnet *ifp) { - netmap_set_all_rings(ifp, 0 /* enabled */); + netmap_set_all_rings(NA(ifp), 0 /* enabled */); } @@ -410,7 +648,6 @@ nm_dump_buf(char *p, int len, int lim, char *dst) int netmap_update_config(struct netmap_adapter *na) { - struct ifnet *ifp = na->ifp; u_int txr, txd, rxr, rxd; txr = txd = rxr = rxd = 0; @@ -429,11 +666,11 @@ netmap_update_config(struct netmap_adapter *na) return 0; /* nothing changed */ if (netmap_verbose || na->active_fds > 0) { D("stored config %s: txring %d x %d, rxring %d x %d", - NM_IFPNAME(ifp), + na->name, na->num_tx_rings, na->num_tx_desc, na->num_rx_rings, na->num_rx_desc); D("new config %s: txring %d x %d, rxring %d x %d", - NM_IFPNAME(ifp), txr, txd, rxr, rxd); + na->name, txr, txd, rxr, rxd); } if (na->active_fds == 0) { D("configuration changed (but fine)"); @@ -447,20 +684,6 @@ netmap_update_config(struct netmap_adapter *na) return 1; } -static int -netmap_txsync_compat(struct netmap_kring *kring, int flags) -{ - struct netmap_adapter *na = kring->na; - return na->nm_txsync(na, kring->ring_id, flags); -} - -static int -netmap_rxsync_compat(struct netmap_kring *kring, int flags) -{ - struct netmap_adapter *na = kring->na; - return na->nm_rxsync(na, kring->ring_id, flags); -} - /* kring->nm_sync callback for the host tx ring */ static int netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags) @@ -538,7 +761,7 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom) kring->ring_id = i; kring->nkr_num_slots = ndesc; if (i < na->num_tx_rings) { - kring->nm_sync = netmap_txsync_compat; // XXX + kring->nm_sync = na->nm_txsync; } else if (i == na->num_tx_rings) { kring->nm_sync = netmap_txsync_to_host_compat; } @@ -547,7 +770,7 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom) */ kring->rhead = kring->rcur = kring->nr_hwcur = 0; kring->rtail = kring->nr_hwtail = ndesc - 1; - snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", NM_IFPNAME(na->ifp), i); + snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", na->name, i); ND("ktx %s h %d c %d t %d", kring->name, kring->rhead, kring->rcur, kring->rtail); mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF); @@ -562,13 +785,13 @@ netmap_krings_create(struct netmap_adapter *na, u_int tailroom) kring->ring_id = i; kring->nkr_num_slots = ndesc; if (i < na->num_rx_rings) { - kring->nm_sync = netmap_rxsync_compat; // XXX + kring->nm_sync = na->nm_rxsync; } else if (i == na->num_rx_rings) { kring->nm_sync = netmap_rxsync_from_host_compat; } kring->rhead = kring->rcur = kring->nr_hwcur = 0; kring->rtail = kring->nr_hwtail = 0; - snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", NM_IFPNAME(na->ifp), i); + snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", na->name, i); ND("krx %s h %d c %d t %d", kring->name, kring->rhead, kring->rcur, kring->rtail); mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF); @@ -624,7 +847,7 @@ netmap_hw_krings_delete(struct netmap_adapter *na) */ /* call with NMG_LOCK held */ static struct netmap_if* -netmap_if_new(const char *ifname, struct netmap_adapter *na) +netmap_if_new(struct netmap_adapter *na) { struct netmap_if *nifp; @@ -641,7 +864,7 @@ netmap_if_new(const char *ifname, struct netmap_adapter *na) * the netmap rings themselves */ if (na->nm_krings_create(na)) - goto cleanup; + return NULL; /* create all missing netmap rings */ if (netmap_mem_rings_create(na)) @@ -650,7 +873,7 @@ netmap_if_new(const char *ifname, struct netmap_adapter *na) final: /* in all cases, create a new netmap if */ - nifp = netmap_mem_if_new(ifname, na); + nifp = netmap_mem_if_new(na); if (nifp == NULL) goto cleanup; @@ -689,7 +912,7 @@ netmap_get_memory_locked(struct netmap_priv_d* p) nmd = p->np_na->nm_mem; } if (p->np_mref == NULL) { - error = netmap_mem_finalize(nmd); + error = netmap_mem_finalize(nmd, p->np_na); if (!error) p->np_mref = nmd; } else if (p->np_mref != nmd) { @@ -728,17 +951,15 @@ static void netmap_drop_memory_locked(struct netmap_priv_d* p) { if (p->np_mref) { - netmap_mem_deref(p->np_mref); + netmap_mem_deref(p->np_mref, p->np_na); p->np_mref = NULL; } } /* - * File descriptor's private data destructor. - * * Call nm_register(ifp,0) to stop netmap mode on the interface and - * revert to normal operation. We expect that np_na->ifp has not gone. + * revert to normal operation. * The second argument is the nifp to work on. In some cases it is * not attached yet to the netmap_priv_d so we need to pass it as * a separate argument. @@ -748,14 +969,13 @@ static void netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) { struct netmap_adapter *na = priv->np_na; - struct ifnet *ifp = na->ifp; NMG_LOCK_ASSERT(); na->active_fds--; if (na->active_fds <= 0) { /* last instance */ if (netmap_verbose) - D("deleting last instance for %s", NM_IFPNAME(ifp)); + D("deleting last instance for %s", na->name); /* * (TO CHECK) This function is only called * when the last reference to this file descriptor goes @@ -770,8 +990,7 @@ netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) * happens if the close() occurs while a concurrent * syscall is running. */ - if (ifp) - na->nm_register(na, 0); /* off, clear flags */ + na->nm_register(na, 0); /* off, clear flags */ /* Wake up any sleeping threads. netmap_poll will * then return POLLERR * XXX The wake up now must happen during *_down(), when @@ -922,13 +1141,13 @@ netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) if ((slot->flags & NS_FORWARD) == 0 && !force) continue; - if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { + if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) { RD(5, "bad pkt at %d len %d", n, slot->len); continue; } slot->flags &= ~NS_FORWARD; // XXX needed ? /* XXX TODO: adapt to the case of a multisegment packet */ - m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL); + m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL); if (m == NULL) break; @@ -981,7 +1200,7 @@ netmap_sw_to_nic(struct netmap_adapter *na) dst->len = tmp.len; dst->flags = NS_BUF_CHANGED; - rdst->head = rdst->cur = nm_next(dst_cur, dst_lim); + rdst->cur = nm_next(dst_cur, dst_lim); } /* if (sent) XXX txsync ? */ } @@ -1028,6 +1247,11 @@ netmap_txsync_to_host(struct netmap_adapter *na) * They have been put in kring->rx_queue by netmap_transmit(). * We protect access to the kring using kring->rx_queue.lock * + * This routine also does the selrecord if called from the poll handler + * (we know because td != NULL). + * + * NOTE: on linux, selrecord() is defined as a macro and uses pwait + * as an additional hidden argument. * returns the number of packets delivered to tx queues in * transparent mode, or a negative value if error */ @@ -1059,14 +1283,15 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai int len = MBUF_LEN(m); struct netmap_slot *slot = &ring->slot[nm_i]; - m_copydata(m, 0, len, BDG_NMB(na, slot)); + m_copydata(m, 0, len, NMB(na, slot)); ND("nm %d len %d", nm_i, len); if (netmap_verbose) - D("%s", nm_dump_buf(BDG_NMB(na, slot),len, 128, NULL)); + D("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL)); slot->len = len; slot->flags = kring->nkr_slot_flags; nm_i = nm_next(nm_i, lim); + m_freem(m); } kring->nr_hwtail = nm_i; } @@ -1083,6 +1308,10 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai nm_rxsync_finalize(kring); + /* access copies of cur,tail in the kring */ + if (kring->rcur == kring->rtail && td) /* no bufs available */ + selrecord(td, &kring->si); + mbq_unlock(q); return ret; } @@ -1128,21 +1357,23 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) i = netmap_admode = NETMAP_ADMODE_BEST; if (NETMAP_CAPABLE(ifp)) { - /* If an adapter already exists, but is - * attached to a vale port, we report that the - * port is busy. - */ - if (NETMAP_OWNED_BY_KERN(NA(ifp))) - return EBUSY; - + prev_na = NA(ifp); /* If an adapter already exists, return it if * there are active file descriptors or if * netmap is not forced to use generic * adapters. */ - if (NA(ifp)->active_fds > 0 || - i != NETMAP_ADMODE_GENERIC) { - *na = NA(ifp); + if (NETMAP_OWNED_BY_ANY(prev_na) + || i != NETMAP_ADMODE_GENERIC + || prev_na->na_flags & NAF_FORCE_NATIVE +#ifdef WITH_PIPES + /* ugly, but we cannot allow an adapter switch + * if some pipe is referring to this one + */ + || prev_na->na_next_pipe > 0 +#endif + ) { + *na = prev_na; return 0; } } @@ -1212,13 +1443,30 @@ netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) *na = NULL; /* default return value */ - /* first try to see if this is a bridge port. */ NMG_LOCK_ASSERT(); + /* we cascade through all possibile types of netmap adapter. + * All netmap_get_*_na() functions return an error and an na, + * with the following combinations: + * + * error na + * 0 NULL type doesn't match + * !0 NULL type matches, but na creation/lookup failed + * 0 !NULL type matches and na created/found + * !0 !NULL impossible + */ + + /* try to see if this is a monitor port */ + error = netmap_get_monitor_na(nmr, na, create); + if (error || *na != NULL) + return error; + + /* try to see if this is a pipe port */ error = netmap_get_pipe_na(nmr, na, create); if (error || *na != NULL) return error; + /* try to see if this is a bridge port */ error = netmap_get_bdg_na(nmr, na, create); if (error) return error; @@ -1241,11 +1489,6 @@ netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) if (error) goto out; - /* Users cannot use the NIC attached to a bridge directly */ - if (NETMAP_OWNED_BY_KERN(ret)) { - error = EBUSY; - goto out; - } *na = ret; netmap_adapter_get(ret); @@ -1444,7 +1687,7 @@ netmap_ring_reinit(struct netmap_kring *kring) int errors = 0; // XXX KASSERT nm_kr_tryget - RD(10, "called for %s", NM_IFPNAME(kring->na->ifp)); + RD(10, "called for %s", kring->name); // XXX probably wrong to trust userspace kring->rhead = ring->head; kring->rcur = ring->cur; @@ -1463,7 +1706,7 @@ netmap_ring_reinit(struct netmap_kring *kring) RD(5, "bad index at slot %d idx %d len %d ", i, idx, len); ring->slot[i].buf_idx = 0; ring->slot[i].len = 0; - } else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) { + } else if (len > NETMAP_BUF_SIZE(kring->na)) { ring->slot[i].len = 0; RD(5, "bad len at slot %d idx %d len %d", i, idx, len); } @@ -1481,13 +1724,15 @@ netmap_ring_reinit(struct netmap_kring *kring) return (errors ? 1 : 0); } - -/* - * Set the ring ID. For devices with a single queue, a request - * for all rings is the same as a single ring. +/* interpret the ringid and flags fields of an nmreq, by translating them + * into a pair of intervals of ring indices: + * + * [priv->np_txqfirst, priv->np_txqlast) and + * [priv->np_rxqfirst, priv->np_rxqlast) + * */ -static int -netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags) +int +netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags) { struct netmap_adapter *na = priv->np_na; u_int j, i = ringid & NETMAP_RING_MASK; @@ -1551,15 +1796,11 @@ netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags) D("invalid regif type %d", reg); return EINVAL; } - priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; priv->np_flags = (flags & ~NR_REG_MASK) | reg; - if (nm_tx_si_user(priv)) - na->tx_si_users++; - if (nm_rx_si_user(priv)) - na->rx_si_users++; + if (netmap_verbose) { D("%s: tx [%d,%d) rx [%d,%d) id %d", - NM_IFPNAME(na->ifp), + na->name, priv->np_txqfirst, priv->np_txqlast, priv->np_rxqfirst, @@ -1569,16 +1810,113 @@ netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags) return 0; } + +/* + * Set the ring ID. For devices with a single queue, a request + * for all rings is the same as a single ring. + */ +static int +netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags) +{ + struct netmap_adapter *na = priv->np_na; + int error; + + error = netmap_interp_ringid(priv, ringid, flags); + if (error) { + return error; + } + + priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; + + /* optimization: count the users registered for more than + * one ring, which are the ones sleeping on the global queue. + * The default netmap_notify() callback will then + * avoid signaling the global queue if nobody is using it + */ + if (nm_tx_si_user(priv)) + na->tx_si_users++; + if (nm_rx_si_user(priv)) + na->rx_si_users++; + return 0; +} + /* * possibly move the interface to netmap-mode. * If success it returns a pointer to netmap_if, otherwise NULL. * This must be called with NMG_LOCK held. + * + * The following na callbacks are called in the process: + * + * na->nm_config() [by netmap_update_config] + * (get current number and size of rings) + * + * We have a generic one for linux (netmap_linux_config). + * The bwrap has to override this, since it has to forward + * the request to the wrapped adapter (netmap_bwrap_config). + * + * XXX netmap_if_new calls this again (2014-03-15) + * + * na->nm_krings_create() [by netmap_if_new] + * (create and init the krings array) + * + * One of the following: + * + * * netmap_hw_krings_create, (hw ports) + * creates the standard layout for the krings + * and adds the mbq (used for the host rings). + * + * * netmap_vp_krings_create (VALE ports) + * add leases and scratchpads + * + * * netmap_pipe_krings_create (pipes) + * create the krings and rings of both ends and + * cross-link them + * + * * netmap_monitor_krings_create (monitors) + * avoid allocating the mbq + * + * * netmap_bwrap_krings_create (bwraps) + * create both the brap krings array, + * the krings array of the wrapped adapter, and + * (if needed) the fake array for the host adapter + * + * na->nm_register(, 1) + * (put the adapter in netmap mode) + * + * This may be one of the following: + * (XXX these should be either all *_register or all *_reg 2014-03-15) + * + * * netmap_hw_register (hw ports) + * checks that the ifp is still there, then calls + * the hardware specific callback; + * + * * netmap_vp_reg (VALE ports) + * If the port is connected to a bridge, + * set the NAF_NETMAP_ON flag under the + * bridge write lock. + * + * * netmap_pipe_reg (pipes) + * inform the other pipe end that it is no + * longer responsibile for the lifetime of this + * pipe end + * + * * netmap_monitor_reg (monitors) + * intercept the sync callbacks of the monitored + * rings + * + * * netmap_bwrap_register (bwraps) + * cross-link the bwrap and hwna rings, + * forward the request to the hwna, override + * the hwna notify callback (to get the frames + * coming from outside go through the bridge). + * + * XXX maybe netmap_if_new() should be merged with this (2014-03-15). + * */ struct netmap_if * netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, uint16_t ringid, uint32_t flags, int *err) { - struct ifnet *ifp = na->ifp; struct netmap_if *nifp = NULL; int error, need_mem = 0; @@ -1597,24 +1935,22 @@ netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, if (error) goto out; } - nifp = netmap_if_new(NM_IFPNAME(ifp), na); - /* Allocate a netmap_if and, if necessary, all the netmap_ring's */ + nifp = netmap_if_new(na); if (nifp == NULL) { /* allocation failed */ error = ENOMEM; goto out; } na->active_fds++; - if (ifp->if_capenable & IFCAP_NETMAP) { - /* was already set */ - } else { - /* Otherwise set the card in netmap mode + if (!nm_netmap_on(na)) { + /* Netmap not active, set the card in netmap mode * and make it use the shared buffers. */ /* cache the allocator info in the na */ - na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut; + na->na_lut = netmap_mem_get_lut(na->nm_mem); ND("%p->na_lut == %p", na, na->na_lut); - na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal; + na->na_lut_objtotal = netmap_mem_get_buftotal(na->nm_mem); + na->na_lut_objsize = netmap_mem_get_bufsize(na->nm_mem); error = na->nm_register(na, 1); /* mode on */ if (error) { netmap_do_unregif(priv, nifp); @@ -1624,12 +1960,12 @@ netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, out: *err = error; if (error) { - priv->np_na = NULL; /* we should drop the allocator, but only * if we were the ones who grabbed it */ if (need_mem) netmap_drop_memory_locked(priv); + priv->np_na = NULL; } if (nifp != NULL) { /* @@ -1662,7 +1998,6 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct netmap_priv_d *priv = NULL; - struct ifnet *ifp = NULL; struct nmreq *nmr = (struct nmreq *) data; struct netmap_adapter *na = NULL; int error; @@ -1740,7 +2075,9 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, /* possibly attach/detach NIC and VALE switch */ i = nmr->nr_cmd; if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH - || i == NETMAP_BDG_VNET_HDR) { + || i == NETMAP_BDG_VNET_HDR + || i == NETMAP_BDG_NEWIF + || i == NETMAP_BDG_DELIF) { error = netmap_bdg_ctl(nmr, NULL); break; } else if (i != 0) { @@ -1762,7 +2099,6 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */ if (error) break; - ifp = na->ifp; if (NETMAP_OWNED_BY_KERN(na)) { netmap_adapter_put(na); error = EBUSY; @@ -1824,9 +2160,7 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, break; } - ifp = na->ifp; - if (ifp == NULL) { - RD(1, "the ifp is gone"); + if (!nm_netmap_on(na)) { error = ENXIO; break; } @@ -1870,6 +2204,9 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, break; + case NIOCCONFIG: + error = netmap_bdg_config(nmr); + break; #ifdef __FreeBSD__ case FIONBIO: case FIOASYNC: @@ -1886,6 +2223,7 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, default: /* allow device-specific ioctls */ { struct socket so; + struct ifnet *ifp; bzero(&so, sizeof(so)); NMG_LOCK(); @@ -1935,7 +2273,6 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) { struct netmap_priv_d *priv = NULL; struct netmap_adapter *na; - struct ifnet *ifp; struct netmap_kring *kring; u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0; struct mbq q; /* packets from hw queues to host stack */ @@ -1974,18 +2311,12 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) rmb(); /* make sure following reads are not from cache */ na = priv->np_na; - ifp = na->ifp; - // check for deleted - if (ifp == NULL) { - RD(1, "the ifp is gone"); - return POLLERR; - } - if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) + if (!nm_netmap_on(na)) return POLLERR; if (netmap_verbose & 0x8000) - D("device %s events 0x%x", NM_IFPNAME(ifp), events); + D("device %s events 0x%x", na->name, events); want_tx = events & (POLLOUT | POLLWRNORM); want_rx = events & (POLLIN | POLLRDNORM); @@ -2056,7 +2387,6 @@ flush_tx: * be better. In current code, however, we only * stop the rings for brief intervals (2014-03-14) */ - if (netmap_verbose) RD(2, "%p lost race on txring %d, ok", priv, i); @@ -2115,6 +2445,8 @@ do_retry_rx: /* * transparent mode support: collect packets * from the rxring(s). + * XXX NR_FORWARD should only be read on + * physical or NIC ports */ if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { ND(10, "forwarding some buffers up %d to %d", @@ -2141,12 +2473,13 @@ do_retry_rx: /* transparent mode XXX only during first pass ? */ if (na->na_flags & NAF_HOST_RINGS) { kring = &na->rx_rings[na->num_rx_rings]; - if (netmap_fwd || kring->ring->flags & NR_FORWARD) { - send_down = netmap_rxsync_from_host(na, td, dev); - if (send_down && (netmap_no_timestamp == 0 || - kring->ring->flags & NR_TIMESTAMP)) { - microtime(&kring->ring->ts); - } + if (check_all_rx + && (netmap_fwd || kring->ring->flags & NR_FORWARD)) { + /* XXX fix to use kring fields */ + if (nm_ring_empty(kring->ring)) + send_down = netmap_rxsync_from_host(na, td, dev); + if (!nm_ring_empty(kring->ring)) + revents |= want_rx; } } @@ -2174,7 +2507,7 @@ do_retry_rx: * rings to a single file descriptor. */ - if (q.head) + if (q.head && na->ifp != NULL) netmap_send_up(na->ifp, &q); return (revents); @@ -2224,19 +2557,27 @@ netmap_attach_common(struct netmap_adapter *na) if (na->num_tx_rings == 0 || na->num_rx_rings == 0) { D("%s: invalid rings tx %d rx %d", - ifp->if_xname, na->num_tx_rings, na->num_rx_rings); + na->name, na->num_tx_rings, na->num_rx_rings); return EINVAL; } - WNA(ifp) = na; + /* ifp is NULL for virtual adapters (bwrap, non-persistent VALE ports, + * pipes, monitors). For bwrap we actually have a non-null ifp for + * use by the external modules, but that is set after this + * function has been called. + * XXX this is ugly, maybe split this function in two (2014-03-14) + */ + if (ifp != NULL) { + WNA(ifp) = na; /* the following is only needed for na that use the host port. * XXX do we have something similar for linux ? */ #ifdef __FreeBSD__ - na->if_input = ifp->if_input; /* for netmap_send_up */ + na->if_input = ifp->if_input; /* for netmap_send_up */ #endif /* __FreeBSD__ */ - NETMAP_SET_CAPABLE(ifp); + NETMAP_SET_CAPABLE(ifp); + } if (na->nm_krings_create == NULL) { /* we assume that we have been called by a driver, * since other port types all provide their own @@ -2250,7 +2591,13 @@ netmap_attach_common(struct netmap_adapter *na) na->active_fds = 0; if (na->nm_mem == NULL) + /* use the global allocator */ na->nm_mem = &nm_mem; + if (na->nm_bdg_attach == NULL) + /* no special nm_bdg_attach callback. On VALE + * attach, we need to interpose a bwrap + */ + na->nm_bdg_attach = netmap_bwrap_attach; return 0; } @@ -2273,6 +2620,28 @@ netmap_detach_common(struct netmap_adapter *na) free(na, M_DEVBUF); } +/* Wrapper for the register callback provided hardware drivers. + * na->ifp == NULL means the the driver module has been + * unloaded, so we cannot call into it. + * Note that module unloading, in our patched linux drivers, + * happens under NMG_LOCK and after having stopped all the + * nic rings (see netmap_detach). This provides sufficient + * protection for the other driver-provied callbacks + * (i.e., nm_config and nm_*xsync), that therefore don't need + * to wrapped. + */ +static int +netmap_hw_register(struct netmap_adapter *na, int onoff) +{ + struct netmap_hw_adapter *hwna = + (struct netmap_hw_adapter*)na; + + if (na->ifp == NULL) + return onoff ? ENXIO : 0; + + return hwna->nm_hw_register(na, onoff); +} + /* * Initialize a ``netmap_adapter`` object created by driver on attach. @@ -2298,6 +2667,9 @@ netmap_attach(struct netmap_adapter *arg) goto fail; hwna->up = *arg; hwna->up.na_flags |= NAF_HOST_RINGS; + strncpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name)); + hwna->nm_hw_register = hwna->up.nm_register; + hwna->up.nm_register = netmap_hw_register; if (netmap_attach_common(&hwna->up)) { free(hwna, M_DEVBUF); goto fail; @@ -2314,10 +2686,20 @@ netmap_attach(struct netmap_adapter *arg) #endif } hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; + if (ifp->ethtool_ops) { + hwna->nm_eto = *ifp->ethtool_ops; + } + hwna->nm_eto.set_ringparam = linux_netmap_set_ringparam; +#ifdef ETHTOOL_SCHANNELS + hwna->nm_eto.set_channels = linux_netmap_set_channels; +#endif + if (arg->nm_config == NULL) { + hwna->up.nm_config = netmap_linux_config; + } #endif /* linux */ D("success for %s tx %d/%d rx %d/%d queues/slots", - NM_IFPNAME(ifp), + hwna->up.name, hwna->up.num_tx_rings, hwna->up.num_tx_desc, hwna->up.num_rx_rings, hwna->up.num_rx_desc ); @@ -2393,6 +2775,8 @@ netmap_detach(struct ifnet *ifp) * tell them that the interface is gone */ na->ifp = NULL; + // XXX also clear NAF_NATIVE_ON ? + na->na_flags &= ~NAF_NETMAP_ON; /* give them a chance to notice */ netmap_enable_all_rings(ifp); } @@ -2426,8 +2810,8 @@ netmap_transmit(struct ifnet *ifp, struct mbuf *m) // if we follow the down/configure/up protocol -gl // mtx_lock(&na->core_lock); - if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) { - D("%s not in netmap mode anymore", NM_IFPNAME(ifp)); + if (!nm_netmap_on(na)) { + D("%s not in netmap mode anymore", na->name); error = ENXIO; goto done; } @@ -2436,9 +2820,9 @@ netmap_transmit(struct ifnet *ifp, struct mbuf *m) q = &kring->rx_queue; // XXX reconsider long packets if we handle fragments - if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */ - D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp), - len, NETMAP_BDG_BUF_SIZE(na->nm_mem)); + if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */ + D("%s from_host, drop packet size %d > %d", na->name, + len, NETMAP_BUF_SIZE(na)); goto done; } @@ -2454,12 +2838,12 @@ netmap_transmit(struct ifnet *ifp, struct mbuf *m) space += kring->nkr_num_slots; if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p", - NM_IFPNAME(ifp), kring->nr_hwcur, kring->nr_hwtail, mbq_len(q), + na->name, kring->nr_hwcur, kring->nr_hwtail, mbq_len(q), len, m); } else { mbq_enqueue(q, m); ND(10, "%s %d bufs in queue len %d m %p", - NM_IFPNAME(ifp), mbq_len(q), len, m); + na->name, mbq_len(q), len, m); /* notify outside the lock */ m = NULL; error = 0; @@ -2492,12 +2876,8 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, struct netmap_kring *kring; int new_hwofs, lim; - if (na == NULL) { - D("NULL na, should not happen"); - return NULL; /* no netmap support here */ - } - if (!(na->ifp->if_capenable & IFCAP_NETMAP)) { - ND("interface not in netmap mode"); + if (!nm_native_on(na)) { + ND("interface not in native netmap mode"); return NULL; /* nothing to reinitialize */ } @@ -2528,7 +2908,7 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, /* Always set the new offset value and realign the ring. */ if (netmap_verbose) D("%s %s%d hwofs %d -> %d, hwtail %d -> %d", - NM_IFPNAME(na->ifp), + na->name, tx == NR_TX ? "TX" : "RX", n, kring->nkr_hwofs, new_hwofs, kring->nr_hwtail, @@ -2570,8 +2950,9 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, * The 'notify' routine depends on what the ring is attached to. * - for a netmap file descriptor, do a selwakeup on the individual * waitqueue, plus one on the global one if needed - * - for a switch, call the proper forwarding routine - * - XXX more ? + * (see netmap_notify) + * - for a nic connected to a switch, call the proper forwarding routine + * (see netmap_bwrap_intr_notify) */ void netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) @@ -2620,11 +3001,18 @@ netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) int netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) { - // XXX could we check NAF_NATIVE_ON ? - if (!(ifp->if_capenable & IFCAP_NETMAP)) + struct netmap_adapter *na = NA(ifp); + + /* + * XXX emulated netmap mode sets NAF_SKIP_INTR so + * we still use the regular driver even though the previous + * check fails. It is unclear whether we should use + * nm_native_on() here. + */ + if (!nm_netmap_on(na)) return 0; - if (NA(ifp)->na_flags & NAF_SKIP_INTR) { + if (na->na_flags & NAF_SKIP_INTR) { ND("use regular interrupt"); return 0; } @@ -2677,6 +3065,9 @@ netmap_init(void) goto fail; netmap_init_bridges(); +#ifdef __FreeBSD__ + nm_vi_init_index(); +#endif printf("netmap: loaded module\n"); return (0); fail: diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c index e43d669..160b7c0 100644 --- a/sys/dev/netmap/netmap_freebsd.c +++ b/sys/dev/netmap/netmap_freebsd.c @@ -50,6 +50,9 @@ #include <sys/selinfo.h> #include <net/if.h> #include <net/if_var.h> +#include <net/if_types.h> /* IFT_ETHER */ +#include <net/ethernet.h> /* ether_ifdetach */ +#include <net/if_dl.h> /* LLADDR */ #include <machine/bus.h> /* bus_dmamap_* */ #include <netinet/in.h> /* in6_cksum_pseudo() */ #include <machine/in_cksum.h> /* in_pseudo(), in_cksum_hdr() */ @@ -91,8 +94,7 @@ nm_csum_fold(rawsum_t cur_sum) return htobe16((~cur_sum) & 0xFFFF); } -uint16_t -nm_csum_ipv4(struct nm_iphdr *iph) +uint16_t nm_csum_ipv4(struct nm_iphdr *iph) { #if 0 return in_cksum_hdr((void *)iph); @@ -148,8 +150,7 @@ nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, int netmap_catch_rx(struct netmap_adapter *na, int intercept) { - struct netmap_generic_adapter *gna = - (struct netmap_generic_adapter *)na; + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; struct ifnet *ifp = na->ifp; if (intercept) { @@ -221,9 +222,9 @@ generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, * (and eventually, just reference the netmap buffer) */ - if (*m->m_ext.ref_cnt != 1) { + if (GET_MBUF_REFCNT(m) != 1) { D("invalid refcnt %d for %p", - *m->m_ext.ref_cnt, m); + GET_MBUF_REFCNT(m), m); panic("in generic_xmit_frame"); } // XXX the ext_size check is unnecessary if we link the netmap buf @@ -231,14 +232,14 @@ generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, RD(5, "size %d < len %d", m->m_ext.ext_size, len); len = m->m_ext.ext_size; } - if (1) { /* XXX seems to have negligible benefits */ + if (0) { /* XXX seems to have negligible benefits */ m->m_ext.ext_buf = m->m_data = addr; } else { bcopy(addr, m->m_data, len); } m->m_len = m->m_pkthdr.len = len; // inc refcount. All ours, we could skip the atomic - atomic_fetchadd_int(m->m_ext.ref_cnt, 1); + atomic_fetchadd_int(PNT_MBUF_REFCNT(m), 1); m->m_flags |= M_FLOWID; m->m_pkthdr.flowid = ring_nr; m->m_pkthdr.rcvif = ifp; /* used for tx notification */ @@ -277,10 +278,11 @@ generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq) void -netmap_mitigation_init(struct nm_generic_mit *mit, struct netmap_adapter *na) +netmap_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na) { ND("called"); mit->mit_pending = 0; + mit->mit_ring_idx = idx; mit->mit_na = na; } @@ -313,6 +315,135 @@ netmap_mitigation_cleanup(struct nm_generic_mit *mit) ND("called"); } +static int +nm_vi_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr) +{ + return EINVAL; +} + +static void +nm_vi_start(struct ifnet *ifp) +{ + panic("nm_vi_start() must not be called"); +} + +/* + * Index manager of persistent virtual interfaces. + * It is used to decide the lowest byte of the MAC address. + * We use the same algorithm with management of bridge port index. + */ +#define NM_VI_MAX 255 +static struct { + uint8_t index[NM_VI_MAX]; /* XXX just for a reasonable number */ + uint8_t active; + struct mtx lock; +} nm_vi_indices; + +void +nm_vi_init_index(void) +{ + int i; + for (i = 0; i < NM_VI_MAX; i++) + nm_vi_indices.index[i] = i; + nm_vi_indices.active = 0; + mtx_init(&nm_vi_indices.lock, "nm_vi_indices_lock", NULL, MTX_DEF); +} + +/* return -1 if no index available */ +static int +nm_vi_get_index(void) +{ + int ret; + + mtx_lock(&nm_vi_indices.lock); + ret = nm_vi_indices.active == NM_VI_MAX ? -1 : + nm_vi_indices.index[nm_vi_indices.active++]; + mtx_unlock(&nm_vi_indices.lock); + return ret; +} + +static void +nm_vi_free_index(uint8_t val) +{ + int i, lim; + + mtx_lock(&nm_vi_indices.lock); + lim = nm_vi_indices.active; + for (i = 0; i < lim; i++) { + if (nm_vi_indices.index[i] == val) { + /* swap index[lim-1] and j */ + int tmp = nm_vi_indices.index[lim-1]; + nm_vi_indices.index[lim-1] = val; + nm_vi_indices.index[i] = tmp; + nm_vi_indices.active--; + break; + } + } + if (lim == nm_vi_indices.active) + D("funny, index %u didn't found", val); + mtx_unlock(&nm_vi_indices.lock); +} +#undef NM_VI_MAX + +/* + * Implementation of a netmap-capable virtual interface that + * registered to the system. + * It is based on if_tap.c and ip_fw_log.c in FreeBSD 9. + * + * Note: Linux sets refcount to 0 on allocation of net_device, + * then increments it on registration to the system. + * FreeBSD sets refcount to 1 on if_alloc(), and does not + * increment this refcount on if_attach(). + */ +int +nm_vi_persist(const char *name, struct ifnet **ret) +{ + struct ifnet *ifp; + u_short macaddr_hi; + uint32_t macaddr_mid; + u_char eaddr[6]; + int unit = nm_vi_get_index(); /* just to decide MAC address */ + + if (unit < 0) + return EBUSY; + /* + * We use the same MAC address generation method with tap + * except for the highest octet is 00:be instead of 00:bd + */ + macaddr_hi = htons(0x00be); /* XXX tap + 1 */ + macaddr_mid = (uint32_t) ticks; + bcopy(&macaddr_hi, eaddr, sizeof(short)); + bcopy(&macaddr_mid, &eaddr[2], sizeof(uint32_t)); + eaddr[5] = (uint8_t)unit; + + ifp = if_alloc(IFT_ETHER); + if (ifp == NULL) { + D("if_alloc failed"); + return ENOMEM; + } + if_initname(ifp, name, IF_DUNIT_NONE); + ifp->if_mtu = 65536; + ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_init = (void *)nm_vi_dummy; + ifp->if_ioctl = nm_vi_dummy; + ifp->if_start = nm_vi_start; + ifp->if_mtu = ETHERMTU; + IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); + ifp->if_capabilities |= IFCAP_LINKSTATE; + ifp->if_capenable |= IFCAP_LINKSTATE; + + ether_ifattach(ifp, eaddr); + *ret = ifp; + return 0; +} +/* unregister from the system and drop the final refcount */ +void +nm_vi_detach(struct ifnet *ifp) +{ + nm_vi_free_index(((char *)IF_LLADDR(ifp))[5]); + ether_ifdetach(ifp); + if_free(ifp); +} /* * In order to track whether pages are still mapped, we hook into diff --git a/sys/dev/netmap/netmap_generic.c b/sys/dev/netmap/netmap_generic.c index a14dbc2..7740382 100644 --- a/sys/dev/netmap/netmap_generic.c +++ b/sys/dev/netmap/netmap_generic.c @@ -102,51 +102,42 @@ __FBSDID("$FreeBSD$"); * mbuf wrappers */ -/* - * mbuf destructor, also need to change the type to EXT_EXTREF, +/* mbuf destructor, also need to change the type to EXT_EXTREF, * add an M_NOFREE flag, and then clear the flag and * chain into uma_zfree(zone_pack, mf) * (or reinstall the buffer ?) - * - * On FreeBSD 9 the destructor is called as ext_free(ext_arg1, ext_arg2) - * whereas newer version have ext_free(m, ext_arg1, ext_arg2) - * For compatibility we set ext_arg1 = m on allocation so we have - * the same code on both. */ #define SET_MBUF_DESTRUCTOR(m, fn) do { \ - (m)->m_ext.ext_free = (void *)fn; \ - (m)->m_ext.ext_type = EXT_EXTREF; \ - } while (0) + (m)->m_ext.ext_free = (void *)fn; \ + (m)->m_ext.ext_type = EXT_EXTREF; \ +} while (0) -static void +static void netmap_default_mbuf_destructor(struct mbuf *m) -{ - /* restore original data pointer and type */ - m->m_ext.ext_buf = m->m_data = m->m_ext.ext_arg2; +{ + /* restore original mbuf */ + m->m_ext.ext_buf = m->m_data = m->m_ext.ext_arg1; + m->m_ext.ext_arg1 = NULL; m->m_ext.ext_type = EXT_PACKET; m->m_ext.ext_free = NULL; - m->m_ext.ext_arg1 = m->m_ext.ext_arg2 = NULL; - if (*(m->m_ext.ref_cnt) == 0) - *(m->m_ext.ref_cnt) = 1; + if (GET_MBUF_REFCNT(m) == 0) + SET_MBUF_REFCNT(m, 1); uma_zfree(zone_pack, m); -} +} -static inline struct mbuf * -netmap_get_mbuf(int len) -{ +static inline struct mbuf * +netmap_get_mbuf(int len) +{ struct mbuf *m; m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR | M_NOFREE); if (m) { - m->m_ext.ext_arg1 = m; /* FreeBSD 9 compat */ - m->m_ext.ext_arg2 = m->m_ext.ext_buf; /* save original */ + m->m_ext.ext_arg1 = m->m_ext.ext_buf; // XXX save m->m_ext.ext_free = (void *)netmap_default_mbuf_destructor; m->m_ext.ext_type = EXT_EXTREF; - ND(5, "create m %p refcnt %d", m, *m->m_ext.ref_cnt); + ND(5, "create m %p refcnt %d", m, GET_MBUF_REFCNT(m)); } return m; -} - -#define GET_MBUF_REFCNT(m) ((m)->m_ext.ref_cnt ? *(m)->m_ext.ref_cnt : -1) +} @@ -158,8 +149,6 @@ netmap_get_mbuf(int len) #include <linux/ethtool.h> /* struct ethtool_ops, get_ringparam */ #include <linux/hrtimer.h> -//#define RATE /* Enables communication statistics. */ - //#define REG_RESET #endif /* linux */ @@ -174,7 +163,7 @@ netmap_get_mbuf(int len) /* ======================== usage stats =========================== */ -#ifdef RATE +#ifdef RATE_GENERIC #define IFRATE(x) x struct rate_stats { unsigned long txpkt; @@ -218,23 +207,33 @@ static void rate_callback(unsigned long arg) static struct rate_context rate_ctx; +void generic_rate(int txp, int txs, int txi, int rxp, int rxs, int rxi) +{ + if (txp) rate_ctx.new.txpkt++; + if (txs) rate_ctx.new.txsync++; + if (txi) rate_ctx.new.txirq++; + if (rxp) rate_ctx.new.rxpkt++; + if (rxs) rate_ctx.new.rxsync++; + if (rxi) rate_ctx.new.rxirq++; +} + #else /* !RATE */ #define IFRATE(x) #endif /* !RATE */ /* =============== GENERIC NETMAP ADAPTER SUPPORT ================= */ -#define GENERIC_BUF_SIZE netmap_buf_size /* Size of the mbufs in the Tx pool. */ /* * Wrapper used by the generic adapter layer to notify * the poller threads. Differently from netmap_rx_irq(), we check - * only IFCAP_NETMAP instead of NAF_NATIVE_ON to enable the irq. + * only NAF_NETMAP_ON instead of NAF_NATIVE_ON to enable the irq. */ static void netmap_generic_irq(struct ifnet *ifp, u_int q, u_int *work_done) { - if (unlikely(!(ifp->if_capenable & IFCAP_NETMAP))) + struct netmap_adapter *na = NA(ifp); + if (unlikely(!nm_netmap_on(na))) return; netmap_common_irq(ifp, q, work_done); @@ -245,7 +244,6 @@ netmap_generic_irq(struct ifnet *ifp, u_int q, u_int *work_done) static int generic_netmap_register(struct netmap_adapter *na, int enable) { - struct ifnet *ifp = na->ifp; struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; struct mbuf *m; int error; @@ -271,7 +269,7 @@ generic_netmap_register(struct netmap_adapter *na, int enable) goto out; } for (r=0; r<na->num_rx_rings; r++) - netmap_mitigation_init(&gna->mit[r], na); + netmap_mitigation_init(&gna->mit[r], r, na); /* Initialize the rx queue, as generic_rx_handler() can * be called as soon as netmap_catch_rx() returns. @@ -296,7 +294,7 @@ generic_netmap_register(struct netmap_adapter *na, int enable) for (i=0; i<na->num_tx_desc; i++) na->tx_rings[r].tx_pool[i] = NULL; for (i=0; i<na->num_tx_desc; i++) { - m = netmap_get_mbuf(GENERIC_BUF_SIZE); + m = netmap_get_mbuf(NETMAP_BUF_SIZE(na)); if (!m) { D("tx_pool[%d] allocation failed", i); error = ENOMEM; @@ -312,14 +310,14 @@ generic_netmap_register(struct netmap_adapter *na, int enable) D("netdev_rx_handler_register() failed (%d)", error); goto register_handler; } - ifp->if_capenable |= IFCAP_NETMAP; + na->na_flags |= NAF_NETMAP_ON; /* Make netmap control the packet steering. */ netmap_catch_tx(gna, 1); rtnl_unlock(); -#ifdef RATE +#ifdef RATE_GENERIC if (rate_ctx.refcount == 0) { D("setup_timer()"); memset(&rate_ctx, 0, sizeof(rate_ctx)); @@ -338,7 +336,7 @@ generic_netmap_register(struct netmap_adapter *na, int enable) error handling code below. */ rtnl_lock(); - ifp->if_capenable &= ~IFCAP_NETMAP; + na->na_flags &= ~NAF_NETMAP_ON; /* Release packet steering control. */ netmap_catch_tx(gna, 0); @@ -365,7 +363,7 @@ generic_netmap_register(struct netmap_adapter *na, int enable) free(na->tx_rings[r].tx_pool, M_DEVBUF); } -#ifdef RATE +#ifdef RATE_GENERIC if (--rate_ctx.refcount == 0) { D("del_timer()"); del_timer(&rate_ctx.timer); @@ -421,6 +419,8 @@ generic_mbuf_destructor(struct mbuf *m) IFRATE(rate_ctx.new.txirq++); } +extern int netmap_adaptive_io; + /* Record completed transmissions and update hwtail. * * The oldest tx buffer not yet completed is at nr_hwtail + 1, @@ -440,7 +440,7 @@ generic_netmap_tx_clean(struct netmap_kring *kring) if (unlikely(m == NULL)) { /* this is done, try to replenish the entry */ - tx_pool[nm_i] = m = netmap_get_mbuf(GENERIC_BUF_SIZE); + tx_pool[nm_i] = m = netmap_get_mbuf(NETMAP_BUF_SIZE(kring->na)); if (unlikely(m == NULL)) { D("mbuf allocation failed, XXX error"); // XXX how do we proceed ? break ? @@ -451,6 +451,23 @@ generic_netmap_tx_clean(struct netmap_kring *kring) } n++; nm_i = nm_next(nm_i, lim); +#if 0 /* rate adaptation */ + if (netmap_adaptive_io > 1) { + if (n >= netmap_adaptive_io) + break; + } else if (netmap_adaptive_io) { + /* if hwcur - nm_i < lim/8 do an early break + * so we prevent the sender from stalling. See CVT. + */ + if (hwcur >= nm_i) { + if (hwcur - nm_i < lim/2) + break; + } else { + if (hwcur + lim + 1 - nm_i < lim/2) + break; + } + } +#endif } kring->nr_hwtail = nm_prev(nm_i, lim); ND("tx completed [%d] -> hwtail %d", n, kring->nr_hwtail); @@ -530,14 +547,15 @@ generic_set_tx_event(struct netmap_kring *kring, u_int hwcur) * since it implements the TX flow control (and takes some locks). */ static int -generic_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) +generic_netmap_txsync(struct netmap_kring *kring, int flags) { + struct netmap_adapter *na = kring->na; struct ifnet *ifp = na->ifp; - struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; u_int nm_i; /* index into the netmap ring */ // j u_int const lim = kring->nkr_num_slots - 1; u_int const head = kring->rhead; + u_int ring_nr = kring->ring_id; IFRATE(rate_ctx.new.txsync++); @@ -553,19 +571,19 @@ generic_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) while (nm_i != head) { struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; - void *addr = NMB(slot); + void *addr = NMB(na, slot); /* device-specific */ struct mbuf *m; int tx_ret; - NM_CHECK_ADDR_LEN(addr, len); + NM_CHECK_ADDR_LEN(na, addr, len); /* Tale a mbuf from the tx pool and copy in the user packet. */ m = kring->tx_pool[nm_i]; if (unlikely(!m)) { RD(5, "This should never happen"); - kring->tx_pool[nm_i] = m = netmap_get_mbuf(GENERIC_BUF_SIZE); + kring->tx_pool[nm_i] = m = netmap_get_mbuf(NETMAP_BUF_SIZE(na)); if (unlikely(m == NULL)) { D("mbuf allocation failed"); break; @@ -580,7 +598,7 @@ generic_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) */ tx_ret = generic_xmit_frame(ifp, m, addr, len, ring_nr); if (unlikely(tx_ret)) { - RD(5, "start_xmit failed: err %d [nm_i %u, head %u, hwtail %u]", + ND(5, "start_xmit failed: err %d [nm_i %u, head %u, hwtail %u]", tx_ret, nm_i, head, kring->nr_hwtail); /* * No room for this mbuf in the device driver. @@ -686,10 +704,10 @@ generic_rx_handler(struct ifnet *ifp, struct mbuf *m) * Access must be protected because the rx handler is asynchronous, */ static int -generic_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +generic_netmap_rxsync(struct netmap_kring *kring, int flags) { - struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; + struct netmap_adapter *na = kring->na; u_int nm_i; /* index into the netmap ring */ //j, u_int n; u_int const lim = kring->nkr_num_slots - 1; @@ -712,11 +730,11 @@ generic_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) nm_i = kring->nr_hwtail; /* first empty slot in the receive ring */ for (n = 0; nm_i != stop_i; n++) { int len; - void *addr = NMB(&ring->slot[nm_i]); + void *addr = NMB(na, &ring->slot[nm_i]); struct mbuf *m; /* we only check the address here on generic rx rings */ - if (addr == netmap_buffer_base) { /* Bad buffer */ + if (addr == NETMAP_BUF_BASE(na)) { /* Bad buffer */ return netmap_ring_reinit(kring); } /* @@ -823,7 +841,7 @@ generic_netmap_attach(struct ifnet *ifp) na->nm_txsync = &generic_netmap_txsync; na->nm_rxsync = &generic_netmap_rxsync; na->nm_dtor = &generic_netmap_dtor; - /* when using generic, IFCAP_NETMAP is set so we force + /* when using generic, NAF_NETMAP_ON is set so we force * NAF_SKIP_INTR to use the regular interrupt handler */ na->na_flags = NAF_SKIP_INTR | NAF_HOST_RINGS; diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h index 087564c..dc6afd8 100644 --- a/sys/dev/netmap/netmap_kern.h +++ b/sys/dev/netmap/netmap_kern.h @@ -36,6 +36,7 @@ #define WITH_VALE // comment out to disable VALE support #define WITH_PIPES +#define WITH_MONITOR #if defined(__FreeBSD__) @@ -66,11 +67,23 @@ struct netmap_adapter *netmap_getna(if_t ifp); #endif +#if __FreeBSD_version >= 1100027 +#define GET_MBUF_REFCNT(m) ((m)->m_ext.ext_cnt ? *((m)->m_ext.ext_cnt) : -1) +#define SET_MBUF_REFCNT(m, x) *((m)->m_ext.ext_cnt) = x +#define PNT_MBUF_REFCNT(m) ((m)->m_ext.ext_cnt) +#else +#define GET_MBUF_REFCNT(m) ((m)->m_ext.ref_cnt ? *((m)->m_ext.ref_cnt) : -1) +#define SET_MBUF_REFCNT(m, x) *((m)->m_ext.ref_cnt) = x +#define PNT_MBUF_REFCNT(m) ((m)->m_ext.ref_cnt) +#endif + MALLOC_DECLARE(M_NETMAP); // XXX linux struct, not used in FreeBSD struct net_device_ops { }; +struct ethtool_ops { +}; struct hrtimer { }; @@ -82,7 +95,7 @@ struct hrtimer { #define MBUF_IFP(m) ((m)->dev) #define NM_SEND_UP(ifp, m) \ do { \ - m->priority = NM_MAGIC_PRIORITY; \ + m->priority = NM_MAGIC_PRIORITY_RX; \ netif_rx(m); \ } while (0) @@ -100,18 +113,6 @@ struct hrtimer { #define DEV_NETMAP #endif /* DEV_NETMAP */ -/* - * IFCAP_NETMAP goes into net_device's priv_flags (if_capenable). - * This was 16 bits up to linux 2.6.36, so we need a 16 bit value on older - * platforms and tolerate the clash with IFF_DYNAMIC and IFF_BRIDGE_PORT. - * For the 32-bit value, 0x100000 has no clashes until at least 3.5.1 - */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) -#define IFCAP_NETMAP 0x8000 -#else -#define IFCAP_NETMAP 0x200000 -#endif - #elif defined (__APPLE__) #warning apple support is incomplete. @@ -215,7 +216,7 @@ extern NMG_LOCK_T netmap_global_lock; * rxsync_from_host() and netmap_transmit(). The mbq is protected * by its internal lock. * - * RX rings attached to the VALE switch are accessed by both sender + * RX rings attached to the VALE switch are accessed by both senders * and receiver. They are protected through the q_lock on the RX ring. */ struct netmap_kring { @@ -266,7 +267,13 @@ struct netmap_kring { uint32_t nkr_hwlease; uint32_t nkr_lease_idx; - volatile int nkr_stopped; // XXX what for ? + /* while nkr_stopped is set, no new [tr]xsync operations can + * be started on this kring. + * This is used by netmap_disable_all_rings() + * to find a synchronization point where critical data + * structures pointed to by the kring can be added or removed + */ + volatile int nkr_stopped; /* Support for adapters without native netmap support. * On tx rings we preallocate an array of tx buffers @@ -281,13 +288,40 @@ struct netmap_kring { uint32_t ring_id; /* debugging */ char name[64]; /* diagnostic */ + /* [tx]sync callback for this kring. + * The default nm_kring_create callback (netmap_krings_create) + * sets the nm_sync callback of each hardware tx(rx) kring to + * the corresponding nm_txsync(nm_rxsync) taken from the + * netmap_adapter; moreover, it sets the sync callback + * of the host tx(rx) ring to netmap_txsync_to_host + * (netmap_rxsync_from_host). + * + * Overrides: the above configuration is not changed by + * any of the nm_krings_create callbacks. + */ int (*nm_sync)(struct netmap_kring *kring, int flags); #ifdef WITH_PIPES - struct netmap_kring *pipe; - struct netmap_ring *save_ring; + struct netmap_kring *pipe; /* if this is a pipe ring, + * pointer to the other end + */ + struct netmap_ring *save_ring; /* pointer to hidden rings + * (see netmap_pipe.c for details) + */ #endif /* WITH_PIPES */ +#ifdef WITH_MONITOR + /* pointer to the adapter that is monitoring this kring (if any) + */ + struct netmap_monitor_adapter *monitor; + /* + * Monitors work by intercepting the txsync and/or rxsync of the + * monitored krings. This is implemented by replacing + * the nm_sync pointer above and saving the previous + * one in save_sync below. + */ + int (*save_sync)(struct netmap_kring *kring, int flags); +#endif } __attribute__((__aligned__(64))); @@ -360,6 +394,8 @@ tail->| |<-hwtail | |<-hwlease enum txrx { NR_RX = 0, NR_TX = 1 }; +struct netmap_vp_adapter; // forward + /* * The "struct netmap_adapter" extends the "struct adapter" * (or equivalent) device descriptor. @@ -390,13 +426,19 @@ struct netmap_adapter { * deallocation of the memory allocator */ #define NAF_NATIVE_ON 16 /* the adapter is native and the attached - * interface is in netmap mode + * interface is in netmap mode. + * Virtual ports (vale, pipe, monitor...) + * should never use this flag. */ #define NAF_NETMAP_ON 32 /* netmap is active (either native or - * emulated. Where possible (e.g. FreeBSD) + * emulated). Where possible (e.g. FreeBSD) * IFCAP_NETMAP also mirrors this flag. */ #define NAF_HOST_RINGS 64 /* the adapter supports the host rings */ +#define NAF_FORCE_NATIVE 128 /* the adapter is always NATIVE */ +#define NAF_BUSY (1U<<31) /* the adapter is used internally and + * cannot be registered from userspace + */ int active_fds; /* number of user-space descriptors using this interface, which is equal to the number of struct netmap_if objs in the mapped region. */ @@ -423,6 +465,8 @@ struct netmap_adapter { /* count users of the global wait queues */ int tx_si_users, rx_si_users; + void *pdev; /* used to store pci device */ + /* copy of if_qflush and if_transmit pointers, to intercept * packets from the network stack when netmap is active. */ @@ -444,7 +488,7 @@ struct netmap_adapter { * * nm_register() is called on NIOCREGIF and close() to enter * or exit netmap mode on the NIC - * Called with NMG_LOCK held. + * Called with NNG_LOCK held. * * nm_txsync() pushes packets to the underlying hw/switch * @@ -453,14 +497,20 @@ struct netmap_adapter { * nm_config() returns configuration information from the OS * Called with NMG_LOCK held. * - * nm_krings_create() create and init the krings array - * (the array layout must conform to the description - * found above the definition of netmap_krings_create) + * nm_krings_create() create and init the tx_rings and + * rx_rings arrays of kring structures. In particular, + * set the nm_sync callbacks for each ring. + * There is no need to also allocate the corresponding + * netmap_rings, since netmap_mem_rings_create() will always + * be called to provide the missing ones. + * Called with NNG_LOCK held. * - * nm_krings_delete() cleanup and delete the kring array + * nm_krings_delete() cleanup and delete the tx_rings and rx_rings + * arrays + * Called with NMG_LOCK held. * * nm_notify() is used to act after data have become available - * (or the stopped state of the ring has changed) + * (or the stopped state of the ring has changed) * For hw devices this is typically a selwakeup(), * but for NIC/host ports attached to a switch (or vice-versa) * we also need to invoke the 'txsync' code downstream. @@ -469,8 +519,8 @@ struct netmap_adapter { int (*nm_register)(struct netmap_adapter *, int onoff); - int (*nm_txsync)(struct netmap_adapter *, u_int ring, int flags); - int (*nm_rxsync)(struct netmap_adapter *, u_int ring, int flags); + int (*nm_txsync)(struct netmap_kring *kring, int flags); + int (*nm_rxsync)(struct netmap_kring *kring, int flags); #define NAF_FORCE_READ 1 #define NAF_FORCE_RECLAIM 2 /* return configuration information */ @@ -480,7 +530,35 @@ struct netmap_adapter { void (*nm_krings_delete)(struct netmap_adapter *); int (*nm_notify)(struct netmap_adapter *, u_int ring, enum txrx, int flags); -#define NAF_DISABLE_NOTIFY 8 +#define NAF_DISABLE_NOTIFY 8 /* notify that the stopped state of the + * ring has changed (kring->nkr_stopped) + */ + +#ifdef WITH_VALE + /* + * nm_bdg_attach() initializes the na_vp field to point + * to an adapter that can be attached to a VALE switch. If the + * current adapter is already a VALE port, na_vp is simply a cast; + * otherwise, na_vp points to a netmap_bwrap_adapter. + * If applicable, this callback also initializes na_hostvp, + * that can be used to connect the adapter host rings to the + * switch. + * Called with NMG_LOCK held. + * + * nm_bdg_ctl() is called on the actual attach/detach to/from + * to/from the switch, to perform adapter-specific + * initializations + * Called with NMG_LOCK held. + */ + int (*nm_bdg_attach)(const char *bdg_name, struct netmap_adapter *); + int (*nm_bdg_ctl)(struct netmap_adapter *, struct nmreq *, int); + + /* adapter used to attach this adapter to a VALE switch (if any) */ + struct netmap_vp_adapter *na_vp; + /* adapter used to attach the host rings of this adapter + * to a VALE switch (if any) */ + struct netmap_vp_adapter *na_hostvp; +#endif /* standard refcount to control the lifetime of the adapter * (it should be equal to the lifetime of the corresponding ifp) @@ -494,17 +572,22 @@ struct netmap_adapter { struct netmap_mem_d *nm_mem; struct lut_entry *na_lut; uint32_t na_lut_objtotal; /* max buffer index */ + uint32_t na_lut_objsize; /* buffer size */ - /* used internally. If non-null, the interface cannot be bound - * from userspace + /* additional information attached to this adapter + * by other netmap subsystems. Currently used by + * bwrap and LINUX/v1000. */ void *na_private; #ifdef WITH_PIPES + /* array of pipes that have this adapter as a parent */ struct netmap_pipe_adapter **na_pipes; - int na_next_pipe; - int na_max_pipes; + int na_next_pipe; /* next free slot in the array */ + int na_max_pipes; /* size of the array */ #endif /* WITH_PIPES */ + + char name[64]; }; @@ -514,9 +597,9 @@ struct netmap_adapter { * if the NIC is owned by a user, only users can share it. * Evaluation must be done under NMG_LOCK(). */ -#define NETMAP_OWNED_BY_KERN(na) (na->na_private) +#define NETMAP_OWNED_BY_KERN(na) ((na)->na_flags & NAF_BUSY) #define NETMAP_OWNED_BY_ANY(na) \ - (NETMAP_OWNED_BY_KERN(na) || (na->active_fds > 0)) + (NETMAP_OWNED_BY_KERN(na) || ((na)->active_fds > 0)) /* @@ -546,12 +629,17 @@ struct netmap_hw_adapter { /* physical device */ struct netmap_adapter up; struct net_device_ops nm_ndo; // XXX linux only + struct ethtool_ops nm_eto; // XXX linux only + const struct ethtool_ops* save_ethtool; + + int (*nm_hw_register)(struct netmap_adapter *, int onoff); }; /* Mitigation support. */ struct nm_generic_mit { struct hrtimer mit_timer; int mit_pending; + int mit_ring_idx; /* index of the ring being mitigated */ struct netmap_adapter *mit_na; /* backpointer */ }; @@ -641,16 +729,19 @@ struct netmap_bwrap_adapter { /* backup of the hwna notify callback */ int (*save_notify)(struct netmap_adapter *, u_int ring, enum txrx, int flags); + /* backup of the hwna memory allocator */ + struct netmap_mem_d *save_nmd; /* * When we attach a physical interface to the bridge, we * allow the controlling process to terminate, so we need - * a place to store the netmap_priv_d data structure. + * a place to store the n_detmap_priv_d data structure. * This is only done when physical interfaces * are attached to a bridge. */ struct netmap_priv_d *na_kpriv; }; +int netmap_bwrap_attach(const char *name, struct netmap_adapter *); #endif /* WITH_VALE */ @@ -747,12 +838,11 @@ static __inline int nm_kr_tryget(struct netmap_kring *kr) * netmap_load_map/netmap_reload_map are helper routines to set/reset * the dmamap for a packet buffer * - * netmap_reset() is a helper routine to be called in the driver - * when reinitializing a ring. + * netmap_reset() is a helper routine to be called in the hw driver + * when reinitializing a ring. It should not be called by + * virtual ports (vale, pipes, monitor) */ int netmap_attach(struct netmap_adapter *); -int netmap_attach_common(struct netmap_adapter *); -void netmap_detach_common(struct netmap_adapter *na); void netmap_detach(struct ifnet *); int netmap_transmit(struct ifnet *, struct mbuf *); struct netmap_slot *netmap_reset(struct netmap_adapter *na, @@ -764,10 +854,33 @@ int netmap_rx_irq(struct ifnet *, u_int, u_int *); #define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL) void netmap_common_irq(struct ifnet *, u_int, u_int *work_done); -void netmap_disable_all_rings(struct ifnet *); -void netmap_enable_all_rings(struct ifnet *); -void netmap_disable_ring(struct netmap_kring *kr); +#ifdef WITH_VALE +/* functions used by external modules to interface with VALE */ +#define netmap_vp_to_ifp(_vp) ((_vp)->up.ifp) +#define netmap_ifp_to_vp(_ifp) (NA(_ifp)->na_vp) +#define netmap_ifp_to_host_vp(_ifp) (NA(_ifp)->na_hostvp) +#define netmap_bdg_idx(_vp) ((_vp)->bdg_port) +const char *netmap_bdg_name(struct netmap_vp_adapter *); +#else /* !WITH_VALE */ +#define netmap_vp_to_ifp(_vp) NULL +#define netmap_ifp_to_vp(_ifp) NULL +#define netmap_ifp_to_host_vp(_ifp) NULL +#define netmap_bdg_idx(_vp) -1 +#define netmap_bdg_name(_vp) NULL +#endif /* WITH_VALE */ + +static inline int +nm_native_on(struct netmap_adapter *na) +{ + return na && na->na_flags & NAF_NATIVE_ON; +} + +static inline int +nm_netmap_on(struct netmap_adapter *na) +{ + return na && na->na_flags & NAF_NETMAP_ON; +} /* set/clear native flags and if_transmit/netdev_ops */ static inline void @@ -785,6 +898,8 @@ nm_set_native_flags(struct netmap_adapter *na) #else na->if_transmit = (void *)ifp->netdev_ops; ifp->netdev_ops = &((struct netmap_hw_adapter *)na)->nm_ndo; + ((struct netmap_hw_adapter *)na)->save_ethtool = ifp->ethtool_ops; + ifp->ethtool_ops = &((struct netmap_hw_adapter*)na)->nm_eto; #endif } @@ -798,6 +913,7 @@ nm_clear_native_flags(struct netmap_adapter *na) ifp->if_transmit = na->if_transmit; #else ifp->netdev_ops = (void *)na->if_transmit; + ifp->ethtool_ops = ((struct netmap_hw_adapter*)na)->save_ethtool; #endif na->na_flags &= ~(NAF_NATIVE_ON | NAF_NETMAP_ON); #ifdef IFCAP_NETMAP /* or FreeBSD ? */ @@ -858,30 +974,72 @@ nm_rxsync_finalize(struct netmap_kring *kring) /* check/fix address and len in tx rings */ #if 1 /* debug version */ -#define NM_CHECK_ADDR_LEN(_a, _l) do { \ - if (_a == netmap_buffer_base || _l > NETMAP_BUF_SIZE) { \ +#define NM_CHECK_ADDR_LEN(_na, _a, _l) do { \ + if (_a == NETMAP_BUF_BASE(_na) || _l > NETMAP_BUF_SIZE(_na)) { \ RD(5, "bad addr/len ring %d slot %d idx %d len %d", \ - ring_nr, nm_i, slot->buf_idx, len); \ - if (_l > NETMAP_BUF_SIZE) \ - _l = NETMAP_BUF_SIZE; \ + kring->ring_id, nm_i, slot->buf_idx, len); \ + if (_l > NETMAP_BUF_SIZE(_na)) \ + _l = NETMAP_BUF_SIZE(_na); \ } } while (0) #else /* no debug version */ -#define NM_CHECK_ADDR_LEN(_a, _l) do { \ - if (_l > NETMAP_BUF_SIZE) \ - _l = NETMAP_BUF_SIZE; \ +#define NM_CHECK_ADDR_LEN(_na, _a, _l) do { \ + if (_l > NETMAP_BUF_SIZE(_na)) \ + _l = NETMAP_BUF_SIZE(_na); \ } while (0) #endif /*---------------------------------------------------------------*/ /* - * Support routines to be used with the VALE switch + * Support routines used by netmap subsystems + * (native drivers, VALE, generic, pipes, monitors, ...) + */ + + +/* common routine for all functions that create a netmap adapter. It performs + * two main tasks: + * - if the na points to an ifp, mark the ifp as netmap capable + * using na as its native adapter; + * - provide defaults for the setup callbacks and the memory allocator + */ +int netmap_attach_common(struct netmap_adapter *); +/* common actions to be performed on netmap adapter destruction */ +void netmap_detach_common(struct netmap_adapter *); +/* fill priv->np_[tr]xq{first,last} using the ringid and flags information + * coming from a struct nmreq + */ +int netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags); +/* update the ring parameters (number and size of tx and rx rings). + * It calls the nm_config callback, if available. */ int netmap_update_config(struct netmap_adapter *na); +/* create and initialize the common fields of the krings array. + * using the information that must be already available in the na. + * tailroom can be used to request the allocation of additional + * tailroom bytes after the krings array. This is used by + * netmap_vp_adapter's (i.e., VALE ports) to make room for + * leasing-related data structures + */ int netmap_krings_create(struct netmap_adapter *na, u_int tailroom); +/* deletes the kring array of the adapter. The array must have + * been created using netmap_krings_create + */ void netmap_krings_delete(struct netmap_adapter *na); -int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait); +/* set the stopped/enabled status of ring + * When stopping, they also wait for all current activity on the ring to + * terminate. The status change is then notified using the na nm_notify + * callback. + */ +void netmap_set_txring(struct netmap_adapter *, u_int ring_id, int stopped); +void netmap_set_rxring(struct netmap_adapter *, u_int ring_id, int stopped); +/* set the stopped/enabled status of all rings of the adapter. */ +void netmap_set_all_rings(struct netmap_adapter *, int stopped); +/* convenience wrappers for netmap_set_all_rings, used in drivers */ +void netmap_disable_all_rings(struct ifnet *); +void netmap_enable_all_rings(struct ifnet *); + +int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait); struct netmap_if * netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, @@ -904,10 +1062,18 @@ int netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na); * NM_BDG_MAXPORTS for broadcast, NM_BDG_MAXPORTS+1 for unknown. * XXX in practice "unknown" might be handled same as broadcast. */ -typedef u_int (*bdg_lookup_fn_t)(char *buf, u_int len, - uint8_t *ring_nr, struct netmap_vp_adapter *); -u_int netmap_bdg_learning(char *, u_int, uint8_t *, - struct netmap_vp_adapter *); +typedef u_int (*bdg_lookup_fn_t)(struct nm_bdg_fwd *ft, uint8_t *ring_nr, + const struct netmap_vp_adapter *); +typedef int (*bdg_config_fn_t)(struct nm_ifreq *); +typedef void (*bdg_dtor_fn_t)(const struct netmap_vp_adapter *); +struct netmap_bdg_ops { + bdg_lookup_fn_t lookup; + bdg_config_fn_t config; + bdg_dtor_fn_t dtor; +}; + +u_int netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, + const struct netmap_vp_adapter *); #define NM_BDG_MAXPORTS 254 /* up to 254 */ #define NM_BDG_BROADCAST NM_BDG_MAXPORTS @@ -915,11 +1081,11 @@ u_int netmap_bdg_learning(char *, u_int, uint8_t *, #define NM_NAME "vale" /* prefix for bridge port name */ - /* these are redefined in case of no VALE support */ int netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create); void netmap_init_bridges(void); -int netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func); +int netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops); +int netmap_bdg_config(struct nmreq *nmr); #else /* !WITH_VALE */ #define netmap_get_bdg_na(_1, _2, _3) 0 @@ -941,6 +1107,12 @@ int netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create #define netmap_get_pipe_na(_1, _2, _3) 0 #endif +#ifdef WITH_MONITOR +int netmap_get_monitor_na(struct nmreq *nmr, struct netmap_adapter **na, int create); +#else +#define netmap_get_monitor_na(_1, _2, _3) 0 +#endif + /* Various prototypes */ int netmap_poll(struct cdev *dev, int events, struct thread *td); int netmap_init(void); @@ -952,7 +1124,6 @@ int netmap_dtor_locked(struct netmap_priv_d *priv); int netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td); /* netmap_adapter creation/destruction */ -#define NM_IFPNAME(ifp) ((ifp) ? (ifp)->if_xname : "zombie") // #define NM_DEBUG_PUTGET 1 @@ -965,7 +1136,7 @@ void __netmap_adapter_get(struct netmap_adapter *na); #define netmap_adapter_get(na) \ do { \ struct netmap_adapter *__na = na; \ - D("getting %p:%s (%d)", __na, NM_IFPNAME(__na->ifp), __na->na_refcount); \ + D("getting %p:%s (%d)", __na, (__na)->name, (__na)->na_refcount); \ __netmap_adapter_get(__na); \ } while (0) @@ -974,7 +1145,7 @@ int __netmap_adapter_put(struct netmap_adapter *na); #define netmap_adapter_put(na) \ ({ \ struct netmap_adapter *__na = na; \ - D("putting %p:%s (%d)", __na, NM_IFPNAME(__na->ifp), __na->na_refcount); \ + D("putting %p:%s (%d)", __na, (__na)->name, (__na)->na_refcount); \ __netmap_adapter_put(__na); \ }) @@ -990,12 +1161,10 @@ int netmap_adapter_put(struct netmap_adapter *na); /* * module variables */ -extern u_int netmap_buf_size; -#define NETMAP_BUF_SIZE netmap_buf_size // XXX remove +#define NETMAP_BUF_BASE(na) ((na)->na_lut[0].vaddr) +#define NETMAP_BUF_SIZE(na) ((na)->na_lut_objsize) extern int netmap_mitigate; // XXX not really used extern int netmap_no_pendintr; -extern u_int netmap_total_buffers; // global allocator -extern char *netmap_buffer_base; // global allocator extern int netmap_verbose; // XXX debugging enum { /* verbose flags */ NM_VERB_ON = 1, /* generic verbose */ @@ -1055,6 +1224,10 @@ extern int netmap_generic_rings; #ifdef __FreeBSD__ +/* Assigns the device IOMMU domain to an allocator. + * Returns -ENOMEM in case the domain is different */ +#define nm_iommu_group_id(dev) (0) + /* Callback invoked by the dma machinery after a successful dmamap_load */ static void netmap_dmamap_cb(__unused void *arg, __unused bus_dma_segment_t * segs, __unused int nseg, __unused int error) @@ -1065,26 +1238,77 @@ static void netmap_dmamap_cb(__unused void *arg, * XXX can we do it without a callback ? */ static inline void -netmap_load_map(bus_dma_tag_t tag, bus_dmamap_t map, void *buf) +netmap_load_map(struct netmap_adapter *na, + bus_dma_tag_t tag, bus_dmamap_t map, void *buf) { if (map) - bus_dmamap_load(tag, map, buf, NETMAP_BUF_SIZE, + bus_dmamap_load(tag, map, buf, NETMAP_BUF_SIZE(na), netmap_dmamap_cb, NULL, BUS_DMA_NOWAIT); } +static inline void +netmap_unload_map(struct netmap_adapter *na, + bus_dma_tag_t tag, bus_dmamap_t map) +{ + if (map) + bus_dmamap_unload(tag, map); +} + /* update the map when a buffer changes. */ static inline void -netmap_reload_map(bus_dma_tag_t tag, bus_dmamap_t map, void *buf) +netmap_reload_map(struct netmap_adapter *na, + bus_dma_tag_t tag, bus_dmamap_t map, void *buf) { if (map) { bus_dmamap_unload(tag, map); - bus_dmamap_load(tag, map, buf, NETMAP_BUF_SIZE, + bus_dmamap_load(tag, map, buf, NETMAP_BUF_SIZE(na), netmap_dmamap_cb, NULL, BUS_DMA_NOWAIT); } } #else /* linux */ +int nm_iommu_group_id(bus_dma_tag_t dev); +extern size_t netmap_mem_get_bufsize(struct netmap_mem_d *); +#include <linux/dma-mapping.h> + +static inline void +netmap_load_map(struct netmap_adapter *na, + bus_dma_tag_t tag, bus_dmamap_t map, void *buf) +{ + if (map) { + *map = dma_map_single(na->pdev, buf, netmap_mem_get_bufsize(na->nm_mem), + DMA_BIDIRECTIONAL); + } +} + +static inline void +netmap_unload_map(struct netmap_adapter *na, + bus_dma_tag_t tag, bus_dmamap_t map) +{ + u_int sz = netmap_mem_get_bufsize(na->nm_mem); + + if (*map) { + dma_unmap_single(na->pdev, *map, sz, + DMA_BIDIRECTIONAL); + } +} + +static inline void +netmap_reload_map(struct netmap_adapter *na, + bus_dma_tag_t tag, bus_dmamap_t map, void *buf) +{ + u_int sz = netmap_mem_get_bufsize(na->nm_mem); + + if (*map) { + dma_unmap_single(na->pdev, *map, sz, + DMA_BIDIRECTIONAL); + } + + *map = dma_map_single(na->pdev, buf, sz, + DMA_BIDIRECTIONAL); +} + /* * XXX How do we redefine these functions: * @@ -1095,8 +1319,7 @@ netmap_reload_map(bus_dma_tag_t tag, bus_dmamap_t map, void *buf) * unfortunately the direction is not, so we need to change * something to have a cross API */ -#define netmap_load_map(_t, _m, _b) -#define netmap_reload_map(_t, _m, _b) + #if 0 struct e1000_buffer *buffer_info = &tx_ring->buffer_info[l]; /* set time_stamp *before* dma to help avoid a possible race */ @@ -1165,40 +1388,32 @@ struct lut_entry { }; struct netmap_obj_pool; -extern struct lut_entry *netmap_buffer_lut; -#define NMB_VA(i) (netmap_buffer_lut[i].vaddr) -#define NMB_PA(i) (netmap_buffer_lut[i].paddr) /* * NMB return the virtual address of a buffer (buffer 0 on bad index) * PNMB also fills the physical address */ static inline void * -NMB(struct netmap_slot *slot) +NMB(struct netmap_adapter *na, struct netmap_slot *slot) { + struct lut_entry *lut = na->na_lut; uint32_t i = slot->buf_idx; - return (unlikely(i >= netmap_total_buffers)) ? NMB_VA(0) : NMB_VA(i); + return (unlikely(i >= na->na_lut_objtotal)) ? + lut[0].vaddr : lut[i].vaddr; } static inline void * -PNMB(struct netmap_slot *slot, uint64_t *pp) +PNMB(struct netmap_adapter *na, struct netmap_slot *slot, uint64_t *pp) { uint32_t i = slot->buf_idx; - void *ret = (i >= netmap_total_buffers) ? NMB_VA(0) : NMB_VA(i); + struct lut_entry *lut = na->na_lut; + void *ret = (i >= na->na_lut_objtotal) ? lut[0].vaddr : lut[i].vaddr; - *pp = (i >= netmap_total_buffers) ? NMB_PA(0) : NMB_PA(i); + *pp = (i >= na->na_lut_objtotal) ? lut[0].paddr : lut[i].paddr; return ret; } /* Generic version of NMB, which uses device-specific memory. */ -static inline void * -BDG_NMB(struct netmap_adapter *na, struct netmap_slot *slot) -{ - struct lut_entry *lut = na->na_lut; - uint32_t i = slot->buf_idx; - return (unlikely(i >= na->na_lut_objtotal)) ? - lut[0].vaddr : lut[i].vaddr; -} @@ -1251,6 +1466,17 @@ struct netmap_priv_d { struct thread *np_td; /* kqueue, just debugging */ }; +#ifdef WITH_MONITOR + +struct netmap_monitor_adapter { + struct netmap_adapter up; + + struct netmap_priv_d priv; + uint32_t flags; +}; + +#endif /* WITH_MONITOR */ + /* * generic netmap emulation for devices that do not have @@ -1265,12 +1491,20 @@ int generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, void *addr, u_int len, int generic_find_num_desc(struct ifnet *ifp, u_int *tx, u_int *rx); void generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq); +//#define RATE_GENERIC /* Enables communication statistics for generic. */ +#ifdef RATE_GENERIC +void generic_rate(int txp, int txs, int txi, int rxp, int rxs, int rxi); +#else +#define generic_rate(txp, txs, txi, rxp, rxs, rxi) +#endif + /* * netmap_mitigation API. This is used by the generic adapter * to reduce the number of interrupt requests/selwakeup * to clients on incoming packets. */ -void netmap_mitigation_init(struct nm_generic_mit *mit, struct netmap_adapter *na); +void netmap_mitigation_init(struct nm_generic_mit *mit, int idx, + struct netmap_adapter *na); void netmap_mitigation_start(struct nm_generic_mit *mit); void netmap_mitigation_restart(struct nm_generic_mit *mit); int netmap_mitigation_active(struct nm_generic_mit *mit); @@ -1378,4 +1612,10 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na, struct netmap_vp_adapter *dst_na, struct nm_bdg_fwd *ft_p, struct netmap_ring *ring, u_int *j, u_int lim, u_int *howmany); + +/* persistent virtual port routines */ +int nm_vi_persist(const char *, struct ifnet **); +void nm_vi_detach(struct ifnet *); +void nm_vi_init_index(void); + #endif /* _NET_NETMAP_KERN_H_ */ diff --git a/sys/dev/netmap/netmap_mbq.h b/sys/dev/netmap/netmap_mbq.h index a011c4c..455ca8a 100644 --- a/sys/dev/netmap/netmap_mbq.h +++ b/sys/dev/netmap/netmap_mbq.h @@ -74,6 +74,7 @@ mbq_unlock(struct mbq *q) mtx_unlock_spin(&q->lock); } + void mbq_safe_init(struct mbq *q); void mbq_safe_destroy(struct mbq *q); void mbq_safe_enqueue(struct mbq *q, struct mbuf *m); diff --git a/sys/dev/netmap/netmap_mem2.c b/sys/dev/netmap/netmap_mem2.c index d237794..fa891ec 100644 --- a/sys/dev/netmap/netmap_mem2.c +++ b/sys/dev/netmap/netmap_mem2.c @@ -54,6 +54,112 @@ __FBSDID("$FreeBSD$"); #include <dev/netmap/netmap_kern.h> #include "netmap_mem2.h" +#define NETMAP_BUF_MAX_NUM 20*4096*2 /* large machine */ + +#define NETMAP_POOL_MAX_NAMSZ 32 + + +enum { + NETMAP_IF_POOL = 0, + NETMAP_RING_POOL, + NETMAP_BUF_POOL, + NETMAP_POOLS_NR +}; + + +struct netmap_obj_params { + u_int size; + u_int num; +}; +struct netmap_obj_pool { + char name[NETMAP_POOL_MAX_NAMSZ]; /* name of the allocator */ + + /* ---------------------------------------------------*/ + /* these are only meaningful if the pool is finalized */ + /* (see 'finalized' field in netmap_mem_d) */ + u_int objtotal; /* actual total number of objects. */ + u_int memtotal; /* actual total memory space */ + u_int numclusters; /* actual number of clusters */ + + u_int objfree; /* number of free objects. */ + + struct lut_entry *lut; /* virt,phys addresses, objtotal entries */ + uint32_t *bitmap; /* one bit per buffer, 1 means free */ + uint32_t bitmap_slots; /* number of uint32 entries in bitmap */ + /* ---------------------------------------------------*/ + + /* limits */ + u_int objminsize; /* minimum object size */ + u_int objmaxsize; /* maximum object size */ + u_int nummin; /* minimum number of objects */ + u_int nummax; /* maximum number of objects */ + + /* these are changed only by config */ + u_int _objtotal; /* total number of objects */ + u_int _objsize; /* object size */ + u_int _clustsize; /* cluster size */ + u_int _clustentries; /* objects per cluster */ + u_int _numclusters; /* number of clusters */ + + /* requested values */ + u_int r_objtotal; + u_int r_objsize; +}; + +#ifdef linux +// XXX a mtx would suffice here 20130415 lr +#define NMA_LOCK_T struct semaphore +#else /* !linux */ +#define NMA_LOCK_T struct mtx +#endif /* linux */ + +typedef int (*netmap_mem_config_t)(struct netmap_mem_d*); +typedef int (*netmap_mem_finalize_t)(struct netmap_mem_d*); +typedef void (*netmap_mem_deref_t)(struct netmap_mem_d*); + +typedef uint16_t nm_memid_t; + +struct netmap_mem_d { + NMA_LOCK_T nm_mtx; /* protect the allocator */ + u_int nm_totalsize; /* shorthand */ + + u_int flags; +#define NETMAP_MEM_FINALIZED 0x1 /* preallocation done */ + int lasterr; /* last error for curr config */ + int refcount; /* existing priv structures */ + /* the three allocators */ + struct netmap_obj_pool pools[NETMAP_POOLS_NR]; + + netmap_mem_config_t config; + netmap_mem_finalize_t finalize; + netmap_mem_deref_t deref; + + nm_memid_t nm_id; /* allocator identifier */ + int nm_grp; /* iommu groupd id */ + + /* list of all existing allocators, sorted by nm_id */ + struct netmap_mem_d *prev, *next; +}; + +/* accessor functions */ +struct lut_entry* +netmap_mem_get_lut(struct netmap_mem_d *nmd) +{ + return nmd->pools[NETMAP_BUF_POOL].lut; +} + +u_int +netmap_mem_get_buftotal(struct netmap_mem_d *nmd) +{ + return nmd->pools[NETMAP_BUF_POOL].objtotal; +} + +size_t +netmap_mem_get_bufsize(struct netmap_mem_d *nmd) +{ + return nmd->pools[NETMAP_BUF_POOL]._objsize; +} + #ifdef linux #define NMA_LOCK_INIT(n) sema_init(&(n)->nm_mtx, 1) #define NMA_LOCK_DESTROY(n) @@ -135,6 +241,7 @@ struct netmap_mem_d nm_mem = { /* Our memory allocator. */ .deref = netmap_mem_global_deref, .nm_id = 1, + .nm_grp = -1, .prev = &nm_mem, .next = &nm_mem, @@ -143,9 +250,6 @@ struct netmap_mem_d nm_mem = { /* Our memory allocator. */ struct netmap_mem_d *netmap_last_mem_d = &nm_mem; -// XXX logically belongs to nm_mem -struct lut_entry *netmap_buffer_lut; /* exported */ - /* blueprint for the private memory allocators */ static int netmap_mem_private_config(struct netmap_mem_d *nmd); static int netmap_mem_private_finalize(struct netmap_mem_d *nmd); @@ -254,6 +358,25 @@ nm_mem_release_id(struct netmap_mem_d *nmd) NMA_UNLOCK(&nm_mem); } +static int +nm_mem_assign_group(struct netmap_mem_d *nmd, struct device *dev) +{ + int err = 0, id; + id = nm_iommu_group_id(dev); + if (netmap_verbose) + D("iommu_group %d", id); + + NMA_LOCK(nmd); + + if (nmd->nm_grp < 0) + nmd->nm_grp = id; + + if (nmd->nm_grp != id) + nmd->lasterr = err = ENOMEM; + + NMA_UNLOCK(nmd); + return err; +} /* * First, find the allocator that contains the requested offset, @@ -274,7 +397,7 @@ netmap_mem_ofstophys(struct netmap_mem_d* nmd, vm_ooffset_t offset) if (offset >= p[i].memtotal) continue; // now lookup the cluster's address - pa = p[i].lut[offset / p[i]._objsize].paddr + + pa = vtophys(p[i].lut[offset / p[i]._objsize].vaddr) + offset % p[i]._objsize; NMA_UNLOCK(nmd); return pa; @@ -300,18 +423,22 @@ netmap_mem_get_info(struct netmap_mem_d* nmd, u_int* size, u_int *memflags, error = nmd->config(nmd); if (error) goto out; - if (nmd->flags & NETMAP_MEM_FINALIZED) { - *size = nmd->nm_totalsize; - } else { - int i; - *size = 0; - for (i = 0; i < NETMAP_POOLS_NR; i++) { - struct netmap_obj_pool *p = nmd->pools + i; - *size += (p->_numclusters * p->_clustsize); + if (size) { + if (nmd->flags & NETMAP_MEM_FINALIZED) { + *size = nmd->nm_totalsize; + } else { + int i; + *size = 0; + for (i = 0; i < NETMAP_POOLS_NR; i++) { + struct netmap_obj_pool *p = nmd->pools + i; + *size += (p->_numclusters * p->_clustsize); + } } } - *memflags = nmd->flags; - *id = nmd->nm_id; + if (memflags) + *memflags = nmd->flags; + if (id) + *id = nmd->nm_id; out: NMA_UNLOCK(nmd); return error; @@ -471,12 +598,15 @@ netmap_obj_free_va(struct netmap_obj_pool *p, void *vaddr) vaddr, p->name); } +#define netmap_mem_bufsize(n) \ + ((n)->pools[NETMAP_BUF_POOL]._objsize) + #define netmap_if_malloc(n, len) netmap_obj_malloc(&(n)->pools[NETMAP_IF_POOL], len, NULL, NULL) #define netmap_if_free(n, v) netmap_obj_free_va(&(n)->pools[NETMAP_IF_POOL], (v)) #define netmap_ring_malloc(n, len) netmap_obj_malloc(&(n)->pools[NETMAP_RING_POOL], len, NULL, NULL) #define netmap_ring_free(n, v) netmap_obj_free_va(&(n)->pools[NETMAP_RING_POOL], (v)) #define netmap_buf_malloc(n, _pos, _index) \ - netmap_obj_malloc(&(n)->pools[NETMAP_BUF_POOL], NETMAP_BDG_BUF_SIZE(n), _pos, _index) + netmap_obj_malloc(&(n)->pools[NETMAP_BUF_POOL], netmap_mem_bufsize(n), _pos, _index) #if 0 // XXX unused @@ -675,7 +805,7 @@ netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int obj p->r_objtotal = objtotal; p->r_objsize = objsize; -#define MAX_CLUSTSIZE (1<<17) +#define MAX_CLUSTSIZE (1<<22) // 4 MB #define LINE_ROUND NM_CACHE_ALIGN // 64 if (objsize >= MAX_CLUSTSIZE) { /* we could do it but there is no point */ @@ -713,15 +843,14 @@ netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int obj clustentries = i; break; } - if (delta > ( (clustentries*objsize) % PAGE_SIZE) ) - clustentries = i; } - // D("XXX --- ouch, delta %d (bad for buffers)", delta); - /* compute clustsize and round to the next page */ + /* exact solution not found */ + if (clustentries == 0) { + D("unsupported allocation for %d bytes", objsize); + return EINVAL; + } + /* compute clustsize */ clustsize = clustentries * objsize; - i = (clustsize & (PAGE_SIZE - 1)); - if (i) - clustsize += PAGE_SIZE - i; if (netmap_verbose) D("objsize %d clustsize %d objects %d", objsize, clustsize, clustentries); @@ -857,6 +986,47 @@ netmap_mem_reset_all(struct netmap_mem_d *nmd) } static int +netmap_mem_unmap(struct netmap_obj_pool *p, struct netmap_adapter *na) +{ + int i, lim = p->_objtotal; + + if (na->pdev == NULL) + return 0; + +#ifdef __FreeBSD__ + (void)i; + (void)lim; + D("unsupported on FreeBSD"); +#else /* linux */ + for (i = 2; i < lim; i++) { + netmap_unload_map(na, (bus_dma_tag_t) na->pdev, &p->lut[i].paddr); + } +#endif /* linux */ + + return 0; +} + +static int +netmap_mem_map(struct netmap_obj_pool *p, struct netmap_adapter *na) +{ +#ifdef __FreeBSD__ + D("unsupported on FreeBSD"); +#else /* linux */ + int i, lim = p->_objtotal; + + if (na->pdev == NULL) + return 0; + + for (i = 2; i < lim; i++) { + netmap_load_map(na, (bus_dma_tag_t) na->pdev, &p->lut[i].paddr, + p->lut[i].vaddr); + } +#endif /* linux */ + + return 0; +} + +static int netmap_mem_finalize_all(struct netmap_mem_d *nmd) { int i; @@ -1091,13 +1261,6 @@ netmap_mem_global_finalize(struct netmap_mem_d *nmd) if (netmap_mem_finalize_all(nmd)) goto out; - /* backward compatibility */ - netmap_buf_size = nmd->pools[NETMAP_BUF_POOL]._objsize; - netmap_total_buffers = nmd->pools[NETMAP_BUF_POOL].objtotal; - - netmap_buffer_lut = nmd->pools[NETMAP_BUF_POOL].lut; - netmap_buffer_base = nmd->pools[NETMAP_BUF_POOL].lut[0].vaddr; - nmd->lasterr = 0; out: @@ -1198,7 +1361,7 @@ netmap_mem_rings_create(struct netmap_adapter *na) ring->cur = kring->rcur; ring->tail = kring->rtail; *(uint16_t *)(uintptr_t)&ring->nr_buf_size = - NETMAP_BDG_BUF_SIZE(na->nm_mem); + netmap_mem_bufsize(na->nm_mem); ND("%s h %d c %d t %d", kring->name, ring->head, ring->cur, ring->tail); ND("initializing slots for txring"); @@ -1241,7 +1404,7 @@ netmap_mem_rings_create(struct netmap_adapter *na) ring->cur = kring->rcur; ring->tail = kring->rtail; *(int *)(uintptr_t)&ring->nr_buf_size = - NETMAP_BDG_BUF_SIZE(na->nm_mem); + netmap_mem_bufsize(na->nm_mem); ND("%s h %d c %d t %d", kring->name, ring->head, ring->cur, ring->tail); ND("initializing slots for rxring %p", ring); @@ -1290,7 +1453,7 @@ netmap_mem_rings_delete(struct netmap_adapter *na) * the interface is in netmap mode. */ struct netmap_if * -netmap_mem_if_new(const char *ifname, struct netmap_adapter *na) +netmap_mem_if_new(struct netmap_adapter *na) { struct netmap_if *nifp; ssize_t base; /* handy for relative offsets between rings and nifp */ @@ -1316,7 +1479,7 @@ netmap_mem_if_new(const char *ifname, struct netmap_adapter *na) /* initialize base fields -- override const */ *(u_int *)(uintptr_t)&nifp->ni_tx_rings = na->num_tx_rings; *(u_int *)(uintptr_t)&nifp->ni_rx_rings = na->num_rx_rings; - strncpy(nifp->ni_name, ifname, (size_t)IFNAMSIZ); + strncpy(nifp->ni_name, na->name, (size_t)IFNAMSIZ); /* * fill the slots for the rx and tx rings. They contain the offset @@ -1358,6 +1521,8 @@ netmap_mem_global_deref(struct netmap_mem_d *nmd) NMA_LOCK(nmd); nmd->refcount--; + if (!nmd->refcount) + nmd->nm_grp = -1; if (netmap_verbose) D("refcount = %d", nmd->refcount); @@ -1365,13 +1530,25 @@ netmap_mem_global_deref(struct netmap_mem_d *nmd) } int -netmap_mem_finalize(struct netmap_mem_d *nmd) +netmap_mem_finalize(struct netmap_mem_d *nmd, struct netmap_adapter *na) { - return nmd->finalize(nmd); + if (nm_mem_assign_group(nmd, na->pdev) < 0) { + return ENOMEM; + } else { + nmd->finalize(nmd); + } + + if (!nmd->lasterr && na->pdev) + netmap_mem_map(&nmd->pools[NETMAP_BUF_POOL], na); + + return nmd->lasterr; } void -netmap_mem_deref(struct netmap_mem_d *nmd) +netmap_mem_deref(struct netmap_mem_d *nmd, struct netmap_adapter *na) { + NMA_LOCK(nmd); + netmap_mem_unmap(&nmd->pools[NETMAP_BUF_POOL], na); + NMA_UNLOCK(nmd); return nmd->deref(nmd); } diff --git a/sys/dev/netmap/netmap_mem2.h b/sys/dev/netmap/netmap_mem2.h index e83616a..4c620bd 100644 --- a/sys/dev/netmap/netmap_mem2.h +++ b/sys/dev/netmap/netmap_mem2.h @@ -97,70 +97,6 @@ #define _NET_NETMAP_MEM2_H_ -#define NETMAP_BUF_MAX_NUM 20*4096*2 /* large machine */ - -#define NETMAP_POOL_MAX_NAMSZ 32 - - -enum { - NETMAP_IF_POOL = 0, - NETMAP_RING_POOL, - NETMAP_BUF_POOL, - NETMAP_POOLS_NR -}; - - -struct netmap_obj_params { - u_int size; - u_int num; -}; -struct netmap_obj_pool { - char name[NETMAP_POOL_MAX_NAMSZ]; /* name of the allocator */ - - /* ---------------------------------------------------*/ - /* these are only meaningful if the pool is finalized */ - /* (see 'finalized' field in netmap_mem_d) */ - u_int objtotal; /* actual total number of objects. */ - u_int memtotal; /* actual total memory space */ - u_int numclusters; /* actual number of clusters */ - - u_int objfree; /* number of free objects. */ - - struct lut_entry *lut; /* virt,phys addresses, objtotal entries */ - uint32_t *bitmap; /* one bit per buffer, 1 means free */ - uint32_t bitmap_slots; /* number of uint32 entries in bitmap */ - /* ---------------------------------------------------*/ - - /* limits */ - u_int objminsize; /* minimum object size */ - u_int objmaxsize; /* maximum object size */ - u_int nummin; /* minimum number of objects */ - u_int nummax; /* maximum number of objects */ - - /* these are changed only by config */ - u_int _objtotal; /* total number of objects */ - u_int _objsize; /* object size */ - u_int _clustsize; /* cluster size */ - u_int _clustentries; /* objects per cluster */ - u_int _numclusters; /* number of clusters */ - - /* requested values */ - u_int r_objtotal; - u_int r_objsize; -}; - -#ifdef linux -// XXX a mtx would suffice here 20130415 lr -#define NMA_LOCK_T struct semaphore -#else /* !linux */ -#define NMA_LOCK_T struct mtx -#endif /* linux */ - -typedef int (*netmap_mem_config_t)(struct netmap_mem_d*); -typedef int (*netmap_mem_finalize_t)(struct netmap_mem_d*); -typedef void (*netmap_mem_deref_t)(struct netmap_mem_d*); - -typedef uint16_t nm_memid_t; /* We implement two kinds of netmap_mem_d structures: * @@ -178,40 +114,21 @@ typedef uint16_t nm_memid_t; * are no active users. By 'active user' we mean an existing netmap_priv * structure holding a reference to the allocator. */ -struct netmap_mem_d { - NMA_LOCK_T nm_mtx; /* protect the allocator */ - u_int nm_totalsize; /* shorthand */ - - u_int flags; -#define NETMAP_MEM_FINALIZED 0x1 /* preallocation done */ -#define NETMAP_MEM_PRIVATE 0x2 /* uses private address space */ - int lasterr; /* last error for curr config */ - int refcount; /* existing priv structures */ - /* the three allocators */ - struct netmap_obj_pool pools[NETMAP_POOLS_NR]; - - netmap_mem_config_t config; - netmap_mem_finalize_t finalize; - netmap_mem_deref_t deref; - - nm_memid_t nm_id; /* allocator identifier */ - - /* list of all existing allocators, sorted by nm_id */ - struct netmap_mem_d *prev, *next; -}; extern struct netmap_mem_d nm_mem; +struct lut_entry* netmap_mem_get_lut(struct netmap_mem_d *); +u_int netmap_mem_get_buftotal(struct netmap_mem_d *); +size_t netmap_mem_get_bufsize(struct netmap_mem_d *); vm_paddr_t netmap_mem_ofstophys(struct netmap_mem_d *, vm_ooffset_t); -int netmap_mem_finalize(struct netmap_mem_d *); +int netmap_mem_finalize(struct netmap_mem_d *, struct netmap_adapter *); int netmap_mem_init(void); void netmap_mem_fini(void); -struct netmap_if * - netmap_mem_if_new(const char *, struct netmap_adapter *); +struct netmap_if * netmap_mem_if_new(struct netmap_adapter *); void netmap_mem_if_delete(struct netmap_adapter *, struct netmap_if *); int netmap_mem_rings_create(struct netmap_adapter *); void netmap_mem_rings_delete(struct netmap_adapter *); -void netmap_mem_deref(struct netmap_mem_d *); +void netmap_mem_deref(struct netmap_mem_d *, struct netmap_adapter *); int netmap_mem_get_info(struct netmap_mem_d *, u_int *size, u_int *memflags, uint16_t *id); ssize_t netmap_mem_if_offset(struct netmap_mem_d *, const void *vaddr); struct netmap_mem_d* netmap_mem_private_new(const char *name, @@ -219,7 +136,8 @@ struct netmap_mem_d* netmap_mem_private_new(const char *name, int* error); void netmap_mem_private_delete(struct netmap_mem_d *); -#define NETMAP_BDG_BUF_SIZE(n) ((n)->pools[NETMAP_BUF_POOL]._objsize) +#define NETMAP_MEM_PRIVATE 0x2 /* allocator uses private address space */ +#define NETMAP_MEM_IO 0x4 /* the underlying memory is mmapped I/O */ uint32_t netmap_extra_alloc(struct netmap_adapter *, uint32_t *, uint32_t n); diff --git a/sys/dev/netmap/netmap_monitor.c b/sys/dev/netmap/netmap_monitor.c new file mode 100644 index 0000000..485c370 --- /dev/null +++ b/sys/dev/netmap/netmap_monitor.c @@ -0,0 +1,498 @@ +/* + * Copyright (C) 2014 Giuseppe Lettieri. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * + * Monitors + * + * netmap monitors can be used to do zero-copy monitoring of network traffic + * on another adapter, when the latter adapter is working in netmap mode. + * + * Monitors offer to userspace the same interface as any other netmap port, + * with as many pairs of netmap rings as the monitored adapter. + * However, only the rx rings are actually used. Each monitor rx ring receives + * the traffic transiting on both the tx and rx corresponding rings in the + * monitored adapter. During registration, the user can choose if she wants + * to intercept tx only, rx only, or both tx and rx traffic. + * + * The monitor only sees the frames after they have been consumed in the + * monitored adapter: + * + * - For tx traffic, this is after the slots containing the frames have been + * marked as free. Note that this may happen at a considerably delay after + * frame transmission, since freeing of slots is often done lazily. + * + * - For rx traffic, this is after the consumer on the monitored adapter + * has released them. In most cases, the consumer is a userspace + * application which may have modified the frame contents. + * + * If the monitor is not able to cope with the stream of frames, excess traffic + * will be dropped. + * + * Each ring can be monitored by at most one monitor. This may change in the + * future, if we implement monitor chaining. + * + */ + + +#if defined(__FreeBSD__) +#include <sys/cdefs.h> /* prerequisite */ + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/param.h> /* defines used in kernel.h */ +#include <sys/kernel.h> /* types used in module initialization */ +#include <sys/malloc.h> +#include <sys/poll.h> +#include <sys/lock.h> +#include <sys/rwlock.h> +#include <sys/selinfo.h> +#include <sys/sysctl.h> +#include <sys/socket.h> /* sockaddrs */ +#include <net/if.h> +#include <net/if_var.h> +#include <machine/bus.h> /* bus_dmamap_* */ +#include <sys/refcount.h> + + +#elif defined(linux) + +#include "bsd_glue.h" + +#elif defined(__APPLE__) + +#warning OSX support is only partial +#include "osx_glue.h" + +#else + +#error Unsupported platform + +#endif /* unsupported */ + +/* + * common headers + */ + +#include <net/netmap.h> +#include <dev/netmap/netmap_kern.h> +#include <dev/netmap/netmap_mem2.h> + +#ifdef WITH_MONITOR + +#define NM_MONITOR_MAXSLOTS 4096 + +/* monitor works by replacing the nm_sync callbacks in the monitored rings. + * The actions to be performed are the same on both tx and rx rings, so we + * have collected them here + */ +static int +netmap_monitor_parent_sync(struct netmap_kring *kring, int flags, u_int* ringptr) +{ + struct netmap_monitor_adapter *mna = kring->monitor; + struct netmap_kring *mkring = &mna->up.rx_rings[kring->ring_id]; + struct netmap_ring *ring = kring->ring, *mring = mkring->ring; + int error; + int rel_slots, free_slots, busy; + u_int beg, end, i; + u_int lim = kring->nkr_num_slots - 1, + mlim = mkring->nkr_num_slots - 1; + + /* get the relased slots (rel_slots) */ + beg = *ringptr; + error = kring->save_sync(kring, flags); + if (error) + return error; + end = *ringptr; + rel_slots = end - beg; + if (rel_slots < 0) + rel_slots += kring->nkr_num_slots; + + if (!rel_slots) { + return 0; + } + + /* we need to lock the monitor receive ring, since it + * is the target of bot tx and rx traffic from the monitored + * adapter + */ + mtx_lock(&mkring->q_lock); + /* get the free slots available on the monitor ring */ + i = mkring->nr_hwtail; + busy = i - mkring->nr_hwcur; + if (busy < 0) + busy += mkring->nkr_num_slots; + free_slots = mlim - busy; + + if (!free_slots) { + mtx_unlock(&mkring->q_lock); + return 0; + } + + /* swap min(free_slots, rel_slots) slots */ + if (free_slots < rel_slots) { + beg += (rel_slots - free_slots); + if (beg > lim) + beg = 0; + rel_slots = free_slots; + } + + for ( ; rel_slots; rel_slots--) { + struct netmap_slot *s = &ring->slot[beg]; + struct netmap_slot *ms = &mring->slot[i]; + uint32_t tmp; + + tmp = ms->buf_idx; + ms->buf_idx = s->buf_idx; + s->buf_idx = tmp; + + tmp = ms->len; + ms->len = s->len; + s->len = tmp; + + s->flags |= NS_BUF_CHANGED; + + beg = nm_next(beg, lim); + i = nm_next(i, mlim); + + } + wmb(); + mkring->nr_hwtail = i; + + mtx_unlock(&mkring->q_lock); + /* notify the new frames to the monitor */ + mna->up.nm_notify(&mna->up, mkring->ring_id, NR_RX, 0); + return 0; +} + +/* callback used to replace the nm_sync callback in the monitored tx rings */ +static int +netmap_monitor_parent_txsync(struct netmap_kring *kring, int flags) +{ + ND("%s %x", kring->name, flags); + return netmap_monitor_parent_sync(kring, flags, &kring->nr_hwtail); +} + +/* callback used to replace the nm_sync callback in the monitored rx rings */ +static int +netmap_monitor_parent_rxsync(struct netmap_kring *kring, int flags) +{ + ND("%s %x", kring->name, flags); + return netmap_monitor_parent_sync(kring, flags, &kring->rcur); +} + +/* nm_sync callback for the monitor's own tx rings. + * This makes no sense and always returns error + */ +static int +netmap_monitor_txsync(struct netmap_kring *kring, int flags) +{ + D("%s %x", kring->name, flags); + return EIO; +} + +/* nm_sync callback for the monitor's own rx rings. + * Note that the lock in netmap_monitor_parent_sync only protects + * writers among themselves. Synchronization between writers + * (i.e., netmap_monitor_parent_txsync and netmap_monitor_parent_rxsync) + * and readers (i.e., netmap_monitor_rxsync) relies on memory barriers. + */ +static int +netmap_monitor_rxsync(struct netmap_kring *kring, int flags) +{ + ND("%s %x", kring->name, flags); + kring->nr_hwcur = kring->rcur; + rmb(); + nm_rxsync_finalize(kring); + return 0; +} + +/* nm_krings_create callbacks for monitors. + * We could use the default netmap_hw_krings_monitor, but + * we don't need the mbq. + */ +static int +netmap_monitor_krings_create(struct netmap_adapter *na) +{ + return netmap_krings_create(na, 0); +} + + +/* nm_register callback for monitors. + * + * On registration, replace the nm_sync callbacks in the monitored + * rings with our own, saving the previous ones in the monitored + * rings themselves, where they are used by netmap_monitor_parent_sync. + * + * On de-registration, restore the original callbacks. We need to + * stop traffic while we are doing this, since the monitored adapter may + * have already started executing a netmap_monitor_parent_sync + * and may not like the kring->save_sync pointer to become NULL. + */ +static int +netmap_monitor_reg(struct netmap_adapter *na, int onoff) +{ + struct netmap_monitor_adapter *mna = + (struct netmap_monitor_adapter *)na; + struct netmap_priv_d *priv = &mna->priv; + struct netmap_adapter *pna = priv->np_na; + struct netmap_kring *kring; + int i; + + ND("%p: onoff %d", na, onoff); + if (onoff) { + if (!nm_netmap_on(pna)) { + /* parent left netmap mode, fatal */ + return ENXIO; + } + if (mna->flags & NR_MONITOR_TX) { + for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) { + kring = &pna->tx_rings[i]; + kring->save_sync = kring->nm_sync; + kring->nm_sync = netmap_monitor_parent_txsync; + } + } + if (mna->flags & NR_MONITOR_RX) { + for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) { + kring = &pna->rx_rings[i]; + kring->save_sync = kring->nm_sync; + kring->nm_sync = netmap_monitor_parent_rxsync; + } + } + na->na_flags |= NAF_NETMAP_ON; + } else { + if (!nm_netmap_on(pna)) { + /* parent left netmap mode, nothing to restore */ + return 0; + } + na->na_flags &= ~NAF_NETMAP_ON; + if (mna->flags & NR_MONITOR_TX) { + for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) { + netmap_set_txring(pna, i, 1 /* stopped */); + kring = &pna->tx_rings[i]; + kring->nm_sync = kring->save_sync; + kring->save_sync = NULL; + netmap_set_txring(pna, i, 0 /* enabled */); + } + } + if (mna->flags & NR_MONITOR_RX) { + for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) { + netmap_set_rxring(pna, i, 1 /* stopped */); + kring = &pna->rx_rings[i]; + kring->nm_sync = kring->save_sync; + kring->save_sync = NULL; + netmap_set_rxring(pna, i, 0 /* enabled */); + } + } + } + return 0; +} +/* nm_krings_delete callback for monitors */ +static void +netmap_monitor_krings_delete(struct netmap_adapter *na) +{ + netmap_krings_delete(na); +} + + +/* nm_dtor callback for monitors */ +static void +netmap_monitor_dtor(struct netmap_adapter *na) +{ + struct netmap_monitor_adapter *mna = + (struct netmap_monitor_adapter *)na; + struct netmap_priv_d *priv = &mna->priv; + struct netmap_adapter *pna = priv->np_na; + int i; + + ND("%p", na); + if (nm_netmap_on(pna)) { + /* parent still in netmap mode, mark its krings as free */ + if (mna->flags & NR_MONITOR_TX) { + for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) { + pna->tx_rings[i].monitor = NULL; + } + } + if (mna->flags & NR_MONITOR_RX) { + for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) { + pna->rx_rings[i].monitor = NULL; + } + } + } + netmap_adapter_put(pna); +} + + +/* check if nmr is a request for a monitor adapter that we can satisfy */ +int +netmap_get_monitor_na(struct nmreq *nmr, struct netmap_adapter **na, int create) +{ + struct nmreq pnmr; + struct netmap_adapter *pna; /* parent adapter */ + struct netmap_monitor_adapter *mna; + int i, error; + + if ((nmr->nr_flags & (NR_MONITOR_TX | NR_MONITOR_RX)) == 0) { + ND("not a monitor"); + return 0; + } + /* this is a request for a monitor adapter */ + + D("flags %x", nmr->nr_flags); + + mna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (mna == NULL) { + D("memory error"); + return ENOMEM; + } + + /* first, try to find the adapter that we want to monitor + * We use the same nmr, after we have turned off the monitor flags. + * In this way we can potentially monitor everything netmap understands, + * except other monitors. + */ + memcpy(&pnmr, nmr, sizeof(pnmr)); + pnmr.nr_flags &= ~(NR_MONITOR_TX | NR_MONITOR_RX); + error = netmap_get_na(&pnmr, &pna, create); + if (error) { + D("parent lookup failed: %d", error); + return error; + } + D("found parent: %s", pna->name); + + if (!nm_netmap_on(pna)) { + /* parent not in netmap mode */ + /* XXX we can wait for the parent to enter netmap mode, + * by intercepting its nm_register callback (2014-03-16) + */ + D("%s not in netmap mode", pna->name); + error = EINVAL; + goto put_out; + } + + /* grab all the rings we need in the parent */ + mna->priv.np_na = pna; + error = netmap_interp_ringid(&mna->priv, nmr->nr_ringid, nmr->nr_flags); + if (error) { + D("ringid error"); + goto put_out; + } + if (nmr->nr_flags & NR_MONITOR_TX) { + for (i = mna->priv.np_txqfirst; i < mna->priv.np_txqlast; i++) { + struct netmap_kring *kring = &pna->tx_rings[i]; + if (kring->monitor) { + error = EBUSY; + D("ring busy"); + goto release_out; + } + kring->monitor = mna; + } + } + if (nmr->nr_flags & NR_MONITOR_RX) { + for (i = mna->priv.np_rxqfirst; i < mna->priv.np_rxqlast; i++) { + struct netmap_kring *kring = &pna->rx_rings[i]; + if (kring->monitor) { + error = EBUSY; + D("ring busy"); + goto release_out; + } + kring->monitor = mna; + } + } + + snprintf(mna->up.name, sizeof(mna->up.name), "mon:%s", pna->name); + + /* the monitor supports the host rings iff the parent does */ + mna->up.na_flags = (pna->na_flags & NAF_HOST_RINGS); + mna->up.nm_txsync = netmap_monitor_txsync; + mna->up.nm_rxsync = netmap_monitor_rxsync; + mna->up.nm_register = netmap_monitor_reg; + mna->up.nm_dtor = netmap_monitor_dtor; + mna->up.nm_krings_create = netmap_monitor_krings_create; + mna->up.nm_krings_delete = netmap_monitor_krings_delete; + mna->up.nm_mem = pna->nm_mem; + mna->up.na_lut = pna->na_lut; + mna->up.na_lut_objtotal = pna->na_lut_objtotal; + mna->up.na_lut_objsize = pna->na_lut_objsize; + + mna->up.num_tx_rings = 1; // XXX we don't need it, but field can't be zero + /* we set the number of our rx_rings to be max(num_rx_rings, num_rx_rings) + * in the parent + */ + mna->up.num_rx_rings = pna->num_rx_rings; + if (pna->num_tx_rings > pna->num_rx_rings) + mna->up.num_rx_rings = pna->num_tx_rings; + /* by default, the number of slots is the same as in + * the parent rings, but the user may ask for a different + * number + */ + mna->up.num_tx_desc = nmr->nr_tx_slots; + nm_bound_var(&mna->up.num_tx_desc, pna->num_tx_desc, + 1, NM_MONITOR_MAXSLOTS, NULL); + mna->up.num_rx_desc = nmr->nr_rx_slots; + nm_bound_var(&mna->up.num_rx_desc, pna->num_rx_desc, + 1, NM_MONITOR_MAXSLOTS, NULL); + error = netmap_attach_common(&mna->up); + if (error) { + D("attach_common error"); + goto release_out; + } + + /* remember the traffic directions we have to monitor */ + mna->flags = (nmr->nr_flags & (NR_MONITOR_TX | NR_MONITOR_RX)); + + *na = &mna->up; + netmap_adapter_get(*na); + + /* write the configuration back */ + nmr->nr_tx_rings = mna->up.num_tx_rings; + nmr->nr_rx_rings = mna->up.num_rx_rings; + nmr->nr_tx_slots = mna->up.num_tx_desc; + nmr->nr_rx_slots = mna->up.num_rx_desc; + + /* keep the reference to the parent */ + D("monitor ok"); + + return 0; + +release_out: + D("monitor error"); + for (i = mna->priv.np_txqfirst; i < mna->priv.np_txqlast; i++) { + if (pna->tx_rings[i].monitor == mna) + pna->tx_rings[i].monitor = NULL; + } + for (i = mna->priv.np_rxqfirst; i < mna->priv.np_rxqlast; i++) { + if (pna->rx_rings[i].monitor == mna) + pna->rx_rings[i].monitor = NULL; + } +put_out: + netmap_adapter_put(pna); + free(mna, M_DEVBUF); + return error; +} + + +#endif /* WITH_MONITOR */ diff --git a/sys/dev/netmap/netmap_offloadings.c b/sys/dev/netmap/netmap_offloadings.c index a776a24..34eafab 100644 --- a/sys/dev/netmap/netmap_offloadings.c +++ b/sys/dev/netmap/netmap_offloadings.c @@ -159,7 +159,7 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na, src = ft_p->ft_buf; src_len = ft_p->ft_len; slot = &ring->slot[*j]; - dst = BDG_NMB(&dst_na->up, slot); + dst = NMB(&dst_na->up, slot); dst_len = src_len; /* We are processing the first input slot and there is a mismatch @@ -303,7 +303,7 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na, /* Next destination slot. */ *j = nm_next(*j, lim); slot = &ring->slot[*j]; - dst = BDG_NMB(&dst_na->up, slot); + dst = NMB(&dst_na->up, slot); gso_bytes = 0; gso_idx++; @@ -365,7 +365,7 @@ void bdg_mismatch_datapath(struct netmap_vp_adapter *na, /* Next destination slot. */ *j = nm_next(*j, lim); slot = &ring->slot[*j]; - dst = BDG_NMB(&dst_na->up, slot); + dst = NMB(&dst_na->up, slot); /* Next source slot. */ ft_p++; diff --git a/sys/dev/netmap/netmap_pipe.c b/sys/dev/netmap/netmap_pipe.c index 9fcc4d2..bc998c0 100644 --- a/sys/dev/netmap/netmap_pipe.c +++ b/sys/dev/netmap/netmap_pipe.c @@ -126,7 +126,7 @@ void netmap_pipe_dealloc(struct netmap_adapter *na) { if (na->na_pipes) { - ND("freeing pipes for %s", NM_IFPNAME(na->ifp)); + ND("freeing pipes for %s", na->name); free(na->na_pipes, M_DEVBUF); na->na_pipes = NULL; na->na_max_pipes = 0; @@ -155,7 +155,7 @@ static int netmap_pipe_add(struct netmap_adapter *parent, struct netmap_pipe_adapter *na) { if (parent->na_next_pipe >= parent->na_max_pipes) { - D("%s: no space left for pipes", NM_IFPNAME(parent->ifp)); + D("%s: no space left for pipes", parent->name); return ENOMEM; } @@ -179,10 +179,9 @@ netmap_pipe_remove(struct netmap_adapter *parent, struct netmap_pipe_adapter *na } static int -netmap_pipe_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) +netmap_pipe_txsync(struct netmap_kring *txkring, int flags) { - struct netmap_kring *txkring = na->tx_rings + ring_nr, - *rxkring = txkring->pipe; + struct netmap_kring *rxkring = txkring->pipe; u_int limit; /* slots to transfer */ u_int j, k, lim_tx = txkring->nkr_num_slots - 1, lim_rx = rxkring->nkr_num_slots - 1; @@ -245,10 +244,9 @@ netmap_pipe_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) } static int -netmap_pipe_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +netmap_pipe_rxsync(struct netmap_kring *rxkring, int flags) { - struct netmap_kring *rxkring = na->rx_rings + ring_nr, - *txkring = rxkring->pipe; + struct netmap_kring *txkring = rxkring->pipe; uint32_t oldhwcur = rxkring->nr_hwcur; ND("%s %x <- %s", rxkring->name, flags, txkring->name); @@ -425,12 +423,11 @@ netmap_pipe_reg(struct netmap_adapter *na, int onoff) { struct netmap_pipe_adapter *pna = (struct netmap_pipe_adapter *)na; - struct ifnet *ifp = na->ifp; ND("%p: onoff %d", na, onoff); if (onoff) { - ifp->if_capenable |= IFCAP_NETMAP; + na->na_flags |= NAF_NETMAP_ON; } else { - ifp->if_capenable &= ~IFCAP_NETMAP; + na->na_flags &= ~NAF_NETMAP_ON; } if (pna->peer_ref) { ND("%p: case 1.a or 2.a, nothing to do", na); @@ -522,8 +519,6 @@ netmap_pipe_dtor(struct netmap_adapter *na) if (pna->role == NR_REG_PIPE_MASTER) netmap_pipe_remove(pna->parent, pna); netmap_adapter_put(pna->parent); - free(na->ifp, M_DEVBUF); - na->ifp = NULL; pna->parent = NULL; } @@ -533,7 +528,6 @@ netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create) struct nmreq pnmr; struct netmap_adapter *pna; /* parent adapter */ struct netmap_pipe_adapter *mna, *sna, *req; - struct ifnet *ifp, *ifp2; u_int pipe_id; int role = nmr->nr_flags & NR_REG_MASK; int error; @@ -556,7 +550,7 @@ netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create) ND("parent lookup failed: %d", error); return error; } - ND("found parent: %s", NM_IFPNAME(pna->ifp)); + ND("found parent: %s", na->name); if (NETMAP_OWNED_BY_KERN(pna)) { ND("parent busy"); @@ -591,19 +585,12 @@ netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create) * The endpoint we were asked for holds a reference to * the other one. */ - ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); - if (!ifp) { - error = ENOMEM; - goto put_out; - } - strcpy(ifp->if_xname, NM_IFPNAME(pna->ifp)); - mna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO); if (mna == NULL) { error = ENOMEM; - goto free_ifp; + goto put_out; } - mna->up.ifp = ifp; + snprintf(mna->up.name, sizeof(mna->up.name), "%s{%d", pna->name, pipe_id); mna->id = pipe_id; mna->role = NR_REG_PIPE_MASTER; @@ -618,6 +605,7 @@ netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create) mna->up.nm_mem = pna->nm_mem; mna->up.na_lut = pna->na_lut; mna->up.na_lut_objtotal = pna->na_lut_objtotal; + mna->up.na_lut_objsize = pna->na_lut_objsize; mna->up.num_tx_rings = 1; mna->up.num_rx_rings = 1; @@ -629,28 +617,21 @@ netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create) 1, NM_PIPE_MAXSLOTS, NULL); error = netmap_attach_common(&mna->up); if (error) - goto free_ifp; + goto free_mna; /* register the master with the parent */ error = netmap_pipe_add(pna, mna); if (error) goto free_mna; /* create the slave */ - ifp2 = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); - if (!ifp) { - error = ENOMEM; - goto free_mna; - } - strcpy(ifp2->if_xname, NM_IFPNAME(pna->ifp)); - sna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO); if (sna == NULL) { error = ENOMEM; - goto free_ifp2; + goto free_mna; } /* most fields are the same, copy from master and then fix */ *sna = *mna; - sna->up.ifp = ifp2; + snprintf(sna->up.name, sizeof(sna->up.name), "%s}%d", pna->name, pipe_id); sna->role = NR_REG_PIPE_SLAVE; error = netmap_attach_common(&sna->up); if (error) @@ -696,12 +677,8 @@ found: free_sna: free(sna, M_DEVBUF); -free_ifp2: - free(ifp2, M_DEVBUF); free_mna: free(mna, M_DEVBUF); -free_ifp: - free(ifp, M_DEVBUF); put_out: netmap_adapter_put(pna); return error; diff --git a/sys/dev/netmap/netmap_vale.c b/sys/dev/netmap/netmap_vale.c index 8e309e9..6b1fe1f 100644 --- a/sys/dev/netmap/netmap_vale.c +++ b/sys/dev/netmap/netmap_vale.c @@ -157,11 +157,9 @@ SYSCTL_DECL(_dev_netmap); SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , ""); -static int bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp); -static int bdg_netmap_reg(struct netmap_adapter *na, int onoff); -static int netmap_bwrap_attach(struct ifnet *, struct ifnet *); +static int netmap_vp_create(struct nmreq *, struct ifnet *, struct netmap_vp_adapter **); +static int netmap_vp_reg(struct netmap_adapter *na, int onoff); static int netmap_bwrap_register(struct netmap_adapter *, int onoff); -int kern_netmap_regif(struct nmreq *nmr); /* * For each output interface, nm_bdg_q is used to construct a list. @@ -217,7 +215,7 @@ struct nm_bridge { * different ring index. * This function must be set by netmap_bdgctl(). */ - bdg_lookup_fn_t nm_bdg_lookup; + struct netmap_bdg_ops bdg_ops; /* the forwarding table, MAC+ports. * XXX should be changed to an argument to be passed to @@ -226,6 +224,15 @@ struct nm_bridge { struct nm_hash_ent ht[NM_BDG_HASH]; }; +const char* +netmap_bdg_name(struct netmap_vp_adapter *vp) +{ + struct nm_bridge *b = vp->na_bdg; + if (b == NULL) + return NULL; + return b->bdg_basename; +} + /* * XXX in principle nm_bridges could be created dynamically @@ -321,7 +328,7 @@ nm_find_bridge(const char *name, int create) for (i = 0; i < NM_BDG_MAXPORTS; i++) b->bdg_port_index[i] = i; /* set the default function */ - b->nm_bdg_lookup = netmap_bdg_learning; + b->bdg_ops.lookup = netmap_bdg_learning; /* reset the MAC address table */ bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH); } @@ -389,6 +396,9 @@ nm_alloc_bdgfwd(struct netmap_adapter *na) } +/* remove from bridge b the ports in slots hw and sw + * (sw can be -1 if not needed) + */ static void netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw) { @@ -434,6 +444,8 @@ netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw) } BDG_WLOCK(b); + if (b->bdg_ops.dtor) + b->bdg_ops.dtor(b->bdg_ports[s_hw]); b->bdg_ports[s_hw] = NULL; if (s_sw >= 0) { b->bdg_ports[s_sw] = NULL; @@ -445,29 +457,131 @@ netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw) ND("now %d active ports", lim); if (lim == 0) { ND("marking bridge %s as free", b->bdg_basename); - b->nm_bdg_lookup = NULL; + bzero(&b->bdg_ops, sizeof(b->bdg_ops)); } } +/* nm_bdg_ctl callback for VALE ports */ +static int +netmap_vp_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach) +{ + struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; + struct nm_bridge *b = vpna->na_bdg; + + if (attach) + return 0; /* nothing to do */ + if (b) { + netmap_set_all_rings(na, 0 /* disable */); + netmap_bdg_detach_common(b, vpna->bdg_port, -1); + vpna->na_bdg = NULL; + netmap_set_all_rings(na, 1 /* enable */); + } + /* I have took reference just for attach */ + netmap_adapter_put(na); + return 0; +} +/* nm_dtor callback for ephemeral VALE ports */ static void -netmap_adapter_vp_dtor(struct netmap_adapter *na) +netmap_vp_dtor(struct netmap_adapter *na) { struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; struct nm_bridge *b = vpna->na_bdg; - struct ifnet *ifp = na->ifp; - ND("%s has %d references", NM_IFPNAME(ifp), na->na_refcount); + ND("%s has %d references", na->name, na->na_refcount); if (b) { netmap_bdg_detach_common(b, vpna->bdg_port, -1); } +} - bzero(ifp, sizeof(*ifp)); - free(ifp, M_DEVBUF); +/* nm_dtor callback for persistent VALE ports */ +static void +netmap_persist_vp_dtor(struct netmap_adapter *na) +{ + struct ifnet *ifp = na->ifp; + + netmap_vp_dtor(na); na->ifp = NULL; + nm_vi_detach(ifp); +} + +/* remove a persistent VALE port from the system */ +static int +nm_vi_destroy(const char *name) +{ + struct ifnet *ifp; + int error; + + ifp = ifunit_ref(name); + if (!ifp) + return ENXIO; + NMG_LOCK(); + /* make sure this is actually a VALE port */ + if (!NETMAP_CAPABLE(ifp) || NA(ifp)->nm_register != netmap_vp_reg) { + error = EINVAL; + goto err; + } + + if (NA(ifp)->na_refcount > 1) { + error = EBUSY; + goto err; + } + NMG_UNLOCK(); + + D("destroying a persistent vale interface %s", ifp->if_xname); + /* Linux requires all the references are released + * before unregister + */ + if_rele(ifp); + netmap_detach(ifp); + return 0; + +err: + NMG_UNLOCK(); + if_rele(ifp); + return error; } +/* + * Create a virtual interface registered to the system. + * The interface will be attached to a bridge later. + */ +static int +nm_vi_create(struct nmreq *nmr) +{ + struct ifnet *ifp; + struct netmap_vp_adapter *vpna; + int error; + + /* don't include VALE prefix */ + if (!strncmp(nmr->nr_name, NM_NAME, strlen(NM_NAME))) + return EINVAL; + ifp = ifunit_ref(nmr->nr_name); + if (ifp) { /* already exist, cannot create new one */ + if_rele(ifp); + return EEXIST; + } + error = nm_vi_persist(nmr->nr_name, &ifp); + if (error) + return error; + + NMG_LOCK(); + /* netmap_vp_create creates a struct netmap_vp_adapter */ + error = netmap_vp_create(nmr, ifp, &vpna); + if (error) { + D("error %d", error); + nm_vi_detach(ifp); + return error; + } + /* persist-specific routines */ + vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl; + vpna->up.nm_dtor = netmap_persist_vp_dtor; + netmap_adapter_get(&vpna->up); + NMG_UNLOCK(); + D("created %s", ifp->if_xname); + return 0; +} /* Try to get a reference to a netmap adapter attached to a VALE switch. * If the adapter is found (or is created), this function returns 0, a @@ -481,11 +595,11 @@ netmap_adapter_vp_dtor(struct netmap_adapter *na) int netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create) { - const char *name = nmr->nr_name; + char *nr_name = nmr->nr_name; + const char *ifname; struct ifnet *ifp; int error = 0; - struct netmap_adapter *ret; - struct netmap_vp_adapter *vpna; + struct netmap_vp_adapter *vpna, *hostna = NULL; struct nm_bridge *b; int i, j, cand = -1, cand2 = -1; int needed; @@ -494,15 +608,17 @@ netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create) /* first try to see if this is a bridge port. */ NMG_LOCK_ASSERT(); - if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) { + if (strncmp(nr_name, NM_NAME, sizeof(NM_NAME) - 1)) { return 0; /* no error, but no VALE prefix */ } - b = nm_find_bridge(name, create); + b = nm_find_bridge(nr_name, create); if (b == NULL) { - D("no bridges available for '%s'", name); + D("no bridges available for '%s'", nr_name); return (create ? ENOMEM : ENXIO); } + if (strlen(nr_name) < b->bdg_namelen) /* impossible */ + panic("x"); /* Now we are sure that name starts with the bridge's name, * lookup the port in the bridge. We need to scan the entire @@ -516,13 +632,11 @@ netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create) i = b->bdg_port_index[j]; vpna = b->bdg_ports[i]; // KASSERT(na != NULL); - ifp = vpna->up.ifp; - /* XXX make sure the name only contains one : */ - if (!strcmp(NM_IFPNAME(ifp), name)) { + D("checking %s", vpna->up.name); + if (!strcmp(vpna->up.name, nr_name)) { netmap_adapter_get(&vpna->up); - ND("found existing if %s refs %d", name, - vpna->na_bdg_refcount); - *na = (struct netmap_adapter *)vpna; + ND("found existing if %s refs %d", nr_name) + *na = &vpna->up; return 0; } } @@ -539,68 +653,50 @@ netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create) cand = b->bdg_port_index[b->bdg_active_ports]; cand2 = b->bdg_port_index[b->bdg_active_ports + 1]; ND("+++ bridge %s port %s used %d avail %d %d", - b->bdg_basename, name, b->bdg_active_ports, cand, cand2); + b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2); /* * try see if there is a matching NIC with this name * (after the bridge's name) */ - ifp = ifunit_ref(name + b->bdg_namelen + 1); - if (!ifp) { /* this is a virtual port */ + ifname = nr_name + b->bdg_namelen + 1; + ifp = ifunit_ref(ifname); + if (!ifp) { + /* Create an ephemeral virtual port + * This block contains all the ephemeral-specific logics + */ if (nmr->nr_cmd) { /* nr_cmd must be 0 for a virtual port */ return EINVAL; } - /* create a struct ifnet for the new port. - * need M_NOWAIT as we are under nma_lock - */ - ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); - if (!ifp) - return ENOMEM; - - strcpy(ifp->if_xname, name); /* bdg_netmap_attach creates a struct netmap_adapter */ - error = bdg_netmap_attach(nmr, ifp); + error = netmap_vp_create(nmr, NULL, &vpna); if (error) { D("error %d", error); free(ifp, M_DEVBUF); return error; } - ret = NA(ifp); - cand2 = -1; /* only need one port */ - } else { /* this is a NIC */ - struct ifnet *fake_ifp; + /* shortcut - we can skip get_hw_na(), + * ownership check and nm_bdg_attach() + */ + } else { + struct netmap_adapter *hw; - error = netmap_get_hw_na(ifp, &ret); - if (error || ret == NULL) + error = netmap_get_hw_na(ifp, &hw); + if (error || hw == NULL) goto out; - /* make sure the NIC is not already in use */ - if (NETMAP_OWNED_BY_ANY(ret)) { - D("NIC %s busy, cannot attach to bridge", - NM_IFPNAME(ifp)); - error = EBUSY; - goto out; - } - /* create a fake interface */ - fake_ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); - if (!fake_ifp) { - error = ENOMEM; - goto out; - } - strcpy(fake_ifp->if_xname, name); - error = netmap_bwrap_attach(fake_ifp, ifp); - if (error) { - free(fake_ifp, M_DEVBUF); + /* host adapter might not be created */ + error = hw->nm_bdg_attach(nr_name, hw); + if (error) goto out; - } - ret = NA(fake_ifp); - if (nmr->nr_arg1 != NETMAP_BDG_HOST) - cand2 = -1; /* only need one port */ + vpna = hw->na_vp; + hostna = hw->na_hostvp; if_rele(ifp); + if (nmr->nr_arg1 != NETMAP_BDG_HOST) + hostna = NULL; } - vpna = (struct netmap_vp_adapter *)ret; BDG_WLOCK(b); vpna->bdg_port = cand; @@ -609,8 +705,7 @@ netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create) b->bdg_ports[cand] = vpna; vpna->na_bdg = b; b->bdg_active_ports++; - if (cand2 >= 0) { - struct netmap_vp_adapter *hostna = vpna + 1; + if (hostna != NULL) { /* also bind the host stack to the bridge */ b->bdg_ports[cand2] = hostna; hostna->bdg_port = cand2; @@ -618,10 +713,10 @@ netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create) b->bdg_active_ports++; ND("host %p to bridge port %d", hostna, cand2); } - ND("if %s refs %d", name, vpna->up.na_refcount); + ND("if %s refs %d", ifname, vpna->up.na_refcount); BDG_WUNLOCK(b); - *na = ret; - netmap_adapter_get(ret); + *na = &vpna->up; + netmap_adapter_get(*na); return 0; out: @@ -631,24 +726,17 @@ out: } -/* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */ +/* Process NETMAP_BDG_ATTACH */ static int -nm_bdg_attach(struct nmreq *nmr) +nm_bdg_ctl_attach(struct nmreq *nmr) { struct netmap_adapter *na; - struct netmap_if *nifp; - struct netmap_priv_d *npriv; - struct netmap_bwrap_adapter *bna; int error; - npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO); - if (npriv == NULL) - return ENOMEM; - NMG_LOCK(); error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */); - if (error) /* no device, or another bridge or user owns the device */ + if (error) /* no device */ goto unlock_exit; if (na == NULL) { /* VALE prefix missing */ @@ -656,39 +744,37 @@ nm_bdg_attach(struct nmreq *nmr) goto unlock_exit; } - if (na->active_fds > 0) { /* already registered */ + if (NETMAP_OWNED_BY_ANY(na)) { error = EBUSY; goto unref_exit; } - nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags, &error); - if (!nifp) { - goto unref_exit; + if (na->nm_bdg_ctl) { + /* nop for VALE ports. The bwrap needs to put the hwna + * in netmap mode (see netmap_bwrap_bdg_ctl) + */ + error = na->nm_bdg_ctl(na, nmr, 1); + if (error) + goto unref_exit; + ND("registered %s to netmap-mode", na->name); } - - bna = (struct netmap_bwrap_adapter*)na; - bna->na_kpriv = npriv; NMG_UNLOCK(); - ND("registered %s to netmap-mode", NM_IFPNAME(na->ifp)); return 0; unref_exit: netmap_adapter_put(na); unlock_exit: NMG_UNLOCK(); - bzero(npriv, sizeof(*npriv)); - free(npriv, M_DEVBUF); return error; } +/* process NETMAP_BDG_DETACH */ static int -nm_bdg_detach(struct nmreq *nmr) +nm_bdg_ctl_detach(struct nmreq *nmr) { struct netmap_adapter *na; int error; - struct netmap_bwrap_adapter *bna; - int last_instance; NMG_LOCK(); error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */); @@ -701,28 +787,13 @@ nm_bdg_detach(struct nmreq *nmr) goto unlock_exit; } - bna = (struct netmap_bwrap_adapter *)na; - - if (na->active_fds == 0) { /* not registered */ - error = EINVAL; - goto unref_exit; - } - - last_instance = netmap_dtor_locked(bna->na_kpriv); /* unregister */ - if (!last_instance) { - D("--- error, trying to detach an entry with active mmaps"); - error = EINVAL; - } else { - struct netmap_priv_d *npriv = bna->na_kpriv; - - bna->na_kpriv = NULL; - D("deleting priv"); - - bzero(npriv, sizeof(*npriv)); - free(npriv, M_DEVBUF); + if (na->nm_bdg_ctl) { + /* remove the port from bridge. The bwrap + * also needs to put the hwna in normal mode + */ + error = na->nm_bdg_ctl(na, nmr, 0); } -unref_exit: netmap_adapter_put(na); unlock_exit: NMG_UNLOCK(); @@ -731,28 +802,39 @@ unlock_exit: } -/* exported to kernel callers, e.g. OVS ? - * Entry point. +/* Called by either user's context (netmap_ioctl()) + * or external kernel modules (e.g., Openvswitch). + * Operation is indicated in nmr->nr_cmd. + * NETMAP_BDG_OPS that sets configure/lookup/dtor functions to the bridge + * requires bdg_ops argument; the other commands ignore this argument. + * * Called without NMG_LOCK. */ int -netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) +netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops) { struct nm_bridge *b; struct netmap_adapter *na; struct netmap_vp_adapter *vpna; - struct ifnet *iter; char *name = nmr->nr_name; int cmd = nmr->nr_cmd, namelen = strlen(name); int error = 0, i, j; switch (cmd) { + case NETMAP_BDG_NEWIF: + error = nm_vi_create(nmr); + break; + + case NETMAP_BDG_DELIF: + error = nm_vi_destroy(nmr->nr_name); + break; + case NETMAP_BDG_ATTACH: - error = nm_bdg_attach(nmr); + error = nm_bdg_ctl_attach(nmr); break; case NETMAP_BDG_DETACH: - error = nm_bdg_detach(nmr); + error = nm_bdg_ctl_detach(nmr); break; case NETMAP_BDG_LIST: @@ -770,6 +852,7 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) break; } + name = name + b->bdg_namelen + 1; error = ENOENT; for (j = 0; j < b->bdg_active_ports; j++) { i = b->bdg_port_index[j]; @@ -778,11 +861,10 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) D("---AAAAAAAAARGH-------"); continue; } - iter = vpna->up.ifp; /* the former and the latter identify a * virtual port and a NIC, respectively */ - if (!strcmp(iter->if_xname, name)) { + if (!strcmp(vpna->up.name, name)) { /* bridge index */ nmr->nr_arg1 = b - nm_bridges; nmr->nr_arg2 = i; /* port index */ @@ -813,8 +895,7 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) nmr->nr_arg2 = j; j = b->bdg_port_index[j]; vpna = b->bdg_ports[j]; - iter = vpna->up.ifp; - strncpy(name, iter->if_xname, (size_t)IFNAMSIZ); + strncpy(name, vpna->up.name, (size_t)IFNAMSIZ); error = 0; break; } @@ -822,12 +903,12 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) } break; - case NETMAP_BDG_LOOKUP_REG: - /* register a lookup function to the given bridge. + case NETMAP_BDG_REGOPS: /* XXX this should not be available from userspace */ + /* register callbacks to the given bridge. * nmr->nr_name may be just bridge's name (including ':' * if it is not just NM_NAME). */ - if (!func) { + if (!bdg_ops) { error = EINVAL; break; } @@ -836,7 +917,7 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) if (!b) { error = EINVAL; } else { - b->nm_bdg_lookup = func; + b->bdg_ops = *bdg_ops; } NMG_UNLOCK(); break; @@ -856,7 +937,7 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) vpna = (struct netmap_vp_adapter *)na; vpna->virt_hdr_len = nmr->nr_arg1; if (vpna->virt_hdr_len) - vpna->mfs = NETMAP_BDG_BUF_SIZE(na->nm_mem); + vpna->mfs = NETMAP_BUF_SIZE(na); D("Using vnet_hdr_len %d for %p", vpna->virt_hdr_len, vpna); netmap_adapter_put(na); } @@ -871,6 +952,32 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) return error; } +int +netmap_bdg_config(struct nmreq *nmr) +{ + struct nm_bridge *b; + int error = EINVAL; + + NMG_LOCK(); + b = nm_find_bridge(nmr->nr_name, 0); + if (!b) { + NMG_UNLOCK(); + return error; + } + NMG_UNLOCK(); + /* Don't call config() with NMG_LOCK() held */ + BDG_RLOCK(b); + if (b->bdg_ops.config != NULL) + error = b->bdg_ops.config((struct nm_ifreq *)nmr); + BDG_RUNLOCK(b); + return error; +} + + +/* nm_krings_create callback for VALE ports. + * Calls the standard netmap_krings_create, then adds leases on rx + * rings and bdgfwd on tx rings. + */ static int netmap_vp_krings_create(struct netmap_adapter *na) { @@ -905,6 +1012,7 @@ netmap_vp_krings_create(struct netmap_adapter *na) } +/* nm_krings_delete callback for VALE ports. */ static void netmap_vp_krings_delete(struct netmap_adapter *na) { @@ -919,17 +1027,20 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, /* + * main dispatch routine for the bridge. * Grab packets from a kring, move them into the ft structure * associated to the tx (input) port. Max one instance per port, * filtered on input (ioctl, poll or XXX). * Returns the next position in the ring. */ static int -nm_bdg_preflush(struct netmap_vp_adapter *na, u_int ring_nr, - struct netmap_kring *kring, u_int end) +nm_bdg_preflush(struct netmap_kring *kring, u_int end) { + struct netmap_vp_adapter *na = + (struct netmap_vp_adapter*)kring->na; struct netmap_ring *ring = kring->ring; struct nm_bdg_fwd *ft; + u_int ring_nr = kring->ring_id; u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; u_int ft_i = 0; /* start from 0 */ u_int frags = 1; /* how many frags ? */ @@ -958,12 +1069,12 @@ nm_bdg_preflush(struct netmap_vp_adapter *na, u_int ring_nr, /* this slot goes into a list so initialize the link field */ ft[ft_i].ft_next = NM_FT_NULL; buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? - (void *)(uintptr_t)slot->ptr : BDG_NMB(&na->up, slot); + (void *)(uintptr_t)slot->ptr : NMB(&na->up, slot); if (unlikely(buf == NULL)) { RD(5, "NULL %s buffer pointer from %s slot %d len %d", (slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT", kring->name, j, ft[ft_i].ft_len); - buf = ft[ft_i].ft_buf = NMB_VA(0); /* the 'null' buffer */ + buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up); ft[ft_i].ft_len = 0; ft[ft_i].ft_flags = 0; } @@ -1035,23 +1146,28 @@ nm_bridge_rthash(const uint8_t *addr) #undef mix +/* nm_register callback for VALE ports */ static int -bdg_netmap_reg(struct netmap_adapter *na, int onoff) +netmap_vp_reg(struct netmap_adapter *na, int onoff) { struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; - struct ifnet *ifp = na->ifp; - /* the interface is already attached to the bridge, - * so we only need to toggle IFCAP_NETMAP. + /* persistent ports may be put in netmap mode + * before being attached to a bridge */ - BDG_WLOCK(vpna->na_bdg); + if (vpna->na_bdg) + BDG_WLOCK(vpna->na_bdg); if (onoff) { - ifp->if_capenable |= IFCAP_NETMAP; + na->na_flags |= NAF_NETMAP_ON; + /* XXX on FreeBSD, persistent VALE ports should also + * toggle IFCAP_NETMAP in na->ifp (2014-03-16) + */ } else { - ifp->if_capenable &= ~IFCAP_NETMAP; + na->na_flags &= ~NAF_NETMAP_ON; } - BDG_WUNLOCK(vpna->na_bdg); + if (vpna->na_bdg) + BDG_WUNLOCK(vpna->na_bdg); return 0; } @@ -1063,16 +1179,28 @@ bdg_netmap_reg(struct netmap_adapter *na, int onoff) * ring in *dst_ring (at the moment, always use ring 0) */ u_int -netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring, - struct netmap_vp_adapter *na) +netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, + const struct netmap_vp_adapter *na) { + uint8_t *buf = ft->ft_buf; + u_int buf_len = ft->ft_len; struct nm_hash_ent *ht = na->na_bdg->ht; uint32_t sh, dh; u_int dst, mysrc = na->bdg_port; uint64_t smac, dmac; - if (buf_len < 14) { - RD(5, "invalid buf length %d", buf_len); + /* safety check, unfortunately we have many cases */ + if (buf_len >= 14 + na->virt_hdr_len) { + /* virthdr + mac_hdr in the same slot */ + buf += na->virt_hdr_len; + buf_len -= na->virt_hdr_len; + } else if (buf_len == na->virt_hdr_len && ft->ft_flags & NS_MOREFRAG) { + /* only header in first fragment */ + ft++; + buf = ft->ft_buf; + buf_len = ft->ft_len; + } else { + RD(5, "invalid buf format, length %d", buf_len); return NM_BDG_NOPORT; } dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; @@ -1170,7 +1298,7 @@ nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) k->nr_hwtail >= k->nkr_num_slots || k->nkr_lease_idx >= k->nkr_num_slots) { D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d", - k->na->ifp->if_xname, + k->na->name, k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, k->nkr_lease_idx, k->nkr_num_slots); } @@ -1178,6 +1306,7 @@ nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) } /* + * * This flush routine supports only unicast and broadcast but a large * number of ports, and lets us replace the learn and dispatch functions. */ @@ -1204,22 +1333,13 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, uint8_t dst_ring = ring_nr; /* default, same ring as origin */ uint16_t dst_port, d_i; struct nm_bdg_q *d; - uint8_t *buf = ft[i].ft_buf; - u_int len = ft[i].ft_len; ND("slot %d frags %d", i, ft[i].ft_frags); /* Drop the packet if the virtio-net header is not into the first fragment nor at the very beginning of the second. */ - if (unlikely(na->virt_hdr_len > len)) + if (unlikely(na->virt_hdr_len > ft[i].ft_len)) continue; - if (len == na->virt_hdr_len) { - buf = ft[i+1].ft_buf; - len = ft[i+1].ft_len; - } else { - buf += na->virt_hdr_len; - len -= na->virt_hdr_len; - } - dst_port = b->nm_bdg_lookup(buf, len, &dst_ring, na); + dst_port = b->bdg_ops.lookup(&ft[i], &dst_ring, na); if (netmap_verbose > 255) RD(5, "slot %d port %d -> %d", i, me, dst_port); if (dst_port == NM_BDG_NOPORT) @@ -1270,9 +1390,8 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, } ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts); - /* second pass: scan destinations (XXX will be modular somehow) */ + /* second pass: scan destinations */ for (i = 0; i < num_dsts; i++) { - struct ifnet *dst_ifp; struct netmap_vp_adapter *dst_na; struct netmap_kring *kring; struct netmap_ring *ring; @@ -1296,13 +1415,12 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, goto cleanup; if (dst_na->up.na_flags & NAF_SW_ONLY) goto cleanup; - dst_ifp = dst_na->up.ifp; /* * The interface may be in !netmap mode in two cases: * - when na is attached but not activated yet; * - when na is being deactivated but is still attached. */ - if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) { + if (unlikely(!nm_netmap_on(&dst_na->up))) { ND("not in netmap mode!"); goto cleanup; } @@ -1320,7 +1438,7 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, needed = d->bq_len + brddst->bq_len; if (unlikely(dst_na->virt_hdr_len != na->virt_hdr_len)) { - RD(3, "virt_hdr_mismatch, src %d len %d", na->virt_hdr_len, dst_na->virt_hdr_len); + RD(3, "virt_hdr_mismatch, src %d dst %d", na->virt_hdr_len, dst_na->virt_hdr_len); /* There is a virtio-net header/offloadings mismatch between * source and destination. The slower mismatch datapath will * be used to cope with all the mismatches. @@ -1358,6 +1476,10 @@ retry: if (dst_na->retry && retry) { /* try to get some free slot from the previous run */ dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); + /* actually useful only for bwraps, since there + * the notify will trigger a txsync on the hwna. VALE ports + * have dst_na->retry == 0 + */ } /* reserve the buffers in the queue and an entry * to report completion, and drop lock. @@ -1413,7 +1535,7 @@ retry: size_t copy_len = ft_p->ft_len, dst_len = copy_len; slot = &ring->slot[j]; - dst = BDG_NMB(&dst_na->up, slot); + dst = NMB(&dst_na->up, slot); ND("send [%d] %d(%d) bytes at %s:%d", i, (int)copy_len, (int)dst_len, @@ -1421,8 +1543,8 @@ retry: /* round to a multiple of 64 */ copy_len = (copy_len + 63) & ~63; - if (unlikely(copy_len > NETMAP_BUF_SIZE || - copy_len > NETMAP_BUF_SIZE)) { + if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) || + copy_len > NETMAP_BUF_SIZE(&na->up))) { RD(5, "invalid len %d, down to 64", (int)copy_len); copy_len = dst_len = 64; // XXX } @@ -1495,8 +1617,16 @@ retry: still_locked = 0; mtx_unlock(&kring->q_lock); dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); - if (dst_na->retry && retry--) + /* this is netmap_notify for VALE ports and + * netmap_bwrap_notify for bwrap. The latter will + * trigger a txsync on the underlying hwna + */ + if (dst_na->retry && retry--) { + /* XXX this is going to call nm_notify again. + * Only useful for bwrap in virtual machines + */ goto retry; + } } } if (still_locked) @@ -1511,11 +1641,12 @@ cleanup: return 0; } - +/* nm_txsync callback for VALE ports */ static int -netmap_vp_txsync(struct netmap_vp_adapter *na, u_int ring_nr, int flags) +netmap_vp_txsync(struct netmap_kring *kring, int flags) { - struct netmap_kring *kring = &na->up.tx_rings[ring_nr]; + struct netmap_vp_adapter *na = + (struct netmap_vp_adapter *)kring->na; u_int done; u_int const lim = kring->nkr_num_slots - 1; u_int const cur = kring->rcur; @@ -1524,10 +1655,14 @@ netmap_vp_txsync(struct netmap_vp_adapter *na, u_int ring_nr, int flags) done = cur; // used all goto done; } + if (!na->na_bdg) { + done = cur; + goto done; + } if (bridge_batch > NM_BDG_BATCH) bridge_batch = NM_BDG_BATCH; - done = nm_bdg_preflush(na, ring_nr, kring, cur); + done = nm_bdg_preflush(kring, cur); done: if (done != cur) D("early break at %d/ %d, tail %d", done, cur, kring->nr_hwtail); @@ -1538,27 +1673,18 @@ done: kring->nr_hwtail = nm_prev(done, lim); nm_txsync_finalize(kring); if (netmap_verbose) - D("%s ring %d flags %d", NM_IFPNAME(na->up.ifp), ring_nr, flags); + D("%s ring %d flags %d", na->up.name, kring->ring_id, flags); return 0; } -/* - * main dispatch routine for the bridge. - * We already know that only one thread is running this. - * we must run nm_bdg_preflush without lock. +/* rxsync code used by VALE ports nm_rxsync callback and also + * internally by the brwap */ static int -bdg_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) +netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags) { - struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; - return netmap_vp_txsync(vpna, ring_nr, flags); -} - -static int -netmap_vp_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) -{ - struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_adapter *na = kring->na; struct netmap_ring *ring = kring->ring; u_int nm_i, lim = kring->nkr_num_slots - 1; u_int head = nm_rxsync_prologue(kring); @@ -1579,9 +1705,9 @@ netmap_vp_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) /* consistency check, but nothing really important here */ for (n = 0; likely(nm_i != head); n++) { struct netmap_slot *slot = &ring->slot[nm_i]; - void *addr = BDG_NMB(na, slot); + void *addr = NMB(na, slot); - if (addr == netmap_buffer_base) { /* bad buf */ + if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */ D("bad buffer index %d, ignore ?", slot->buf_idx); } @@ -1599,26 +1725,45 @@ done: } /* + * nm_rxsync callback for VALE ports * user process reading from a VALE switch. * Already protected against concurrent calls from userspace, * but we must acquire the queue's lock to protect against * writers on the same queue. */ static int -bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +netmap_vp_rxsync(struct netmap_kring *kring, int flags) { - struct netmap_kring *kring = &na->rx_rings[ring_nr]; int n; mtx_lock(&kring->q_lock); - n = netmap_vp_rxsync(na, ring_nr, flags); + n = netmap_vp_rxsync_locked(kring, flags); mtx_unlock(&kring->q_lock); return n; } +/* nm_bdg_attach callback for VALE ports + * The na_vp port is this same netmap_adapter. There is no host port. + */ +static int +netmap_vp_bdg_attach(const char *name, struct netmap_adapter *na) +{ + struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; + + if (vpna->na_bdg) + return EBUSY; + na->na_vp = vpna; + strncpy(na->name, name, sizeof(na->name)); + na->na_hostvp = NULL; + return 0; +} + +/* create a netmap_vp_adapter that describes a VALE port. + * Only persistent VALE ports have a non-null ifp. + */ static int -bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp) +netmap_vp_create(struct nmreq *nmr, struct ifnet *ifp, struct netmap_vp_adapter **ret) { struct netmap_vp_adapter *vpna; struct netmap_adapter *na; @@ -1632,6 +1777,7 @@ bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp) na = &vpna->up; na->ifp = ifp; + strncpy(na->name, nmr->nr_name, sizeof(na->name)); /* bound checking */ na->num_tx_rings = nmr->nr_tx_rings; @@ -1664,22 +1810,24 @@ bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp) D("max frame size %u", vpna->mfs); na->na_flags |= NAF_BDG_MAYSLEEP | NAF_MEM_OWNER; - na->nm_txsync = bdg_netmap_txsync; - na->nm_rxsync = bdg_netmap_rxsync; - na->nm_register = bdg_netmap_reg; - na->nm_dtor = netmap_adapter_vp_dtor; + na->nm_txsync = netmap_vp_txsync; + na->nm_rxsync = netmap_vp_rxsync; + na->nm_register = netmap_vp_reg; na->nm_krings_create = netmap_vp_krings_create; na->nm_krings_delete = netmap_vp_krings_delete; - na->nm_mem = netmap_mem_private_new(NM_IFPNAME(na->ifp), + na->nm_dtor = netmap_vp_dtor; + na->nm_mem = netmap_mem_private_new(na->name, na->num_tx_rings, na->num_tx_desc, na->num_rx_rings, na->num_rx_desc, nmr->nr_arg3, npipes, &error); if (na->nm_mem == NULL) goto err; + na->nm_bdg_attach = netmap_vp_bdg_attach; /* other nmd fields are set in the common routine */ error = netmap_attach_common(na); if (error) goto err; + *ret = vpna; return 0; err: @@ -1689,30 +1837,60 @@ err: return error; } +/* Bridge wrapper code (bwrap). + * This is used to connect a non-VALE-port netmap_adapter (hwna) to a + * VALE switch. + * The main task is to swap the meaning of tx and rx rings to match the + * expectations of the VALE switch code (see nm_bdg_flush). + * + * The bwrap works by interposing a netmap_bwrap_adapter between the + * rest of the system and the hwna. The netmap_bwrap_adapter looks like + * a netmap_vp_adapter to the rest the system, but, internally, it + * translates all callbacks to what the hwna expects. + * + * Note that we have to intercept callbacks coming from two sides: + * + * - callbacks coming from the netmap module are intercepted by + * passing around the netmap_bwrap_adapter instead of the hwna + * + * - callbacks coming from outside of the netmap module only know + * about the hwna. This, however, only happens in interrupt + * handlers, where only the hwna->nm_notify callback is called. + * What the bwrap does is to overwrite the hwna->nm_notify callback + * with its own netmap_bwrap_intr_notify. + * XXX This assumes that the hwna->nm_notify callback was the + * standard netmap_notify(), as it is the case for nic adapters. + * Any additional action performed by hwna->nm_notify will not be + * performed by netmap_bwrap_intr_notify. + * + * Additionally, the bwrap can optionally attach the host rings pair + * of the wrapped adapter to a different port of the switch. + */ + static void netmap_bwrap_dtor(struct netmap_adapter *na) { struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; struct netmap_adapter *hwna = bna->hwna; - struct nm_bridge *b = bna->up.na_bdg, - *bh = bna->host.na_bdg; - struct ifnet *ifp = na->ifp; ND("na %p", na); + /* drop reference to hwna->ifp. + * If we don't do this, netmap_detach_common(na) + * will think it has set NA(na->ifp) to NULL + */ + na->ifp = NULL; + /* for safety, also drop the possible reference + * in the hostna + */ + bna->host.up.ifp = NULL; - if (b) { - netmap_bdg_detach_common(b, bna->up.bdg_port, - (bh ? bna->host.bdg_port : -1)); - } - + hwna->nm_mem = bna->save_nmd; hwna->na_private = NULL; + hwna->na_vp = hwna->na_hostvp = NULL; + hwna->na_flags &= ~NAF_BUSY; netmap_adapter_put(hwna); - bzero(ifp, sizeof(*ifp)); - free(ifp, M_DEVBUF); - na->ifp = NULL; - } @@ -1737,7 +1915,6 @@ netmap_bwrap_dtor(struct netmap_adapter *na) static int netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, int flags) { - struct ifnet *ifp = na->ifp; struct netmap_bwrap_adapter *bna = na->na_private; struct netmap_vp_adapter *hostna = &bna->host; struct netmap_kring *kring, *bkring; @@ -1747,20 +1924,24 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, int error = 0; if (netmap_verbose) - D("%s %s%d 0x%x", NM_IFPNAME(ifp), + D("%s %s%d 0x%x", na->name, (tx == NR_TX ? "TX" : "RX"), ring_nr, flags); if (flags & NAF_DISABLE_NOTIFY) { - kring = tx == NR_TX ? na->tx_rings : na->rx_rings; - bkring = tx == NR_TX ? vpna->up.rx_rings : vpna->up.tx_rings; - if (kring[ring_nr].nkr_stopped) - netmap_disable_ring(&bkring[ring_nr]); - else - bkring[ring_nr].nkr_stopped = 0; + /* the enabled/disabled state of the ring has changed, + * propagate the info to the wrapper (with tx/rx swapped) + */ + if (tx == NR_TX) { + netmap_set_rxring(&vpna->up, ring_nr, + na->tx_rings[ring_nr].nkr_stopped); + } else { + netmap_set_txring(&vpna->up, ring_nr, + na->rx_rings[ring_nr].nkr_stopped); + } return 0; } - if (ifp == NULL || !(ifp->if_capenable & IFCAP_NETMAP)) + if (!nm_netmap_on(na)) return 0; /* we only care about receive interrupts */ @@ -1786,7 +1967,7 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, * the info from the rx kring. */ if (netmap_verbose) - D("%s head %d cur %d tail %d (kring %d %d %d)", NM_IFPNAME(ifp), + D("%s head %d cur %d tail %d (kring %d %d %d)", na->name, ring->head, ring->cur, ring->tail, kring->rhead, kring->rcur, kring->rtail); @@ -1807,7 +1988,7 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, goto put_out; if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) { D("how strange, interrupt with no packets on %s", - NM_IFPNAME(ifp)); + na->name); goto put_out; } @@ -1823,7 +2004,7 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, /* pass packets to the switch */ nm_txsync_prologue(bkring); // XXX error checking ? - netmap_vp_txsync(vpna, ring_nr, flags); + netmap_vp_txsync(bkring, flags); /* mark all buffers as released on this ring */ ring->head = ring->cur = kring->nr_hwtail; @@ -1845,6 +2026,7 @@ put_out: } +/* nm_register callback for bwrap */ static int netmap_bwrap_register(struct netmap_adapter *na, int onoff) { @@ -1854,22 +2036,35 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff) struct netmap_vp_adapter *hostna = &bna->host; int error; - ND("%s %s", NM_IFPNAME(na->ifp), onoff ? "on" : "off"); + ND("%s %s", na->name, onoff ? "on" : "off"); if (onoff) { int i; + /* netmap_do_regif has been called on the bwrap na. + * We need to pass the information about the + * memory allocator down to the hwna before + * putting it in netmap mode + */ hwna->na_lut = na->na_lut; hwna->na_lut_objtotal = na->na_lut_objtotal; + hwna->na_lut_objsize = na->na_lut_objsize; if (hostna->na_bdg) { + /* if the host rings have been attached to switch, + * we need to copy the memory allocator information + * in the hostna also + */ hostna->up.na_lut = na->na_lut; hostna->up.na_lut_objtotal = na->na_lut_objtotal; + hostna->up.na_lut_objsize = na->na_lut_objsize; } /* cross-link the netmap rings * The original number of rings comes from hwna, * rx rings on one side equals tx rings on the other. + * We need to do this now, after the initialization + * of the kring->ring pointers */ for (i = 0; i < na->num_rx_rings + 1; i++) { hwna->tx_rings[i].nkr_num_slots = na->rx_rings[i].nkr_num_slots; @@ -1881,27 +2076,31 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff) } } - if (hwna->ifp) { - error = hwna->nm_register(hwna, onoff); - if (error) - return error; - } + /* forward the request to the hwna */ + error = hwna->nm_register(hwna, onoff); + if (error) + return error; - bdg_netmap_reg(na, onoff); + /* impersonate a netmap_vp_adapter */ + netmap_vp_reg(na, onoff); + if (hostna->na_bdg) + netmap_vp_reg(&hostna->up, onoff); if (onoff) { + /* intercept the hwna nm_nofify callback */ bna->save_notify = hwna->nm_notify; hwna->nm_notify = netmap_bwrap_intr_notify; } else { hwna->nm_notify = bna->save_notify; hwna->na_lut = NULL; hwna->na_lut_objtotal = 0; + hwna->na_lut_objsize = 0; } return 0; } - +/* nm_config callback for bwrap */ static int netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd, u_int *rxr, u_int *rxd) @@ -1922,6 +2121,7 @@ netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd, } +/* nm_krings_create callback for bwrap */ static int netmap_bwrap_krings_create(struct netmap_adapter *na) { @@ -1931,21 +2131,33 @@ netmap_bwrap_krings_create(struct netmap_adapter *na) struct netmap_adapter *hostna = &bna->host.up; int error; - ND("%s", NM_IFPNAME(na->ifp)); + ND("%s", na->name); + /* impersonate a netmap_vp_adapter */ error = netmap_vp_krings_create(na); if (error) return error; + /* also create the hwna krings */ error = hwna->nm_krings_create(hwna); if (error) { netmap_vp_krings_delete(na); return error; } + /* the connection between the bwrap krings and the hwna krings + * will be perfomed later, in the nm_register callback, since + * now the kring->ring pointers have not been initialized yet + */ if (na->na_flags & NAF_HOST_RINGS) { + /* the hostna rings are the host rings of the bwrap. + * The corresponding krings must point back to the + * hostna + */ hostna->tx_rings = na->tx_rings + na->num_tx_rings; + hostna->tx_rings[0].na = hostna; hostna->rx_rings = na->rx_rings + na->num_rx_rings; + hostna->rx_rings[0].na = hostna; } return 0; @@ -1959,7 +2171,7 @@ netmap_bwrap_krings_delete(struct netmap_adapter *na) (struct netmap_bwrap_adapter *)na; struct netmap_adapter *hwna = bna->hwna; - ND("%s", NM_IFPNAME(na->ifp)); + ND("%s", na->name); hwna->nm_krings_delete(hwna); netmap_vp_krings_delete(na); @@ -1986,13 +2198,13 @@ netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int f ring = kring->ring; lim = kring->nkr_num_slots - 1; - if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP)) + if (!nm_netmap_on(hwna)) return 0; mtx_lock(&kring->q_lock); /* first step: simulate a user wakeup on the rx ring */ - netmap_vp_rxsync(na, ring_n, flags); + netmap_vp_rxsync_locked(kring, flags); ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", - NM_IFPNAME(na->ifp), ring_n, + na->name, ring_n, kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, ring->head, ring->cur, ring->tail, hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail); @@ -2013,9 +2225,9 @@ netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int f ring->tail = kring->rtail; /* restore saved value of tail, for safety */ /* fifth step: the user goes to sleep again, causing another rxsync */ - netmap_vp_rxsync(na, ring_n, flags); + netmap_vp_rxsync_locked(kring, flags); ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", - NM_IFPNAME(na->ifp), ring_n, + na->name, ring_n, kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, ring->head, ring->cur, ring->tail, hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail); @@ -2024,6 +2236,7 @@ netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int f } +/* notify method for the bridge-->host-rings path */ static int netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags) { @@ -2035,23 +2248,95 @@ netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, } -/* attach a bridge wrapper to the 'real' device */ +/* nm_bdg_ctl callback for the bwrap. + * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd]. + * On attach, it needs to provide a fake netmap_priv_d structure and + * perform a netmap_do_regif() on the bwrap. This will put both the + * bwrap and the hwna in netmap mode, with the netmap rings shared + * and cross linked. Moroever, it will start intercepting interrupts + * directed to hwna. + */ static int -netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real) +netmap_bwrap_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach) +{ + struct netmap_priv_d *npriv; + struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; + struct netmap_if *nifp; + int error = 0; + + if (attach) { + if (NETMAP_OWNED_BY_ANY(na)) { + return EBUSY; + } + if (bna->na_kpriv) { + /* nothing to do */ + return 0; + } + npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO); + if (npriv == NULL) + return ENOMEM; + nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags, &error); + if (!nifp) { + bzero(npriv, sizeof(*npriv)); + free(npriv, M_DEVBUF); + return error; + } + bna->na_kpriv = npriv; + na->na_flags |= NAF_BUSY; + } else { + int last_instance; + + if (na->active_fds == 0) /* not registered */ + return EINVAL; + last_instance = netmap_dtor_locked(bna->na_kpriv); + if (!last_instance) { + D("--- error, trying to detach an entry with active mmaps"); + error = EINVAL; + } else { + struct nm_bridge *b = bna->up.na_bdg, + *bh = bna->host.na_bdg; + npriv = bna->na_kpriv; + bna->na_kpriv = NULL; + D("deleting priv"); + + bzero(npriv, sizeof(*npriv)); + free(npriv, M_DEVBUF); + if (b) { + /* XXX the bwrap dtor should take care + * of this (2014-06-16) + */ + netmap_bdg_detach_common(b, bna->up.bdg_port, + (bh ? bna->host.bdg_port : -1)); + } + na->na_flags &= ~NAF_BUSY; + } + } + return error; + +} + +/* attach a bridge wrapper to the 'real' device */ +int +netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna) { struct netmap_bwrap_adapter *bna; - struct netmap_adapter *na; - struct netmap_adapter *hwna = NA(real); - struct netmap_adapter *hostna; - int error; + struct netmap_adapter *na = NULL; + struct netmap_adapter *hostna = NULL; + int error = 0; + /* make sure the NIC is not already in use */ + if (NETMAP_OWNED_BY_ANY(hwna)) { + D("NIC %s busy, cannot attach to bridge", hwna->name); + return EBUSY; + } bna = malloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO); - if (bna == NULL) + if (bna == NULL) { return ENOMEM; + } na = &bna->up.up; - na->ifp = fake; + strncpy(na->name, nr_name, sizeof(na->name)); /* fill the ring data for the bwrap adapter with rx/tx meanings * swapped. The real cross-linking will be done during register, * when all the krings will have been created. @@ -2068,17 +2353,28 @@ netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real) na->nm_krings_create = netmap_bwrap_krings_create; na->nm_krings_delete = netmap_bwrap_krings_delete; na->nm_notify = netmap_bwrap_notify; - na->nm_mem = hwna->nm_mem; - na->na_private = na; /* prevent NIOCREGIF */ + na->nm_bdg_ctl = netmap_bwrap_bdg_ctl; + na->pdev = hwna->pdev; + na->nm_mem = netmap_mem_private_new(na->name, + na->num_tx_rings, na->num_tx_desc, + na->num_rx_rings, na->num_rx_desc, + 0, 0, &error); + na->na_flags |= NAF_MEM_OWNER; + if (na->nm_mem == NULL) + goto err_put; bna->up.retry = 1; /* XXX maybe this should depend on the hwna */ bna->hwna = hwna; netmap_adapter_get(hwna); hwna->na_private = bna; /* weak reference */ - + hwna->na_vp = &bna->up; + if (hwna->na_flags & NAF_HOST_RINGS) { + if (hwna->na_flags & NAF_SW_ONLY) + na->na_flags |= NAF_SW_ONLY; na->na_flags |= NAF_HOST_RINGS; hostna = &bna->host.up; + snprintf(hostna->name, sizeof(hostna->name), "%s^", nr_name); hostna->ifp = hwna->ifp; hostna->num_tx_rings = 1; hostna->num_tx_desc = hwna->num_rx_desc; @@ -2089,20 +2385,44 @@ netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real) hostna->nm_notify = netmap_bwrap_host_notify; hostna->nm_mem = na->nm_mem; hostna->na_private = bna; + hostna->na_vp = &bna->up; + na->na_hostvp = hwna->na_hostvp = + hostna->na_hostvp = &bna->host; + hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */ } ND("%s<->%s txr %d txd %d rxr %d rxd %d", - fake->if_xname, real->if_xname, + na->name, ifp->if_xname, na->num_tx_rings, na->num_tx_desc, na->num_rx_rings, na->num_rx_desc); error = netmap_attach_common(na); if (error) { - netmap_adapter_put(hwna); - free(bna, M_DEVBUF); - return error; + goto err_free; } + /* make bwrap ifp point to the real ifp + * NOTE: netmap_attach_common() interprets a non-NULL na->ifp + * as a request to make the ifp point to the na. Since we + * do not want to change the na already pointed to by hwna->ifp, + * the following assignment has to be delayed until now + */ + na->ifp = hwna->ifp; + hwna->na_flags |= NAF_BUSY; + /* make hwna point to the allocator we are actually using, + * so that monitors will be able to find it + */ + bna->save_nmd = hwna->nm_mem; + hwna->nm_mem = na->nm_mem; return 0; + +err_free: + netmap_mem_private_delete(na->nm_mem); +err_put: + hwna->na_vp = hwna->na_hostvp = NULL; + netmap_adapter_put(hwna); + free(bna, M_DEVBUF); + return error; + } diff --git a/tools/tools/netmap/pkt-gen.c b/tools/tools/netmap/pkt-gen.c index 8e78fa8..c7cd874 100644 --- a/tools/tools/netmap/pkt-gen.c +++ b/tools/tools/netmap/pkt-gen.c @@ -37,6 +37,8 @@ * */ +// #define TRASH_VHOST_HDR + #define _GNU_SOURCE /* for CPU_SET() */ #include <stdio.h> #define NETMAP_WITH_LIBS @@ -123,12 +125,14 @@ struct virt_header { uint8_t fields[VIRT_HDR_MAX]; }; +#define MAX_BODYSIZE 16384 + struct pkt { struct virt_header vh; struct ether_header eh; struct ip ip; struct udphdr udp; - uint8_t body[2048]; // XXX hardwired + uint8_t body[MAX_BODYSIZE]; // XXX hardwired } __attribute__((__packed__)); struct ip_range { @@ -144,6 +148,15 @@ struct mac_range { /* ifname can be netmap:foo-xxxx */ #define MAX_IFNAMELEN 64 /* our buffer for ifname */ +//#define MAX_PKTSIZE 1536 +#define MAX_PKTSIZE MAX_BODYSIZE /* XXX: + IP_HDR + ETH_HDR */ + +/* compact timestamp to fit into 60 byte packet. (enough to obtain RTT) */ +struct tstamp { + uint32_t sec; + uint32_t nsec; +}; + /* * global arguments for all threads */ @@ -168,6 +181,8 @@ struct glob_arg { #define OPT_TS 16 /* add a timestamp */ #define OPT_INDIRECT 32 /* use indirect buffers, tx only */ #define OPT_DUMP 64 /* dump rx/tx traffic */ +#define OPT_MONITOR_TX 128 +#define OPT_MONITOR_RX 256 int dev_type; #ifndef NO_PCAP pcap_t *p; @@ -179,7 +194,6 @@ struct glob_arg { int affinity; int main_fd; struct nm_desc *nmd; - uint64_t nmd_flags; int report_interval; /* milliseconds between prints */ void *(*td_body)(void *); void *mmap_addr; @@ -309,6 +323,7 @@ sigint_h(int sig) int i; (void)sig; /* UNUSED */ + D("received control-C on thread %p", pthread_self()); for (i = 0; i < global_nthreads; i++) { targs[i].cancel = 1; } @@ -642,9 +657,37 @@ initialize_packet(struct targ *targ) eh->ether_type = htons(ETHERTYPE_IP); bzero(&pkt->vh, sizeof(pkt->vh)); +#ifdef TRASH_VHOST_HDR + /* set bogus content */ + pkt->vh.fields[0] = 0xff; + pkt->vh.fields[1] = 0xff; + pkt->vh.fields[2] = 0xff; + pkt->vh.fields[3] = 0xff; + pkt->vh.fields[4] = 0xff; + pkt->vh.fields[5] = 0xff; +#endif /* TRASH_VHOST_HDR */ // dump_payload((void *)pkt, targ->g->pkt_size, NULL, 0); } +static void +set_vnet_hdr_len(struct targ *t) +{ + int err, l = t->g->virt_header; + struct nmreq req; + + if (l == 0) + return; + + memset(&req, 0, sizeof(req)); + bcopy(t->nmd->req.nr_name, req.nr_name, sizeof(req.nr_name)); + req.nr_version = NETMAP_API; + req.nr_cmd = NETMAP_BDG_VNET_HDR; + req.nr_arg1 = l; + err = ioctl(t->fd, NIOCREGIF, &req); + if (err) { + D("Unable to set vnet header length %d", l); + } +} /* @@ -760,10 +803,13 @@ pinger_body(void *data) if (nm_ring_empty(ring)) { D("-- ouch, cannot send"); } else { + struct tstamp *tp; nm_pkt_copy(frame, p, size); clock_gettime(CLOCK_REALTIME_PRECISE, &ts); bcopy(&sent, p+42, sizeof(sent)); - bcopy(&ts, p+46, sizeof(ts)); + tp = (struct tstamp *)(p+46); + tp->sec = (uint32_t)ts.tv_sec; + tp->nsec = (uint32_t)ts.tv_nsec; sent++; ring->head = ring->cur = nm_ring_next(ring, ring->cur); } @@ -780,12 +826,15 @@ pinger_body(void *data) ring = NETMAP_RXRING(nifp, i); while (!nm_ring_empty(ring)) { uint32_t seq; + struct tstamp *tp; slot = &ring->slot[ring->cur]; p = NETMAP_BUF(ring, slot->buf_idx); clock_gettime(CLOCK_REALTIME_PRECISE, &now); bcopy(p+42, &seq, sizeof(seq)); - bcopy(p+46, &ts, sizeof(ts)); + tp = (struct tstamp *)(p+46); + ts.tv_sec = (time_t)tp->sec; + ts.tv_nsec = (long)tp->nsec; ts.tv_sec = now.tv_sec - ts.tv_sec; ts.tv_nsec = now.tv_nsec - ts.tv_nsec; if (ts.tv_nsec < 0) { @@ -978,7 +1027,7 @@ sender_body(void *data) { struct targ *targ = (struct targ *) data; struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT }; - struct netmap_if *nifp = targ->nmd->nifp; + struct netmap_if *nifp; struct netmap_ring *txring; int i, n = targ->g->npackets / targ->g->nthreads; int64_t sent = 0; @@ -993,7 +1042,7 @@ sender_body(void *data) frame += sizeof(pkt->vh) - targ->g->virt_header; size = targ->g->pkt_size + targ->g->virt_header; - D("start"); + D("start, fd %d main_fd %d", targ->fd, targ->g->main_fd); if (setaffinity(targ->thread, targ->affinity)) goto quit; @@ -1035,6 +1084,7 @@ sender_body(void *data) int tosend = 0; int frags = targ->g->frags; + nifp = targ->nmd->nifp; while (!targ->cancel && (n == 0 || sent < n)) { if (rate_limit && tosend <= 0) { @@ -1088,12 +1138,17 @@ sender_body(void *data) } } /* flush any remaining packets */ + D("flush tail %d head %d on thread %p", + txring->tail, txring->head, + pthread_self()); ioctl(pfd.fd, NIOCTXSYNC, NULL); /* final part: wait all the TX queues to be empty. */ for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) { txring = NETMAP_TXRING(nifp, i); while (nm_tx_pending(txring)) { + RD(5, "pending tx tail %d head %d on ring %d", + txring->tail, txring->head, i); ioctl(pfd.fd, NIOCTXSYNC, NULL); usleep(1); /* wait 1 tick */ } @@ -1152,7 +1207,7 @@ receiver_body(void *data) { struct targ *targ = (struct targ *) data; struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; - struct netmap_if *nifp = targ->nmd->nifp; + struct netmap_if *nifp; struct netmap_ring *rxring; int i; uint64_t received = 0; @@ -1160,21 +1215,21 @@ receiver_body(void *data) if (setaffinity(targ->thread, targ->affinity)) goto quit; + D("reading from %s fd %d main_fd %d", + targ->g->ifname, targ->fd, targ->g->main_fd); /* unbounded wait for the first packet. */ - for (;;) { + for (;!targ->cancel;) { i = poll(&pfd, 1, 1000); if (i > 0 && !(pfd.revents & POLLERR)) break; RD(1, "waiting for initial packets, poll returns %d %d", i, pfd.revents); } - /* main loop, exit after 1s silence */ clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic); if (targ->g->dev_type == DEV_TAP) { - D("reading from %s fd %d", targ->g->ifname, targ->g->main_fd); while (!targ->cancel) { - char buf[2048]; + char buf[MAX_BODYSIZE]; /* XXX should we poll ? */ if (read(targ->g->main_fd, buf, sizeof(buf)) > 0) targ->count++; @@ -1183,11 +1238,14 @@ receiver_body(void *data) } else if (targ->g->dev_type == DEV_PCAP) { while (!targ->cancel) { /* XXX should we poll ? */ - pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap, NULL); + pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap, + (u_char *)&targ->count); } #endif /* !NO_PCAP */ } else { int dump = targ->g->options & OPT_DUMP; + + nifp = targ->nmd->nifp; while (!targ->cancel) { /* Once we started to receive packets, wait at most 1 seconds before quitting. */ @@ -1333,6 +1391,8 @@ start_threads(struct glob_arg *g) if (g->dev_type == DEV_NETMAP) { struct nm_desc nmd = *g->nmd; /* copy, we overwrite ringid */ + uint64_t nmd_flags = 0; + nmd.self = &nmd; if (g->nthreads > 1) { if (nmd.req.nr_flags != NR_REG_ALL_NIC) { @@ -1344,18 +1404,23 @@ start_threads(struct glob_arg *g) } /* Only touch one of the rings (rx is already ok) */ if (g->td_body == receiver_body) - nmd.req.nr_ringid |= NETMAP_NO_TX_POLL; + nmd_flags |= NETMAP_NO_TX_POLL; /* register interface. Override ifname and ringid etc. */ + if (g->options & OPT_MONITOR_TX) + nmd.req.nr_flags |= NR_MONITOR_TX; + if (g->options & OPT_MONITOR_RX) + nmd.req.nr_flags |= NR_MONITOR_RX; - t->nmd = nm_open(t->g->ifname, NULL, g->nmd_flags | - NM_OPEN_IFNAME | NM_OPEN_NO_MMAP, g->nmd); + t->nmd = nm_open(t->g->ifname, NULL, nmd_flags | + NM_OPEN_IFNAME | NM_OPEN_NO_MMAP, &nmd); if (t->nmd == NULL) { D("Unable to open %s: %s", t->g->ifname, strerror(errno)); continue; } t->fd = t->nmd->fd; + set_vnet_hdr_len(t); } else { targs[i].fd = g->main_fd; @@ -1573,7 +1638,7 @@ main(int arc, char **argv) g.virt_header = 0; while ( (ch = getopt(arc, argv, - "a:f:F:n:i:Il:d:s:D:S:b:c:o:p:T:w:WvR:XC:H:e:")) != -1) { + "a:f:F:n:i:Il:d:s:D:S:b:c:o:p:T:w:WvR:XC:H:e:m:")) != -1) { struct sf *fn; switch(ch) { @@ -1707,6 +1772,15 @@ main(int arc, char **argv) case 'e': /* extra bufs */ g.extra_bufs = atoi(optarg); break; + case 'm': + if (strcmp(optarg, "tx") == 0) { + g.options |= OPT_MONITOR_TX; + } else if (strcmp(optarg, "rx") == 0) { + g.options |= OPT_MONITOR_RX; + } else { + D("unrecognized monitor mode %s", optarg); + } + break; } } @@ -1723,8 +1797,8 @@ main(int arc, char **argv) if (g.cpus == 0) g.cpus = i; - if (g.pkt_size < 16 || g.pkt_size > 1536) { - D("bad pktsize %d\n", g.pkt_size); + if (g.pkt_size < 16 || g.pkt_size > MAX_PKTSIZE) { + D("bad pktsize %d [16..%d]\n", g.pkt_size, MAX_PKTSIZE); usage(); } @@ -1766,26 +1840,25 @@ main(int arc, char **argv) } else if (g.dev_type == DEV_PCAP) { char pcap_errbuf[PCAP_ERRBUF_SIZE]; - D("using pcap on %s", g.ifname); pcap_errbuf[0] = '\0'; // init the buffer - g.p = pcap_open_live(g.ifname, 0, 1, 100, pcap_errbuf); + g.p = pcap_open_live(g.ifname, 256 /* XXX */, 1, 100, pcap_errbuf); if (g.p == NULL) { D("cannot open pcap on %s", g.ifname); usage(); } + g.main_fd = pcap_fileno(g.p); + D("using pcap on %s fileno %d", g.ifname, g.main_fd); #endif /* !NO_PCAP */ } else if (g.dummy_send) { /* but DEV_NETMAP */ D("using a dummy send routine"); } else { - struct nm_desc base_nmd; + struct nmreq base_nmd; bzero(&base_nmd, sizeof(base_nmd)); - g.nmd_flags = 0; - g.nmd_flags |= parse_nmr_config(g.nmr_config, &base_nmd.req); + parse_nmr_config(g.nmr_config, &base_nmd); if (g.extra_bufs) { - base_nmd.req.nr_arg3 = g.extra_bufs; - g.nmd_flags |= NM_OPEN_ARG3; + base_nmd.nr_arg3 = g.extra_bufs; } /* @@ -1795,7 +1868,7 @@ main(int arc, char **argv) * which in turn may take some time for the PHY to * reconfigure. We do the open here to have time to reset. */ - g.nmd = nm_open(g.ifname, NULL, g.nmd_flags, &base_nmd); + g.nmd = nm_open(g.ifname, &base_nmd, 0, NULL); if (g.nmd == NULL) { D("Unable to open %s: %s", g.ifname, strerror(errno)); goto out; @@ -1803,7 +1876,11 @@ main(int arc, char **argv) g.main_fd = g.nmd->fd; D("mapped %dKB at %p", g.nmd->req.nr_memsize>>10, g.nmd->mem); - devqueues = g.nmd->req.nr_rx_rings; + /* get num of queues in tx or rx */ + if (g.td_body == sender_body) + devqueues = g.nmd->req.nr_tx_rings; + else + devqueues = g.nmd->req.nr_rx_rings; /* validate provided nthreads. */ if (g.nthreads < 1 || g.nthreads > devqueues) { @@ -1819,12 +1896,14 @@ main(int arc, char **argv) req->nr_offset, req->nr_tx_rings, req->nr_rx_rings, req->nr_arg2); for (i = 0; i <= req->nr_tx_rings; i++) { - D(" TX%d at 0x%lx", i, - (char *)NETMAP_TXRING(nifp, i) - (char *)nifp); + struct netmap_ring *ring = NETMAP_TXRING(nifp, i); + D(" TX%d at 0x%lx slots %d", i, + (char *)ring - (char *)nifp, ring->num_slots); } for (i = 0; i <= req->nr_rx_rings; i++) { - D(" RX%d at 0x%lx", i, - (char *)NETMAP_RXRING(nifp, i) - (char *)nifp); + struct netmap_ring *ring = NETMAP_RXRING(nifp, i); + D(" RX%d at 0x%lx slots %d", i, + (char *)ring - (char *)nifp, ring->num_slots); } } diff --git a/tools/tools/netmap/vale-ctl.c b/tools/tools/netmap/vale-ctl.c index e1d8da5..c9e5f31 100644 --- a/tools/tools/netmap/vale-ctl.c +++ b/tools/tools/netmap/vale-ctl.c @@ -38,6 +38,7 @@ #include <net/netmap.h> #include <net/netmap_user.h> #include <libgen.h> /* basename */ +#include <stdlib.h> /* atoi, free */ /* debug support */ #define ND(format, ...) do {} while(0) @@ -45,8 +46,47 @@ fprintf(stderr, "%s [%d] " format "\n", \ __FUNCTION__, __LINE__, ##__VA_ARGS__) +/* XXX cut and paste from pkt-gen.c because I'm not sure whether this + * program may include nm_util.h + */ +void parse_nmr_config(const char* conf, struct nmreq *nmr) +{ + char *w, *tok; + int i, v; + + nmr->nr_tx_rings = nmr->nr_rx_rings = 0; + nmr->nr_tx_slots = nmr->nr_rx_slots = 0; + if (conf == NULL || ! *conf) + return; + w = strdup(conf); + for (i = 0, tok = strtok(w, ","); tok; i++, tok = strtok(NULL, ",")) { + v = atoi(tok); + switch (i) { + case 0: + nmr->nr_tx_slots = nmr->nr_rx_slots = v; + break; + case 1: + nmr->nr_rx_slots = v; + break; + case 2: + nmr->nr_tx_rings = nmr->nr_rx_rings = v; + break; + case 3: + nmr->nr_rx_rings = v; + break; + default: + D("ignored config: %s", tok); + break; + } + } + D("txr %d txd %d rxr %d rxd %d", + nmr->nr_tx_rings, nmr->nr_tx_slots, + nmr->nr_rx_rings, nmr->nr_rx_slots); + free(w); +} + static int -bdg_ctl(const char *name, int nr_cmd, int nr_arg) +bdg_ctl(const char *name, int nr_cmd, int nr_arg, char *nmr_config) { struct nmreq nmr; int error = 0; @@ -62,8 +102,19 @@ bdg_ctl(const char *name, int nr_cmd, int nr_arg) if (name != NULL) /* might be NULL */ strncpy(nmr.nr_name, name, sizeof(nmr.nr_name)); nmr.nr_cmd = nr_cmd; + parse_nmr_config(nmr_config, &nmr); switch (nr_cmd) { + case NETMAP_BDG_DELIF: + case NETMAP_BDG_NEWIF: + error = ioctl(fd, NIOCREGIF, &nmr); + if (error == -1) { + ND("Unable to %s %s", nr_cmd == NETMAP_BDG_DELIF ? "delete":"create", name); + perror(name); + } else { + ND("Success to %s %s", nr_cmd == NETMAP_BDG_DELIF ? "delete":"create", name); + } + break; case NETMAP_BDG_ATTACH: case NETMAP_BDG_DETACH: if (nr_arg && nr_arg != NETMAP_BDG_HOST) @@ -120,7 +171,7 @@ main(int argc, char *argv[]) { int ch, nr_cmd = 0, nr_arg = 0; const char *command = basename(argv[0]); - char *name = NULL; + char *name = NULL, *nmr_config = NULL; if (argc > 3) { usage: @@ -131,12 +182,15 @@ usage: "\t-d interface interface name to be detached\n" "\t-a interface interface name to be attached\n" "\t-h interface interface name to be attached with the host stack\n" + "\t-n interface interface name to be created\n" + "\t-r interface interface name to be deleted\n" "\t-l list all or specified bridge's interfaces (default)\n" + "\t-C string ring/slot setting of an interface creating by -n\n" "", command); return 0; } - while ((ch = getopt(argc, argv, "d:a:h:g:l")) != -1) { + while ((ch = getopt(argc, argv, "d:a:h:g:l:n:r:C:")) != -1) { name = optarg; /* default */ switch (ch) { default: @@ -152,6 +206,12 @@ usage: nr_cmd = NETMAP_BDG_ATTACH; nr_arg = NETMAP_BDG_HOST; break; + case 'n': + nr_cmd = NETMAP_BDG_NEWIF; + break; + case 'r': + nr_cmd = NETMAP_BDG_DELIF; + break; case 'g': nr_cmd = 0; break; @@ -160,6 +220,9 @@ usage: if (optind < argc && argv[optind][0] == '-') name = NULL; break; + case 'C': + nmr_config = strdup(optarg); + break; } if (optind != argc) { // fprintf(stderr, "optind %d argc %d\n", optind, argc); @@ -168,5 +231,5 @@ usage: } if (argc == 1) nr_cmd = NETMAP_BDG_LIST; - return bdg_ctl(name, nr_cmd, nr_arg) ? 1 : 0; + return bdg_ctl(name, nr_cmd, nr_arg, nmr_config) ? 1 : 0; } |