diff options
Diffstat (limited to 'sys')
-rw-r--r-- | sys/conf/files | 7 | ||||
-rw-r--r-- | sys/dev/e1000/if_em.c | 9 | ||||
-rw-r--r-- | sys/dev/e1000/if_igb.c | 11 | ||||
-rw-r--r-- | sys/dev/e1000/if_lem.c | 8 | ||||
-rw-r--r-- | sys/dev/ixgbe/ixgbe.c | 9 | ||||
-rw-r--r-- | sys/dev/netmap/if_em_netmap.h | 297 | ||||
-rw-r--r-- | sys/dev/netmap/if_igb_netmap.h | 324 | ||||
-rw-r--r-- | sys/dev/netmap/if_lem_netmap.h | 308 | ||||
-rw-r--r-- | sys/dev/netmap/if_re_netmap.h | 384 | ||||
-rw-r--r-- | sys/dev/netmap/ixgbe_netmap.h | 525 | ||||
-rw-r--r-- | sys/dev/netmap/netmap.c | 4258 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_kern.h | 1077 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_mem2.c | 1261 | ||||
-rw-r--r-- | sys/dev/re/if_re.c | 5 | ||||
-rw-r--r-- | sys/modules/netmap/Makefile | 10 | ||||
-rw-r--r-- | sys/net/netmap.h | 651 | ||||
-rw-r--r-- | sys/net/netmap_user.h | 658 |
17 files changed, 5423 insertions, 4379 deletions
diff --git a/sys/conf/files b/sys/conf/files index 2cf9028..d5615dd 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1910,6 +1910,13 @@ dev/nand/nfc_if.m optional nand dev/ncv/ncr53c500.c optional ncv dev/ncv/ncr53c500_pccard.c optional ncv pccard dev/netmap/netmap.c optional netmap +dev/netmap/netmap_freebsd.c optional netmap +dev/netmap/netmap_generic.c optional netmap +dev/netmap/netmap_mbq.c optional netmap +dev/netmap/netmap_mem2.c optional netmap +dev/netmap/netmap_offloadings.c optional netmap +dev/netmap/netmap_pipe.c optional netmap +dev/netmap/netmap_vale.c optional netmap dev/nge/if_nge.c optional nge dev/nxge/if_nxge.c optional nxge \ compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}" diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c index 516e84b..242d877 100644 --- a/sys/dev/e1000/if_em.c +++ b/sys/dev/e1000/if_em.c @@ -3835,8 +3835,7 @@ em_txeof(struct tx_ring *txr) EM_TX_LOCK_ASSERT(txr); #ifdef DEV_NETMAP - if (netmap_tx_irq(ifp, txr->me | - (NETMAP_LOCKED_ENTER | NETMAP_LOCKED_EXIT))) + if (netmap_tx_irq(ifp, txr->me)) return; #endif /* DEV_NETMAP */ @@ -4352,7 +4351,7 @@ em_initialize_receive_unit(struct adapter *adapter) * preserve the rx buffers passed to userspace. */ if (ifp->if_capenable & IFCAP_NETMAP) - rdt -= NA(adapter->ifp)->rx_rings[i].nr_hwavail; + rdt -= nm_kr_rxspace(&NA(adapter->ifp)->rx_rings[i]); #endif /* DEV_NETMAP */ E1000_WRITE_REG(hw, E1000_RDT(i), rdt); } @@ -4431,8 +4430,10 @@ em_rxeof(struct rx_ring *rxr, int count, int *done) EM_RX_LOCK(rxr); #ifdef DEV_NETMAP - if (netmap_rx_irq(ifp, rxr->me | NETMAP_LOCKED_ENTER, &processed)) + if (netmap_rx_irq(ifp, rxr->me, &processed)) { + EM_RX_UNLOCK(rxr); return (FALSE); + } #endif /* DEV_NETMAP */ for (i = rxr->next_to_check, processed = 0; count != 0;) { diff --git a/sys/dev/e1000/if_igb.c b/sys/dev/e1000/if_igb.c index 0873daa..fe5c6ad 100644 --- a/sys/dev/e1000/if_igb.c +++ b/sys/dev/e1000/if_igb.c @@ -3961,8 +3961,7 @@ igb_txeof(struct tx_ring *txr) mtx_assert(&txr->tx_mtx, MA_OWNED); #ifdef DEV_NETMAP - if (netmap_tx_irq(ifp, txr->me | - (NETMAP_LOCKED_ENTER|NETMAP_LOCKED_EXIT))) + if (netmap_tx_irq(ifp, txr->me)) return (FALSE); #endif /* DEV_NETMAP */ @@ -4630,13 +4629,13 @@ igb_initialize_receive_units(struct adapter *adapter) * an init() while a netmap client is active must * preserve the rx buffers passed to userspace. * In this driver it means we adjust RDT to - * somthing different from next_to_refresh + * something different from next_to_refresh * (which is not used in netmap mode). */ if (ifp->if_capenable & IFCAP_NETMAP) { struct netmap_adapter *na = NA(adapter->ifp); struct netmap_kring *kring = &na->rx_rings[i]; - int t = rxr->next_to_refresh - kring->nr_hwavail; + int t = rxr->next_to_refresh - nm_kr_rxspace(kring); if (t >= adapter->num_rx_desc) t -= adapter->num_rx_desc; @@ -4826,8 +4825,10 @@ igb_rxeof(struct igb_queue *que, int count, int *done) BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); #ifdef DEV_NETMAP - if (netmap_rx_irq(ifp, rxr->me | NETMAP_LOCKED_ENTER, &processed)) + if (netmap_rx_irq(ifp, rxr->me, &processed)) { + IGB_RX_UNLOCK(rxr); return (FALSE); + } #endif /* DEV_NETMAP */ /* Main clean loop */ diff --git a/sys/dev/e1000/if_lem.c b/sys/dev/e1000/if_lem.c index 3b63f45..8732838 100644 --- a/sys/dev/e1000/if_lem.c +++ b/sys/dev/e1000/if_lem.c @@ -2985,7 +2985,7 @@ lem_txeof(struct adapter *adapter) EM_TX_LOCK_ASSERT(adapter); #ifdef DEV_NETMAP - if (netmap_tx_irq(ifp, 0 | (NETMAP_LOCKED_ENTER|NETMAP_LOCKED_EXIT))) + if (netmap_tx_irq(ifp, 0)) return; #endif /* DEV_NETMAP */ if (adapter->num_tx_desc_avail == adapter->num_tx_desc) @@ -3366,7 +3366,7 @@ lem_initialize_receive_unit(struct adapter *adapter) #ifdef DEV_NETMAP /* preserve buffers already made available to clients */ if (ifp->if_capenable & IFCAP_NETMAP) - rctl -= NA(adapter->ifp)->rx_rings[0].nr_hwavail; + rctl -= nm_kr_rxspace(&NA(adapter->ifp)->rx_rings[0]); #endif /* DEV_NETMAP */ E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), rctl); @@ -3452,8 +3452,10 @@ lem_rxeof(struct adapter *adapter, int count, int *done) BUS_DMASYNC_POSTREAD); #ifdef DEV_NETMAP - if (netmap_rx_irq(ifp, 0 | NETMAP_LOCKED_ENTER, &rx_sent)) + if (netmap_rx_irq(ifp, 0, &rx_sent)) { + EM_RX_UNLOCK(adapter); return (FALSE); + } #endif /* DEV_NETMAP */ if (!((current_desc->status) & E1000_RXD_STAT_DD)) { diff --git a/sys/dev/ixgbe/ixgbe.c b/sys/dev/ixgbe/ixgbe.c index 581dcc6..5b9fa01 100644 --- a/sys/dev/ixgbe/ixgbe.c +++ b/sys/dev/ixgbe/ixgbe.c @@ -1245,7 +1245,7 @@ ixgbe_init_locked(struct adapter *adapter) if (ifp->if_capenable & IFCAP_NETMAP) { struct netmap_adapter *na = NA(adapter->ifp); struct netmap_kring *kring = &na->rx_rings[i]; - int t = na->num_rx_desc - 1 - kring->nr_hwavail; + int t = na->num_rx_desc - 1 - nm_kr_rxspace(kring); IXGBE_WRITE_REG(hw, IXGBE_RDT(i), t); } else @@ -3629,8 +3629,7 @@ ixgbe_txeof(struct tx_ring *txr) if (!netmap_mitigate || (kring->nr_kflags < kring->nkr_num_slots && txd[kring->nr_kflags].wb.status & IXGBE_TXD_STAT_DD)) { - netmap_tx_irq(ifp, txr->me | - (NETMAP_LOCKED_ENTER|NETMAP_LOCKED_EXIT)); + netmap_tx_irq(ifp, txr->me); } return; } @@ -4422,8 +4421,10 @@ ixgbe_rxeof(struct ix_queue *que) #ifdef DEV_NETMAP /* Same as the txeof routine: wakeup clients on intr. */ - if (netmap_rx_irq(ifp, rxr->me | NETMAP_LOCKED_ENTER, &processed)) + if (netmap_rx_irq(ifp, rxr->me, &processed)) { + IXGBE_RX_UNLOCK(rxr); return (FALSE); + } #endif /* DEV_NETMAP */ for (i = rxr->next_to_check; count != 0;) { diff --git a/sys/dev/netmap/if_em_netmap.h b/sys/dev/netmap/if_em_netmap.h index 5bfbd3d..17b4c4f 100644 --- a/sys/dev/netmap/if_em_netmap.h +++ b/sys/dev/netmap/if_em_netmap.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -26,7 +26,7 @@ /* * $FreeBSD$ * - * netmap support for em. + * netmap support for: em. * * For more details on netmap support please see ixgbe_netmap.h */ @@ -39,39 +39,6 @@ #include <dev/netmap/netmap_kern.h> -static void em_netmap_block_tasks(struct adapter *); -static void em_netmap_unblock_tasks(struct adapter *); - - -static void -em_netmap_lock_wrapper(struct ifnet *ifp, int what, u_int queueid) -{ - struct adapter *adapter = ifp->if_softc; - - ASSERT(queueid < adapter->num_queues); - switch (what) { - case NETMAP_CORE_LOCK: - EM_CORE_LOCK(adapter); - break; - case NETMAP_CORE_UNLOCK: - EM_CORE_UNLOCK(adapter); - break; - case NETMAP_TX_LOCK: - EM_TX_LOCK(&adapter->tx_rings[queueid]); - break; - case NETMAP_TX_UNLOCK: - EM_TX_UNLOCK(&adapter->tx_rings[queueid]); - break; - case NETMAP_RX_LOCK: - EM_RX_LOCK(&adapter->rx_rings[queueid]); - break; - case NETMAP_RX_UNLOCK: - EM_RX_UNLOCK(&adapter->rx_rings[queueid]); - break; - } -} - - // XXX do we need to block/unblock the tasks ? static void em_netmap_block_tasks(struct adapter *adapter) @@ -114,45 +81,31 @@ em_netmap_unblock_tasks(struct adapter *adapter) /* - * Register/unregister routine + * Register/unregister. We are already under netmap lock. */ static int -em_netmap_reg(struct ifnet *ifp, int onoff) +em_netmap_reg(struct netmap_adapter *na, int onoff) { + struct ifnet *ifp = na->ifp; struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); - int error = 0; - - if (na == NULL) - return EINVAL; /* no netmap support here */ + EM_CORE_LOCK(adapter); em_disable_intr(adapter); /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); em_netmap_block_tasks(adapter); - + /* enable or disable flags and callbacks in na and ifp */ if (onoff) { - ifp->if_capenable |= IFCAP_NETMAP; - - na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_start; - - em_init_locked(adapter); - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { - error = ENOMEM; - goto fail; - } + nm_set_native_flags(na); } else { -fail: - /* return to non-netmap mode */ - ifp->if_transmit = na->if_transmit; - ifp->if_capenable &= ~IFCAP_NETMAP; - em_init_locked(adapter); /* also enable intr */ + nm_clear_native_flags(na); } + em_init_locked(adapter); /* also enable intr */ em_netmap_unblock_tasks(adapter); - return (error); + EM_CORE_UNLOCK(adapter); + return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); } @@ -160,100 +113,93 @@ fail: * Reconcile kernel and user view of the transmit ring. */ static int -em_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct tx_ring *txr = &adapter->tx_rings[ring_nr]; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, k, l, n = 0, lim = kring->nkr_num_slots - 1; - + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; /* generate an interrupt approximately every half ring */ u_int report_frequency = kring->nkr_num_slots >> 1; - k = ring->cur; - if (k > lim) - return netmap_ring_reinit(kring); + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct tx_ring *txr = &adapter->tx_rings[ring_nr]; - if (do_lock) - EM_TX_LOCK(txr); bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); /* - * Process new packets to send. j is the current index in the - * netmap ring, l is the corresponding index in the NIC ring. + * First part: process new packets to send. */ - j = kring->nr_hwcur; - if (j != k) { /* we have new packets to send */ - l = netmap_idx_k2n(kring, j); - for (n = 0; j != k; n++) { - /* slot is the current slot in the netmap ring */ - struct netmap_slot *slot = &ring->slot[j]; - /* curr is the current slot in the nic ring */ - struct e1000_tx_desc *curr = &txr->tx_base[l]; - struct em_buffer *txbuf = &txr->tx_buffers[l]; - int flags = ((slot->flags & NS_REPORT) || - j == 0 || j == report_frequency) ? - E1000_TXD_CMD_RS : 0; + + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* we have new packets to send */ + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; + u_int len = slot->len; uint64_t paddr; void *addr = PNMB(slot, &paddr); - u_int len = slot->len; - if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { - if (do_lock) - EM_TX_UNLOCK(txr); - return netmap_ring_reinit(kring); - } + /* device-specific */ + struct e1000_tx_desc *curr = &txr->tx_base[nic_i]; + struct em_buffer *txbuf = &txr->tx_buffers[nic_i]; + int flags = (slot->flags & NS_REPORT || + nic_i == 0 || nic_i == report_frequency) ? + E1000_TXD_CMD_RS : 0; + + NM_CHECK_ADDR_LEN(addr, len); - slot->flags &= ~NS_REPORT; if (slot->flags & NS_BUF_CHANGED) { curr->buffer_addr = htole64(paddr); /* buffer has changed, reload map */ netmap_reload_map(txr->txtag, txbuf->map, addr); - slot->flags &= ~NS_BUF_CHANGED; } + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + + /* Fill the slot in the NIC ring. */ curr->upper.data = 0; curr->lower.data = htole32(adapter->txd_cmd | len | (E1000_TXD_CMD_EOP | flags) ); bus_dmamap_sync(txr->txtag, txbuf->map, BUS_DMASYNC_PREWRITE); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwcur = k; /* the saved ring->cur */ - kring->nr_hwavail -= n; + kring->nr_hwcur = head; + /* synchronize the NIC ring */ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, - BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), l); + /* (re)start the tx unit up to slot nic_i (excluded) */ + E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), nic_i); } - if (n == 0 || kring->nr_hwavail < 1) { - int delta; - + /* + * Second part: reclaim buffers for completed transmissions. + */ + if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { /* record completed transmissions using TDH */ - l = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); - if (l >= kring->nkr_num_slots) { /* XXX can it happen ? */ - D("TDH wrap %d", l); - l -= kring->nkr_num_slots; + nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); + if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ + D("TDH wrap %d", nic_i); + nic_i -= kring->nkr_num_slots; } - delta = l - txr->next_to_clean; - if (delta) { - /* some completed, increment hwavail. */ - if (delta < 0) - delta += kring->nkr_num_slots; - txr->next_to_clean = l; - kring->nr_hwavail += delta; + if (nic_i != txr->next_to_clean) { + txr->next_to_clean = nic_i; + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } } - /* update avail to what the kernel knows */ - ring->avail = kring->nr_hwavail; - if (do_lock) - EM_TX_UNLOCK(txr); + nm_txsync_finalize(kring); + return 0; } @@ -262,111 +208,108 @@ em_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) * Reconcile kernel and user view of the receive ring. */ static int -em_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, l, n, lim = kring->nkr_num_slots - 1; - int force_update = do_lock || kring->nr_kflags & NKR_PENDINTR; - u_int k = ring->cur, resvd = ring->reserved; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_rxsync_prologue(kring); + int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; + + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; - k = ring->cur; - if (k > lim) + if (head > lim) return netmap_ring_reinit(kring); - if (do_lock) - EM_RX_LOCK(rxr); - /* XXX check sync modes */ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* - * Import newly received packets into the netmap ring. - * j is an index in the netmap ring, l in the NIC ring. + * First part: import newly received packets. */ - l = rxr->next_to_check; - j = netmap_idx_n2k(kring, l); if (netmap_no_pendintr || force_update) { uint16_t slot_flags = kring->nkr_slot_flags; - for (n = 0; ; n++) { - struct e1000_rx_desc *curr = &rxr->rx_base[l]; + nic_i = rxr->next_to_check; + nm_i = netmap_idx_n2k(kring, nic_i); + + for (n = 0; ; n++) { // XXX no need to count + struct e1000_rx_desc *curr = &rxr->rx_base[nic_i]; uint32_t staterr = le32toh(curr->status); if ((staterr & E1000_RXD_STAT_DD) == 0) break; - ring->slot[j].len = le16toh(curr->length); - ring->slot[j].flags = slot_flags; - bus_dmamap_sync(rxr->rxtag, rxr->rx_buffers[l].map, + ring->slot[nm_i].len = le16toh(curr->length); + ring->slot[nm_i].flags = slot_flags; + bus_dmamap_sync(rxr->rxtag, rxr->rx_buffers[nic_i].map, BUS_DMASYNC_POSTREAD); - j = (j == lim) ? 0 : j + 1; + nm_i = nm_next(nm_i, lim); /* make sure next_to_refresh follows next_to_check */ - rxr->next_to_refresh = l; // XXX - l = (l == lim) ? 0 : l + 1; + rxr->next_to_refresh = nic_i; // XXX + nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ - rxr->next_to_check = l; - kring->nr_hwavail += n; + rxr->next_to_check = nic_i; + kring->nr_hwtail = nm_i; } kring->nr_kflags &= ~NKR_PENDINTR; } - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; /* netmap ring index */ - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - if (j != k) { /* userspace has released some packets. */ - l = netmap_idx_k2n(kring, j); /* NIC ring index */ - for (n = 0; j != k; n++) { - struct netmap_slot *slot = &ring->slot[j]; - struct e1000_rx_desc *curr = &rxr->rx_base[l]; - struct em_buffer *rxbuf = &rxr->rx_buffers[l]; + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); - if (addr == netmap_buffer_base) { /* bad buf */ - if (do_lock) - EM_RX_UNLOCK(rxr); - return netmap_ring_reinit(kring); - } + struct e1000_rx_desc *curr = &rxr->rx_base[nic_i]; + struct em_buffer *rxbuf = &rxr->rx_buffers[nic_i]; + + if (addr == netmap_buffer_base) /* bad buf */ + goto ring_reset; if (slot->flags & NS_BUF_CHANGED) { - curr->buffer_addr = htole64(paddr); /* buffer has changed, reload map */ + curr->buffer_addr = htole64(paddr); netmap_reload_map(rxr->rxtag, rxbuf->map, addr); slot->flags &= ~NS_BUF_CHANGED; } curr->status = 0; bus_dmamap_sync(rxr->rxtag, rxbuf->map, BUS_DMASYNC_PREREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = k; + kring->nr_hwcur = head; + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* * IMPORTANT: we must leave one free slot in the ring, - * so move l back by one unit + * so move nic_i back by one unit */ - l = (l == 0) ? lim : l - 1; - E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), l); + nic_i = nm_prev(nic_i, lim); + E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), nic_i); } - /* tell userspace that there are new packets */ - ring->avail = kring->nr_hwavail - resvd; - if (do_lock) - EM_RX_UNLOCK(rxr); + + /* tell userspace that there might be new packets */ + nm_rxsync_finalize(kring); + return 0; + +ring_reset: + return netmap_ring_reinit(kring); } @@ -378,14 +321,14 @@ em_netmap_attach(struct adapter *adapter) bzero(&na, sizeof(na)); na.ifp = adapter->ifp; - na.separate_locks = 1; + na.na_flags = NAF_BDG_MAYSLEEP; na.num_tx_desc = adapter->num_tx_desc; na.num_rx_desc = adapter->num_rx_desc; na.nm_txsync = em_netmap_txsync; na.nm_rxsync = em_netmap_rxsync; - na.nm_lock = em_netmap_lock_wrapper; na.nm_register = em_netmap_reg; - netmap_attach(&na, adapter->num_queues); + na.num_tx_rings = na.num_rx_rings = adapter->num_queues; + netmap_attach(&na); } /* end of file */ diff --git a/sys/dev/netmap/if_igb_netmap.h b/sys/dev/netmap/if_igb_netmap.h index d4e5dfe..e1929f0 100644 --- a/sys/dev/netmap/if_igb_netmap.h +++ b/sys/dev/netmap/if_igb_netmap.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Universita` di Pisa. All rights reserved. + * Copyright (C) 2011-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -37,76 +37,43 @@ #include <vm/pmap.h> /* vtophys ? */ #include <dev/netmap/netmap_kern.h> - /* - * wrapper to export locks to the generic code + * Adaptation to different versions of the driver. */ -static void -igb_netmap_lock_wrapper(struct ifnet *ifp, int what, u_int queueid) -{ - struct adapter *adapter = ifp->if_softc; - ASSERT(queueid < adapter->num_queues); - switch (what) { - case NETMAP_CORE_LOCK: - IGB_CORE_LOCK(adapter); - break; - case NETMAP_CORE_UNLOCK: - IGB_CORE_UNLOCK(adapter); - break; - case NETMAP_TX_LOCK: - IGB_TX_LOCK(&adapter->tx_rings[queueid]); - break; - case NETMAP_TX_UNLOCK: - IGB_TX_UNLOCK(&adapter->tx_rings[queueid]); - break; - case NETMAP_RX_LOCK: - IGB_RX_LOCK(&adapter->rx_rings[queueid]); - break; - case NETMAP_RX_UNLOCK: - IGB_RX_UNLOCK(&adapter->rx_rings[queueid]); - break; - } -} +#ifndef IGB_MEDIA_RESET +/* at the same time as IGB_MEDIA_RESET was defined, the + * tx buffer descriptor was renamed, so use this to revert + * back to the old name. + */ +#define igb_tx_buf igb_tx_buffer +#endif /* - * register-unregister routine + * Register/unregister. We are already under netmap lock. */ static int -igb_netmap_reg(struct ifnet *ifp, int onoff) +igb_netmap_reg(struct netmap_adapter *na, int onoff) { + struct ifnet *ifp = na->ifp; struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); - int error = 0; - - if (na == NULL) - return EINVAL; /* no netmap support here */ + IGB_CORE_LOCK(adapter); igb_disable_intr(adapter); /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + /* enable or disable flags and callbacks in na and ifp */ if (onoff) { - ifp->if_capenable |= IFCAP_NETMAP; - - na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_start; - - igb_init_locked(adapter); - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { - error = ENOMEM; - goto fail; - } + nm_set_native_flags(na); } else { -fail: - /* restore if_transmit */ - ifp->if_transmit = na->if_transmit; - ifp->if_capenable &= ~IFCAP_NETMAP; - igb_init_locked(adapter); /* also enable intr */ + nm_clear_native_flags(na); } - return (error); + igb_init_locked(adapter); /* also enable intr */ + IGB_CORE_UNLOCK(adapter); + return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); } @@ -114,65 +81,59 @@ fail: * Reconcile kernel and user view of the transmit ring. */ static int -igb_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct tx_ring *txr = &adapter->tx_rings[ring_nr]; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, k, l, n = 0, lim = kring->nkr_num_slots - 1; - + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; /* generate an interrupt approximately every half ring */ u_int report_frequency = kring->nkr_num_slots >> 1; - k = ring->cur; - if (k > lim) - return netmap_ring_reinit(kring); + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct tx_ring *txr = &adapter->tx_rings[ring_nr]; + /* 82575 needs the queue index added */ + u32 olinfo_status = + (adapter->hw.mac.type == e1000_82575) ? (txr->me << 4) : 0; - if (do_lock) - IGB_TX_LOCK(txr); bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, - BUS_DMASYNC_POSTREAD); + BUS_DMASYNC_POSTREAD); - /* check for new packets to send. - * j indexes the netmap ring, l indexes the nic ring, and - * j = kring->nr_hwcur, l = E1000_TDT (not tracked), - * j == (l + kring->nkr_hwofs) % ring_size + /* + * First part: process new packets to send. */ - j = kring->nr_hwcur; - if (j != k) { /* we have new packets to send */ - /* 82575 needs the queue index added */ - u32 olinfo_status = - (adapter->hw.mac.type == e1000_82575) ? (txr->me << 4) : 0; - - l = netmap_idx_k2n(kring, j); - for (n = 0; j != k; n++) { - /* slot is the current slot in the netmap ring */ - struct netmap_slot *slot = &ring->slot[j]; - /* curr is the current slot in the nic ring */ - union e1000_adv_tx_desc *curr = - (union e1000_adv_tx_desc *)&txr->tx_base[l]; - struct igb_tx_buf *txbuf = &txr->tx_buffers[l]; - int flags = ((slot->flags & NS_REPORT) || - j == 0 || j == report_frequency) ? - E1000_ADVTXD_DCMD_RS : 0; + + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* we have new packets to send */ + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; + u_int len = slot->len; uint64_t paddr; void *addr = PNMB(slot, &paddr); - u_int len = slot->len; - if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { - if (do_lock) - IGB_TX_UNLOCK(txr); - return netmap_ring_reinit(kring); - } + /* device-specific */ + union e1000_adv_tx_desc *curr = + (union e1000_adv_tx_desc *)&txr->tx_base[nic_i]; + struct igb_tx_buf *txbuf = &txr->tx_buffers[nic_i]; + int flags = (slot->flags & NS_REPORT || + nic_i == 0 || nic_i == report_frequency) ? + E1000_ADVTXD_DCMD_RS : 0; + + NM_CHECK_ADDR_LEN(addr, len); - slot->flags &= ~NS_REPORT; if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ netmap_reload_map(txr->txtag, txbuf->map, addr); - slot->flags &= ~NS_BUF_CHANGED; } + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + + /* Fill the slot in the NIC ring. */ curr->read.buffer_addr = htole64(paddr); // XXX check olinfo and cmd_type_len curr->read.olinfo_status = @@ -180,51 +141,47 @@ igb_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) (len<< E1000_ADVTXD_PAYLEN_SHIFT)); curr->read.cmd_type_len = htole32(len | E1000_ADVTXD_DTYP_DATA | - E1000_ADVTXD_DCMD_IFCS | - E1000_ADVTXD_DCMD_DEXT | - E1000_ADVTXD_DCMD_EOP | flags); + E1000_ADVTXD_DCMD_IFCS | + E1000_ADVTXD_DCMD_DEXT | + E1000_ADVTXD_DCMD_EOP | flags); + /* make sure changes to the buffer are synced */ bus_dmamap_sync(txr->txtag, txbuf->map, BUS_DMASYNC_PREWRITE); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwcur = k; /* the saved ring->cur */ - kring->nr_hwavail -= n; + kring->nr_hwcur = head; /* Set the watchdog XXX ? */ txr->queue_status = IGB_QUEUE_WORKING; txr->watchdog_time = ticks; + /* synchronize the NIC ring */ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, - BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), l); + /* (re)start the tx unit up to slot nic_i (excluded) */ + E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), nic_i); } - if (n == 0 || kring->nr_hwavail < 1) { - int delta; - + /* + * Second part: reclaim buffers for completed transmissions. + */ + if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { /* record completed transmissions using TDH */ - l = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); - if (l >= kring->nkr_num_slots) { /* XXX can it happen ? */ - D("TDH wrap %d", l); - l -= kring->nkr_num_slots; - } - delta = l - txr->next_to_clean; - if (delta) { - /* some completed, increment hwavail. */ - if (delta < 0) - delta += kring->nkr_num_slots; - txr->next_to_clean = l; - kring->nr_hwavail += delta; + nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); + if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ + D("TDH wrap %d", nic_i); + nic_i -= kring->nkr_num_slots; } + txr->next_to_clean = nic_i; + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } - /* update avail to what the kernel knows */ - ring->avail = kring->nr_hwavail; - if (do_lock) - IGB_TX_UNLOCK(txr); + nm_txsync_finalize(kring); + return 0; } @@ -233,108 +190,106 @@ igb_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) * Reconcile kernel and user view of the receive ring. */ static int -igb_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, l, n, lim = kring->nkr_num_slots - 1; - int force_update = do_lock || kring->nr_kflags & NKR_PENDINTR; - u_int k = ring->cur, resvd = ring->reserved; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_rxsync_prologue(kring); + int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; + + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; - k = ring->cur; - if (k > lim) + if (head > lim) return netmap_ring_reinit(kring); - if (do_lock) - IGB_RX_LOCK(rxr); - /* XXX check sync modes */ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, - BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* - * import newly received packets into the netmap ring. - * j is an index in the netmap ring, l in the NIC ring. + * First part: import newly received packets. */ - l = rxr->next_to_check; - j = netmap_idx_n2k(kring, l); if (netmap_no_pendintr || force_update) { uint16_t slot_flags = kring->nkr_slot_flags; + nic_i = rxr->next_to_check; + nm_i = netmap_idx_n2k(kring, nic_i); + for (n = 0; ; n++) { - union e1000_adv_rx_desc *curr = &rxr->rx_base[l]; + union e1000_adv_rx_desc *curr = &rxr->rx_base[nic_i]; uint32_t staterr = le32toh(curr->wb.upper.status_error); if ((staterr & E1000_RXD_STAT_DD) == 0) break; - ring->slot[j].len = le16toh(curr->wb.upper.length); - ring->slot[j].flags = slot_flags; + ring->slot[nm_i].len = le16toh(curr->wb.upper.length); + ring->slot[nm_i].flags = slot_flags; bus_dmamap_sync(rxr->ptag, - rxr->rx_buffers[l].pmap, BUS_DMASYNC_POSTREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + rxr->rx_buffers[nic_i].pmap, BUS_DMASYNC_POSTREAD); + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ - rxr->next_to_check = l; - kring->nr_hwavail += n; + rxr->next_to_check = nic_i; + kring->nr_hwtail = nm_i; } kring->nr_kflags &= ~NKR_PENDINTR; } - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; /* netmap ring index */ - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - if (j != k) { /* userspace has released some packets. */ - l = netmap_idx_k2n(kring, j); - for (n = 0; j != k; n++) { - struct netmap_slot *slot = ring->slot + j; - union e1000_adv_rx_desc *curr = &rxr->rx_base[l]; - struct igb_rx_buf *rxbuf = rxr->rx_buffers + l; + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); - if (addr == netmap_buffer_base) { /* bad buf */ - if (do_lock) - IGB_RX_UNLOCK(rxr); - return netmap_ring_reinit(kring); - } + union e1000_adv_rx_desc *curr = &rxr->rx_base[nic_i]; + struct igb_rx_buf *rxbuf = &rxr->rx_buffers[nic_i]; + + if (addr == netmap_buffer_base) /* bad buf */ + goto ring_reset; if (slot->flags & NS_BUF_CHANGED) { + /* buffer has changed, reload map */ netmap_reload_map(rxr->ptag, rxbuf->pmap, addr); slot->flags &= ~NS_BUF_CHANGED; } - curr->read.pkt_addr = htole64(paddr); curr->wb.upper.status_error = 0; + curr->read.pkt_addr = htole64(paddr); bus_dmamap_sync(rxr->ptag, rxbuf->pmap, - BUS_DMASYNC_PREREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + BUS_DMASYNC_PREREAD); + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = k; + kring->nr_hwcur = head; + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, - BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* * IMPORTANT: we must leave one free slot in the ring, - * so move l back by one unit + * so move nic_i back by one unit */ - l = (l == 0) ? lim : l - 1; - E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), l); + nic_i = nm_prev(nic_i, lim); + E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), nic_i); } - /* tell userspace that there are new packets */ - ring->avail = kring->nr_hwavail - resvd; - if (do_lock) - IGB_RX_UNLOCK(rxr); + + /* tell userspace that there might be new packets */ + nm_rxsync_finalize(kring); + return 0; + +ring_reset: + return netmap_ring_reinit(kring); } @@ -346,13 +301,14 @@ igb_netmap_attach(struct adapter *adapter) bzero(&na, sizeof(na)); na.ifp = adapter->ifp; - na.separate_locks = 1; + na.na_flags = NAF_BDG_MAYSLEEP; na.num_tx_desc = adapter->num_tx_desc; na.num_rx_desc = adapter->num_rx_desc; na.nm_txsync = igb_netmap_txsync; na.nm_rxsync = igb_netmap_rxsync; - na.nm_lock = igb_netmap_lock_wrapper; na.nm_register = igb_netmap_reg; - netmap_attach(&na, adapter->num_queues); -} + na.num_tx_rings = na.num_rx_rings = adapter->num_queues; + netmap_attach(&na); +} + /* end of file */ diff --git a/sys/dev/netmap/if_lem_netmap.h b/sys/dev/netmap/if_lem_netmap.h index acef45f..4fce5c9 100644 --- a/sys/dev/netmap/if_lem_netmap.h +++ b/sys/dev/netmap/if_lem_netmap.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -27,11 +27,12 @@ /* * $FreeBSD$ * - * netmap support for "lem" + * netmap support for: lem * * For details on netmap support please see ixgbe_netmap.h */ + #include <net/netmap.h> #include <sys/selinfo.h> #include <vm/vm.h> @@ -39,47 +40,16 @@ #include <dev/netmap/netmap_kern.h> -static void -lem_netmap_lock_wrapper(struct ifnet *ifp, int what, u_int ringid) -{ - struct adapter *adapter = ifp->if_softc; - - /* only one ring here so ignore the ringid */ - switch (what) { - case NETMAP_CORE_LOCK: - EM_CORE_LOCK(adapter); - break; - case NETMAP_CORE_UNLOCK: - EM_CORE_UNLOCK(adapter); - break; - case NETMAP_TX_LOCK: - EM_TX_LOCK(adapter); - break; - case NETMAP_TX_UNLOCK: - EM_TX_UNLOCK(adapter); - break; - case NETMAP_RX_LOCK: - EM_RX_LOCK(adapter); - break; - case NETMAP_RX_UNLOCK: - EM_RX_UNLOCK(adapter); - break; - } -} - - /* - * Register/unregister + * Register/unregister. We are already under netmap lock. */ static int -lem_netmap_reg(struct ifnet *ifp, int onoff) +lem_netmap_reg(struct netmap_adapter *na, int onoff) { + struct ifnet *ifp = na->ifp; struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); - int error = 0; - if (na == NULL) - return EINVAL; + EM_CORE_LOCK(adapter); lem_disable_intr(adapter); @@ -91,30 +61,22 @@ lem_netmap_reg(struct ifnet *ifp, int onoff) taskqueue_drain(adapter->tq, &adapter->rxtx_task); taskqueue_drain(adapter->tq, &adapter->link_task); #endif /* !EM_LEGCY_IRQ */ - if (onoff) { - ifp->if_capenable |= IFCAP_NETMAP; - - na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_start; - lem_init_locked(adapter); - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { - error = ENOMEM; - goto fail; - } + /* enable or disable flags and callbacks in na and ifp */ + if (onoff) { + nm_set_native_flags(na); } else { -fail: - /* return to non-netmap mode */ - ifp->if_transmit = na->if_transmit; - ifp->if_capenable &= ~IFCAP_NETMAP; - lem_init_locked(adapter); /* also enable intr */ + nm_clear_native_flags(na); } + lem_init_locked(adapter); /* also enable intr */ #ifndef EM_LEGACY_IRQ taskqueue_unblock(adapter->tq); // XXX do we need this ? #endif /* !EM_LEGCY_IRQ */ - return (error); + EM_CORE_UNLOCK(adapter); + + return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); } @@ -122,101 +84,90 @@ fail: * Reconcile kernel and user view of the transmit ring. */ static int -lem_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, k, l, n = 0, lim = kring->nkr_num_slots - 1; - + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; /* generate an interrupt approximately every half ring */ - int report_frequency = kring->nkr_num_slots >> 1; + u_int report_frequency = kring->nkr_num_slots >> 1; - /* take a copy of ring->cur now, and never read it again */ - k = ring->cur; - if (k > lim) - return netmap_ring_reinit(kring); + /* device-specific */ + struct adapter *adapter = ifp->if_softc; - if (do_lock) - EM_TX_LOCK(adapter); bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, BUS_DMASYNC_POSTREAD); + /* - * Process new packets to send. j is the current index in the - * netmap ring, l is the corresponding index in the NIC ring. + * First part: process new packets to send. */ - j = kring->nr_hwcur; - if (j != k) { /* we have new packets to send */ - l = netmap_idx_k2n(kring, j); - for (n = 0; j != k; n++) { - /* slot is the current slot in the netmap ring */ - struct netmap_slot *slot = &ring->slot[j]; - /* curr is the current slot in the nic ring */ - struct e1000_tx_desc *curr = &adapter->tx_desc_base[l]; - struct em_buffer *txbuf = &adapter->tx_buffer_area[l]; - int flags = ((slot->flags & NS_REPORT) || - j == 0 || j == report_frequency) ? - E1000_TXD_CMD_RS : 0; + + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* we have new packets to send */ + nic_i = netmap_idx_k2n(kring, nm_i); + while (nm_i != head) { + struct netmap_slot *slot = &ring->slot[nm_i]; + u_int len = slot->len; uint64_t paddr; void *addr = PNMB(slot, &paddr); - u_int len = slot->len; - if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { - if (do_lock) - EM_TX_UNLOCK(adapter); - return netmap_ring_reinit(kring); - } + /* device-specific */ + struct e1000_tx_desc *curr = &adapter->tx_desc_base[nic_i]; + struct em_buffer *txbuf = &adapter->tx_buffer_area[nic_i]; + int flags = (slot->flags & NS_REPORT || + nic_i == 0 || nic_i == report_frequency) ? + E1000_TXD_CMD_RS : 0; + + NM_CHECK_ADDR_LEN(addr, len); - slot->flags &= ~NS_REPORT; if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ - netmap_reload_map(adapter->txtag, txbuf->map, addr); curr->buffer_addr = htole64(paddr); - slot->flags &= ~NS_BUF_CHANGED; + netmap_reload_map(adapter->txtag, txbuf->map, addr); } + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + + /* Fill the slot in the NIC ring. */ curr->upper.data = 0; - curr->lower.data = - htole32( adapter->txd_cmd | len | + curr->lower.data = htole32(adapter->txd_cmd | len | (E1000_TXD_CMD_EOP | flags) ); - bus_dmamap_sync(adapter->txtag, txbuf->map, - BUS_DMASYNC_PREWRITE); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + BUS_DMASYNC_PREWRITE); + + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwcur = k; /* the saved ring->cur */ - kring->nr_hwavail -= n; + kring->nr_hwcur = head; + /* synchronize the NIC ring */ bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, - BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), l); + /* (re)start the tx unit up to slot nic_i (excluded) */ + E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), nic_i); } - if (n == 0 || kring->nr_hwavail < 1) { - int delta; - + /* + * Second part: reclaim buffers for completed transmissions. + */ + if (ticks != kring->last_reclaim || flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { + kring->last_reclaim = ticks; /* record completed transmissions using TDH */ - l = E1000_READ_REG(&adapter->hw, E1000_TDH(0)); - if (l >= kring->nkr_num_slots) { /* XXX can it happen ? */ - D("bad TDH %d", l); - l -= kring->nkr_num_slots; - } - delta = l - adapter->next_tx_to_clean; - if (delta) { - /* some tx completed, increment hwavail. */ - if (delta < 0) - delta += kring->nkr_num_slots; - adapter->next_tx_to_clean = l; - kring->nr_hwavail += delta; + nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(0)); + if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ + D("TDH wrap %d", nic_i); + nic_i -= kring->nkr_num_slots; } + adapter->next_tx_to_clean = nic_i; + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } - /* update avail to what the kernel knows */ - ring->avail = kring->nr_hwavail; - if (do_lock) - EM_TX_UNLOCK(adapter); + nm_txsync_finalize(kring); + return 0; } @@ -225,37 +176,39 @@ lem_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) * Reconcile kernel and user view of the receive ring. */ static int -lem_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - int j, l, n, lim = kring->nkr_num_slots - 1; - int force_update = do_lock || kring->nr_kflags & NKR_PENDINTR; - u_int k = ring->cur, resvd = ring->reserved; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_rxsync_prologue(kring); + int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; + + /* device-specific */ + struct adapter *adapter = ifp->if_softc; - if (k > lim) + if (head > lim) return netmap_ring_reinit(kring); - if (do_lock) - EM_RX_LOCK(adapter); - /* XXX check sync modes */ bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* - * Import newly received packets into the netmap ring. - * j is an index in the netmap ring, l in the NIC ring. + * First part: import newly received packets. */ - l = adapter->next_rx_desc_to_check; - j = netmap_idx_n2k(kring, l); if (netmap_no_pendintr || force_update) { uint16_t slot_flags = kring->nkr_slot_flags; + nic_i = adapter->next_rx_desc_to_check; + nm_i = netmap_idx_n2k(kring, nic_i); + for (n = 0; ; n++) { - struct e1000_rx_desc *curr = &adapter->rx_desc_base[l]; + struct e1000_rx_desc *curr = &adapter->rx_desc_base[nic_i]; uint32_t staterr = le32toh(curr->status); int len; @@ -263,78 +216,77 @@ lem_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) break; len = le16toh(curr->length) - 4; // CRC if (len < 0) { - D("bogus pkt size at %d", j); + D("bogus pkt size %d nic idx %d", len, nic_i); len = 0; } - ring->slot[j].len = len; - ring->slot[j].flags = slot_flags; + ring->slot[nm_i].len = len; + ring->slot[nm_i].flags = slot_flags; bus_dmamap_sync(adapter->rxtag, - adapter->rx_buffer_area[l].map, - BUS_DMASYNC_POSTREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + adapter->rx_buffer_area[nic_i].map, + BUS_DMASYNC_POSTREAD); + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ - adapter->next_rx_desc_to_check = l; - kring->nr_hwavail += n; + ND("%d new packets at nic %d nm %d tail %d", + n, + adapter->next_rx_desc_to_check, + netmap_idx_n2k(kring, adapter->next_rx_desc_to_check), + kring->nr_hwtail); + adapter->next_rx_desc_to_check = nic_i; + // ifp->if_ipackets += n; + kring->nr_hwtail = nm_i; } kring->nr_kflags &= ~NKR_PENDINTR; } - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; /* netmap ring index */ - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - if (j != k) { /* userspace has released some packets. */ - l = netmap_idx_k2n(kring, j); /* NIC ring index */ - for (n = 0; j != k; n++) { - struct netmap_slot *slot = &ring->slot[j]; - struct e1000_rx_desc *curr = &adapter->rx_desc_base[l]; - struct em_buffer *rxbuf = &adapter->rx_buffer_area[l]; + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); - if (addr == netmap_buffer_base) { /* bad buf */ - if (do_lock) - EM_RX_UNLOCK(adapter); - return netmap_ring_reinit(kring); - } + struct e1000_rx_desc *curr = &adapter->rx_desc_base[nic_i]; + struct em_buffer *rxbuf = &adapter->rx_buffer_area[nic_i]; + + if (addr == netmap_buffer_base) /* bad buf */ + goto ring_reset; if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ - netmap_reload_map(adapter->rxtag, rxbuf->map, addr); curr->buffer_addr = htole64(paddr); + netmap_reload_map(adapter->rxtag, rxbuf->map, addr); slot->flags &= ~NS_BUF_CHANGED; } curr->status = 0; - bus_dmamap_sync(adapter->rxtag, rxbuf->map, BUS_DMASYNC_PREREAD); - - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = k; + kring->nr_hwcur = head; bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* * IMPORTANT: we must leave one free slot in the ring, - * so move l back by one unit + * so move nic_i back by one unit */ - l = (l == 0) ? lim : l - 1; - E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), l); + nic_i = nm_prev(nic_i, lim); + E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), nic_i); } - /* tell userspace that there are new packets */ - ring->avail = kring->nr_hwavail - resvd; - if (do_lock) - EM_RX_UNLOCK(adapter); + + /* tell userspace that there might be new packets */ + nm_rxsync_finalize(kring); + return 0; + +ring_reset: + return netmap_ring_reinit(kring); } @@ -346,14 +298,14 @@ lem_netmap_attach(struct adapter *adapter) bzero(&na, sizeof(na)); na.ifp = adapter->ifp; - na.separate_locks = 1; + na.na_flags = NAF_BDG_MAYSLEEP; na.num_tx_desc = adapter->num_tx_desc; na.num_rx_desc = adapter->num_rx_desc; na.nm_txsync = lem_netmap_txsync; na.nm_rxsync = lem_netmap_rxsync; - na.nm_lock = lem_netmap_lock_wrapper; na.nm_register = lem_netmap_reg; - netmap_attach(&na, 1); + na.num_tx_rings = na.num_rx_rings = 1; + netmap_attach(&na); } /* end of file */ diff --git a/sys/dev/netmap/if_re_netmap.h b/sys/dev/netmap/if_re_netmap.h index f0f1f19..10abe4f 100644 --- a/sys/dev/netmap/if_re_netmap.h +++ b/sys/dev/netmap/if_re_netmap.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -26,8 +26,9 @@ /* * $FreeBSD$ * - * netmap support for "re" - * For details on netmap support please see ixgbe_netmap.h + * netmap support for: re + * + * For more details on netmap support please see ixgbe_netmap.h */ @@ -39,71 +40,24 @@ /* - * wrapper to export locks to the generic code - * We should not use the tx/rx locks - */ -static void -re_netmap_lock_wrapper(struct ifnet *ifp, int what, u_int queueid) -{ - struct rl_softc *adapter = ifp->if_softc; - - switch (what) { - case NETMAP_CORE_LOCK: - RL_LOCK(adapter); - break; - case NETMAP_CORE_UNLOCK: - RL_UNLOCK(adapter); - break; - - case NETMAP_TX_LOCK: - case NETMAP_RX_LOCK: - case NETMAP_TX_UNLOCK: - case NETMAP_RX_UNLOCK: - D("invalid lock call %d, no tx/rx locks here", what); - break; - } -} - - -/* - * support for netmap register/unregisted. We are already under core lock. - * only called on the first register or the last unregister. + * Register/unregister. We are already under netmap lock. */ static int -re_netmap_reg(struct ifnet *ifp, int onoff) +re_netmap_reg(struct netmap_adapter *na, int onoff) { + struct ifnet *ifp = na->ifp; struct rl_softc *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); - int error = 0; - - if (na == NULL) - return EINVAL; - /* Tell the stack that the interface is no longer active */ - ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); - - re_stop(adapter); + RL_LOCK(adapter); + re_stop(adapter); /* also clears IFF_DRV_RUNNING */ if (onoff) { - ifp->if_capenable |= IFCAP_NETMAP; - - /* save if_transmit to restore it later */ - na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_start; - - re_init_locked(adapter); - - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { - error = ENOMEM; - goto fail; - } + nm_set_native_flags(na); } else { -fail: - /* restore if_transmit */ - ifp->if_transmit = na->if_transmit; - ifp->if_capenable &= ~IFCAP_NETMAP; - re_init_locked(adapter); /* also enables intr */ + nm_clear_native_flags(na); } - return (error); + re_init_locked(adapter); /* also enables intr */ + RL_UNLOCK(adapter); + return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); } @@ -111,97 +65,102 @@ fail: * Reconcile kernel and user view of the transmit ring. */ static int -re_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct rl_softc *sc = ifp->if_softc; - struct rl_txdesc *txd = sc->rl_ldata.rl_tx_desc; - struct netmap_adapter *na = NA(sc->rl_ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - int j, k, l, n, lim = kring->nkr_num_slots - 1; - - k = ring->cur; - if (k > lim) - return netmap_ring_reinit(kring); + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; - if (do_lock) - RL_LOCK(sc); + /* device-specific */ + struct rl_softc *sc = ifp->if_softc; + struct rl_txdesc *txd = sc->rl_ldata.rl_tx_desc; - /* Sync the TX descriptor list */ bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, - sc->rl_ldata.rl_tx_list_map, - BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); - - /* XXX move after the transmissions */ - /* record completed transmissions */ - for (n = 0, l = sc->rl_ldata.rl_tx_considx; - l != sc->rl_ldata.rl_tx_prodidx; - n++, l = RL_TX_DESC_NXT(sc, l)) { - uint32_t cmdstat = - le32toh(sc->rl_ldata.rl_tx_list[l].rl_cmdstat); - if (cmdstat & RL_TDESC_STAT_OWN) - break; - } - if (n > 0) { - sc->rl_ldata.rl_tx_considx = l; - sc->rl_ldata.rl_tx_free += n; - kring->nr_hwavail += n; - } + sc->rl_ldata.rl_tx_list_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); // XXX extra postwrite ? - /* update avail to what the kernel knows */ - ring->avail = kring->nr_hwavail; + /* + * First part: process new packets to send. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* we have new packets to send */ + nic_i = sc->rl_ldata.rl_tx_prodidx; + // XXX or netmap_idx_k2n(kring, nm_i); + + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; + u_int len = slot->len; + uint64_t paddr; + void *addr = PNMB(slot, &paddr); - j = kring->nr_hwcur; - if (j != k) { /* we have new packets to send */ - l = sc->rl_ldata.rl_tx_prodidx; - for (n = 0; j != k; n++) { - struct netmap_slot *slot = &ring->slot[j]; - struct rl_desc *desc = &sc->rl_ldata.rl_tx_list[l]; + /* device-specific */ + struct rl_desc *desc = &sc->rl_ldata.rl_tx_list[nic_i]; int cmd = slot->len | RL_TDESC_CMD_EOF | RL_TDESC_CMD_OWN | RL_TDESC_CMD_SOF ; - uint64_t paddr; - void *addr = PNMB(slot, &paddr); - int len = slot->len; - if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { - if (do_lock) - RL_UNLOCK(sc); - // XXX what about prodidx ? - return netmap_ring_reinit(kring); - } + NM_CHECK_ADDR_LEN(addr, len); - if (l == lim) /* mark end of ring */ + if (nic_i == lim) /* mark end of ring */ cmd |= RL_TDESC_CMD_EOR; if (slot->flags & NS_BUF_CHANGED) { + /* buffer has changed, reload map */ desc->rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); desc->rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); - /* buffer has changed, unload and reload map */ netmap_reload_map(sc->rl_ldata.rl_tx_mtag, - txd[l].tx_dmamap, addr); - slot->flags &= ~NS_BUF_CHANGED; + txd[nic_i].tx_dmamap, addr); } - slot->flags &= ~NS_REPORT; + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + + /* Fill the slot in the NIC ring. */ desc->rl_cmdstat = htole32(cmd); + + /* make sure changes to the buffer are synced */ bus_dmamap_sync(sc->rl_ldata.rl_tx_mtag, - txd[l].tx_dmamap, BUS_DMASYNC_PREWRITE); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + txd[nic_i].tx_dmamap, + BUS_DMASYNC_PREWRITE); + + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - sc->rl_ldata.rl_tx_prodidx = l; - kring->nr_hwcur = k; /* the saved ring->cur */ - ring->avail -= n; // XXX see others - kring->nr_hwavail = ring->avail; + sc->rl_ldata.rl_tx_prodidx = nic_i; + kring->nr_hwcur = head; + /* synchronize the NIC ring */ bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, - sc->rl_ldata.rl_tx_list_map, - BUS_DMASYNC_PREWRITE|BUS_DMASYNC_PREREAD); + sc->rl_ldata.rl_tx_list_map, + BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE); /* start ? */ CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START); } - if (do_lock) - RL_UNLOCK(sc); + + /* + * Second part: reclaim buffers for completed transmissions. + */ + if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { + nic_i = sc->rl_ldata.rl_tx_considx; + for (n = 0; nic_i != sc->rl_ldata.rl_tx_prodidx; + n++, nic_i = RL_TX_DESC_NXT(sc, nic_i)) { + uint32_t cmdstat = + le32toh(sc->rl_ldata.rl_tx_list[nic_i].rl_cmdstat); + if (cmdstat & RL_TDESC_STAT_OWN) + break; + } + if (n > 0) { + sc->rl_ldata.rl_tx_considx = nic_i; + sc->rl_ldata.rl_tx_free += n; + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); + } + } + + nm_txsync_finalize(kring); + return 0; } @@ -210,44 +169,46 @@ re_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) * Reconcile kernel and user view of the receive ring. */ static int -re_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct rl_softc *sc = ifp->if_softc; - struct rl_rxdesc *rxd = sc->rl_ldata.rl_rx_desc; - struct netmap_adapter *na = NA(sc->rl_ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - int j, l, n, lim = kring->nkr_num_slots - 1; - int force_update = do_lock || kring->nr_kflags & NKR_PENDINTR; - u_int k = ring->cur, resvd = ring->reserved; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_rxsync_prologue(kring); + int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; + + /* device-specific */ + struct rl_softc *sc = ifp->if_softc; + struct rl_rxdesc *rxd = sc->rl_ldata.rl_rx_desc; - k = ring->cur; - if (k > lim) + if (head > lim) return netmap_ring_reinit(kring); - if (do_lock) - RL_LOCK(sc); - /* XXX check sync modes */ bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, - sc->rl_ldata.rl_rx_list_map, - BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + sc->rl_ldata.rl_rx_list_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* - * Import newly received packets into the netmap ring. - * j is an index in the netmap ring, l in the NIC ring. + * First part: import newly received packets. * - * The device uses all the buffers in the ring, so we need + * This device uses all the buffers in the ring, so we need * another termination condition in addition to RL_RDESC_STAT_OWN - * cleared (all buffers could have it cleared. The easiest one - * is to limit the amount of data reported up to 'lim' + * cleared (all buffers could have it cleared). The easiest one + * is to stop right before nm_hwcur. */ - l = sc->rl_ldata.rl_rx_prodidx; /* next pkt to check */ - j = netmap_idx_n2k(kring, l); /* the kring index */ if (netmap_no_pendintr || force_update) { uint16_t slot_flags = kring->nkr_slot_flags; + uint32_t stop_i = nm_prev(kring->nr_hwcur, lim); - for (n = kring->nr_hwavail; n < lim ; n++) { - struct rl_desc *cur_rx = &sc->rl_ldata.rl_rx_list[l]; + nic_i = sc->rl_ldata.rl_rx_prodidx; /* next pkt to check */ + nm_i = netmap_idx_n2k(kring, nic_i); + + while (nm_i != stop_i) { + struct rl_desc *cur_rx = &sc->rl_ldata.rl_rx_list[nic_i]; uint32_t rxstat = le32toh(cur_rx->rl_cmdstat); uint32_t total_len; @@ -256,78 +217,72 @@ re_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) total_len = rxstat & sc->rl_rxlenmask; /* XXX subtract crc */ total_len = (total_len < 4) ? 0 : total_len - 4; - kring->ring->slot[j].len = total_len; - kring->ring->slot[j].flags = slot_flags; + ring->slot[nm_i].len = total_len; + ring->slot[nm_i].flags = slot_flags; /* sync was in re_newbuf() */ bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, - rxd[l].rx_dmamap, BUS_DMASYNC_POSTREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; - } - if (n != kring->nr_hwavail) { - sc->rl_ldata.rl_rx_prodidx = l; - sc->rl_ifp->if_ipackets += n - kring->nr_hwavail; - kring->nr_hwavail = n; + rxd[nic_i].rx_dmamap, BUS_DMASYNC_POSTREAD); + // sc->rl_ifp->if_ipackets++; + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } + sc->rl_ldata.rl_rx_prodidx = nic_i; + kring->nr_hwtail = nm_i; kring->nr_kflags &= ~NKR_PENDINTR; } - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - if (j != k) { /* userspace has released some packets. */ - l = netmap_idx_k2n(kring, j); /* the NIC index */ - for (n = 0; j != k; n++) { - struct netmap_slot *slot = ring->slot + j; - struct rl_desc *desc = &sc->rl_ldata.rl_rx_list[l]; - int cmd = NETMAP_BUF_SIZE | RL_RDESC_CMD_OWN; + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); - if (addr == netmap_buffer_base) { /* bad buf */ - if (do_lock) - RL_UNLOCK(sc); - return netmap_ring_reinit(kring); - } + struct rl_desc *desc = &sc->rl_ldata.rl_rx_list[nic_i]; + int cmd = NETMAP_BUF_SIZE | RL_RDESC_CMD_OWN; - if (l == lim) /* mark end of ring */ + if (addr == netmap_buffer_base) /* bad buf */ + goto ring_reset; + + if (nic_i == lim) /* mark end of ring */ cmd |= RL_RDESC_CMD_EOR; - slot->flags &= ~NS_REPORT; if (slot->flags & NS_BUF_CHANGED) { - netmap_reload_map(sc->rl_ldata.rl_rx_mtag, - rxd[l].rx_dmamap, addr); + /* buffer has changed, reload map */ desc->rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); desc->rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); + netmap_reload_map(sc->rl_ldata.rl_rx_mtag, + rxd[nic_i].rx_dmamap, addr); slot->flags &= ~NS_BUF_CHANGED; } desc->rl_cmdstat = htole32(cmd); bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, - rxd[l].rx_dmamap, BUS_DMASYNC_PREREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + rxd[nic_i].rx_dmamap, + BUS_DMASYNC_PREREAD); + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = k; - /* Flush the RX DMA ring */ + kring->nr_hwcur = head; bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, sc->rl_ldata.rl_rx_list_map, - BUS_DMASYNC_PREWRITE|BUS_DMASYNC_PREREAD); + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); } - /* tell userspace that there are new packets */ - ring->avail = kring->nr_hwavail - resvd; - if (do_lock) - RL_UNLOCK(sc); + + /* tell userspace that there might be new packets */ + nm_rxsync_finalize(kring); + return 0; + +ring_reset: + return netmap_ring_reinit(kring); } + /* * Additional routines to init the tx and rx rings. * In other drivers we do that inline in the main code. @@ -339,11 +294,16 @@ re_netmap_tx_init(struct rl_softc *sc) struct rl_desc *desc; int i, n; struct netmap_adapter *na = NA(sc->rl_ifp); - struct netmap_slot *slot = netmap_reset(na, NR_TX, 0, 0); + struct netmap_slot *slot; + if (!na || !(na->na_flags & NAF_NATIVE_ON)) { + return; + } + + slot = netmap_reset(na, NR_TX, 0, 0); /* slot is NULL if we are not in netmap mode */ if (!slot) - return; + return; // XXX cannot happen /* in netmap mode, overwrite addresses and maps */ txd = sc->rl_ldata.rl_tx_desc; desc = sc->rl_ldata.rl_tx_list; @@ -369,36 +329,35 @@ re_netmap_rx_init(struct rl_softc *sc) struct netmap_slot *slot = netmap_reset(na, NR_RX, 0, 0); struct rl_desc *desc = sc->rl_ldata.rl_rx_list; uint32_t cmdstat; - int i, n, max_avail; + uint32_t nic_i, max_avail; + uint32_t const n = sc->rl_ldata.rl_rx_desc_cnt; if (!slot) return; - n = sc->rl_ldata.rl_rx_desc_cnt; /* - * Userspace owned hwavail packets before the reset, - * so the NIC that last hwavail descriptors of the ring - * are still owned by the driver (and keep one empty). + * Do not release the slots owned by userspace, + * and also keep one empty. */ - max_avail = n - 1 - na->rx_rings[0].nr_hwavail; - for (i = 0; i < n; i++) { + max_avail = n - 1 - nm_kr_rxspace(&na->rx_rings[0]); + for (nic_i = 0; nic_i < n; nic_i++) { void *addr; uint64_t paddr; - int l = netmap_idx_n2k(&na->rx_rings[0], i); + uint32_t nm_i = netmap_idx_n2k(&na->rx_rings[0], nic_i); - addr = PNMB(slot + l, &paddr); + addr = PNMB(slot + nm_i, &paddr); netmap_reload_map(sc->rl_ldata.rl_rx_mtag, - sc->rl_ldata.rl_rx_desc[i].rx_dmamap, addr); + sc->rl_ldata.rl_rx_desc[nic_i].rx_dmamap, addr); bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, - sc->rl_ldata.rl_rx_desc[i].rx_dmamap, BUS_DMASYNC_PREREAD); - desc[i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); - desc[i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); + sc->rl_ldata.rl_rx_desc[nic_i].rx_dmamap, BUS_DMASYNC_PREREAD); + desc[nic_i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); + desc[nic_i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); cmdstat = NETMAP_BUF_SIZE; - if (i == n - 1) /* mark the end of ring */ + if (nic_i == n - 1) /* mark the end of ring */ cmdstat |= RL_RDESC_CMD_EOR; - if (i < max_avail) + if (nic_i < max_avail) cmdstat |= RL_RDESC_CMD_OWN; - desc[i].rl_cmdstat = htole32(cmdstat); + desc[nic_i].rl_cmdstat = htole32(cmdstat); } } @@ -411,13 +370,14 @@ re_netmap_attach(struct rl_softc *sc) bzero(&na, sizeof(na)); na.ifp = sc->rl_ifp; - na.separate_locks = 0; + na.na_flags = NAF_BDG_MAYSLEEP; na.num_tx_desc = sc->rl_ldata.rl_tx_desc_cnt; na.num_rx_desc = sc->rl_ldata.rl_rx_desc_cnt; na.nm_txsync = re_netmap_txsync; na.nm_rxsync = re_netmap_rxsync; - na.nm_lock = re_netmap_lock_wrapper; na.nm_register = re_netmap_reg; - netmap_attach(&na, 1); + na.num_tx_rings = na.num_rx_rings = 1; + netmap_attach(&na); } + /* end of file */ diff --git a/sys/dev/netmap/ixgbe_netmap.h b/sys/dev/netmap/ixgbe_netmap.h index be79050..a617cc4 100644 --- a/sys/dev/netmap/ixgbe_netmap.h +++ b/sys/dev/netmap/ixgbe_netmap.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -26,16 +26,16 @@ /* * $FreeBSD$ * - * netmap modifications for ixgbe + * netmap support for: ixgbe * * This file is meant to be a reference on how to implement * netmap support for a network driver. - * This file contains code but only static or inline functions - * that are used by a single driver. To avoid replication of - * code we just #include it near the beginning of the - * standard driver. + * This file contains code but only static or inline functions used + * by a single driver. To avoid replication of code we just #include + * it near the beginning of the standard driver. */ + #include <net/netmap.h> #include <sys/selinfo.h> /* @@ -48,7 +48,10 @@ */ #include <dev/netmap/netmap_kern.h> + /* + * device-specific sysctl variables: + * * ix_crcstrip: 0: keep CRC in rx frames (default), 1: strip it. * During regular operations the CRC is stripped, but on some * hardware reception of frames not multiple of 64 is slower, @@ -56,53 +59,16 @@ * * ix_rx_miss, ix_rx_miss_bufs: * count packets that might be missed due to lost interrupts. - * - * ix_use_dd - * use the dd bit for completed tx transmissions. - * This is tricky, much better to use TDH for now. */ SYSCTL_DECL(_dev_netmap); -static int ix_rx_miss, ix_rx_miss_bufs, ix_use_dd, ix_crcstrip; +static int ix_rx_miss, ix_rx_miss_bufs, ix_crcstrip; SYSCTL_INT(_dev_netmap, OID_AUTO, ix_crcstrip, CTLFLAG_RW, &ix_crcstrip, 0, "strip CRC on rx frames"); -SYSCTL_INT(_dev_netmap, OID_AUTO, ix_use_dd, - CTLFLAG_RW, &ix_use_dd, 0, "use dd instead of tdh to detect tx frames"); SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss, CTLFLAG_RW, &ix_rx_miss, 0, "potentially missed rx intr"); SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss_bufs, CTLFLAG_RW, &ix_rx_miss_bufs, 0, "potentially missed rx intr bufs"); -/* - * wrapper to export locks to the generic netmap code. - */ -static void -ixgbe_netmap_lock_wrapper(struct ifnet *_a, int what, u_int queueid) -{ - struct adapter *adapter = _a->if_softc; - - ASSERT(queueid < adapter->num_queues); - switch (what) { - case NETMAP_CORE_LOCK: - IXGBE_CORE_LOCK(adapter); - break; - case NETMAP_CORE_UNLOCK: - IXGBE_CORE_UNLOCK(adapter); - break; - case NETMAP_TX_LOCK: - IXGBE_TX_LOCK(&adapter->tx_rings[queueid]); - break; - case NETMAP_TX_UNLOCK: - IXGBE_TX_UNLOCK(&adapter->tx_rings[queueid]); - break; - case NETMAP_RX_LOCK: - IXGBE_RX_LOCK(&adapter->rx_rings[queueid]); - break; - case NETMAP_RX_UNLOCK: - IXGBE_RX_UNLOCK(&adapter->rx_rings[queueid]); - break; - } -} - static void set_crcstrip(struct ixgbe_hw *hw, int onoff) @@ -141,342 +107,275 @@ set_crcstrip(struct ixgbe_hw *hw, int onoff) IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rxc); } + /* - * Register/unregister. We are already under core lock. + * Register/unregister. We are already under netmap lock. * Only called on the first register or the last unregister. */ static int -ixgbe_netmap_reg(struct ifnet *ifp, int onoff) +ixgbe_netmap_reg(struct netmap_adapter *na, int onoff) { + struct ifnet *ifp = na->ifp; struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); - int error = 0; - - if (na == NULL) - return EINVAL; /* no netmap support here */ - ixgbe_disable_intr(adapter); + IXGBE_CORE_LOCK(adapter); + ixgbe_disable_intr(adapter); // XXX maybe ixgbe_stop ? /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); set_crcstrip(&adapter->hw, onoff); - if (onoff) { /* enable netmap mode */ - ifp->if_capenable |= IFCAP_NETMAP; - - /* save if_transmit and replace with our routine */ - na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_start; - - /* - * reinitialize the adapter, now with netmap flag set, - * so the rings will be set accordingly. - */ - ixgbe_init_locked(adapter); - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { - error = ENOMEM; - goto fail; - } - } else { /* reset normal mode (explicit request or netmap failed) */ -fail: - /* restore if_transmit */ - ifp->if_transmit = na->if_transmit; - ifp->if_capenable &= ~IFCAP_NETMAP; - /* initialize the card, this time in standard mode */ - ixgbe_init_locked(adapter); /* also enables intr */ + /* enable or disable flags and callbacks in na and ifp */ + if (onoff) { + nm_set_native_flags(na); + } else { + nm_clear_native_flags(na); } - set_crcstrip(&adapter->hw, onoff); - return (error); + ixgbe_init_locked(adapter); /* also enables intr */ + set_crcstrip(&adapter->hw, onoff); // XXX why twice ? + IXGBE_CORE_UNLOCK(adapter); + return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); } /* * Reconcile kernel and user view of the transmit ring. - * This routine might be called frequently so it must be efficient. - * - * ring->cur holds the userspace view of the current ring index. Userspace - * has filled the tx slots from the previous call's ring->cur up to but not - * including ring->cur for this call. In this function the kernel updates - * kring->nr_hwcur to ring->cur, thus slots [kring->nr_hwcur, ring->cur) are - * now ready to transmit. At the last interrupt kring->nr_hwavail slots were - * available. * - * This function runs under lock (acquired from the caller or internally). - * It must first update ring->avail to what the kernel knows, - * subtract the newly used slots (ring->cur - kring->nr_hwcur) - * from both avail and nr_hwavail, and set ring->nr_hwcur = ring->cur - * issuing a dmamap_sync on all slots. + * All information is in the kring. + * Userspace wants to send packets up to the one before kring->rhead, + * kernel knows kring->nr_hwcur is the first unsent packet. * - * Since ring comes from userspace, its content must be read only once, - * and validated before being used to update the kernel's structures. - * (this is also true for every use of ring in the kernel). + * Here we push packets out (as many as possible), and possibly + * reclaim buffers from previously completed transmission. * - * ring->avail is never used, only checked for bogus values. - * - * do_lock is set iff the function is called from the ioctl handler. - * In this case, grab a lock around the body, and also reclaim transmitted - * buffers irrespective of interrupt mitigation. + * The caller (netmap) guarantees that there is only one instance + * running at any time. Any interference with other driver + * methods should be handled by the individual drivers. */ static int -ixgbe_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct tx_ring *txr = &adapter->tx_rings[ring_nr]; - struct netmap_adapter *na = NA(adapter->ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, l, n = 0; - u_int const k = ring->cur, lim = kring->nkr_num_slots - 1; - + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; /* - * ixgbe can generate an interrupt on every tx packet, but it - * seems very expensive, so we interrupt once every half ring, - * or when requested with NS_REPORT + * interrupts on every tx packet are expensive so request + * them every half ring, or where NS_REPORT is set */ u_int report_frequency = kring->nkr_num_slots >> 1; - if (k > lim) - return netmap_ring_reinit(kring); - if (do_lock) - IXGBE_TX_LOCK(txr); + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct tx_ring *txr = &adapter->tx_rings[ring_nr]; + int reclaim_tx; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); /* - * Process new packets to send. j is the current index in the - * netmap ring, l is the corresponding index in the NIC ring. + * First part: process new packets to send. + * nm_i is the current index in the netmap ring, + * nic_i is the corresponding index in the NIC ring. * The two numbers differ because upon a *_init() we reset * the NIC ring but leave the netmap ring unchanged. * For the transmit ring, we have * - * j = kring->nr_hwcur - * l = IXGBE_TDT (not tracked in the driver) + * nm_i = kring->nr_hwcur + * nic_i = IXGBE_TDT (not tracked in the driver) * and - * j == (l + kring->nkr_hwofs) % ring_size + * nm_i == (nic_i + kring->nkr_hwofs) % ring_size * * In this driver kring->nkr_hwofs >= 0, but for other * drivers it might be negative as well. */ - j = kring->nr_hwcur; - if (j != k) { /* we have new packets to send */ - prefetch(&ring->slot[j]); - l = netmap_idx_k2n(kring, j); /* NIC index */ - prefetch(&txr->tx_buffers[l]); - for (n = 0; j != k; n++) { - /* - * Collect per-slot info. - * Note that txbuf and curr are indexed by l. - * - * In this driver we collect the buffer address - * (using the PNMB() macro) because we always - * need to rewrite it into the NIC ring. - * Many other drivers preserve the address, so - * we only need to access it if NS_BUF_CHANGED - * is set. - * XXX note, on this device the dmamap* calls are - * not necessary because tag is 0, however just accessing - * the per-packet tag kills 1Mpps at 900 MHz. - */ - struct netmap_slot *slot = &ring->slot[j]; - union ixgbe_adv_tx_desc *curr = &txr->tx_base[l]; - struct ixgbe_tx_buf *txbuf = &txr->tx_buffers[l]; - uint64_t paddr; - // XXX type for flags and len ? - int flags = ((slot->flags & NS_REPORT) || - j == 0 || j == report_frequency) ? - IXGBE_TXD_CMD_RS : 0; + + /* + * If we have packets to send (kring->nr_hwcur != kring->rhead) + * iterate over the netmap ring, fetch length and update + * the corresponding slot in the NIC ring. Some drivers also + * need to update the buffer's physical address in the NIC slot + * even NS_BUF_CHANGED is not set (PNMB computes the addresses). + * + * The netmap_reload_map() calls is especially expensive, + * even when (as in this case) the tag is 0, so do only + * when the buffer has actually changed. + * + * If possible do not set the report/intr bit on all slots, + * but only a few times per ring or when NS_REPORT is set. + * + * Finally, on 10G and faster drivers, it might be useful + * to prefetch the next slot and txr entry. + */ + + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* we have new packets to send */ + nic_i = netmap_idx_k2n(kring, nm_i); + + __builtin_prefetch(&ring->slot[nm_i]); + __builtin_prefetch(&txr->tx_buffers[nic_i]); + + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; + uint64_t paddr; void *addr = PNMB(slot, &paddr); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; - prefetch(&ring->slot[j]); - prefetch(&txr->tx_buffers[l]); - - /* - * Quick check for valid addr and len. - * NMB() returns netmap_buffer_base for invalid - * buffer indexes (but the address is still a - * valid one to be used in a ring). slot->len is - * unsigned so no need to check for negative values. - */ - if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { -ring_reset: - if (do_lock) - IXGBE_TX_UNLOCK(txr); - return netmap_ring_reinit(kring); - } + /* device-specific */ + union ixgbe_adv_tx_desc *curr = &txr->tx_base[nic_i]; + struct ixgbe_tx_buf *txbuf = &txr->tx_buffers[nic_i]; + int flags = (slot->flags & NS_REPORT || + nic_i == 0 || nic_i == report_frequency) ? + IXGBE_TXD_CMD_RS : 0; + + /* prefetch for next round */ + __builtin_prefetch(&ring->slot[nm_i + 1]); + __builtin_prefetch(&txr->tx_buffers[nic_i + 1]); + + NM_CHECK_ADDR_LEN(addr, len); if (slot->flags & NS_BUF_CHANGED) { - /* buffer has changed, unload and reload map */ + /* buffer has changed, reload map */ netmap_reload_map(txr->txtag, txbuf->map, addr); - slot->flags &= ~NS_BUF_CHANGED; } - slot->flags &= ~NS_REPORT; - /* - * Fill the slot in the NIC ring. - * In this driver we need to rewrite the buffer - * address in the NIC ring. Other drivers do not - * need this. - * Use legacy descriptor, it is faster. - */ + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + + /* Fill the slot in the NIC ring. */ + /* Use legacy descriptor, they are faster? */ curr->read.buffer_addr = htole64(paddr); curr->read.olinfo_status = 0; curr->read.cmd_type_len = htole32(len | flags | IXGBE_ADVTXD_DCMD_IFCS | IXGBE_TXD_CMD_EOP); /* make sure changes to the buffer are synced */ - bus_dmamap_sync(txr->txtag, txbuf->map, BUS_DMASYNC_PREWRITE); + bus_dmamap_sync(txr->txtag, txbuf->map, + BUS_DMASYNC_PREWRITE); + + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwcur = k; /* the saved ring->cur */ - /* decrease avail by number of packets sent */ - kring->nr_hwavail -= n; + kring->nr_hwcur = head; /* synchronize the NIC ring */ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - /* (re)start the transmitter up to slot l (excluded) */ - IXGBE_WRITE_REG(&adapter->hw, IXGBE_TDT(txr->me), l); + + /* (re)start the tx unit up to slot nic_i (excluded) */ + IXGBE_WRITE_REG(&adapter->hw, IXGBE_TDT(txr->me), nic_i); } /* - * Reclaim buffers for completed transmissions. + * Second part: reclaim buffers for completed transmissions. * Because this is expensive (we read a NIC register etc.) * we only do it in specific cases (see below). - * In all cases kring->nr_kflags indicates which slot will be - * checked upon a tx interrupt (nkr_num_slots means none). */ - if (do_lock) { - j = 1; /* forced reclaim, ignore interrupts */ - kring->nr_kflags = kring->nkr_num_slots; - } else if (kring->nr_hwavail > 0) { - j = 0; /* buffers still available: no reclaim, ignore intr. */ - kring->nr_kflags = kring->nkr_num_slots; + if (flags & NAF_FORCE_RECLAIM) { + reclaim_tx = 1; /* forced reclaim */ + } else if (!nm_kr_txempty(kring)) { + reclaim_tx = 0; /* have buffers, no reclaim */ } else { /* - * no buffers available, locate a slot for which we request - * ReportStatus (approximately half ring after next_to_clean) - * and record it in kring->nr_kflags. - * If the slot has DD set, do the reclaim looking at TDH, - * otherwise we go to sleep (in netmap_poll()) and will be - * woken up when slot nr_kflags will be ready. + * No buffers available. Locate previous slot with + * REPORT_STATUS set. + * If the slot has DD set, we can reclaim space, + * otherwise wait for the next interrupt. + * This enables interrupt moderation on the tx + * side though it might reduce throughput. */ struct ixgbe_legacy_tx_desc *txd = (struct ixgbe_legacy_tx_desc *)txr->tx_base; - j = txr->next_to_clean + kring->nkr_num_slots/2; - if (j >= kring->nkr_num_slots) - j -= kring->nkr_num_slots; + nic_i = txr->next_to_clean + report_frequency; + if (nic_i > lim) + nic_i -= lim + 1; // round to the closest with dd set - j= (j < kring->nkr_num_slots / 4 || j >= kring->nkr_num_slots*3/4) ? + nic_i = (nic_i < kring->nkr_num_slots / 4 || + nic_i >= kring->nkr_num_slots*3/4) ? 0 : report_frequency; - kring->nr_kflags = j; /* the slot to check */ - j = txd[j].upper.fields.status & IXGBE_TXD_STAT_DD; // XXX cpu_to_le32 ? + reclaim_tx = txd[nic_i].upper.fields.status & IXGBE_TXD_STAT_DD; // XXX cpu_to_le32 ? } - if (j) { - int delta; - + if (reclaim_tx) { /* * Record completed transmissions. * We (re)use the driver's txr->next_to_clean to keep * track of the most recently completed transmission. * - * The datasheet discourages the use of TDH to find out the - * number of sent packets. We should rather check the DD - * status bit in a packet descriptor. However, we only set - * the "report status" bit for some descriptors (a kind of - * interrupt mitigation), so we can only check on those. - * For the time being we use TDH, as we do it infrequently - * enough not to pose performance problems. + * The datasheet discourages the use of TDH to find + * out the number of sent packets, but we only set + * REPORT_STATUS in a few slots so TDH is the only + * good way. */ - if (ix_use_dd) { - struct ixgbe_legacy_tx_desc *txd = - (struct ixgbe_legacy_tx_desc *)txr->tx_base; - u_int k1 = netmap_idx_k2n(kring, kring->nr_hwcur); - l = txr->next_to_clean; - delta = 0; - while (l != k1 && - txd[l].upper.fields.status & IXGBE_TXD_STAT_DD) { - delta++; - l = (l == lim) ? 0 : l + 1; - } - } else { - l = IXGBE_READ_REG(&adapter->hw, IXGBE_TDH(ring_nr)); - if (l >= kring->nkr_num_slots) { /* XXX can happen */ - D("TDH wrap %d", l); - l -= kring->nkr_num_slots; + nic_i = IXGBE_READ_REG(&adapter->hw, IXGBE_TDH(ring_nr)); + if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ + D("TDH wrap %d", nic_i); + nic_i -= kring->nkr_num_slots; } - delta = l - txr->next_to_clean; - } - if (delta) { + if (nic_i != txr->next_to_clean) { /* some tx completed, increment avail */ - if (delta < 0) - delta += kring->nkr_num_slots; - txr->next_to_clean = l; - kring->nr_hwavail += delta; - if (kring->nr_hwavail > lim) - goto ring_reset; + txr->next_to_clean = nic_i; + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } } - /* update avail to what the kernel knows */ - ring->avail = kring->nr_hwavail; - if (do_lock) - IXGBE_TX_UNLOCK(txr); + nm_txsync_finalize(kring); + return 0; } /* * Reconcile kernel and user view of the receive ring. - * Same as for the txsync, this routine must be efficient and - * avoid races in accessing the shared regions. + * Same as for the txsync, this routine must be efficient. + * The caller guarantees a single invocations, but races against + * the rest of the driver should be handled here. * - * When called, userspace has read data from slots kring->nr_hwcur - * up to ring->cur (excluded). + * On call, kring->rhead is the first packet that userspace wants + * to keep, and kring->rcur is the wakeup point. + * The kernel has previously reported packets up to kring->rtail. * - * The last interrupt reported kring->nr_hwavail slots available - * after kring->nr_hwcur. - * We must subtract the newly consumed slots (cur - nr_hwcur) - * from nr_hwavail, make the descriptors available for the next reads, - * and set kring->nr_hwcur = ring->cur and ring->avail = kring->nr_hwavail. - * - * do_lock has a special meaning: please refer to txsync. + * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective + * of whether or not we received an interrupt. */ static int -ixgbe_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; - struct netmap_adapter *na = NA(adapter->ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, l, n, lim = kring->nkr_num_slots - 1; - int force_update = do_lock || kring->nr_kflags & NKR_PENDINTR; - u_int k = ring->cur, resvd = ring->reserved; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_rxsync_prologue(kring); + int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; + + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; - if (k > lim) + if (head > lim) return netmap_ring_reinit(kring); - if (do_lock) - IXGBE_RX_LOCK(rxr); /* XXX check sync modes */ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* - * First part, import newly received packets into the netmap ring. + * First part: import newly received packets. * - * j is the index of the next free slot in the netmap ring, - * and l is the index of the next received packet in the NIC ring, + * nm_i is the index of the next free slot in the netmap ring, + * nic_i is the index of the next received packet in the NIC ring, * and they may differ in case if_init() has been called while * in netmap mode. For the receive ring we have * - * j = (kring->nr_hwcur + kring->nr_hwavail) % ring_size - * l = rxr->next_to_check; + * nic_i = rxr->next_to_check; + * nm_i = kring->nr_hwtail (previous) * and - * j == (l + kring->nkr_hwofs) % ring_size + * nm_i == (nic_i + kring->nkr_hwofs) % ring_size * * rxr->next_to_check is set to 0 on a ring reinit */ @@ -484,21 +383,21 @@ ixgbe_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) int crclen = ix_crcstrip ? 0 : 4; uint16_t slot_flags = kring->nkr_slot_flags; - l = rxr->next_to_check; - j = netmap_idx_n2k(kring, l); + nic_i = rxr->next_to_check; // or also k2n(kring->nr_hwtail) + nm_i = netmap_idx_n2k(kring, nic_i); for (n = 0; ; n++) { - union ixgbe_adv_rx_desc *curr = &rxr->rx_base[l]; + union ixgbe_adv_rx_desc *curr = &rxr->rx_base[nic_i]; uint32_t staterr = le32toh(curr->wb.upper.status_error); if ((staterr & IXGBE_RXD_STAT_DD) == 0) break; - ring->slot[j].len = le16toh(curr->wb.upper.length) - crclen; - ring->slot[j].flags = slot_flags; + ring->slot[nm_i].len = le16toh(curr->wb.upper.length) - crclen; + ring->slot[nm_i].flags = slot_flags; bus_dmamap_sync(rxr->ptag, - rxr->rx_buffers[l].pmap, BUS_DMASYNC_POSTREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + rxr->rx_buffers[nic_i].pmap, BUS_DMASYNC_POSTREAD); + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ if (netmap_no_pendintr && !force_update) { @@ -506,48 +405,36 @@ ixgbe_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) ix_rx_miss ++; ix_rx_miss_bufs += n; } - rxr->next_to_check = l; - kring->nr_hwavail += n; + rxr->next_to_check = nic_i; + kring->nr_hwtail = nm_i; } kring->nr_kflags &= ~NKR_PENDINTR; } /* - * Skip past packets that userspace has released - * (from kring->nr_hwcur to ring->cur - ring->reserved excluded), + * Second part: skip past packets that userspace has released. + * (kring->nr_hwcur to kring->rhead excluded), * and make the buffers available for reception. - * As usual j is the index in the netmap ring, l is the index - * in the NIC ring, and j == (l + kring->nkr_hwofs) % ring_size + * As usual nm_i is the index in the netmap ring, + * nic_i is the index in the NIC ring, and + * nm_i == (nic_i + kring->nkr_hwofs) % ring_size */ - j = kring->nr_hwcur; - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - if (j != k) { /* userspace has released some packets. */ - l = netmap_idx_k2n(kring, j); - for (n = 0; j != k; n++) { - /* collect per-slot info, with similar validations - * and flag handling as in the txsync code. - * - * NOTE curr and rxbuf are indexed by l. - * Also, this driver needs to update the physical - * address in the NIC ring, but other drivers - * may not have this requirement. - */ - struct netmap_slot *slot = &ring->slot[j]; - union ixgbe_adv_rx_desc *curr = &rxr->rx_base[l]; - struct ixgbe_rx_buf *rxbuf = &rxr->rx_buffers[l]; + nm_i = kring->nr_hwcur; + if (nm_i != head) { + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); + union ixgbe_adv_rx_desc *curr = &rxr->rx_base[nic_i]; + struct ixgbe_rx_buf *rxbuf = &rxr->rx_buffers[nic_i]; + if (addr == netmap_buffer_base) /* bad buf */ goto ring_reset; if (slot->flags & NS_BUF_CHANGED) { + /* buffer has changed, reload map */ netmap_reload_map(rxr->ptag, rxbuf->pmap, addr); slot->flags &= ~NS_BUF_CHANGED; } @@ -555,29 +442,27 @@ ixgbe_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) curr->read.pkt_addr = htole64(paddr); bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_PREREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = k; + kring->nr_hwcur = head; + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - /* IMPORTANT: we must leave one free slot in the ring, - * so move l back by one unit + /* + * IMPORTANT: we must leave one free slot in the ring, + * so move nic_i back by one unit */ - l = (l == 0) ? lim : l - 1; - IXGBE_WRITE_REG(&adapter->hw, IXGBE_RDT(rxr->me), l); + nic_i = nm_prev(nic_i, lim); + IXGBE_WRITE_REG(&adapter->hw, IXGBE_RDT(rxr->me), nic_i); } - /* tell userspace that there are new packets */ - ring->avail = kring->nr_hwavail - resvd; - if (do_lock) - IXGBE_RX_UNLOCK(rxr); + /* tell userspace that there might be new packets */ + nm_rxsync_finalize(kring); + return 0; ring_reset: - if (do_lock) - IXGBE_RX_UNLOCK(rxr); return netmap_ring_reinit(kring); } @@ -597,14 +482,14 @@ ixgbe_netmap_attach(struct adapter *adapter) bzero(&na, sizeof(na)); na.ifp = adapter->ifp; - na.separate_locks = 1; /* this card has separate rx/tx locks */ + na.na_flags = NAF_BDG_MAYSLEEP; na.num_tx_desc = adapter->num_tx_desc; na.num_rx_desc = adapter->num_rx_desc; na.nm_txsync = ixgbe_netmap_txsync; na.nm_rxsync = ixgbe_netmap_rxsync; - na.nm_lock = ixgbe_netmap_lock_wrapper; na.nm_register = ixgbe_netmap_reg; - netmap_attach(&na, adapter->num_queues); -} + na.num_tx_rings = na.num_rx_rings = adapter->num_queues; + netmap_attach(&na); +} /* end of file */ diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c index 139eb89..6fd8028 100644 --- a/sys/dev/netmap/netmap.c +++ b/sys/dev/netmap/netmap.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -8,7 +8,7 @@ * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. + * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE @@ -23,9 +23,10 @@ * SUCH DAMAGE. */ -#define NM_BRIDGE /* + * $FreeBSD$ + * * This module supports memory mapped access to network devices, * see netmap(4). * @@ -52,57 +53,151 @@ * packets on the output interface. * 6. select() or poll() can be used to wait for events on individual * transmit or receive queues (or all queues for a given interface). - */ + * -#ifdef linux -#include "bsd_glue.h" -static netdev_tx_t linux_netmap_start(struct sk_buff *skb, struct net_device *dev); -#endif /* linux */ + SYNCHRONIZATION (USER) + +The netmap rings and data structures may be shared among multiple +user threads or even independent processes. +Any synchronization among those threads/processes is delegated +to the threads themselves. Only one thread at a time can be in +a system call on the same netmap ring. The OS does not enforce +this and only guarantees against system crashes in case of +invalid usage. + + LOCKING (INTERNAL) + +Within the kernel, access to the netmap rings is protected as follows: + +- a spinlock on each ring, to handle producer/consumer races on + RX rings attached to the host stack (against multiple host + threads writing from the host stack to the same ring), + and on 'destination' rings attached to a VALE switch + (i.e. RX rings in VALE ports, and TX rings in NIC/host ports) + protecting multiple active senders for the same destination) + +- an atomic variable to guarantee that there is at most one + instance of *_*xsync() on the ring at any time. + For rings connected to user file + descriptors, an atomic_test_and_set() protects this, and the + lock on the ring is not actually used. + For NIC RX rings connected to a VALE switch, an atomic_test_and_set() + is also used to prevent multiple executions (the driver might indeed + already guarantee this). + For NIC TX rings connected to a VALE switch, the lock arbitrates + access to the queue (both when allocating buffers and when pushing + them out). + +- *xsync() should be protected against initializations of the card. + On FreeBSD most devices have the reset routine protected by + a RING lock (ixgbe, igb, em) or core lock (re). lem is missing + the RING protection on rx_reset(), this should be added. + + On linux there is an external lock on the tx path, which probably + also arbitrates access to the reset routine. XXX to be revised + +- a per-interface core_lock protecting access from the host stack + while interfaces may be detached from netmap mode. + XXX there should be no need for this lock if we detach the interfaces + only while they are down. + + +--- VALE SWITCH --- + +NMG_LOCK() serializes all modifications to switches and ports. +A switch cannot be deleted until all ports are gone. + +For each switch, an SX lock (RWlock on linux) protects +deletion of ports. When configuring or deleting a new port, the +lock is acquired in exclusive mode (after holding NMG_LOCK). +When forwarding, the lock is acquired in shared mode (without NMG_LOCK). +The lock is held throughout the entire forwarding cycle, +during which the thread may incur in a page fault. +Hence it is important that sleepable shared locks are used. + +On the rx ring, the per-port lock is grabbed initially to reserve +a number of slot in the ring, then the lock is released, +packets are copied from source to destination, and then +the lock is acquired again and the receive ring is updated. +(A similar thing is done on the tx ring for NIC and host stack +ports attached to the switch) -#ifdef __APPLE__ -#include "osx_glue.h" -#endif /* __APPLE__ */ + */ -#ifdef __FreeBSD__ -#include <sys/cdefs.h> /* prerequisite */ -__FBSDID("$FreeBSD$"); +/* + * OS-specific code that is used only within this file. + * Other OS-specific code that must be accessed by drivers + * is present in netmap_kern.h + */ +#if defined(__FreeBSD__) +#include <sys/cdefs.h> /* prerequisite */ #include <sys/types.h> -#include <sys/module.h> #include <sys/errno.h> #include <sys/param.h> /* defines used in kernel.h */ -#include <sys/jail.h> #include <sys/kernel.h> /* types used in module initialization */ -#include <sys/conf.h> /* cdevsw struct */ -#include <sys/uio.h> /* uio struct */ +#include <sys/conf.h> /* cdevsw struct, UID, GID */ +#include <sys/filio.h> /* FIONBIO */ #include <sys/sockio.h> #include <sys/socketvar.h> /* struct socket */ #include <sys/malloc.h> -#include <sys/mman.h> /* PROT_EXEC */ #include <sys/poll.h> -#include <sys/proc.h> #include <sys/rwlock.h> -#include <vm/vm.h> /* vtophys */ -#include <vm/pmap.h> /* vtophys */ #include <sys/socket.h> /* sockaddrs */ -#include <machine/bus.h> #include <sys/selinfo.h> #include <sys/sysctl.h> +#include <sys/jail.h> +#include <net/vnet.h> #include <net/if.h> +#include <net/if_var.h> #include <net/bpf.h> /* BIOCIMMEDIATE */ -#include <net/vnet.h> #include <machine/bus.h> /* bus_dmamap_* */ +#include <sys/endian.h> +#include <sys/refcount.h> -MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); -#endif /* __FreeBSD__ */ +/* reduce conditional code */ +// linux API, use for the knlist in FreeBSD +#define init_waitqueue_head(x) knlist_init_mtx(&(x)->si_note, NULL) + +void freebsd_selwakeup(struct selinfo *si, int pri); +#define OS_selwakeup(a, b) freebsd_selwakeup(a, b) + +#elif defined(linux) + +#include "bsd_glue.h" + + + +#elif defined(__APPLE__) + +#warning OSX support is only partial +#include "osx_glue.h" + +#else + +#error Unsupported platform + +#endif /* unsupported */ + +/* + * common headers + */ #include <net/netmap.h> #include <dev/netmap/netmap_kern.h> +#include <dev/netmap/netmap_mem2.h> + + +MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); -/* XXX the following variables must be deprecated and included in nm_mem */ +/* + * The following variables are used by the drivers and replicate + * fields in the global memory pool. They only refer to buffers + * used by physical interfaces. + */ u_int netmap_total_buffers; u_int netmap_buf_size; -char *netmap_buffer_base; /* address of an invalid buffer */ +char *netmap_buffer_base; /* also address of an invalid buffer */ /* user-controlled variables */ int netmap_verbose; @@ -123,322 +218,182 @@ int netmap_txsync_retry = 2; SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); -int netmap_drop = 0; /* debugging */ int netmap_flags = 0; /* debug flags */ int netmap_fwd = 0; /* force transparent mode */ - -SYSCTL_INT(_dev_netmap, OID_AUTO, drop, CTLFLAG_RW, &netmap_drop, 0 , ""); -SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); -SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); - -#ifdef NM_BRIDGE /* support for netmap virtual switch, called VALE */ +int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */ /* - * system parameters (most of them in netmap_kern.h) - * NM_NAME prefix for switch port names, default "vale" - * NM_MAXPORTS number of ports - * NM_BRIDGES max number of switches in the system. - * XXX should become a sysctl or tunable - * - * Switch ports are named valeX:Y where X is the switch name and Y - * is the port. If Y matches a physical interface name, the port is - * connected to a physical device. - * - * Unlike physical interfaces, switch ports use their own memory region - * for rings and buffers. - * The virtual interfaces use per-queue lock instead of core lock. - * In the tx loop, we aggregate traffic in batches to make all operations - * faster. The batch size is NM_BDG_BATCH + * netmap_admode selects the netmap mode to use. + * Invalid values are reset to NETMAP_ADMODE_BEST */ -#define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ -#define NM_BRIDGE_RINGSIZE 1024 /* in the device */ -#define NM_BDG_HASH 1024 /* forwarding table entries */ -#define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ -#define NM_BRIDGES 8 /* number of bridges */ - - -int netmap_bridge = NM_BDG_BATCH; /* bridge batch size */ -SYSCTL_INT(_dev_netmap, OID_AUTO, bridge, CTLFLAG_RW, &netmap_bridge, 0 , ""); - -#ifdef linux +enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */ + NETMAP_ADMODE_NATIVE, /* either native or none */ + NETMAP_ADMODE_GENERIC, /* force generic */ + NETMAP_ADMODE_LAST }; +static int netmap_admode = NETMAP_ADMODE_BEST; -#define refcount_acquire(_a) atomic_add(1, (atomic_t *)_a) -#define refcount_release(_a) atomic_dec_and_test((atomic_t *)_a) +int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */ +int netmap_generic_ringsize = 1024; /* Generic ringsize. */ +int netmap_generic_rings = 1; /* number of queues in generic. */ -#else /* !linux */ - -#ifdef __FreeBSD__ -#include <sys/endian.h> -#include <sys/refcount.h> -#endif /* __FreeBSD__ */ +SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , ""); -#define prefetch(x) __builtin_prefetch(x) +NMG_LOCK_T netmap_global_lock; -#endif /* !linux */ -/* - * These are used to handle reference counters for bridge ports. - */ -#define ADD_BDG_REF(ifp) refcount_acquire(&NA(ifp)->na_bdg_refcount) -#define DROP_BDG_REF(ifp) refcount_release(&NA(ifp)->na_bdg_refcount) - -static void bdg_netmap_attach(struct netmap_adapter *); -static int bdg_netmap_reg(struct ifnet *ifp, int onoff); -static int kern_netmap_regif(struct nmreq *nmr); - -/* per-tx-queue entry */ -struct nm_bdg_fwd { /* forwarding entry for a bridge */ - void *ft_buf; - uint16_t _ft_dst; /* dst port, unused */ - uint16_t ft_flags; /* flags, e.g. indirect */ - uint16_t ft_len; /* src len */ - uint16_t ft_next; /* next packet to same destination */ -}; - -/* We need to build a list of buffers going to each destination. - * Each buffer is in one entry of struct nm_bdg_fwd, we use ft_next - * to build the list, and struct nm_bdg_q below for the queue. - * The structure should compact because potentially we have a lot - * of destinations. - */ -struct nm_bdg_q { - uint16_t bq_head; - uint16_t bq_tail; -}; +static void +nm_kr_get(struct netmap_kring *kr) +{ + while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) + tsleep(kr, 0, "NM_KR_GET", 4); +} -struct nm_hash_ent { - uint64_t mac; /* the top 2 bytes are the epoch */ - uint64_t ports; -}; /* - * Interfaces for a bridge are all in bdg_ports[]. - * The array has fixed size, an empty entry does not terminate - * the search. But lookups only occur on attach/detach so we - * don't mind if they are slow. - * - * The bridge is non blocking on the transmit ports. - * - * bdg_lock protects accesses to the bdg_ports array. - * This is a rw lock (or equivalent). + * mark the ring as stopped, and run through the locks + * to make sure other users get to see it. */ -struct nm_bridge { - int namelen; /* 0 means free */ +void +netmap_disable_ring(struct netmap_kring *kr) +{ + kr->nkr_stopped = 1; + nm_kr_get(kr); + mtx_lock(&kr->q_lock); + mtx_unlock(&kr->q_lock); + nm_kr_put(kr); +} - /* XXX what is the proper alignment/layout ? */ - NM_RWLOCK_T bdg_lock; /* protects bdg_ports */ - struct netmap_adapter *bdg_ports[NM_BDG_MAXPORTS]; - char basename[IFNAMSIZ]; - /* - * The function to decide the destination port. - * It returns either of an index of the destination port, - * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to - * forward this packet. ring_nr is the source ring index, and the - * function may overwrite this value to forward this packet to a - * different ring index. - * This function must be set by netmap_bdgctl(). - */ - bdg_lookup_fn_t nm_bdg_lookup; +static void +netmap_set_all_rings(struct ifnet *ifp, int stopped) +{ + struct netmap_adapter *na; + int i; + u_int ntx, nrx; - /* the forwarding table, MAC+ports */ - struct nm_hash_ent ht[NM_BDG_HASH]; -}; + if (!(ifp->if_capenable & IFCAP_NETMAP)) + return; -struct nm_bridge nm_bridges[NM_BRIDGES]; -NM_LOCK_T netmap_bridge_mutex; + na = NA(ifp); -/* other OS will have these macros defined in their own glue code. */ + ntx = netmap_real_tx_rings(na); + nrx = netmap_real_rx_rings(na); -#ifdef __FreeBSD__ -#define BDG_LOCK() mtx_lock(&netmap_bridge_mutex) -#define BDG_UNLOCK() mtx_unlock(&netmap_bridge_mutex) -#define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock) -#define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock) -#define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock) -#define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock) - -/* set/get variables. OS-specific macros may wrap these - * assignments into read/write lock or similar - */ -#define BDG_SET_VAR(lval, p) (lval = p) -#define BDG_GET_VAR(lval) (lval) -#define BDG_FREE(p) free(p, M_DEVBUF) -#endif /* __FreeBSD__ */ + for (i = 0; i < ntx; i++) { + if (stopped) + netmap_disable_ring(na->tx_rings + i); + else + na->tx_rings[i].nkr_stopped = 0; + na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY); + } -static __inline int -nma_is_vp(struct netmap_adapter *na) -{ - return na->nm_register == bdg_netmap_reg; -} -static __inline int -nma_is_host(struct netmap_adapter *na) -{ - return na->nm_register == NULL; -} -static __inline int -nma_is_hw(struct netmap_adapter *na) -{ - /* In case of sw adapter, nm_register is NULL */ - return !nma_is_vp(na) && !nma_is_host(na); + for (i = 0; i < nrx; i++) { + if (stopped) + netmap_disable_ring(na->rx_rings + i); + else + na->rx_rings[i].nkr_stopped = 0; + na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY); + } } -/* - * Regarding holding a NIC, if the NIC is owned by the kernel - * (i.e., bridge), neither another bridge nor user can use it; - * if the NIC is owned by a user, only users can share it. - * Evaluation must be done under NMA_LOCK(). - */ -#define NETMAP_OWNED_BY_KERN(ifp) (!nma_is_vp(NA(ifp)) && NA(ifp)->na_bdg) -#define NETMAP_OWNED_BY_ANY(ifp) \ - (NETMAP_OWNED_BY_KERN(ifp) || (NA(ifp)->refcount > 0)) -/* - * NA(ifp)->bdg_port port index - */ - -// XXX only for multiples of 64 bytes, non overlapped. -static inline void -pkt_copy(void *_src, void *_dst, int l) +void +netmap_disable_all_rings(struct ifnet *ifp) { - uint64_t *src = _src; - uint64_t *dst = _dst; - if (unlikely(l >= 1024)) { - bcopy(src, dst, l); - return; - } - for (; likely(l > 0); l-=64) { - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - } + netmap_set_all_rings(ifp, 1 /* stopped */); } -/* - * locate a bridge among the existing ones. - * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. - * We assume that this is called with a name of at least NM_NAME chars. - */ -static struct nm_bridge * -nm_find_bridge(const char *name, int create) +void +netmap_enable_all_rings(struct ifnet *ifp) { - int i, l, namelen; - struct nm_bridge *b = NULL; - - namelen = strlen(NM_NAME); /* base length */ - l = strlen(name); /* actual length */ - for (i = namelen + 1; i < l; i++) { - if (name[i] == ':') { - namelen = i; - break; - } - } - if (namelen >= IFNAMSIZ) - namelen = IFNAMSIZ; - ND("--- prefix is '%.*s' ---", namelen, name); - - BDG_LOCK(); - /* lookup the name, remember empty slot if there is one */ - for (i = 0; i < NM_BRIDGES; i++) { - struct nm_bridge *x = nm_bridges + i; - - if (x->namelen == 0) { - if (create && b == NULL) - b = x; /* record empty slot */ - } else if (x->namelen != namelen) { - continue; - } else if (strncmp(name, x->basename, namelen) == 0) { - ND("found '%.*s' at %d", namelen, name, i); - b = x; - break; - } - } - if (i == NM_BRIDGES && b) { /* name not found, can create entry */ - strncpy(b->basename, name, namelen); - b->namelen = namelen; - /* set the default function */ - b->nm_bdg_lookup = netmap_bdg_learning; - /* reset the MAC address table */ - bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH); - } - BDG_UNLOCK(); - return b; + netmap_set_all_rings(ifp, 0 /* enabled */); } /* - * Free the forwarding tables for rings attached to switch ports. + * generic bound_checking function */ -static void -nm_free_bdgfwd(struct netmap_adapter *na) +u_int +nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg) { - int nrings, i; - struct netmap_kring *kring; - - nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings; - kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings; - for (i = 0; i < nrings; i++) { - if (kring[i].nkr_ft) { - free(kring[i].nkr_ft, M_DEVBUF); - kring[i].nkr_ft = NULL; /* protect from freeing twice */ - } - } - if (nma_is_hw(na)) - nm_free_bdgfwd(SWNA(na->ifp)); + u_int oldv = *v; + const char *op = NULL; + + if (dflt < lo) + dflt = lo; + if (dflt > hi) + dflt = hi; + if (oldv < lo) { + *v = dflt; + op = "Bump"; + } else if (oldv > hi) { + *v = hi; + op = "Clamp"; + } + if (op && msg) + printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); + return *v; } /* - * Allocate the forwarding tables for the rings attached to the bridge ports. + * packet-dump function, user-supplied or static buffer. + * The destination buffer must be at least 30+4*len */ -static int -nm_alloc_bdgfwd(struct netmap_adapter *na) +const char * +nm_dump_buf(char *p, int len, int lim, char *dst) { - int nrings, l, i, num_dstq; - struct netmap_kring *kring; - - /* all port:rings + broadcast */ - num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; - l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH; - l += sizeof(struct nm_bdg_q) * num_dstq; - l += sizeof(uint16_t) * NM_BDG_BATCH; - - nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings; - kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings; - for (i = 0; i < nrings; i++) { - struct nm_bdg_fwd *ft; - struct nm_bdg_q *dstq; - int j; - - ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO); - if (!ft) { - nm_free_bdgfwd(na); - return ENOMEM; + static char _dst[8192]; + int i, j, i0; + static char hex[] ="0123456789abcdef"; + char *o; /* output position */ + +#define P_HI(x) hex[((x) & 0xf0)>>4] +#define P_LO(x) hex[((x) & 0xf)] +#define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.') + if (!dst) + dst = _dst; + if (lim <= 0 || lim > len) + lim = len; + o = dst; + sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim); + o += strlen(o); + /* hexdump routine */ + for (i = 0; i < lim; ) { + sprintf(o, "%5d: ", i); + o += strlen(o); + memset(o, ' ', 48); + i0 = i; + for (j=0; j < 16 && i < lim; i++, j++) { + o[j*3] = P_HI(p[i]); + o[j*3+1] = P_LO(p[i]); } - dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH); - for (j = 0; j < num_dstq; j++) - dstq[j].bq_head = dstq[j].bq_tail = NM_BDG_BATCH; - kring[i].nkr_ft = ft; - } - if (nma_is_hw(na)) - nm_alloc_bdgfwd(SWNA(na->ifp)); - return 0; + i = i0; + for (j=0; j < 16 && i < lim; i++, j++) + o[j + 48] = P_C(p[i]); + o[j+48] = '\n'; + o += j+49; + } + *o = '\0'; +#undef P_HI +#undef P_LO +#undef P_C + return dst; } -#endif /* NM_BRIDGE */ - /* * Fetch configuration from the device, to cope with dynamic * reconfigurations after loading the module. */ -static int +int netmap_update_config(struct netmap_adapter *na) { struct ifnet *ifp = na->ifp; @@ -446,7 +401,7 @@ netmap_update_config(struct netmap_adapter *na) txr = txd = rxr = rxd = 0; if (na->nm_config) { - na->nm_config(ifp, &txr, &txd, &rxr, &rxd); + na->nm_config(na, &txr, &txd, &rxr, &rxd); } else { /* take whatever we had at init time */ txr = na->num_tx_rings; @@ -458,15 +413,15 @@ netmap_update_config(struct netmap_adapter *na) if (na->num_tx_rings == txr && na->num_tx_desc == txd && na->num_rx_rings == rxr && na->num_rx_desc == rxd) return 0; /* nothing changed */ - if (netmap_verbose || na->refcount > 0) { + if (netmap_verbose || na->active_fds > 0) { D("stored config %s: txring %d x %d, rxring %d x %d", - ifp->if_xname, + NM_IFPNAME(ifp), na->num_tx_rings, na->num_tx_desc, na->num_rx_rings, na->num_rx_desc); D("new config %s: txring %d x %d, rxring %d x %d", - ifp->if_xname, txr, txd, rxr, rxd); + NM_IFPNAME(ifp), txr, txd, rxr, rxd); } - if (na->refcount == 0) { + if (na->active_fds == 0) { D("configuration changed (but fine)"); na->num_tx_rings = txr; na->num_tx_desc = txd; @@ -478,369 +433,394 @@ netmap_update_config(struct netmap_adapter *na) return 1; } -/*------------- memory allocator -----------------*/ -#include "netmap_mem2.c" -/*------------ end of memory allocator ----------*/ - - -/* Structure associated to each thread which registered an interface. - * - * The first 4 fields of this structure are written by NIOCREGIF and - * read by poll() and NIOC?XSYNC. - * There is low contention among writers (actually, a correct user program - * should have no contention among writers) and among writers and readers, - * so we use a single global lock to protect the structure initialization. - * Since initialization involves the allocation of memory, we reuse the memory - * allocator lock. - * Read access to the structure is lock free. Readers must check that - * np_nifp is not NULL before using the other fields. - * If np_nifp is NULL initialization has not been performed, so they should - * return an error to userlevel. - * - * The ref_done field is used to regulate access to the refcount in the - * memory allocator. The refcount must be incremented at most once for - * each open("/dev/netmap"). The increment is performed by the first - * function that calls netmap_get_memory() (currently called by - * mmap(), NIOCGINFO and NIOCREGIF). - * If the refcount is incremented, it is then decremented when the - * private structure is destroyed. - */ -struct netmap_priv_d { - struct netmap_if * volatile np_nifp; /* netmap interface descriptor. */ - - struct ifnet *np_ifp; /* device for which we hold a reference */ - int np_ringid; /* from the ioctl */ - u_int np_qfirst, np_qlast; /* range of rings to scan */ - uint16_t np_txpoll; +static int +netmap_txsync_compat(struct netmap_kring *kring, int flags) +{ + struct netmap_adapter *na = kring->na; + return na->nm_txsync(na, kring->ring_id, flags); +} - unsigned long ref_done; /* use with NMA_LOCK held */ -}; +static int +netmap_rxsync_compat(struct netmap_kring *kring, int flags) +{ + struct netmap_adapter *na = kring->na; + return na->nm_rxsync(na, kring->ring_id, flags); +} +static int +netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags) +{ + (void)flags; + netmap_txsync_to_host(kring->na); + return 0; +} static int -netmap_get_memory(struct netmap_priv_d* p) +netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags) { - int error = 0; - NMA_LOCK(); - if (!p->ref_done) { - error = netmap_memory_finalize(); - if (!error) - p->ref_done = 1; - } - NMA_UNLOCK(); - return error; + (void)flags; + netmap_rxsync_from_host(kring->na, NULL, NULL); + return 0; } -/* - * File descriptor's private data destructor. + + +/* create the krings array and initialize the fields common to all adapters. + * The array layout is this: * - * Call nm_register(ifp,0) to stop netmap mode on the interface and - * revert to normal operation. We expect that np_ifp has not gone. + * +----------+ + * na->tx_rings ----->| | \ + * | | } na->num_tx_ring + * | | / + * +----------+ + * | | host tx kring + * na->rx_rings ----> +----------+ + * | | \ + * | | } na->num_rx_rings + * | | / + * +----------+ + * | | host rx kring + * +----------+ + * na->tailroom ----->| | \ + * | | } tailroom bytes + * | | / + * +----------+ + * + * Note: for compatibility, host krings are created even when not needed. + * The tailroom space is currently used by vale ports for allocating leases. */ -/* call with NMA_LOCK held */ -static void -netmap_dtor_locked(void *data) +int +netmap_krings_create(struct netmap_adapter *na, u_int tailroom) { - struct netmap_priv_d *priv = data; - struct ifnet *ifp = priv->np_ifp; - struct netmap_adapter *na = NA(ifp); - struct netmap_if *nifp = priv->np_nifp; + u_int i, len, ndesc; + struct netmap_kring *kring; + u_int ntx, nrx; - na->refcount--; - if (na->refcount <= 0) { /* last instance */ - u_int i, j, lim; + /* account for the (possibly fake) host rings */ + ntx = na->num_tx_rings + 1; + nrx = na->num_rx_rings + 1; - if (netmap_verbose) - D("deleting last instance for %s", ifp->if_xname); + len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom; + + na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); + if (na->tx_rings == NULL) { + D("Cannot allocate krings"); + return ENOMEM; + } + na->rx_rings = na->tx_rings + ntx; + + /* + * All fields in krings are 0 except the one initialized below. + * but better be explicit on important kring fields. + */ + ndesc = na->num_tx_desc; + for (i = 0; i < ntx; i++) { /* Transmit rings */ + kring = &na->tx_rings[i]; + bzero(kring, sizeof(*kring)); + kring->na = na; + kring->ring_id = i; + kring->nkr_num_slots = ndesc; + if (i < na->num_tx_rings) { + kring->nm_sync = netmap_txsync_compat; // XXX + } else if (i == na->num_tx_rings) { + kring->nm_sync = netmap_txsync_to_host_compat; + } /* - * (TO CHECK) This function is only called - * when the last reference to this file descriptor goes - * away. This means we cannot have any pending poll() - * or interrupt routine operating on the structure. + * IMPORTANT: Always keep one slot empty. */ - na->nm_register(ifp, 0); /* off, clear IFCAP_NETMAP */ - /* Wake up any sleeping threads. netmap_poll will - * then return POLLERR - */ - for (i = 0; i < na->num_tx_rings + 1; i++) - selwakeuppri(&na->tx_rings[i].si, PI_NET); - for (i = 0; i < na->num_rx_rings + 1; i++) - selwakeuppri(&na->rx_rings[i].si, PI_NET); - selwakeuppri(&na->tx_si, PI_NET); - selwakeuppri(&na->rx_si, PI_NET); -#ifdef NM_BRIDGE - nm_free_bdgfwd(na); -#endif /* NM_BRIDGE */ - /* release all buffers */ - for (i = 0; i < na->num_tx_rings + 1; i++) { - struct netmap_ring *ring = na->tx_rings[i].ring; - lim = na->tx_rings[i].nkr_num_slots; - for (j = 0; j < lim; j++) - netmap_free_buf(nifp, ring->slot[j].buf_idx); - /* knlist_destroy(&na->tx_rings[i].si.si_note); */ - mtx_destroy(&na->tx_rings[i].q_lock); - } - for (i = 0; i < na->num_rx_rings + 1; i++) { - struct netmap_ring *ring = na->rx_rings[i].ring; - lim = na->rx_rings[i].nkr_num_slots; - for (j = 0; j < lim; j++) - netmap_free_buf(nifp, ring->slot[j].buf_idx); - /* knlist_destroy(&na->rx_rings[i].si.si_note); */ - mtx_destroy(&na->rx_rings[i].q_lock); + kring->rhead = kring->rcur = kring->nr_hwcur = 0; + kring->rtail = kring->nr_hwtail = ndesc - 1; + snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", NM_IFPNAME(na->ifp), i); + ND("ktx %s h %d c %d t %d", + kring->name, kring->rhead, kring->rcur, kring->rtail); + mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF); + init_waitqueue_head(&kring->si); + } + + ndesc = na->num_rx_desc; + for (i = 0; i < nrx; i++) { /* Receive rings */ + kring = &na->rx_rings[i]; + bzero(kring, sizeof(*kring)); + kring->na = na; + kring->ring_id = i; + kring->nkr_num_slots = ndesc; + if (i < na->num_rx_rings) { + kring->nm_sync = netmap_rxsync_compat; // XXX + } else if (i == na->num_rx_rings) { + kring->nm_sync = netmap_rxsync_from_host_compat; } - /* XXX kqueue(9) needed; these will mirror knlist_init. */ - /* knlist_destroy(&na->tx_si.si_note); */ - /* knlist_destroy(&na->rx_si.si_note); */ - netmap_free_rings(na); - if (nma_is_hw(na)) - SWNA(ifp)->tx_rings = SWNA(ifp)->rx_rings = NULL; + kring->rhead = kring->rcur = kring->nr_hwcur = 0; + kring->rtail = kring->nr_hwtail = 0; + snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", NM_IFPNAME(na->ifp), i); + ND("krx %s h %d c %d t %d", + kring->name, kring->rhead, kring->rcur, kring->rtail); + mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF); + init_waitqueue_head(&kring->si); } - netmap_if_free(nifp); + init_waitqueue_head(&na->tx_si); + init_waitqueue_head(&na->rx_si); + + na->tailroom = na->rx_rings + nrx; + + return 0; } -/* we assume netmap adapter exists */ -static void -nm_if_rele(struct ifnet *ifp) +/* undo the actions performed by netmap_krings_create */ +void +netmap_krings_delete(struct netmap_adapter *na) { -#ifndef NM_BRIDGE - if_rele(ifp); -#else /* NM_BRIDGE */ - int i, full = 0, is_hw; - struct nm_bridge *b; - struct netmap_adapter *na; + struct netmap_kring *kring = na->tx_rings; - /* I can be called not only for get_ifp()-ed references where netmap's - * capability is guaranteed, but also for non-netmap-capable NICs. - */ - if (!NETMAP_CAPABLE(ifp) || !NA(ifp)->na_bdg) { - if_rele(ifp); - return; + /* we rely on the krings layout described above */ + for ( ; kring != na->tailroom; kring++) { + mtx_destroy(&kring->q_lock); } - if (!DROP_BDG_REF(ifp)) - return; - - na = NA(ifp); - b = na->na_bdg; - is_hw = nma_is_hw(na); - - BDG_WLOCK(b); - ND("want to disconnect %s from the bridge", ifp->if_xname); - full = 0; - /* remove the entry from the bridge, also check - * if there are any leftover interfaces - * XXX we should optimize this code, e.g. going directly - * to na->bdg_port, and having a counter of ports that - * are connected. But it is not in a critical path. - * In NIC's case, index of sw na is always higher than hw na - */ - for (i = 0; i < NM_BDG_MAXPORTS; i++) { - struct netmap_adapter *tmp = BDG_GET_VAR(b->bdg_ports[i]); - - if (tmp == na) { - /* disconnect from bridge */ - BDG_SET_VAR(b->bdg_ports[i], NULL); - na->na_bdg = NULL; - if (is_hw && SWNA(ifp)->na_bdg) { - /* disconnect sw adapter too */ - int j = SWNA(ifp)->bdg_port; - BDG_SET_VAR(b->bdg_ports[j], NULL); - SWNA(ifp)->na_bdg = NULL; - } - } else if (tmp != NULL) { - full = 1; - } - } - BDG_WUNLOCK(b); - if (full == 0) { - ND("marking bridge %d as free", b - nm_bridges); - b->namelen = 0; - b->nm_bdg_lookup = NULL; - } - if (na->na_bdg) { /* still attached to the bridge */ - D("ouch, cannot find ifp to remove"); - } else if (is_hw) { - if_rele(ifp); - } else { - bzero(na, sizeof(*na)); - free(na, M_DEVBUF); - bzero(ifp, sizeof(*ifp)); - free(ifp, M_DEVBUF); - } -#endif /* NM_BRIDGE */ + free(na->tx_rings, M_DEVBUF); + na->tx_rings = na->rx_rings = na->tailroom = NULL; } + +/* + * Destructor for NIC ports. They also have an mbuf queue + * on the rings connected to the host so we need to purge + * them first. + */ static void -netmap_dtor(void *data) +netmap_hw_krings_delete(struct netmap_adapter *na) { - struct netmap_priv_d *priv = data; - struct ifnet *ifp = priv->np_ifp; + struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue; - NMA_LOCK(); - if (ifp) { - struct netmap_adapter *na = NA(ifp); + ND("destroy sw mbq with len %d", mbq_len(q)); + mbq_purge(q); + mbq_safe_destroy(q); + netmap_krings_delete(na); +} - if (na->na_bdg) - BDG_WLOCK(na->na_bdg); - na->nm_lock(ifp, NETMAP_REG_LOCK, 0); - netmap_dtor_locked(data); - na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0); - if (na->na_bdg) - BDG_WUNLOCK(na->na_bdg); - nm_if_rele(ifp); /* might also destroy *na */ - } - if (priv->ref_done) { - netmap_memory_deref(); +static struct netmap_if* +netmap_if_new(const char *ifname, struct netmap_adapter *na) +{ + struct netmap_if *nifp; + + if (netmap_update_config(na)) { + /* configuration mismatch, report and fail */ + return NULL; } - NMA_UNLOCK(); - bzero(priv, sizeof(*priv)); /* XXX for safety */ - free(priv, M_DEVBUF); -} + if (na->active_fds) + goto final; -#ifdef __FreeBSD__ -#include <vm/vm.h> -#include <vm/vm_param.h> -#include <vm/vm_object.h> -#include <vm/vm_page.h> -#include <vm/vm_pager.h> -#include <vm/uma.h> + if (na->nm_krings_create(na)) + goto cleanup; -/* - * In order to track whether pages are still mapped, we hook into - * the standard cdev_pager and intercept the constructor and - * destructor. - * XXX but then ? Do we really use the information ? - * Need to investigate. - */ -static struct cdev_pager_ops saved_cdev_pager_ops; + if (netmap_mem_rings_create(na)) + goto cleanup; + +final: + + nifp = netmap_mem_if_new(ifname, na); + if (nifp == NULL) + goto cleanup; + + return (nifp); +cleanup: + + if (na->active_fds == 0) { + netmap_mem_rings_delete(na); + na->nm_krings_delete(na); + } + + return NULL; +} + +/* grab a reference to the memory allocator, if we don't have one already. The + * reference is taken from the netmap_adapter registered with the priv. + * + */ static int -netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, - vm_ooffset_t foff, struct ucred *cred, u_short *color) +netmap_get_memory_locked(struct netmap_priv_d* p) { - if (netmap_verbose) - D("first mmap for %p", handle); - return saved_cdev_pager_ops.cdev_pg_ctor(handle, - size, prot, foff, cred, color); + struct netmap_mem_d *nmd; + int error = 0; + + if (p->np_na == NULL) { + if (!netmap_mmap_unreg) + return ENODEV; + /* for compatibility with older versions of the API + * we use the global allocator when no interface has been + * registered + */ + nmd = &nm_mem; + } else { + nmd = p->np_na->nm_mem; + } + if (p->np_mref == NULL) { + error = netmap_mem_finalize(nmd); + if (!error) + p->np_mref = nmd; + } else if (p->np_mref != nmd) { + /* a virtual port has been registered, but previous + * syscalls already used the global allocator. + * We cannot continue + */ + error = ENODEV; + } + return error; } -static void -netmap_dev_pager_dtor(void *handle) +int +netmap_get_memory(struct netmap_priv_d* p) { - saved_cdev_pager_ops.cdev_pg_dtor(handle); - ND("ready to release memory for %p", handle); + int error; + NMG_LOCK(); + error = netmap_get_memory_locked(p); + NMG_UNLOCK(); + return error; } -static struct cdev_pager_ops netmap_cdev_pager_ops = { - .cdev_pg_ctor = netmap_dev_pager_ctor, - .cdev_pg_dtor = netmap_dev_pager_dtor, - .cdev_pg_fault = NULL, -}; +static int +netmap_have_memory_locked(struct netmap_priv_d* p) +{ + return p->np_mref != NULL; +} -// XXX check whether we need netmap_mmap_single _and_ netmap_mmap -static int -netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff, - vm_size_t objsize, vm_object_t *objp, int prot) +static void +netmap_drop_memory_locked(struct netmap_priv_d* p) { - vm_object_t obj; - - ND("cdev %p foff %jd size %jd objp %p prot %d", cdev, - (intmax_t )*foff, (intmax_t )objsize, objp, prot); - obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, - curthread->td_ucred); - ND("returns obj %p", obj); - if (obj == NULL) - return EINVAL; - if (saved_cdev_pager_ops.cdev_pg_fault == NULL) { - ND("initialize cdev_pager_ops"); - saved_cdev_pager_ops = *(obj->un_pager.devp.ops); - netmap_cdev_pager_ops.cdev_pg_fault = - saved_cdev_pager_ops.cdev_pg_fault; - }; - obj->un_pager.devp.ops = &netmap_cdev_pager_ops; - *objp = obj; - return 0; + if (p->np_mref) { + netmap_mem_deref(p->np_mref); + p->np_mref = NULL; + } } -#endif /* __FreeBSD__ */ /* - * mmap(2) support for the "netmap" device. - * - * Expose all the memory previously allocated by our custom memory - * allocator: this way the user has only to issue a single mmap(2), and - * can work on all the data structures flawlessly. + * File descriptor's private data destructor. * - * Return 0 on success, -1 otherwise. + * Call nm_register(ifp,0) to stop netmap mode on the interface and + * revert to normal operation. We expect that np_na->ifp has not gone. + * The second argument is the nifp to work on. In some cases it is + * not attached yet to the netmap_priv_d so we need to pass it as + * a separate argument. */ - -#ifdef __FreeBSD__ -static int -netmap_mmap(__unused struct cdev *dev, -#if __FreeBSD_version < 900000 - vm_offset_t offset, vm_paddr_t *paddr, int nprot -#else - vm_ooffset_t offset, vm_paddr_t *paddr, int nprot, - __unused vm_memattr_t *memattr -#endif - ) +/* call with NMG_LOCK held */ +static void +netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) { - int error = 0; - struct netmap_priv_d *priv; + struct netmap_adapter *na = priv->np_na; + struct ifnet *ifp = na->ifp; - if (nprot & PROT_EXEC) - return (-1); // XXX -1 or EINVAL ? + NMG_LOCK_ASSERT(); + na->active_fds--; + if (na->active_fds <= 0) { /* last instance */ - error = devfs_get_cdevpriv((void **)&priv); - if (error == EBADF) { /* called on fault, memory is initialized */ - ND(5, "handling fault at ofs 0x%x", offset); - error = 0; - } else if (error == 0) /* make sure memory is set */ - error = netmap_get_memory(priv); - if (error) - return (error); - - ND("request for offset 0x%x", (uint32_t)offset); - *paddr = netmap_ofstophys(offset); + if (netmap_verbose) + D("deleting last instance for %s", NM_IFPNAME(ifp)); + /* + * (TO CHECK) This function is only called + * when the last reference to this file descriptor goes + * away. This means we cannot have any pending poll() + * or interrupt routine operating on the structure. + * XXX The file may be closed in a thread while + * another thread is using it. + * Linux keeps the file opened until the last reference + * by any outstanding ioctl/poll or mmap is gone. + * FreeBSD does not track mmap()s (but we do) and + * wakes up any sleeping poll(). Need to check what + * happens if the close() occurs while a concurrent + * syscall is running. + */ + if (ifp) + na->nm_register(na, 0); /* off, clear flags */ + /* Wake up any sleeping threads. netmap_poll will + * then return POLLERR + * XXX The wake up now must happen during *_down(), when + * we order all activities to stop. -gl + */ + /* XXX kqueue(9) needed; these will mirror knlist_init. */ + /* knlist_destroy(&na->tx_si.si_note); */ + /* knlist_destroy(&na->rx_si.si_note); */ - return (*paddr ? 0 : ENOMEM); + /* delete rings and buffers */ + netmap_mem_rings_delete(na); + na->nm_krings_delete(na); + } + /* delete the nifp */ + netmap_mem_if_delete(na, nifp); } +static __inline int +nm_tx_si_user(struct netmap_priv_d *priv) +{ + return (priv->np_na != NULL && + (priv->np_txqlast - priv->np_txqfirst > 1)); +} -static int -netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td) +static __inline int +nm_rx_si_user(struct netmap_priv_d *priv) { - if (netmap_verbose) - D("dev %p fflag 0x%x devtype %d td %p", - dev, fflag, devtype, td); - return 0; + return (priv->np_na != NULL && + (priv->np_rxqlast - priv->np_rxqfirst > 1)); } -static int -netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) +/* + * returns 1 if this is the last instance and we can free priv + */ +int +netmap_dtor_locked(struct netmap_priv_d *priv) { - struct netmap_priv_d *priv; - int error; + struct netmap_adapter *na = priv->np_na; - priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, - M_NOWAIT | M_ZERO); - if (priv == NULL) - return ENOMEM; +#ifdef __FreeBSD__ + /* + * np_refcount is the number of active mmaps on + * this file descriptor + */ + if (--priv->np_refcount > 0) { + return 0; + } +#endif /* __FreeBSD__ */ + if (!na) { + return 1; //XXX is it correct? + } + netmap_do_unregif(priv, priv->np_nifp); + priv->np_nifp = NULL; + netmap_drop_memory_locked(priv); + if (priv->np_na) { + if (nm_tx_si_user(priv)) + na->tx_si_users--; + if (nm_rx_si_user(priv)) + na->rx_si_users--; + netmap_adapter_put(na); + priv->np_na = NULL; + } + return 1; +} - error = devfs_set_cdevpriv(priv, netmap_dtor); - if (error) - return error; - return 0; +void +netmap_dtor(void *data) +{ + struct netmap_priv_d *priv = data; + int last_instance; + + NMG_LOCK(); + last_instance = netmap_dtor_locked(priv); + NMG_UNLOCK(); + if (last_instance) { + bzero(priv, sizeof(*priv)); /* for safety */ + free(priv, M_DEVBUF); + } } -#endif /* __FreeBSD__ */ + + /* @@ -864,386 +844,551 @@ netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) /* * pass a chain of buffers to the host stack as coming from 'dst' + * We do not need to lock because the queue is private. */ static void -netmap_send_up(struct ifnet *dst, struct mbuf *head) +netmap_send_up(struct ifnet *dst, struct mbq *q) { struct mbuf *m; /* send packets up, outside the lock */ - while ((m = head) != NULL) { - head = head->m_nextpkt; - m->m_nextpkt = NULL; + while ((m = mbq_dequeue(q)) != NULL) { if (netmap_verbose & NM_VERB_HOST) D("sending up pkt %p size %d", m, MBUF_LEN(m)); NM_SEND_UP(dst, m); } + mbq_destroy(q); } -struct mbq { - struct mbuf *head; - struct mbuf *tail; - int count; -}; - /* * put a copy of the buffers marked NS_FORWARD into an mbuf chain. - * Run from hwcur to cur - reserved + * Take packets from hwcur to ring->head marked NS_FORWARD (or forced) + * and pass them up. Drop remaining packets in the unlikely event + * of an mbuf shortage. */ static void netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) { - /* Take packets from hwcur to cur-reserved and pass them up. - * In case of no buffers we give up. At the end of the loop, - * the queue is drained in all cases. - * XXX handle reserved - */ - int k = kring->ring->cur - kring->ring->reserved; - u_int n, lim = kring->nkr_num_slots - 1; - struct mbuf *m, *tail = q->tail; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->ring->head; + u_int n; + struct netmap_adapter *na = kring->na; - if (k < 0) - k = k + kring->nkr_num_slots; - for (n = kring->nr_hwcur; n != k;) { + for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) { + struct mbuf *m; struct netmap_slot *slot = &kring->ring->slot[n]; - n = (n == lim) ? 0 : n + 1; if ((slot->flags & NS_FORWARD) == 0 && !force) continue; - if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE) { - D("bad pkt at %d len %d", n, slot->len); + if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { + RD(5, "bad pkt at %d len %d", n, slot->len); continue; } slot->flags &= ~NS_FORWARD; // XXX needed ? - m = m_devget(NMB(slot), slot->len, 0, kring->na->ifp, NULL); + /* XXX TODO: adapt to the case of a multisegment packet */ + m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL); if (m == NULL) break; - if (tail) - tail->m_nextpkt = m; - else - q->head = m; - tail = m; - q->count++; - m->m_nextpkt = NULL; + mbq_enqueue(q, m); } - q->tail = tail; } /* - * called under main lock to send packets from the host to the NIC - * The host ring has packets from nr_hwcur to (cur - reserved) - * to be sent down. We scan the tx rings, which have just been - * flushed so nr_hwcur == cur. Pushing packets down means - * increment cur and decrement avail. - * XXX to be verified + * Send to the NIC rings packets marked NS_FORWARD between + * kring->nr_hwcur and kring->rhead + * Called under kring->rx_queue.lock on the sw rx ring, */ -static void +static u_int netmap_sw_to_nic(struct netmap_adapter *na) { struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; - struct netmap_kring *k1 = &na->tx_rings[0]; - int i, howmany, src_lim, dst_lim; + struct netmap_slot *rxslot = kring->ring->slot; + u_int i, rxcur = kring->nr_hwcur; + u_int const head = kring->rhead; + u_int const src_lim = kring->nkr_num_slots - 1; + u_int sent = 0; + + /* scan rings to find space, then fill as much as possible */ + for (i = 0; i < na->num_tx_rings; i++) { + struct netmap_kring *kdst = &na->tx_rings[i]; + struct netmap_ring *rdst = kdst->ring; + u_int const dst_lim = kdst->nkr_num_slots - 1; + + /* XXX do we trust ring or kring->rcur,rtail ? */ + for (; rxcur != head && !nm_ring_empty(rdst); + rxcur = nm_next(rxcur, src_lim) ) { + struct netmap_slot *src, *dst, tmp; + u_int dst_cur = rdst->cur; - howmany = kring->nr_hwavail; /* XXX otherwise cur - reserved - nr_hwcur */ + src = &rxslot[rxcur]; + if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd) + continue; + + sent++; + + dst = &rdst->slot[dst_cur]; - src_lim = kring->nkr_num_slots; - for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) { - ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail); - dst_lim = k1->nkr_num_slots; - while (howmany > 0 && k1->ring->avail > 0) { - struct netmap_slot *src, *dst, tmp; - src = &kring->ring->slot[kring->nr_hwcur]; - dst = &k1->ring->slot[k1->ring->cur]; tmp = *src; + src->buf_idx = dst->buf_idx; src->flags = NS_BUF_CHANGED; dst->buf_idx = tmp.buf_idx; dst->len = tmp.len; dst->flags = NS_BUF_CHANGED; - ND("out len %d buf %d from %d to %d", - dst->len, dst->buf_idx, - kring->nr_hwcur, k1->ring->cur); - - if (++kring->nr_hwcur >= src_lim) - kring->nr_hwcur = 0; - howmany--; - kring->nr_hwavail--; - if (++k1->ring->cur >= dst_lim) - k1->ring->cur = 0; - k1->ring->avail--; + + rdst->cur = nm_next(dst_cur, dst_lim); } - kring->ring->cur = kring->nr_hwcur; // XXX - k1++; + /* if (sent) XXX txsync ? */ } + return sent; } /* - * netmap_sync_to_host() passes packets up. We are called from a + * netmap_txsync_to_host() passes packets up. We are called from a * system call in user process context, and the only contention * can be among multiple user threads erroneously calling * this routine concurrently. */ -static void -netmap_sync_to_host(struct netmap_adapter *na) +void +netmap_txsync_to_host(struct netmap_adapter *na) { struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; struct netmap_ring *ring = kring->ring; - u_int k, lim = kring->nkr_num_slots - 1; - struct mbq q = { NULL, NULL }; - - k = ring->cur; - if (k > lim) { - netmap_ring_reinit(kring); - return; - } - // na->nm_lock(na->ifp, NETMAP_CORE_LOCK, 0); + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; + struct mbq q; - /* Take packets from hwcur to cur and pass them up. + /* Take packets from hwcur to head and pass them up. + * force head = cur since netmap_grab_packets() stops at head * In case of no buffers we give up. At the end of the loop, * the queue is drained in all cases. */ - netmap_grab_packets(kring, &q, 1); - kring->nr_hwcur = k; - kring->nr_hwavail = ring->avail = lim; - // na->nm_lock(na->ifp, NETMAP_CORE_UNLOCK, 0); - - netmap_send_up(na->ifp, q.head); -} - - -/* SWNA(ifp)->txrings[0] is always NA(ifp)->txrings[NA(ifp)->num_txrings] */ -static int -netmap_bdg_to_host(struct ifnet *ifp, u_int ring_nr, int do_lock) -{ - (void)ring_nr; - (void)do_lock; - netmap_sync_to_host(NA(ifp)); - return 0; + mbq_init(&q); + ring->cur = head; + netmap_grab_packets(kring, &q, 1 /* force */); + ND("have %d pkts in queue", mbq_len(&q)); + kring->nr_hwcur = head; + kring->nr_hwtail = head + lim; + if (kring->nr_hwtail > lim) + kring->nr_hwtail -= lim + 1; + nm_txsync_finalize(kring); + + netmap_send_up(na->ifp, &q); } /* * rxsync backend for packets coming from the host stack. - * They have been put in the queue by netmap_start() so we - * need to protect access to the kring using a lock. + * They have been put in kring->rx_queue by netmap_transmit(). + * We protect access to the kring using kring->rx_queue.lock * * This routine also does the selrecord if called from the poll handler * (we know because td != NULL). * * NOTE: on linux, selrecord() is defined as a macro and uses pwait * as an additional hidden argument. + * returns the number of packets delivered to tx queues in + * transparent mode, or a negative value if error */ -static void -netmap_sync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) +int +netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) { struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; struct netmap_ring *ring = kring->ring; - u_int j, n, lim = kring->nkr_num_slots; - u_int k = ring->cur, resvd = ring->reserved; + u_int nm_i, n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; + int ret = 0; + struct mbq *q = &kring->rx_queue; (void)pwait; /* disable unused warnings */ - na->nm_lock(na->ifp, NETMAP_CORE_LOCK, 0); - if (k >= lim) { - netmap_ring_reinit(kring); - return; - } - /* new packets are already set in nr_hwavail */ - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... + (void)td; + + mtx_lock(&q->lock); + + /* First part: import newly received packets */ + n = mbq_len(q); + if (n) { /* grab packets from the queue */ + struct mbuf *m; + uint32_t stop_i; + + nm_i = kring->nr_hwtail; + stop_i = nm_prev(nm_i, lim); + while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) { + int len = MBUF_LEN(m); + struct netmap_slot *slot = &ring->slot[nm_i]; + + m_copydata(m, 0, len, BDG_NMB(na, slot)); + ND("nm %d len %d", nm_i, len); + if (netmap_verbose) + D("%s", nm_dump_buf(BDG_NMB(na, slot),len, 128, NULL)); + + slot->len = len; + slot->flags = kring->nkr_slot_flags; + nm_i = nm_next(nm_i, lim); } - k = (k >= resvd) ? k - resvd : k + lim - resvd; - } - if (j != k) { - n = k >= j ? k - j : k + lim - j; - kring->nr_hwavail -= n; - kring->nr_hwcur = k; - } - k = ring->avail = kring->nr_hwavail - resvd; - if (k == 0 && td) + kring->nr_hwtail = nm_i; + } + + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* something was released */ + if (netmap_fwd || kring->ring->flags & NR_FORWARD) + ret = netmap_sw_to_nic(na); + kring->nr_hwcur = head; + } + + nm_rxsync_finalize(kring); + + /* access copies of cur,tail in the kring */ + if (kring->rcur == kring->rtail && td) /* no bufs available */ selrecord(td, &kring->si); - if (k && (netmap_verbose & NM_VERB_HOST)) - D("%d pkts from stack", k); - na->nm_lock(na->ifp, NETMAP_CORE_UNLOCK, 0); + + mtx_unlock(&q->lock); + return ret; } -/* - * get a refcounted reference to an interface. - * Return ENXIO if the interface does not exist, EINVAL if netmap - * is not supported by the interface. - * If successful, hold a reference. +/* Get a netmap adapter for the port. + * + * If it is possible to satisfy the request, return 0 + * with *na containing the netmap adapter found. + * Otherwise return an error code, with *na containing NULL. * - * During the NIC is attached to a bridge, reference is managed - * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as - * virtual ports. Hence, on the final DROP_BDG_REF(), the NIC - * is detached from the bridge, then ifp's refcount is dropped (this - * is equivalent to that ifp is destroyed in case of virtual ports. + * When the port is attached to a bridge, we always return + * EBUSY. + * Otherwise, if the port is already bound to a file descriptor, + * then we unconditionally return the existing adapter into *na. + * In all the other cases, we return (into *na) either native, + * generic or NULL, according to the following table: + * + * native_support + * active_fds dev.netmap.admode YES NO + * ------------------------------------------------------- + * >0 * NA(ifp) NA(ifp) + * + * 0 NETMAP_ADMODE_BEST NATIVE GENERIC + * 0 NETMAP_ADMODE_NATIVE NATIVE NULL + * 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC * - * This function uses if_rele() when we want to prevent the NIC from - * being detached from the bridge in error handling. But once refcount - * is acquired by this function, it must be released using nm_if_rele(). */ -static int -get_ifp(struct nmreq *nmr, struct ifnet **ifp) + +int +netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) { - const char *name = nmr->nr_name; - int namelen = strlen(name); -#ifdef NM_BRIDGE - struct ifnet *iter = NULL; - int no_prefix = 0; - - do { - struct nm_bridge *b; - struct netmap_adapter *na; - int i, cand = -1, cand2 = -1; - - if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) { - no_prefix = 1; - break; - } - b = nm_find_bridge(name, 1 /* create a new one if no exist */ ); - if (b == NULL) { - D("no bridges available for '%s'", name); - return (ENXIO); - } - /* Now we are sure that name starts with the bridge's name */ - BDG_WLOCK(b); - /* lookup in the local list of ports */ - for (i = 0; i < NM_BDG_MAXPORTS; i++) { - na = BDG_GET_VAR(b->bdg_ports[i]); - if (na == NULL) { - if (cand == -1) - cand = i; /* potential insert point */ - else if (cand2 == -1) - cand2 = i; /* for host stack */ - continue; - } - iter = na->ifp; - /* XXX make sure the name only contains one : */ - if (!strcmp(iter->if_xname, name) /* virtual port */ || - (namelen > b->namelen && !strcmp(iter->if_xname, - name + b->namelen + 1)) /* NIC */) { - ADD_BDG_REF(iter); - ND("found existing interface"); - BDG_WUNLOCK(b); - break; - } - } - if (i < NM_BDG_MAXPORTS) /* already unlocked */ - break; - if (cand == -1) { - D("bridge full, cannot create new port"); -no_port: - BDG_WUNLOCK(b); - *ifp = NULL; - return EINVAL; - } - ND("create new bridge port %s", name); - /* - * create a struct ifnet for the new port. - * The forwarding table is attached to the kring(s). + /* generic support */ + int i = netmap_admode; /* Take a snapshot. */ + int error = 0; + struct netmap_adapter *prev_na; + struct netmap_generic_adapter *gna; + + *na = NULL; /* default */ + + /* reset in case of invalid value */ + if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST) + i = netmap_admode = NETMAP_ADMODE_BEST; + + if (NETMAP_CAPABLE(ifp)) { + /* If an adapter already exists, but is + * attached to a vale port, we report that the + * port is busy. */ - /* - * try see if there is a matching NIC with this name - * (after the bridge's name) + if (NETMAP_OWNED_BY_KERN(NA(ifp))) + return EBUSY; + + /* If an adapter already exists, return it if + * there are active file descriptors or if + * netmap is not forced to use generic + * adapters. */ - iter = ifunit_ref(name + b->namelen + 1); - if (!iter) { /* this is a virtual port */ - /* Create a temporary NA with arguments, then - * bdg_netmap_attach() will allocate the real one - * and attach it to the ifp - */ - struct netmap_adapter tmp_na; - - if (nmr->nr_cmd) /* nr_cmd must be for a NIC */ - goto no_port; - bzero(&tmp_na, sizeof(tmp_na)); - /* bound checking */ - if (nmr->nr_tx_rings < 1) - nmr->nr_tx_rings = 1; - if (nmr->nr_tx_rings > NM_BDG_MAXRINGS) - nmr->nr_tx_rings = NM_BDG_MAXRINGS; - tmp_na.num_tx_rings = nmr->nr_tx_rings; - if (nmr->nr_rx_rings < 1) - nmr->nr_rx_rings = 1; - if (nmr->nr_rx_rings > NM_BDG_MAXRINGS) - nmr->nr_rx_rings = NM_BDG_MAXRINGS; - tmp_na.num_rx_rings = nmr->nr_rx_rings; - - iter = malloc(sizeof(*iter), M_DEVBUF, M_NOWAIT | M_ZERO); - if (!iter) - goto no_port; - strcpy(iter->if_xname, name); - tmp_na.ifp = iter; - /* bdg_netmap_attach creates a struct netmap_adapter */ - bdg_netmap_attach(&tmp_na); - } else if (NETMAP_CAPABLE(iter)) { /* this is a NIC */ - /* cannot attach the NIC that any user or another - * bridge already holds. - */ - if (NETMAP_OWNED_BY_ANY(iter) || cand2 == -1) { -ifunit_rele: - if_rele(iter); /* don't detach from bridge */ - goto no_port; - } - /* bind the host stack to the bridge */ - if (nmr->nr_arg1 == NETMAP_BDG_HOST) { - BDG_SET_VAR(b->bdg_ports[cand2], SWNA(iter)); - SWNA(iter)->bdg_port = cand2; - SWNA(iter)->na_bdg = b; - } - } else /* not a netmap-capable NIC */ - goto ifunit_rele; - na = NA(iter); - na->bdg_port = cand; - /* bind the port to the bridge (virtual ports are not active) */ - BDG_SET_VAR(b->bdg_ports[cand], na); - na->na_bdg = b; - ADD_BDG_REF(iter); - BDG_WUNLOCK(b); - ND("attaching virtual bridge %p", b); - } while (0); - *ifp = iter; - if (! *ifp) -#endif /* NM_BRIDGE */ - *ifp = ifunit_ref(name); - if (*ifp == NULL) - return (ENXIO); - /* can do this if the capability exists and if_pspare[0] - * points to the netmap descriptor. + if (NA(ifp)->active_fds > 0 || + i != NETMAP_ADMODE_GENERIC) { + *na = NA(ifp); + return 0; + } + } + + /* If there isn't native support and netmap is not allowed + * to use generic adapters, we cannot satisfy the request. */ - if (NETMAP_CAPABLE(*ifp)) { -#ifdef NM_BRIDGE - /* Users cannot use the NIC attached to a bridge directly */ - if (no_prefix && NETMAP_OWNED_BY_KERN(*ifp)) { - if_rele(*ifp); /* don't detach from bridge */ - return EINVAL; - } else -#endif /* NM_BRIDGE */ - return 0; /* valid pointer, we hold the refcount */ + if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE) + return EOPNOTSUPP; + + /* Otherwise, create a generic adapter and return it, + * saving the previously used netmap adapter, if any. + * + * Note that here 'prev_na', if not NULL, MUST be a + * native adapter, and CANNOT be a generic one. This is + * true because generic adapters are created on demand, and + * destroyed when not used anymore. Therefore, if the adapter + * currently attached to an interface 'ifp' is generic, it + * must be that + * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))). + * Consequently, if NA(ifp) is generic, we will enter one of + * the branches above. This ensures that we never override + * a generic adapter with another generic adapter. + */ + prev_na = NA(ifp); + error = generic_netmap_attach(ifp); + if (error) + return error; + + *na = NA(ifp); + gna = (struct netmap_generic_adapter*)NA(ifp); + gna->prev = prev_na; /* save old na */ + if (prev_na != NULL) { + ifunit_ref(ifp->if_xname); + // XXX add a refcount ? + netmap_adapter_get(prev_na); + } + ND("Created generic NA %p (prev %p)", gna, gna->prev); + + return 0; +} + + +/* + * MUST BE CALLED UNDER NMG_LOCK() + * + * Get a refcounted reference to a netmap adapter attached + * to the interface specified by nmr. + * This is always called in the execution of an ioctl(). + * + * Return ENXIO if the interface specified by the request does + * not exist, ENOTSUP if netmap is not supported by the interface, + * EBUSY if the interface is already attached to a bridge, + * EINVAL if parameters are invalid, ENOMEM if needed resources + * could not be allocated. + * If successful, hold a reference to the netmap adapter. + * + * No reference is kept on the real interface, which may then + * disappear at any time. + */ +int +netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) +{ + struct ifnet *ifp = NULL; + int error = 0; + struct netmap_adapter *ret = NULL; + + *na = NULL; /* default return value */ + + /* first try to see if this is a bridge port. */ + NMG_LOCK_ASSERT(); + + error = netmap_get_pipe_na(nmr, na, create); + if (error || *na != NULL) + return error; + + error = netmap_get_bdg_na(nmr, na, create); + if (error) + return error; + + if (*na != NULL) /* valid match in netmap_get_bdg_na() */ + goto pipes; + + ifp = ifunit_ref(nmr->nr_name); + if (ifp == NULL) { + return ENXIO; } - nm_if_rele(*ifp); - return EINVAL; // not NETMAP capable + + error = netmap_get_hw_na(ifp, &ret); + if (error) + goto out; + + /* Users cannot use the NIC attached to a bridge directly */ + if (NETMAP_OWNED_BY_KERN(ret)) { + error = EBUSY; + goto out; + } + *na = ret; + netmap_adapter_get(ret); + +pipes: + error = netmap_pipe_alloc(*na, nmr); + +out: + if (error && ret != NULL) + netmap_adapter_put(ret); + + if (ifp) + if_rele(ifp); + + return error; +} + + +/* + * validate parameters on entry for *_txsync() + * Returns ring->cur if ok, or something >= kring->nkr_num_slots + * in case of error. + * + * rhead, rcur and rtail=hwtail are stored from previous round. + * hwcur is the next packet to send to the ring. + * + * We want + * hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail + * + * hwcur, rhead, rtail and hwtail are reliable + */ +u_int +nm_txsync_prologue(struct netmap_kring *kring) +{ + struct netmap_ring *ring = kring->ring; + u_int head = ring->head; /* read only once */ + u_int cur = ring->cur; /* read only once */ + u_int n = kring->nkr_num_slots; + + ND(5, "%s kcur %d ktail %d head %d cur %d tail %d", + kring->name, + kring->nr_hwcur, kring->nr_hwtail, + ring->head, ring->cur, ring->tail); +#if 1 /* kernel sanity checks; but we can trust the kring. */ + if (kring->nr_hwcur >= n || kring->rhead >= n || + kring->rtail >= n || kring->nr_hwtail >= n) + goto error; +#endif /* kernel sanity checks */ + /* + * user sanity checks. We only use 'cur', + * A, B, ... are possible positions for cur: + * + * 0 A cur B tail C n-1 + * 0 D tail E cur F n-1 + * + * B, F, D are valid. A, C, E are wrong + */ + if (kring->rtail >= kring->rhead) { + /* want rhead <= head <= rtail */ + if (head < kring->rhead || head > kring->rtail) + goto error; + /* and also head <= cur <= rtail */ + if (cur < head || cur > kring->rtail) + goto error; + } else { /* here rtail < rhead */ + /* we need head outside rtail .. rhead */ + if (head > kring->rtail && head < kring->rhead) + goto error; + + /* two cases now: head <= rtail or head >= rhead */ + if (head <= kring->rtail) { + /* want head <= cur <= rtail */ + if (cur < head || cur > kring->rtail) + goto error; + } else { /* head >= rhead */ + /* cur must be outside rtail..head */ + if (cur > kring->rtail && cur < head) + goto error; + } + } + if (ring->tail != kring->rtail) { + RD(5, "tail overwritten was %d need %d", + ring->tail, kring->rtail); + ring->tail = kring->rtail; + } + kring->rhead = head; + kring->rcur = cur; + return head; + +error: + RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d", + kring->name, + kring->nr_hwcur, + kring->rcur, kring->nr_hwtail, + cur, ring->tail); + return n; +} + + +/* + * validate parameters on entry for *_rxsync() + * Returns ring->head if ok, kring->nkr_num_slots on error. + * + * For a valid configuration, + * hwcur <= head <= cur <= tail <= hwtail + * + * We only consider head and cur. + * hwcur and hwtail are reliable. + * + */ +u_int +nm_rxsync_prologue(struct netmap_kring *kring) +{ + struct netmap_ring *ring = kring->ring; + uint32_t const n = kring->nkr_num_slots; + uint32_t head, cur; + + ND("%s kc %d kt %d h %d c %d t %d", + kring->name, + kring->nr_hwcur, kring->nr_hwtail, + ring->head, ring->cur, ring->tail); + /* + * Before storing the new values, we should check they do not + * move backwards. However: + * - head is not an issue because the previous value is hwcur; + * - cur could in principle go back, however it does not matter + * because we are processing a brand new rxsync() + */ + cur = kring->rcur = ring->cur; /* read only once */ + head = kring->rhead = ring->head; /* read only once */ +#if 1 /* kernel sanity checks */ + if (kring->nr_hwcur >= n || kring->nr_hwtail >= n) + goto error; +#endif /* kernel sanity checks */ + /* user sanity checks */ + if (kring->nr_hwtail >= kring->nr_hwcur) { + /* want hwcur <= rhead <= hwtail */ + if (head < kring->nr_hwcur || head > kring->nr_hwtail) + goto error; + /* and also rhead <= rcur <= hwtail */ + if (cur < head || cur > kring->nr_hwtail) + goto error; + } else { + /* we need rhead outside hwtail..hwcur */ + if (head < kring->nr_hwcur && head > kring->nr_hwtail) + goto error; + /* two cases now: head <= hwtail or head >= hwcur */ + if (head <= kring->nr_hwtail) { + /* want head <= cur <= hwtail */ + if (cur < head || cur > kring->nr_hwtail) + goto error; + } else { + /* cur must be outside hwtail..head */ + if (cur < head && cur > kring->nr_hwtail) + goto error; + } + } + if (ring->tail != kring->rtail) { + RD(5, "%s tail overwritten was %d need %d", + kring->name, + ring->tail, kring->rtail); + ring->tail = kring->rtail; + } + return head; + +error: + RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d", + kring->nr_hwcur, + kring->rcur, kring->nr_hwtail, + kring->rhead, kring->rcur, ring->tail); + return n; } /* * Error routine called when txsync/rxsync detects an error. - * Can't do much more than resetting cur = hwcur, avail = hwavail. + * Can't do much more than resetting head =cur = hwcur, tail = hwtail * Return 1 on reinit. * * This routine is only called by the upper half of the kernel. * It only reads hwcur (which is changed only by the upper half, too) - * and hwavail (which may be changed by the lower half, but only on + * and hwtail (which may be changed by the lower half, but only on * a tx ring and only to increase it, so any error will be recovered * on the next call). For the above, we don't strictly need to call * it under lock. @@ -1255,37 +1400,40 @@ netmap_ring_reinit(struct netmap_kring *kring) u_int i, lim = kring->nkr_num_slots - 1; int errors = 0; - RD(10, "called for %s", kring->na->ifp->if_xname); + // XXX KASSERT nm_kr_tryget + RD(10, "called for %s", NM_IFPNAME(kring->na->ifp)); + // XXX probably wrong to trust userspace + kring->rhead = ring->head; + kring->rcur = ring->cur; + kring->rtail = ring->tail; + if (ring->cur > lim) errors++; + if (ring->head > lim) + errors++; + if (ring->tail > lim) + errors++; for (i = 0; i <= lim; i++) { u_int idx = ring->slot[i].buf_idx; u_int len = ring->slot[i].len; if (idx < 2 || idx >= netmap_total_buffers) { - if (!errors++) - D("bad buffer at slot %d idx %d len %d ", i, idx, len); + RD(5, "bad index at slot %d idx %d len %d ", i, idx, len); ring->slot[i].buf_idx = 0; ring->slot[i].len = 0; - } else if (len > NETMAP_BUF_SIZE) { + } else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) { ring->slot[i].len = 0; - if (!errors++) - D("bad len %d at slot %d idx %d", - len, i, idx); + RD(5, "bad len at slot %d idx %d len %d", i, idx, len); } } if (errors) { - int pos = kring - kring->na->tx_rings; - int n = kring->na->num_tx_rings + 1; - RD(10, "total %d errors", errors); - errors++; - RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d", - kring->na->ifp->if_xname, - pos < n ? "TX" : "RX", pos < n ? pos : pos - n, + RD(10, "%s reinit, cur %d -> %d tail %d -> %d", + kring->name, ring->cur, kring->nr_hwcur, - ring->avail, kring->nr_hwavail); - ring->cur = kring->nr_hwcur; - ring->avail = kring->nr_hwavail; + ring->tail, kring->nr_hwtail); + ring->head = kring->rhead = kring->nr_hwcur; + ring->cur = kring->rcur = kring->nr_hwcur; + ring->tail = kring->rtail = kring->nr_hwtail; } return (errors ? 1 : 0); } @@ -1296,338 +1444,163 @@ netmap_ring_reinit(struct netmap_kring *kring) * for all rings is the same as a single ring. */ static int -netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) +netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags) { - struct ifnet *ifp = priv->np_ifp; - struct netmap_adapter *na = NA(ifp); - u_int i = ringid & NETMAP_RING_MASK; - /* initially (np_qfirst == np_qlast) we don't want to lock */ - int need_lock = (priv->np_qfirst != priv->np_qlast); - int lim = na->num_rx_rings; - - if (na->num_tx_rings > lim) - lim = na->num_tx_rings; - if ( (ringid & NETMAP_HW_RING) && i >= lim) { - D("invalid ring id %d", i); - return (EINVAL); - } - if (need_lock) - na->nm_lock(ifp, NETMAP_CORE_LOCK, 0); - priv->np_ringid = ringid; - if (ringid & NETMAP_SW_RING) { - priv->np_qfirst = NETMAP_SW_RING; - priv->np_qlast = 0; - } else if (ringid & NETMAP_HW_RING) { - priv->np_qfirst = i; - priv->np_qlast = i + 1; - } else { - priv->np_qfirst = 0; - priv->np_qlast = NETMAP_HW_RING ; + struct netmap_adapter *na = priv->np_na; + u_int j, i = ringid & NETMAP_RING_MASK; + u_int reg = flags & NR_REG_MASK; + + if (reg == NR_REG_DEFAULT) { + /* convert from old ringid to flags */ + if (ringid & NETMAP_SW_RING) { + reg = NR_REG_SW; + } else if (ringid & NETMAP_HW_RING) { + reg = NR_REG_ONE_NIC; + } else { + reg = NR_REG_ALL_NIC; + } + D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg); + } + switch (reg) { + case NR_REG_ALL_NIC: + case NR_REG_PIPE_MASTER: + case NR_REG_PIPE_SLAVE: + priv->np_txqfirst = 0; + priv->np_txqlast = na->num_tx_rings; + priv->np_rxqfirst = 0; + priv->np_rxqlast = na->num_rx_rings; + ND("%s %d %d", "ALL/PIPE", + priv->np_rxqfirst, priv->np_rxqlast); + break; + case NR_REG_SW: + case NR_REG_NIC_SW: + if (!(na->na_flags & NAF_HOST_RINGS)) { + D("host rings not supported"); + return EINVAL; + } + priv->np_txqfirst = (reg == NR_REG_SW ? + na->num_tx_rings : 0); + priv->np_txqlast = na->num_tx_rings + 1; + priv->np_rxqfirst = (reg == NR_REG_SW ? + na->num_rx_rings : 0); + priv->np_rxqlast = na->num_rx_rings + 1; + ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW", + priv->np_rxqfirst, priv->np_rxqlast); + break; + case NR_REG_ONE_NIC: + if (i >= na->num_tx_rings && i >= na->num_rx_rings) { + D("invalid ring id %d", i); + return EINVAL; + } + /* if not enough rings, use the first one */ + j = i; + if (j >= na->num_tx_rings) + j = 0; + priv->np_txqfirst = j; + priv->np_txqlast = j + 1; + j = i; + if (j >= na->num_rx_rings) + j = 0; + priv->np_rxqfirst = j; + priv->np_rxqlast = j + 1; + break; + default: + D("invalid regif type %d", reg); + return EINVAL; } priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; - if (need_lock) - na->nm_lock(ifp, NETMAP_CORE_UNLOCK, 0); - if (netmap_verbose) { - if (ringid & NETMAP_SW_RING) - D("ringid %s set to SW RING", ifp->if_xname); - else if (ringid & NETMAP_HW_RING) - D("ringid %s set to HW RING %d", ifp->if_xname, - priv->np_qfirst); - else - D("ringid %s set to all %d HW RINGS", ifp->if_xname, lim); - } + priv->np_flags = (flags & ~NR_REG_MASK) | reg; + if (nm_tx_si_user(priv)) + na->tx_si_users++; + if (nm_rx_si_user(priv)) + na->rx_si_users++; + if (netmap_verbose) { + D("%s: tx [%d,%d) rx [%d,%d) id %d", + NM_IFPNAME(na->ifp), + priv->np_txqfirst, + priv->np_txqlast, + priv->np_rxqfirst, + priv->np_rxqlast, + i); + } return 0; } - /* * possibly move the interface to netmap-mode. * If success it returns a pointer to netmap_if, otherwise NULL. - * This must be called with NMA_LOCK held. + * This must be called with NMG_LOCK held. */ -static struct netmap_if * -netmap_do_regif(struct netmap_priv_d *priv, struct ifnet *ifp, - uint16_t ringid, int *err) +struct netmap_if * +netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, + uint16_t ringid, uint32_t flags, int *err) { - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_if *nifp = NULL; - int i, error; - - if (na->na_bdg) - BDG_WLOCK(na->na_bdg); - na->nm_lock(ifp, NETMAP_REG_LOCK, 0); + int error, need_mem = 0; + NMG_LOCK_ASSERT(); /* ring configuration may have changed, fetch from the card */ netmap_update_config(na); - priv->np_ifp = ifp; /* store the reference */ - error = netmap_set_ringid(priv, ringid); + priv->np_na = na; /* store the reference */ + error = netmap_set_ringid(priv, ringid, flags); if (error) goto out; - nifp = netmap_if_new(ifp->if_xname, na); + /* ensure allocators are ready */ + need_mem = !netmap_have_memory_locked(priv); + if (need_mem) { + error = netmap_get_memory_locked(priv); + ND("get_memory returned %d", error); + if (error) + goto out; + } + nifp = netmap_if_new(NM_IFPNAME(ifp), na); if (nifp == NULL) { /* allocation failed */ + /* we should drop the allocator, but only + * if we were the ones who grabbed it + */ error = ENOMEM; - } else if (ifp->if_capenable & IFCAP_NETMAP) { + goto out; + } + na->active_fds++; + if (ifp->if_capenable & IFCAP_NETMAP) { /* was already set */ } else { /* Otherwise set the card in netmap mode * and make it use the shared buffers. + * + * do not core lock because the race is harmless here, + * there cannot be any traffic to netmap_transmit() */ - for (i = 0 ; i < na->num_tx_rings + 1; i++) - mtx_init(&na->tx_rings[i].q_lock, "nm_txq_lock", - MTX_NETWORK_LOCK, MTX_DEF); - for (i = 0 ; i < na->num_rx_rings + 1; i++) { - mtx_init(&na->rx_rings[i].q_lock, "nm_rxq_lock", - MTX_NETWORK_LOCK, MTX_DEF); - } - if (nma_is_hw(na)) { - SWNA(ifp)->tx_rings = &na->tx_rings[na->num_tx_rings]; - SWNA(ifp)->rx_rings = &na->rx_rings[na->num_rx_rings]; - } - error = na->nm_register(ifp, 1); /* mode on */ -#ifdef NM_BRIDGE - if (!error) - error = nm_alloc_bdgfwd(na); -#endif /* NM_BRIDGE */ + na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut; + ND("%p->na_lut == %p", na, na->na_lut); + na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal; + error = na->nm_register(na, 1); /* mode on */ if (error) { - netmap_dtor_locked(priv); - /* nifp is not yet in priv, so free it separately */ - netmap_if_free(nifp); + netmap_do_unregif(priv, nifp); nifp = NULL; } - } out: *err = error; - na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0); - if (na->na_bdg) - BDG_WUNLOCK(na->na_bdg); - return nifp; -} - - -/* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */ -static int -kern_netmap_regif(struct nmreq *nmr) -{ - struct ifnet *ifp; - struct netmap_if *nifp; - struct netmap_priv_d *npriv; - int error; - - npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO); - if (npriv == NULL) - return ENOMEM; - error = netmap_get_memory(npriv); if (error) { -free_exit: - bzero(npriv, sizeof(*npriv)); - free(npriv, M_DEVBUF); - return error; + priv->np_na = NULL; + if (need_mem) + netmap_drop_memory_locked(priv); } - - NMA_LOCK(); - error = get_ifp(nmr, &ifp); - if (error) { /* no device, or another bridge or user owns the device */ - NMA_UNLOCK(); - goto free_exit; - } else if (!NETMAP_OWNED_BY_KERN(ifp)) { - /* got reference to a virtual port or direct access to a NIC. - * perhaps specified no bridge's prefix or wrong NIC's name + if (nifp != NULL) { + /* + * advertise that the interface is ready bt setting ni_nifp. + * The barrier is needed because readers (poll and *SYNC) + * check for priv->np_nifp != NULL without locking */ - error = EINVAL; -unref_exit: - nm_if_rele(ifp); - NMA_UNLOCK(); - goto free_exit; - } - - if (nmr->nr_cmd == NETMAP_BDG_DETACH) { - if (NA(ifp)->refcount == 0) { /* not registered */ - error = EINVAL; - goto unref_exit; - } - NMA_UNLOCK(); - - netmap_dtor(NA(ifp)->na_kpriv); /* unregister */ - NA(ifp)->na_kpriv = NULL; - nm_if_rele(ifp); /* detach from the bridge */ - goto free_exit; - } else if (NA(ifp)->refcount > 0) { /* already registered */ - error = EINVAL; - goto unref_exit; - } - - nifp = netmap_do_regif(npriv, ifp, nmr->nr_ringid, &error); - if (!nifp) - goto unref_exit; - wmb(); // XXX do we need it ? - npriv->np_nifp = nifp; - NA(ifp)->na_kpriv = npriv; - NMA_UNLOCK(); - D("registered %s to netmap-mode", ifp->if_xname); - return 0; -} - - -/* CORE_LOCK is not necessary */ -static void -netmap_swlock_wrapper(struct ifnet *dev, int what, u_int queueid) -{ - struct netmap_adapter *na = SWNA(dev); - - switch (what) { - case NETMAP_TX_LOCK: - mtx_lock(&na->tx_rings[queueid].q_lock); - break; - - case NETMAP_TX_UNLOCK: - mtx_unlock(&na->tx_rings[queueid].q_lock); - break; - - case NETMAP_RX_LOCK: - mtx_lock(&na->rx_rings[queueid].q_lock); - break; - - case NETMAP_RX_UNLOCK: - mtx_unlock(&na->rx_rings[queueid].q_lock); - break; + wmb(); /* make sure previous writes are visible to all CPUs */ + priv->np_nifp = nifp; } + return nifp; } -/* Initialize necessary fields of sw adapter located in right after hw's - * one. sw adapter attaches a pair of sw rings of the netmap-mode NIC. - * It is always activated and deactivated at the same tie with the hw's one. - * Thus we don't need refcounting on the sw adapter. - * Regardless of NIC's feature we use separate lock so that anybody can lock - * me independently from the hw adapter. - * Make sure nm_register is NULL to be handled as FALSE in nma_is_hw - */ -static void -netmap_attach_sw(struct ifnet *ifp) -{ - struct netmap_adapter *hw_na = NA(ifp); - struct netmap_adapter *na = SWNA(ifp); - - na->ifp = ifp; - na->separate_locks = 1; - na->nm_lock = netmap_swlock_wrapper; - na->num_rx_rings = na->num_tx_rings = 1; - na->num_tx_desc = hw_na->num_tx_desc; - na->num_rx_desc = hw_na->num_rx_desc; - na->nm_txsync = netmap_bdg_to_host; -} - - -/* exported to kernel callers */ -int -netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) -{ - struct nm_bridge *b; - struct netmap_adapter *na; - struct ifnet *iter; - char *name = nmr->nr_name; - int cmd = nmr->nr_cmd, namelen = strlen(name); - int error = 0, i, j; - - switch (cmd) { - case NETMAP_BDG_ATTACH: - case NETMAP_BDG_DETACH: - error = kern_netmap_regif(nmr); - break; - - case NETMAP_BDG_LIST: - /* this is used to enumerate bridges and ports */ - if (namelen) { /* look up indexes of bridge and port */ - if (strncmp(name, NM_NAME, strlen(NM_NAME))) { - error = EINVAL; - break; - } - b = nm_find_bridge(name, 0 /* don't create */); - if (!b) { - error = ENOENT; - break; - } - - BDG_RLOCK(b); - error = ENOENT; - for (i = 0; i < NM_BDG_MAXPORTS; i++) { - na = BDG_GET_VAR(b->bdg_ports[i]); - if (na == NULL) - continue; - iter = na->ifp; - /* the former and the latter identify a - * virtual port and a NIC, respectively - */ - if (!strcmp(iter->if_xname, name) || - (namelen > b->namelen && - !strcmp(iter->if_xname, - name + b->namelen + 1))) { - /* bridge index */ - nmr->nr_arg1 = b - nm_bridges; - nmr->nr_arg2 = i; /* port index */ - error = 0; - break; - } - } - BDG_RUNLOCK(b); - } else { - /* return the first non-empty entry starting from - * bridge nr_arg1 and port nr_arg2. - * - * Users can detect the end of the same bridge by - * seeing the new and old value of nr_arg1, and can - * detect the end of all the bridge by error != 0 - */ - i = nmr->nr_arg1; - j = nmr->nr_arg2; - - for (error = ENOENT; error && i < NM_BRIDGES; i++) { - b = nm_bridges + i; - BDG_RLOCK(b); - for (; j < NM_BDG_MAXPORTS; j++) { - na = BDG_GET_VAR(b->bdg_ports[j]); - if (na == NULL) - continue; - iter = na->ifp; - nmr->nr_arg1 = i; - nmr->nr_arg2 = j; - strncpy(name, iter->if_xname, IFNAMSIZ); - error = 0; - break; - } - BDG_RUNLOCK(b); - j = 0; /* following bridges scan from 0 */ - } - } - break; - - case NETMAP_BDG_LOOKUP_REG: - /* register a lookup function to the given bridge. - * nmr->nr_name may be just bridge's name (including ':' - * if it is not just NM_NAME). - */ - if (!func) { - error = EINVAL; - break; - } - b = nm_find_bridge(name, 0 /* don't create */); - if (!b) { - error = EINVAL; - break; - } - BDG_WLOCK(b); - b->nm_bdg_lookup = func; - BDG_WUNLOCK(b); - break; - default: - D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd); - error = EINVAL; - break; - } - return error; -} - /* * ioctl(2) support for the "netmap" device. @@ -1636,41 +1609,41 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) * - NIOCGINFO * - SIOCGIFADDR just for convenience * - NIOCREGIF - * - NIOCUNREGIF * - NIOCTXSYNC * - NIOCRXSYNC * * Return 0 on success, errno otherwise. */ -static int +int netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct netmap_priv_d *priv = NULL; - struct ifnet *ifp; + struct ifnet *ifp = NULL; struct nmreq *nmr = (struct nmreq *) data; - struct netmap_adapter *na; + struct netmap_adapter *na = NULL; int error; - u_int i, lim; + u_int i, qfirst, qlast; struct netmap_if *nifp; + struct netmap_kring *krings; (void)dev; /* UNUSED */ (void)fflag; /* UNUSED */ -#ifdef linux -#define devfs_get_cdevpriv(pp) \ - ({ *(struct netmap_priv_d **)pp = ((struct file *)td)->private_data; \ - (*pp ? 0 : ENOENT); }) - -/* devfs_set_cdevpriv cannot fail on linux */ -#define devfs_set_cdevpriv(p, fn) \ - ({ ((struct file *)td)->private_data = p; (p ? 0 : EINVAL); }) - - -#define devfs_clear_cdevpriv() do { \ - netmap_dtor(priv); ((struct file *)td)->private_data = 0; \ - } while (0) -#endif /* linux */ + if (cmd == NIOCGINFO || cmd == NIOCREGIF) { + /* truncate name */ + nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; + if (nmr->nr_version != NETMAP_API) { + D("API mismatch for %s got %d need %d", + nmr->nr_name, + nmr->nr_version, NETMAP_API); + nmr->nr_version = NETMAP_API; + } + if (nmr->nr_version < NETMAP_MIN_API || + nmr->nr_version > NETMAP_MAX_API) { + return EINVAL; + } + } CURVNET_SET(TD_TO_VNET(td)); error = devfs_get_cdevpriv((void **)&priv); @@ -1681,57 +1654,50 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, return (error == ENOENT ? ENXIO : error); } - nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; /* truncate name */ switch (cmd) { case NIOCGINFO: /* return capabilities etc */ - if (nmr->nr_version != NETMAP_API) { - D("API mismatch got %d have %d", - nmr->nr_version, NETMAP_API); - nmr->nr_version = NETMAP_API; - error = EINVAL; - break; - } if (nmr->nr_cmd == NETMAP_BDG_LIST) { error = netmap_bdg_ctl(nmr, NULL); break; } - /* update configuration */ - error = netmap_get_memory(priv); - ND("get_memory returned %d", error); - if (error) - break; - /* memsize is always valid */ - nmr->nr_memsize = nm_mem.nm_totalsize; - nmr->nr_offset = 0; - nmr->nr_rx_slots = nmr->nr_tx_slots = 0; - if (nmr->nr_name[0] == '\0') /* just get memory info */ - break; - /* lock because get_ifp and update_config see na->refcount */ - NMA_LOCK(); - error = get_ifp(nmr, &ifp); /* get a refcount */ - if (error) { - NMA_UNLOCK(); - break; - } - na = NA(ifp); /* retrieve netmap_adapter */ - netmap_update_config(na); - NMA_UNLOCK(); - nmr->nr_rx_rings = na->num_rx_rings; - nmr->nr_tx_rings = na->num_tx_rings; - nmr->nr_rx_slots = na->num_rx_desc; - nmr->nr_tx_slots = na->num_tx_desc; - nm_if_rele(ifp); /* return the refcount */ + + NMG_LOCK(); + do { + /* memsize is always valid */ + struct netmap_mem_d *nmd = &nm_mem; + u_int memflags; + + if (nmr->nr_name[0] != '\0') { + /* get a refcount */ + error = netmap_get_na(nmr, &na, 1 /* create */); + if (error) + break; + nmd = na->nm_mem; /* get memory allocator */ + } + + error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags, + &nmr->nr_arg2); + if (error) + break; + if (na == NULL) /* only memory info */ + break; + nmr->nr_offset = 0; + nmr->nr_rx_slots = nmr->nr_tx_slots = 0; + netmap_update_config(na); + nmr->nr_rx_rings = na->num_rx_rings; + nmr->nr_tx_rings = na->num_tx_rings; + nmr->nr_rx_slots = na->num_rx_desc; + nmr->nr_tx_slots = na->num_tx_desc; + netmap_adapter_put(na); + } while (0); + NMG_UNLOCK(); break; case NIOCREGIF: - if (nmr->nr_version != NETMAP_API) { - nmr->nr_version = NETMAP_API; - error = EINVAL; - break; - } /* possibly attach/detach NIC and VALE switch */ i = nmr->nr_cmd; - if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH) { + if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH + || i == NETMAP_BDG_VNET_HDR) { error = netmap_bdg_ctl(nmr, NULL); break; } else if (i != 0) { @@ -1740,58 +1706,61 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, break; } - /* ensure allocators are ready */ - error = netmap_get_memory(priv); - ND("get_memory returned %d", error); - if (error) - break; - /* protect access to priv from concurrent NIOCREGIF */ - NMA_LOCK(); - if (priv->np_ifp != NULL) { /* thread already registered */ - error = netmap_set_ringid(priv, nmr->nr_ringid); -unlock_out: - NMA_UNLOCK(); - break; - } - /* find the interface and a reference */ - error = get_ifp(nmr, &ifp); /* keep reference */ - if (error) - goto unlock_out; - else if (NETMAP_OWNED_BY_KERN(ifp)) { - nm_if_rele(ifp); - goto unlock_out; - } - nifp = netmap_do_regif(priv, ifp, nmr->nr_ringid, &error); - if (!nifp) { /* reg. failed, release priv and ref */ - nm_if_rele(ifp); /* return the refcount */ - priv->np_ifp = NULL; - priv->np_nifp = NULL; - goto unlock_out; - } - - /* the following assignment is a commitment. - * Readers (i.e., poll and *SYNC) check for - * np_nifp != NULL without locking - */ - wmb(); /* make sure previous writes are visible to all CPUs */ - priv->np_nifp = nifp; - NMA_UNLOCK(); - - /* return the offset of the netmap_if object */ - na = NA(ifp); /* retrieve netmap adapter */ - nmr->nr_rx_rings = na->num_rx_rings; - nmr->nr_tx_rings = na->num_tx_rings; - nmr->nr_rx_slots = na->num_rx_desc; - nmr->nr_tx_slots = na->num_tx_desc; - nmr->nr_memsize = nm_mem.nm_totalsize; - nmr->nr_offset = netmap_if_offset(nifp); - break; + NMG_LOCK(); + do { + u_int memflags; - case NIOCUNREGIF: - // XXX we have no data here ? - D("deprecated, data is %p", nmr); - error = EINVAL; + if (priv->np_na != NULL) { /* thread already registered */ + error = EBUSY; + break; + } + /* find the interface and a reference */ + error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */ + if (error) + break; + ifp = na->ifp; + if (NETMAP_OWNED_BY_KERN(na)) { + netmap_adapter_put(na); + error = EBUSY; + break; + } + nifp = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags, &error); + if (!nifp) { /* reg. failed, release priv and ref */ + netmap_adapter_put(na); + priv->np_nifp = NULL; + break; + } + priv->np_td = td; // XXX kqueue, debugging only + + /* return the offset of the netmap_if object */ + nmr->nr_rx_rings = na->num_rx_rings; + nmr->nr_tx_rings = na->num_tx_rings; + nmr->nr_rx_slots = na->num_rx_desc; + nmr->nr_tx_slots = na->num_tx_desc; + error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags, + &nmr->nr_arg2); + if (error) { + netmap_adapter_put(na); + break; + } + if (memflags & NETMAP_MEM_PRIVATE) { + *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM; + } + priv->np_txsi = (priv->np_txqlast - priv->np_txqfirst > 1) ? + &na->tx_si : &na->tx_rings[priv->np_txqfirst].si; + priv->np_rxsi = (priv->np_rxqlast - priv->np_rxqfirst > 1) ? + &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si; + + if (nmr->nr_arg3) { + D("requested %d extra buffers", nmr->nr_arg3); + nmr->nr_arg3 = netmap_extra_alloc(na, + &nifp->ni_bufs_head, nmr->nr_arg3); + D("got %d extra buffers", nmr->nr_arg3); + } + nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); + } while (0); + NMG_UNLOCK(); break; case NIOCTXSYNC: @@ -1804,50 +1773,66 @@ unlock_out: } rmb(); /* make sure following reads are not from cache */ + na = priv->np_na; /* we have a reference */ - ifp = priv->np_ifp; /* we have a reference */ + if (na == NULL) { + D("Internal error: nifp != NULL && na == NULL"); + error = ENXIO; + break; + } + ifp = na->ifp; if (ifp == NULL) { - D("Internal error: nifp != NULL && ifp == NULL"); + RD(1, "the ifp is gone"); error = ENXIO; break; } - na = NA(ifp); /* retrieve netmap adapter */ - if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */ - if (cmd == NIOCTXSYNC) - netmap_sync_to_host(na); - else - netmap_sync_from_host(na, NULL, NULL); - break; + if (cmd == NIOCTXSYNC) { + krings = na->tx_rings; + qfirst = priv->np_txqfirst; + qlast = priv->np_txqlast; + } else { + krings = na->rx_rings; + qfirst = priv->np_rxqfirst; + qlast = priv->np_rxqlast; } - /* find the last ring to scan */ - lim = priv->np_qlast; - if (lim == NETMAP_HW_RING) - lim = (cmd == NIOCTXSYNC) ? - na->num_tx_rings : na->num_rx_rings; - for (i = priv->np_qfirst; i < lim; i++) { + for (i = qfirst; i < qlast; i++) { + struct netmap_kring *kring = krings + i; + if (nm_kr_tryget(kring)) { + error = EBUSY; + goto out; + } if (cmd == NIOCTXSYNC) { - struct netmap_kring *kring = &na->tx_rings[i]; if (netmap_verbose & NM_VERB_TXSYNC) D("pre txsync ring %d cur %d hwcur %d", i, kring->ring->cur, kring->nr_hwcur); - na->nm_txsync(ifp, i, 1 /* do lock */); + if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { + netmap_ring_reinit(kring); + } else { + kring->nm_sync(kring, NAF_FORCE_RECLAIM); + } if (netmap_verbose & NM_VERB_TXSYNC) D("post txsync ring %d cur %d hwcur %d", i, kring->ring->cur, kring->nr_hwcur); } else { - na->nm_rxsync(ifp, i, 1 /* do lock */); + kring->nm_sync(kring, NAF_FORCE_READ); microtime(&na->rx_rings[i].ring->ts); } + nm_kr_put(kring); } break; #ifdef __FreeBSD__ + case FIONBIO: + case FIOASYNC: + ND("FIONBIO/FIOASYNC are no-ops"); + break; + case BIOCIMMEDIATE: case BIOCGHDRCMPLT: case BIOCSHDRCMPLT: @@ -1858,14 +1843,21 @@ unlock_out: default: /* allow device-specific ioctls */ { struct socket so; + bzero(&so, sizeof(so)); - error = get_ifp(nmr, &ifp); /* keep reference */ - if (error) + NMG_LOCK(); + error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */ + if (error) { + netmap_adapter_put(na); + NMG_UNLOCK(); break; + } + ifp = na->ifp; so.so_vnet = ifp->if_vnet; // so->so_proto not null. error = ifioctl(&so, cmd, data, td); - nm_if_rele(ifp); + netmap_adapter_put(na); + NMG_UNLOCK(); break; } @@ -1874,6 +1866,7 @@ unlock_out: error = EOPNOTSUPP; #endif /* linux */ } +out: CURVNET_RESTORE(); return (error); @@ -1886,7 +1879,7 @@ unlock_out: * Can be called for one or more queues. * Return true the event mask corresponding to ready events. * If there are no ready events, do a selrecord on either individual - * selfd or on the global one. + * selinfo or on the global one. * Device-dependent parts (locking and sync of tx/rx rings) * are done through callbacks. * @@ -1894,22 +1887,41 @@ unlock_out: * The first one is remapped to pwait as selrecord() uses the name as an * hidden argument. */ -static int +int netmap_poll(struct cdev *dev, int events, struct thread *td) { struct netmap_priv_d *priv = NULL; struct netmap_adapter *na; struct ifnet *ifp; struct netmap_kring *kring; - u_int core_lock, i, check_all, want_tx, want_rx, revents = 0; - u_int lim_tx, lim_rx, host_forwarded = 0; - struct mbq q = { NULL, NULL, 0 }; - enum {NO_CL, NEED_CL, LOCKED_CL }; /* see below */ + u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0; + struct mbq q; /* packets from hw queues to host stack */ void *pwait = dev; /* linux compatibility */ + int is_kevent = 0; + + /* + * In order to avoid nested locks, we need to "double check" + * txsync and rxsync if we decide to do a selrecord(). + * retry_tx (and retry_rx, later) prevent looping forever. + */ + int retry_tx = 1, retry_rx = 1; (void)pwait; + mbq_init(&q); - if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL) + /* + * XXX kevent has curthread->tp_fop == NULL, + * so devfs_get_cdevpriv() fails. We circumvent this by passing + * priv as the first argument, which is also useful to avoid + * the selrecord() which are not necessary in that case. + */ + if (devfs_get_cdevpriv((void **)&priv) != 0) { + is_kevent = 1; + if (netmap_verbose) + D("called from kevent"); + priv = (struct netmap_priv_d *)dev; + } + if (priv == NULL) return POLLERR; if (priv->np_nifp == NULL) { @@ -1918,272 +1930,286 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) } rmb(); /* make sure following reads are not from cache */ - ifp = priv->np_ifp; - // XXX check for deleting() ? + na = priv->np_na; + ifp = na->ifp; + // check for deleted + if (ifp == NULL) { + RD(1, "the ifp is gone"); + return POLLERR; + } + if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) return POLLERR; if (netmap_verbose & 0x8000) - D("device %s events 0x%x", ifp->if_xname, events); + D("device %s events 0x%x", NM_IFPNAME(ifp), events); want_tx = events & (POLLOUT | POLLWRNORM); want_rx = events & (POLLIN | POLLRDNORM); - na = NA(ifp); /* retrieve netmap adapter */ - - lim_tx = na->num_tx_rings; - lim_rx = na->num_rx_rings; - /* how many queues we are scanning */ - if (priv->np_qfirst == NETMAP_SW_RING) { - if (priv->np_txpoll || want_tx) { - /* push any packets up, then we are always ready */ - netmap_sync_to_host(na); - revents |= want_tx; - } - if (want_rx) { - kring = &na->rx_rings[lim_rx]; - if (kring->ring->avail == 0) - netmap_sync_from_host(na, td, dev); - if (kring->ring->avail > 0) { - revents |= want_rx; - } - } - return (revents); - } - - /* if we are in transparent mode, check also the host rx ring */ - kring = &na->rx_rings[lim_rx]; - if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all - && want_rx - && (netmap_fwd || kring->ring->flags & NR_FORWARD) ) { - if (kring->ring->avail == 0) - netmap_sync_from_host(na, td, dev); - if (kring->ring->avail > 0) - revents |= want_rx; - } /* - * check_all is set if the card has more than one queue and - * the client is polling all of them. If true, we sleep on - * the "global" selfd, otherwise we sleep on individual selfd - * (we can only sleep on one of them per direction). - * The interrupt routine in the driver should always wake on - * the individual selfd, and also on the global one if the card - * has more than one ring. - * - * If the card has only one lock, we just use that. - * If the card has separate ring locks, we just use those - * unless we are doing check_all, in which case the whole - * loop is wrapped by the global lock. - * We acquire locks only when necessary: if poll is called - * when buffers are available, we can just return without locks. + * check_all_{tx|rx} are set if the card has more than one queue AND + * the file descriptor is bound to all of them. If so, we sleep on + * the "global" selinfo, otherwise we sleep on individual selinfo + * (FreeBSD only allows two selinfo's per file descriptor). + * The interrupt routine in the driver wake one or the other + * (or both) depending on which clients are active. * * rxsync() is only called if we run out of buffers on a POLLIN. * txsync() is called if we run out of buffers on POLLOUT, or * there are pending packets to send. The latter can be disabled * passing NETMAP_NO_TX_POLL in the NIOCREG call. */ - check_all = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1 || lim_rx > 1); + check_all_tx = nm_tx_si_user(priv); + check_all_rx = nm_rx_si_user(priv); /* - * core_lock indicates what to do with the core lock. - * The core lock is used when either the card has no individual - * locks, or it has individual locks but we are cheking all - * rings so we need the core lock to avoid missing wakeup events. - * - * It has three possible states: - * NO_CL we don't need to use the core lock, e.g. - * because we are protected by individual locks. - * NEED_CL we need the core lock. In this case, when we - * call the lock routine, move to LOCKED_CL - * to remember to release the lock once done. - * LOCKED_CL core lock is set, so we need to release it. - */ - core_lock = (check_all || !na->separate_locks) ? NEED_CL : NO_CL; -#ifdef NM_BRIDGE - /* the bridge uses separate locks */ - if (na->nm_register == bdg_netmap_reg) { - ND("not using core lock for %s", ifp->if_xname); - core_lock = NO_CL; - } -#endif /* NM_BRIDGE */ - if (priv->np_qlast != NETMAP_HW_RING) { - lim_tx = lim_rx = priv->np_qlast; - } - - /* - * We start with a lock free round which is good if we have - * data available. If this fails, then lock and call the sync + * We start with a lock free round which is cheap if we have + * slots available. If this fails, then lock and call the sync * routines. */ - for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) { + for (i = priv->np_rxqfirst; want_rx && i < priv->np_rxqlast; i++) { kring = &na->rx_rings[i]; - if (kring->ring->avail > 0) { + /* XXX compare ring->cur and kring->tail */ + if (!nm_ring_empty(kring->ring)) { revents |= want_rx; want_rx = 0; /* also breaks the loop */ } } - for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) { + for (i = priv->np_txqfirst; want_tx && i < priv->np_txqlast; i++) { kring = &na->tx_rings[i]; - if (kring->ring->avail > 0) { + /* XXX compare ring->cur and kring->tail */ + if (!nm_ring_empty(kring->ring)) { revents |= want_tx; want_tx = 0; /* also breaks the loop */ } } /* - * If we to push packets out (priv->np_txpoll) or want_tx is - * still set, we do need to run the txsync calls (on all rings, - * to avoid that the tx rings stall). + * If we want to push packets out (priv->np_txpoll) or + * want_tx is still set, we must issue txsync calls + * (on all rings, to avoid that the tx rings stall). + * XXX should also check cur != hwcur on the tx rings. + * Fortunately, normal tx mode has np_txpoll set. */ if (priv->np_txpoll || want_tx) { + /* + * The first round checks if anyone is ready, if not + * do a selrecord and another round to handle races. + * want_tx goes to 0 if any space is found, and is + * used to skip rings with no pending transmissions. + */ flush_tx: - for (i = priv->np_qfirst; i < lim_tx; i++) { + for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) { + int found = 0; + kring = &na->tx_rings[i]; - /* - * Skip the current ring if want_tx == 0 - * (we have already done a successful sync on - * a previous ring) AND kring->cur == kring->hwcur - * (there are no pending transmissions for this ring). - */ if (!want_tx && kring->ring->cur == kring->nr_hwcur) continue; - if (core_lock == NEED_CL) { - na->nm_lock(ifp, NETMAP_CORE_LOCK, 0); - core_lock = LOCKED_CL; + /* only one thread does txsync */ + if (nm_kr_tryget(kring)) { + if (netmap_verbose) + RD(2, "%p lost race on txring %d, ok", + priv, i); + continue; } - if (na->separate_locks) - na->nm_lock(ifp, NETMAP_TX_LOCK, i); - if (netmap_verbose & NM_VERB_TXSYNC) - D("send %d on %s %d", - kring->ring->cur, - ifp->if_xname, i); - if (na->nm_txsync(ifp, i, 0 /* no lock */)) + if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { + netmap_ring_reinit(kring); revents |= POLLERR; + } else { + if (kring->nm_sync(kring, 0)) + revents |= POLLERR; + } - /* Check avail/call selrecord only if called with POLLOUT */ - if (want_tx) { - if (kring->ring->avail > 0) { - /* stop at the first ring. We don't risk - * starvation. - */ - revents |= want_tx; - want_tx = 0; - } else if (!check_all) - selrecord(td, &kring->si); + /* + * If we found new slots, notify potential + * listeners on the same ring. + * Since we just did a txsync, look at the copies + * of cur,tail in the kring. + */ + found = kring->rcur != kring->rtail; + nm_kr_put(kring); + if (found) { /* notify other listeners */ + revents |= want_tx; + want_tx = 0; + na->nm_notify(na, i, NR_TX, 0); } - if (na->separate_locks) - na->nm_lock(ifp, NETMAP_TX_UNLOCK, i); + } + if (want_tx && retry_tx && !is_kevent) { + selrecord(td, check_all_tx ? + &na->tx_si : &na->tx_rings[priv->np_txqfirst].si); + retry_tx = 0; + goto flush_tx; } } /* - * now if want_rx is still set we need to lock and rxsync. + * If want_rx is still set scan receive rings. * Do it on all rings because otherwise we starve. */ if (want_rx) { - for (i = priv->np_qfirst; i < lim_rx; i++) { + int send_down = 0; /* transparent mode */ + /* two rounds here to for race avoidance */ +do_retry_rx: + for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) { + int found = 0; + kring = &na->rx_rings[i]; - if (core_lock == NEED_CL) { - na->nm_lock(ifp, NETMAP_CORE_LOCK, 0); - core_lock = LOCKED_CL; + + if (nm_kr_tryget(kring)) { + if (netmap_verbose) + RD(2, "%p lost race on rxring %d, ok", + priv, i); + continue; } - if (na->separate_locks) - na->nm_lock(ifp, NETMAP_RX_LOCK, i); + + /* + * transparent mode support: collect packets + * from the rxring(s). + * XXX NR_FORWARD should only be read on + * physical or NIC ports + */ if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { ND(10, "forwarding some buffers up %d to %d", kring->nr_hwcur, kring->ring->cur); netmap_grab_packets(kring, &q, netmap_fwd); } - if (na->nm_rxsync(ifp, i, 0 /* no lock */)) + if (kring->nm_sync(kring, 0)) revents |= POLLERR; if (netmap_no_timestamp == 0 || kring->ring->flags & NR_TIMESTAMP) { microtime(&kring->ring->ts); } - - if (kring->ring->avail > 0) + /* after an rxsync we can use kring->rcur, rtail */ + found = kring->rcur != kring->rtail; + nm_kr_put(kring); + if (found) { revents |= want_rx; - else if (!check_all) - selrecord(td, &kring->si); - if (na->separate_locks) - na->nm_lock(ifp, NETMAP_RX_UNLOCK, i); + retry_rx = 0; + na->nm_notify(na, i, NR_RX, 0); + } } - } - if (check_all && revents == 0) { /* signal on the global queue */ - if (want_tx) - selrecord(td, &na->tx_si); - if (want_rx) - selrecord(td, &na->rx_si); - } - - /* forward host to the netmap ring */ - kring = &na->rx_rings[lim_rx]; - if (kring->nr_hwavail > 0) - ND("host rx %d has %d packets", lim_rx, kring->nr_hwavail); - if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all - && (netmap_fwd || kring->ring->flags & NR_FORWARD) - && kring->nr_hwavail > 0 && !host_forwarded) { - if (core_lock == NEED_CL) { - na->nm_lock(ifp, NETMAP_CORE_LOCK, 0); - core_lock = LOCKED_CL; + + /* transparent mode XXX only during first pass ? */ + if (na->na_flags & NAF_HOST_RINGS) { + kring = &na->rx_rings[na->num_rx_rings]; + if (check_all_rx + && (netmap_fwd || kring->ring->flags & NR_FORWARD)) { + /* XXX fix to use kring fields */ + if (nm_ring_empty(kring->ring)) + send_down = netmap_rxsync_from_host(na, td, dev); + if (!nm_ring_empty(kring->ring)) + revents |= want_rx; + } + } + + if (retry_rx && !is_kevent) + selrecord(td, check_all_rx ? + &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si); + if (send_down > 0 || retry_rx) { + retry_rx = 0; + if (send_down) + goto flush_tx; /* and retry_rx */ + else + goto do_retry_rx; } - netmap_sw_to_nic(na); - host_forwarded = 1; /* prevent another pass */ - want_rx = 0; - goto flush_tx; } - if (core_lock == LOCKED_CL) - na->nm_lock(ifp, NETMAP_CORE_UNLOCK, 0); + /* + * Transparent mode: marked bufs on rx rings between + * kring->nr_hwcur and ring->head + * are passed to the other endpoint. + * + * In this mode we also scan the sw rxring, which in + * turn passes packets up. + * + * XXX Transparent mode at the moment requires to bind all + * rings to a single file descriptor. + */ + if (q.head) - netmap_send_up(na->ifp, q.head); + netmap_send_up(na->ifp, &q); return (revents); } -/*------- driver support routines ------*/ +/*-------------------- driver support routines -------------------*/ -/* - * default lock wrapper. - */ -static void -netmap_lock_wrapper(struct ifnet *dev, int what, u_int queueid) +static int netmap_hw_krings_create(struct netmap_adapter *); + +static int +netmap_notify(struct netmap_adapter *na, u_int n_ring, + enum txrx tx, int flags) { - struct netmap_adapter *na = NA(dev); + struct netmap_kring *kring; - switch (what) { -#ifdef linux /* some system do not need lock on register */ - case NETMAP_REG_LOCK: - case NETMAP_REG_UNLOCK: - break; -#endif /* linux */ + if (tx == NR_TX) { + kring = na->tx_rings + n_ring; + OS_selwakeup(&kring->si, PI_NET); + if (na->tx_si_users > 0) + OS_selwakeup(&na->tx_si, PI_NET); + } else { + kring = na->rx_rings + n_ring; + OS_selwakeup(&kring->si, PI_NET); + if (na->rx_si_users > 0) + OS_selwakeup(&na->rx_si, PI_NET); + } + return 0; +} - case NETMAP_CORE_LOCK: - mtx_lock(&na->core_lock); - break; - case NETMAP_CORE_UNLOCK: - mtx_unlock(&na->core_lock); - break; +// XXX check handling of failures +int +netmap_attach_common(struct netmap_adapter *na) +{ + struct ifnet *ifp = na->ifp; - case NETMAP_TX_LOCK: - mtx_lock(&na->tx_rings[queueid].q_lock); - break; + if (na->num_tx_rings == 0 || na->num_rx_rings == 0) { + D("%s: invalid rings tx %d rx %d", + ifp->if_xname, na->num_tx_rings, na->num_rx_rings); + return EINVAL; + } + WNA(ifp) = na; - case NETMAP_TX_UNLOCK: - mtx_unlock(&na->tx_rings[queueid].q_lock); - break; + /* the following is only needed for na that use the host port. + * XXX do we have something similar for linux ? + */ +#ifdef __FreeBSD__ + na->if_input = ifp->if_input; /* for netmap_send_up */ +#endif /* __FreeBSD__ */ - case NETMAP_RX_LOCK: - mtx_lock(&na->rx_rings[queueid].q_lock); - break; + NETMAP_SET_CAPABLE(ifp); + if (na->nm_krings_create == NULL) { + na->nm_krings_create = netmap_hw_krings_create; + na->nm_krings_delete = netmap_hw_krings_delete; + } + if (na->nm_notify == NULL) + na->nm_notify = netmap_notify; + na->active_fds = 0; - case NETMAP_RX_UNLOCK: - mtx_unlock(&na->rx_rings[queueid].q_lock); - break; + if (na->nm_mem == NULL) + na->nm_mem = &nm_mem; + return 0; +} + + +void +netmap_detach_common(struct netmap_adapter *na) +{ + if (na->ifp) + WNA(na->ifp) = NULL; /* XXX do we need this? */ + + if (na->tx_rings) { /* XXX should not happen */ + D("freeing leftover tx_rings"); + na->nm_krings_delete(na); } + netmap_pipe_dealloc(na); + if (na->na_flags & NAF_MEM_OWNER) + netmap_mem_private_delete(na->nm_mem); + bzero(na, sizeof(*na)); + free(na, M_DEVBUF); } @@ -2194,172 +2220,191 @@ netmap_lock_wrapper(struct ifnet *dev, int what, u_int queueid) * of hardware rings): * krings 0..N-1 are for the hardware queues. * kring N is for the host stack queue - * kring N+1 is only used for the selinfo for all queues. + * kring N+1 is only used for the selinfo for all queues. // XXX still true ? * Return 0 on success, ENOMEM otherwise. - * - * By default the receive and transmit adapter ring counts are both initialized - * to num_queues. na->num_tx_rings can be set for cards with different tx/rx - * setups. */ int -netmap_attach(struct netmap_adapter *arg, int num_queues) +netmap_attach(struct netmap_adapter *arg) { - struct netmap_adapter *na = NULL; + struct netmap_hw_adapter *hwna = NULL; + // XXX when is arg == NULL ? struct ifnet *ifp = arg ? arg->ifp : NULL; - int len; if (arg == NULL || ifp == NULL) goto fail; - len = nma_is_vp(arg) ? sizeof(*na) : sizeof(*na) * 2; - na = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO); - if (na == NULL) + hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (hwna == NULL) + goto fail; + hwna->up = *arg; + hwna->up.na_flags |= NAF_HOST_RINGS; + if (netmap_attach_common(&hwna->up)) { + free(hwna, M_DEVBUF); goto fail; - WNA(ifp) = na; - *na = *arg; /* copy everything, trust the driver to not pass junk */ - NETMAP_SET_CAPABLE(ifp); - if (na->num_tx_rings == 0) - na->num_tx_rings = num_queues; - na->num_rx_rings = num_queues; - na->refcount = na->na_single = na->na_multi = 0; - /* Core lock initialized here, others after netmap_if_new. */ - mtx_init(&na->core_lock, "netmap core lock", MTX_NETWORK_LOCK, MTX_DEF); - if (na->nm_lock == NULL) { - ND("using default locks for %s", ifp->if_xname); - na->nm_lock = netmap_lock_wrapper; } + netmap_adapter_get(&hwna->up); + #ifdef linux if (ifp->netdev_ops) { - ND("netdev_ops %p", ifp->netdev_ops); /* prepare a clone of the netdev ops */ #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28) - na->nm_ndo.ndo_start_xmit = ifp->netdev_ops; + hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops; #else - na->nm_ndo = *ifp->netdev_ops; + hwna->nm_ndo = *ifp->netdev_ops; #endif } - na->nm_ndo.ndo_start_xmit = linux_netmap_start; -#endif - if (!nma_is_vp(arg)) - netmap_attach_sw(ifp); - D("success for %s", ifp->if_xname); + hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; +#endif /* linux */ + + D("success for %s", NM_IFPNAME(ifp)); return 0; fail: - D("fail, arg %p ifp %p na %p", arg, ifp, na); + D("fail, arg %p ifp %p na %p", arg, ifp, hwna); netmap_detach(ifp); - return (na ? EINVAL : ENOMEM); + return (hwna ? EINVAL : ENOMEM); } -/* - * Free the allocated memory linked to the given ``netmap_adapter`` - * object. - */ void -netmap_detach(struct ifnet *ifp) +NM_DBG(netmap_adapter_get)(struct netmap_adapter *na) { - struct netmap_adapter *na = NA(ifp); + if (!na) { + return; + } + + refcount_acquire(&na->na_refcount); +} + +/* returns 1 iff the netmap_adapter is destroyed */ +int +NM_DBG(netmap_adapter_put)(struct netmap_adapter *na) +{ if (!na) - return; + return 1; - mtx_destroy(&na->core_lock); + if (!refcount_release(&na->na_refcount)) + return 0; - if (na->tx_rings) { /* XXX should not happen */ - D("freeing leftover tx_rings"); - free(na->tx_rings, M_DEVBUF); + if (na->nm_dtor) + na->nm_dtor(na); + + netmap_detach_common(na); + + return 1; +} + +int +netmap_hw_krings_create(struct netmap_adapter *na) +{ + int ret = netmap_krings_create(na, 0); + if (ret == 0) { + /* initialize the mbq for the sw rx ring */ + mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue); + ND("initialized sw rx queue %d", na->num_rx_rings); } - bzero(na, sizeof(*na)); - WNA(ifp) = NULL; - free(na, M_DEVBUF); + return ret; } -int -nm_bdg_flush(struct nm_bdg_fwd *ft, int n, struct netmap_adapter *na, u_int ring_nr); -/* we don't need to lock myself */ -static int -bdg_netmap_start(struct ifnet *ifp, struct mbuf *m) +/* + * Free the allocated memory linked to the given ``netmap_adapter`` + * object. + */ +void +netmap_detach(struct ifnet *ifp) { - struct netmap_adapter *na = SWNA(ifp); - struct nm_bdg_fwd *ft = na->rx_rings[0].nkr_ft; - char *buf = NMB(&na->rx_rings[0].ring->slot[0]); - u_int len = MBUF_LEN(m); + struct netmap_adapter *na = NA(ifp); - if (!na->na_bdg) /* SWNA is not configured to be attached */ - return EBUSY; - m_copydata(m, 0, len, buf); - ft->ft_flags = 0; // XXX could be indirect ? - ft->ft_len = len; - ft->ft_buf = buf; - ft->ft_next = NM_BDG_BATCH; // XXX is it needed ? - nm_bdg_flush(ft, 1, na, 0); - - /* release the mbuf in either cases of success or failure. As an - * alternative, put the mbuf in a free list and free the list - * only when really necessary. - */ - m_freem(m); + if (!na) + return; - return (0); + NMG_LOCK(); + netmap_disable_all_rings(ifp); + if (!netmap_adapter_put(na)) { + /* someone is still using the adapter, + * tell them that the interface is gone + */ + na->ifp = NULL; + /* give them a chance to notice */ + netmap_enable_all_rings(ifp); + } + NMG_UNLOCK(); } /* * Intercept packets from the network stack and pass them * to netmap as incoming packets on the 'software' ring. - * We are not locked when called. + * + * We only store packets in a bounded mbq and then copy them + * in the relevant rxsync routine. + * + * We rely on the OS to make sure that the ifp and na do not go + * away (typically the caller checks for IFF_DRV_RUNNING or the like). + * In nm_register() or whenever there is a reinitialization, + * we make sure to make the mode change visible here. */ int -netmap_start(struct ifnet *ifp, struct mbuf *m) +netmap_transmit(struct ifnet *ifp, struct mbuf *m) { struct netmap_adapter *na = NA(ifp); - struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; - u_int i, len = MBUF_LEN(m); - u_int error = EBUSY, lim = kring->nkr_num_slots - 1; - struct netmap_slot *slot; - - if (netmap_verbose & NM_VERB_HOST) - D("%s packet %d len %d from the stack", ifp->if_xname, - kring->nr_hwcur + kring->nr_hwavail, len); - if (len > NETMAP_BUF_SIZE) { /* too long for us */ - D("%s from_host, drop packet size %d > %d", ifp->if_xname, - len, NETMAP_BUF_SIZE); - m_freem(m); - return EINVAL; + struct netmap_kring *kring; + u_int len = MBUF_LEN(m); + u_int error = ENOBUFS; + struct mbq *q; + int space; + + // XXX [Linux] we do not need this lock + // if we follow the down/configure/up protocol -gl + // mtx_lock(&na->core_lock); + + if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) { + D("%s not in netmap mode anymore", NM_IFPNAME(ifp)); + error = ENXIO; + goto done; } - if (na->na_bdg) - return bdg_netmap_start(ifp, m); - na->nm_lock(ifp, NETMAP_CORE_LOCK, 0); - if (kring->nr_hwavail >= lim) { - if (netmap_verbose) - D("stack ring %s full\n", ifp->if_xname); - goto done; /* no space */ - } - - /* compute the insert position */ - i = kring->nr_hwcur + kring->nr_hwavail; - if (i > lim) - i -= lim + 1; - slot = &kring->ring->slot[i]; - m_copydata(m, 0, len, NMB(slot)); - slot->len = len; - slot->flags = kring->nkr_slot_flags; - kring->nr_hwavail++; - if (netmap_verbose & NM_VERB_HOST) - D("wake up host ring %s %d", na->ifp->if_xname, na->num_rx_rings); - selwakeuppri(&kring->si, PI_NET); - error = 0; -done: - na->nm_lock(ifp, NETMAP_CORE_UNLOCK, 0); + kring = &na->rx_rings[na->num_rx_rings]; + q = &kring->rx_queue; - /* release the mbuf in either cases of success or failure. As an - * alternative, put the mbuf in a free list and free the list - * only when really necessary. + // XXX reconsider long packets if we handle fragments + if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */ + D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp), + len, NETMAP_BDG_BUF_SIZE(na->nm_mem)); + goto done; + } + + /* protect against rxsync_from_host(), netmap_sw_to_nic() + * and maybe other instances of netmap_transmit (the latter + * not possible on Linux). + * Also avoid overflowing the queue. */ - m_freem(m); + mtx_lock(&q->lock); + + space = kring->nr_hwtail - kring->nr_hwcur; + if (space < 0) + space += kring->nkr_num_slots; + if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX + RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p", + NM_IFPNAME(ifp), kring->nr_hwcur, kring->nr_hwtail, mbq_len(q), + len, m); + } else { + mbq_enqueue(q, m); + ND(10, "%s %d bufs in queue len %d m %p", + NM_IFPNAME(ifp), mbq_len(q), len, m); + /* notify outside the lock */ + m = NULL; + error = 0; + } + mtx_unlock(&q->lock); + +done: + if (m) + m_freem(m); + /* unconditionally wake up listeners */ + na->nm_notify(na, na->num_rx_rings, NR_RX, 0); return (error); } @@ -2368,42 +2413,62 @@ done: /* * netmap_reset() is called by the driver routines when reinitializing * a ring. The driver is in charge of locking to protect the kring. - * If netmap mode is not set just return NULL. + * If native netmap mode is not set just return NULL. */ struct netmap_slot * -netmap_reset(struct netmap_adapter *na, enum txrx tx, int n, +netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, u_int new_cur) { struct netmap_kring *kring; int new_hwofs, lim; - if (na == NULL) + if (na == NULL) { + D("NULL na, should not happen"); return NULL; /* no netmap support here */ - if (!(na->ifp->if_capenable & IFCAP_NETMAP)) + } + if (!(na->ifp->if_capenable & IFCAP_NETMAP)) { + ND("interface not in netmap mode"); return NULL; /* nothing to reinitialize */ + } + /* XXX note- in the new scheme, we are not guaranteed to be + * under lock (e.g. when called on a device reset). + * In this case, we should set a flag and do not trust too + * much the values. In practice: TODO + * - set a RESET flag somewhere in the kring + * - do the processing in a conservative way + * - let the *sync() fixup at the end. + */ if (tx == NR_TX) { if (n >= na->num_tx_rings) return NULL; kring = na->tx_rings + n; + // XXX check whether we should use hwcur or rcur new_hwofs = kring->nr_hwcur - new_cur; } else { if (n >= na->num_rx_rings) return NULL; kring = na->rx_rings + n; - new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur; + new_hwofs = kring->nr_hwtail - new_cur; } lim = kring->nkr_num_slots - 1; if (new_hwofs > lim) new_hwofs -= lim + 1; - /* Alwayws set the new offset value and realign the ring. */ + /* Always set the new offset value and realign the ring. */ + if (netmap_verbose) + D("%s %s%d hwofs %d -> %d, hwtail %d -> %d", + NM_IFPNAME(na->ifp), + tx == NR_TX ? "TX" : "RX", n, + kring->nkr_hwofs, new_hwofs, + kring->nr_hwtail, + tx == NR_TX ? lim : kring->nr_hwtail); kring->nkr_hwofs = new_hwofs; - if (tx == NR_TX) - kring->nr_hwavail = kring->nkr_num_slots - 1; - ND(10, "new hwofs %d on %s %s[%d]", - kring->nkr_hwofs, na->ifp->if_xname, - tx == NR_TX ? "TX" : "RX", n); + if (tx == NR_TX) { + kring->nr_hwtail = kring->nr_hwcur + lim; + if (kring->nr_hwtail > lim) + kring->nr_hwtail -= lim + 1; + } #if 0 // def linux /* XXX check that the mappings are correct */ @@ -2416,912 +2481,135 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, int n, #endif /* linux */ /* - * Wakeup on the individual and global lock + * Wakeup on the individual and global selwait * We do the wakeup here, but the ring is not yet reconfigured. * However, we are under lock so there are no races. */ - selwakeuppri(&kring->si, PI_NET); - selwakeuppri(tx == NR_TX ? &na->tx_si : &na->rx_si, PI_NET); + na->nm_notify(na, n, tx, 0); return kring->ring->slot; } -/* returns the next position in the ring */ -static int -nm_bdg_preflush(struct netmap_adapter *na, u_int ring_nr, - struct netmap_kring *kring, u_int end) -{ - struct netmap_ring *ring = kring->ring; - struct nm_bdg_fwd *ft = kring->nkr_ft; - u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; - u_int ft_i = 0; /* start from 0 */ - - for (; likely(j != end); j = unlikely(j == lim) ? 0 : j+1) { - struct netmap_slot *slot = &ring->slot[j]; - char *buf = NMB(slot); - int len = ft[ft_i].ft_len = slot->len; - - ft[ft_i].ft_flags = slot->flags; - - ND("flags is 0x%x", slot->flags); - /* this slot goes into a list so initialize the link field */ - ft[ft_i].ft_next = NM_BDG_BATCH; /* equivalent to NULL */ - if (unlikely(len < 14)) - continue; - buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? - *((void **)buf) : buf; - prefetch(buf); - if (unlikely(++ft_i == netmap_bridge)) - ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); - } - if (ft_i) - ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); - return j; -} - - /* - * Pass packets from nic to the bridge. Must be called with - * proper locks on the source interface. - * Note, no user process can access this NIC so we can ignore - * the info in the 'ring'. - */ -static void -netmap_nic_to_bdg(struct ifnet *ifp, u_int ring_nr) -{ - struct netmap_adapter *na = NA(ifp); - struct netmap_kring *kring = &na->rx_rings[ring_nr]; - struct netmap_ring *ring = kring->ring; - int j, k, lim = kring->nkr_num_slots - 1; - - /* fetch packets that have arrived */ - na->nm_rxsync(ifp, ring_nr, 0); - /* XXX we don't count reserved, but it should be 0 */ - j = kring->nr_hwcur; - k = j + kring->nr_hwavail; - if (k > lim) - k -= lim + 1; - if (k == j && netmap_verbose) { - D("how strange, interrupt with no packets on %s", - ifp->if_xname); - return; - } - - j = nm_bdg_preflush(na, ring_nr, kring, k); - - /* we consume everything, but we cannot update kring directly - * because the nic may have destroyed the info in the NIC ring. - * So we need to call rxsync again to restore it. - */ - ring->cur = j; - ring->avail = 0; - na->nm_rxsync(ifp, ring_nr, 0); - return; -} - - -/* - * Default functions to handle rx/tx interrupts - * we have 4 cases: - * 1 ring, single lock: - * lock(core); wake(i=0); unlock(core) - * N rings, single lock: - * lock(core); wake(i); wake(N+1) unlock(core) - * 1 ring, separate locks: (i=0) - * lock(i); wake(i); unlock(i) - * N rings, separate locks: - * lock(i); wake(i); unlock(i); lock(core) wake(N+1) unlock(core) - * work_done is non-null on the RX path. + * Dispatch rx/tx interrupts to the netmap rings. * - * The 'q' argument also includes flag to tell whether the queue is - * already locked on enter, and whether it should remain locked on exit. - * This helps adapting to different defaults in drivers and OSes. + * "work_done" is non-null on the RX path, NULL for the TX path. + * We rely on the OS to make sure that there is only one active + * instance per queue, and that there is appropriate locking. + * + * The 'notify' routine depends on what the ring is attached to. + * - for a netmap file descriptor, do a selwakeup on the individual + * waitqueue, plus one on the global one if needed + * - for a switch, call the proper forwarding routine + * - XXX more ? */ -int -netmap_rx_irq(struct ifnet *ifp, int q, int *work_done) +void +netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) { - struct netmap_adapter *na; - struct netmap_kring *r; - NM_SELINFO_T *main_wq; - int locktype, unlocktype, nic_to_bridge, lock; - - if (!(ifp->if_capenable & IFCAP_NETMAP)) - return 0; + struct netmap_adapter *na = NA(ifp); + struct netmap_kring *kring; - lock = q & (NETMAP_LOCKED_ENTER | NETMAP_LOCKED_EXIT); - q = q & NETMAP_RING_MASK; + q &= NETMAP_RING_MASK; - ND(5, "received %s queue %d", work_done ? "RX" : "TX" , q); - na = NA(ifp); - if (na->na_flags & NAF_SKIP_INTR) { - ND("use regular interrupt"); - return 0; + if (netmap_verbose) { + RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); } if (work_done) { /* RX path */ if (q >= na->num_rx_rings) - return 0; // not a physical queue - r = na->rx_rings + q; - r->nr_kflags |= NKR_PENDINTR; - main_wq = (na->num_rx_rings > 1) ? &na->rx_si : NULL; - /* set a flag if the NIC is attached to a VALE switch */ - nic_to_bridge = (na->na_bdg != NULL); - locktype = NETMAP_RX_LOCK; - unlocktype = NETMAP_RX_UNLOCK; + return; // not a physical queue + kring = na->rx_rings + q; + kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? + na->nm_notify(na, q, NR_RX, 0); + *work_done = 1; /* do not fire napi again */ } else { /* TX path */ if (q >= na->num_tx_rings) - return 0; // not a physical queue - r = na->tx_rings + q; - main_wq = (na->num_tx_rings > 1) ? &na->tx_si : NULL; - work_done = &q; /* dummy */ - nic_to_bridge = 0; - locktype = NETMAP_TX_LOCK; - unlocktype = NETMAP_TX_UNLOCK; - } - if (na->separate_locks) { - if (!(lock & NETMAP_LOCKED_ENTER)) - na->nm_lock(ifp, locktype, q); - /* If a NIC is attached to a bridge, flush packets - * (and no need to wakeup anyone). Otherwise, wakeup - * possible processes waiting for packets. - */ - if (nic_to_bridge) - netmap_nic_to_bdg(ifp, q); - else - selwakeuppri(&r->si, PI_NET); - na->nm_lock(ifp, unlocktype, q); - if (main_wq && !nic_to_bridge) { - na->nm_lock(ifp, NETMAP_CORE_LOCK, 0); - selwakeuppri(main_wq, PI_NET); - na->nm_lock(ifp, NETMAP_CORE_UNLOCK, 0); - } - /* lock the queue again if requested */ - if (lock & NETMAP_LOCKED_EXIT) - na->nm_lock(ifp, locktype, q); - } else { - if (!(lock & NETMAP_LOCKED_ENTER)) - na->nm_lock(ifp, NETMAP_CORE_LOCK, 0); - if (nic_to_bridge) - netmap_nic_to_bdg(ifp, q); - else { - selwakeuppri(&r->si, PI_NET); - if (main_wq) - selwakeuppri(main_wq, PI_NET); - } - if (!(lock & NETMAP_LOCKED_EXIT)) - na->nm_lock(ifp, NETMAP_CORE_UNLOCK, 0); - } - *work_done = 1; /* do not fire napi again */ - return 1; -} - - -#ifdef linux /* linux-specific routines */ - - -/* - * Remap linux arguments into the FreeBSD call. - * - pwait is the poll table, passed as 'dev'; - * If pwait == NULL someone else already woke up before. We can report - * events but they are filtered upstream. - * If pwait != NULL, then pwait->key contains the list of events. - * - events is computed from pwait as above. - * - file is passed as 'td'; - */ -static u_int -linux_netmap_poll(struct file * file, struct poll_table_struct *pwait) -{ -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) - int events = POLLIN | POLLOUT; /* XXX maybe... */ -#elif LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0) - int events = pwait ? pwait->key : POLLIN | POLLOUT; -#else /* in 3.4.0 field 'key' was renamed to '_key' */ - int events = pwait ? pwait->_key : POLLIN | POLLOUT; -#endif - return netmap_poll((void *)pwait, events, (void *)file); -} - - -static int -linux_netmap_mmap(struct file *f, struct vm_area_struct *vma) -{ - int lut_skip, i, j; - int user_skip = 0; - struct lut_entry *l_entry; - int error = 0; - unsigned long off, tomap; - /* - * vma->vm_start: start of mapping user address space - * vma->vm_end: end of the mapping user address space - * vma->vm_pfoff: offset of first page in the device - */ - - // XXX security checks - - error = netmap_get_memory(f->private_data); - ND("get_memory returned %d", error); - if (error) - return -error; - - off = vma->vm_pgoff << PAGE_SHIFT; /* offset in bytes */ - tomap = vma->vm_end - vma->vm_start; - for (i = 0; i < NETMAP_POOLS_NR; i++) { /* loop through obj_pools */ - const struct netmap_obj_pool *p = &nm_mem.pools[i]; - /* - * In each pool memory is allocated in clusters - * of size _clustsize, each containing clustentries - * entries. For each object k we already store the - * vtophys mapping in lut[k] so we use that, scanning - * the lut[] array in steps of clustentries, - * and we map each cluster (not individual pages, - * it would be overkill -- XXX slow ? 20130415). - */ - - /* - * We interpret vm_pgoff as an offset into the whole - * netmap memory, as if all clusters where contiguous. - */ - for (lut_skip = 0, j = 0; j < p->_numclusters; j++, lut_skip += p->clustentries) { - unsigned long paddr, mapsize; - if (p->_clustsize <= off) { - off -= p->_clustsize; - continue; - } - l_entry = &p->lut[lut_skip]; /* first obj in the cluster */ - paddr = l_entry->paddr + off; - mapsize = p->_clustsize - off; - off = 0; - if (mapsize > tomap) - mapsize = tomap; - ND("remap_pfn_range(%lx, %lx, %lx)", - vma->vm_start + user_skip, - paddr >> PAGE_SHIFT, mapsize); - if (remap_pfn_range(vma, vma->vm_start + user_skip, - paddr >> PAGE_SHIFT, mapsize, - vma->vm_page_prot)) - return -EAGAIN; // XXX check return value - user_skip += mapsize; - tomap -= mapsize; - if (tomap == 0) - goto done; - } + return; // not a physical queue + kring = na->tx_rings + q; + na->nm_notify(na, q, NR_TX, 0); } -done: - - return 0; -} - - -static netdev_tx_t -linux_netmap_start(struct sk_buff *skb, struct net_device *dev) -{ - netmap_start(dev, skb); - return (NETDEV_TX_OK); } -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) // XXX was 38 -#define LIN_IOCTL_NAME .ioctl -int -linux_netmap_ioctl(struct inode *inode, struct file *file, u_int cmd, u_long data /* arg */) -#else -#define LIN_IOCTL_NAME .unlocked_ioctl -long -linux_netmap_ioctl(struct file *file, u_int cmd, u_long data /* arg */) -#endif -{ - int ret; - struct nmreq nmr; - bzero(&nmr, sizeof(nmr)); - - if (data && copy_from_user(&nmr, (void *)data, sizeof(nmr) ) != 0) - return -EFAULT; - ret = netmap_ioctl(NULL, cmd, (caddr_t)&nmr, 0, (void *)file); - if (data && copy_to_user((void*)data, &nmr, sizeof(nmr) ) != 0) - return -EFAULT; - return -ret; -} - - -static int -netmap_release(struct inode *inode, struct file *file) -{ - (void)inode; /* UNUSED */ - if (file->private_data) - netmap_dtor(file->private_data); - return (0); -} - - -static int -linux_netmap_open(struct inode *inode, struct file *file) -{ - struct netmap_priv_d *priv; - (void)inode; /* UNUSED */ - - priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, - M_NOWAIT | M_ZERO); - if (priv == NULL) - return -ENOMEM; - - file->private_data = priv; - - return (0); -} - - -static struct file_operations netmap_fops = { - .owner = THIS_MODULE, - .open = linux_netmap_open, - .mmap = linux_netmap_mmap, - LIN_IOCTL_NAME = linux_netmap_ioctl, - .poll = linux_netmap_poll, - .release = netmap_release, -}; - - -static struct miscdevice netmap_cdevsw = { /* same name as FreeBSD */ - MISC_DYNAMIC_MINOR, - "netmap", - &netmap_fops, -}; - -static int netmap_init(void); -static void netmap_fini(void); - - -/* Errors have negative values on linux */ -static int linux_netmap_init(void) -{ - return -netmap_init(); -} - -module_init(linux_netmap_init); -module_exit(netmap_fini); -/* export certain symbols to other modules */ -EXPORT_SYMBOL(netmap_attach); // driver attach routines -EXPORT_SYMBOL(netmap_detach); // driver detach routines -EXPORT_SYMBOL(netmap_ring_reinit); // ring init on error -EXPORT_SYMBOL(netmap_buffer_lut); -EXPORT_SYMBOL(netmap_total_buffers); // index check -EXPORT_SYMBOL(netmap_buffer_base); -EXPORT_SYMBOL(netmap_reset); // ring init routines -EXPORT_SYMBOL(netmap_buf_size); -EXPORT_SYMBOL(netmap_rx_irq); // default irq handler -EXPORT_SYMBOL(netmap_no_pendintr); // XXX mitigation - should go away -EXPORT_SYMBOL(netmap_bdg_ctl); // bridge configuration routine -EXPORT_SYMBOL(netmap_bdg_learning); // the default lookup function - - -MODULE_AUTHOR("http://info.iet.unipi.it/~luigi/netmap/"); -MODULE_DESCRIPTION("The netmap packet I/O framework"); -MODULE_LICENSE("Dual BSD/GPL"); /* the code here is all BSD. */ - -#else /* __FreeBSD__ */ - - -static struct cdevsw netmap_cdevsw = { - .d_version = D_VERSION, - .d_name = "netmap", - .d_open = netmap_open, - .d_mmap = netmap_mmap, - .d_mmap_single = netmap_mmap_single, - .d_ioctl = netmap_ioctl, - .d_poll = netmap_poll, - .d_close = netmap_close, -}; -#endif /* __FreeBSD__ */ - -#ifdef NM_BRIDGE -/* - *---- support for virtual bridge ----- - */ - -/* ----- FreeBSD if_bridge hash function ------- */ - /* - * The following hash function is adapted from "Hash Functions" by Bob Jenkins - * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). + * Default functions to handle rx/tx interrupts from a physical device. + * "work_done" is non-null on the RX path, NULL for the TX path. * - * http://www.burtleburtle.net/bob/hash/spooky.html - */ -#define mix(a, b, c) \ -do { \ - a -= b; a -= c; a ^= (c >> 13); \ - b -= c; b -= a; b ^= (a << 8); \ - c -= a; c -= b; c ^= (b >> 13); \ - a -= b; a -= c; a ^= (c >> 12); \ - b -= c; b -= a; b ^= (a << 16); \ - c -= a; c -= b; c ^= (b >> 5); \ - a -= b; a -= c; a ^= (c >> 3); \ - b -= c; b -= a; b ^= (a << 10); \ - c -= a; c -= b; c ^= (b >> 15); \ -} while (/*CONSTCOND*/0) - -static __inline uint32_t -nm_bridge_rthash(const uint8_t *addr) -{ - uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key - - b += addr[5] << 8; - b += addr[4]; - a += addr[3] << 24; - a += addr[2] << 16; - a += addr[1] << 8; - a += addr[0]; - - mix(a, b, c); -#define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) - return (c & BRIDGE_RTHASH_MASK); -} - -#undef mix - - -static int -bdg_netmap_reg(struct ifnet *ifp, int onoff) -{ - // struct nm_bridge *b = NA(ifp)->na_bdg; - - /* the interface is already attached to the bridge, - * so we only need to toggle IFCAP_NETMAP. - * Locking is not necessary (we are already under - * NMA_LOCK, and the port is not in use during this call). - */ - /* BDG_WLOCK(b); */ - if (onoff) { - ifp->if_capenable |= IFCAP_NETMAP; - } else { - ifp->if_capenable &= ~IFCAP_NETMAP; - } - /* BDG_WUNLOCK(b); */ - return 0; -} - - -/* - * Lookup function for a learning bridge. - * Update the hash table with the source address, - * and then returns the destination port index, and the - * ring in *dst_ring (at the moment, always use ring 0) - */ -u_int -netmap_bdg_learning(char *buf, u_int len, uint8_t *dst_ring, - struct netmap_adapter *na) -{ - struct nm_hash_ent *ht = na->na_bdg->ht; - uint32_t sh, dh; - u_int dst, mysrc = na->bdg_port; - uint64_t smac, dmac; - - dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; - smac = le64toh(*(uint64_t *)(buf + 4)); - smac >>= 16; - - /* - * The hash is somewhat expensive, there might be some - * worthwhile optimizations here. - */ - if ((buf[6] & 1) == 0) { /* valid src */ - uint8_t *s = buf+6; - sh = nm_bridge_rthash(buf+6); // XXX hash of source - /* update source port forwarding entry */ - ht[sh].mac = smac; /* XXX expire ? */ - ht[sh].ports = mysrc; - if (netmap_verbose) - D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", - s[0], s[1], s[2], s[3], s[4], s[5], mysrc); - } - dst = NM_BDG_BROADCAST; - if ((buf[0] & 1) == 0) { /* unicast */ - dh = nm_bridge_rthash(buf); // XXX hash of dst - if (ht[dh].mac == dmac) { /* found dst */ - dst = ht[dh].ports; - } - /* XXX otherwise return NM_BDG_UNKNOWN ? */ - } - *dst_ring = 0; - return dst; -} - - -/* - * This flush routine supports only unicast and broadcast but a large - * number of ports, and lets us replace the learn and dispatch functions. + * If the card is not in netmap mode, simply return 0, + * so that the caller proceeds with regular processing. + * Otherwise call netmap_common_irq() and return 1. + * + * If the card is connected to a netmap file descriptor, + * do a selwakeup on the individual queue, plus one on the global one + * if needed (multiqueue card _and_ there are multiqueue listeners), + * and return 1. + * + * Finally, if called on rx from an interface connected to a switch, + * calls the proper forwarding routine, and return 1. */ int -nm_bdg_flush(struct nm_bdg_fwd *ft, int n, struct netmap_adapter *na, - u_int ring_nr) +netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) { - struct nm_bdg_q *dst_ents, *brddst; - uint16_t num_dsts = 0, *dsts; - struct nm_bridge *b = na->na_bdg; - u_int i, me = na->bdg_port; - - dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH); - dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); - - BDG_RLOCK(b); - - /* first pass: find a destination */ - for (i = 0; likely(i < n); i++) { - uint8_t *buf = ft[i].ft_buf; - uint8_t dst_ring = ring_nr; - uint16_t dst_port, d_i; - struct nm_bdg_q *d; - - dst_port = b->nm_bdg_lookup(buf, ft[i].ft_len, &dst_ring, na); - if (dst_port == NM_BDG_NOPORT) { - continue; /* this packet is identified to be dropped */ - } else if (unlikely(dst_port > NM_BDG_MAXPORTS)) { - continue; - } else if (dst_port == NM_BDG_BROADCAST) { - dst_ring = 0; /* broadcasts always go to ring 0 */ - } else if (unlikely(dst_port == me || - !BDG_GET_VAR(b->bdg_ports[dst_port]))) { - continue; - } - - /* get a position in the scratch pad */ - d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; - d = dst_ents + d_i; - if (d->bq_head == NM_BDG_BATCH) { /* new destination */ - d->bq_head = d->bq_tail = i; - /* remember this position to be scanned later */ - if (dst_port != NM_BDG_BROADCAST) - dsts[num_dsts++] = d_i; - } else { - ft[d->bq_tail].ft_next = i; - d->bq_tail = i; - } - } - - /* if there is a broadcast, set ring 0 of all ports to be scanned - * XXX This would be optimized by recording the highest index of active - * ports. - */ - brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; - if (brddst->bq_head != NM_BDG_BATCH) { - for (i = 0; likely(i < NM_BDG_MAXPORTS); i++) { - uint16_t d_i = i * NM_BDG_MAXRINGS; - if (unlikely(i == me) || !BDG_GET_VAR(b->bdg_ports[i])) - continue; - else if (dst_ents[d_i].bq_head == NM_BDG_BATCH) - dsts[num_dsts++] = d_i; - } - } - - /* second pass: scan destinations (XXX will be modular somehow) */ - for (i = 0; i < num_dsts; i++) { - struct ifnet *dst_ifp; - struct netmap_adapter *dst_na; - struct netmap_kring *kring; - struct netmap_ring *ring; - u_int dst_nr, is_vp, lim, j, sent = 0, d_i, next, brd_next; - int howmany, retry = netmap_txsync_retry; - struct nm_bdg_q *d; - - d_i = dsts[i]; - d = dst_ents + d_i; - dst_na = BDG_GET_VAR(b->bdg_ports[d_i/NM_BDG_MAXRINGS]); - /* protect from the lookup function returning an inactive - * destination port - */ - if (unlikely(dst_na == NULL)) - continue; - else if (dst_na->na_flags & NAF_SW_ONLY) - continue; - dst_ifp = dst_na->ifp; - /* - * The interface may be in !netmap mode in two cases: - * - when na is attached but not activated yet; - * - when na is being deactivated but is still attached. - */ - if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) - continue; - - /* there is at least one either unicast or broadcast packet */ - brd_next = brddst->bq_head; - next = d->bq_head; - - is_vp = nma_is_vp(dst_na); - dst_nr = d_i & (NM_BDG_MAXRINGS-1); - if (is_vp) { /* virtual port */ - if (dst_nr >= dst_na->num_rx_rings) - dst_nr = dst_nr % dst_na->num_rx_rings; - kring = &dst_na->rx_rings[dst_nr]; - ring = kring->ring; - lim = kring->nkr_num_slots - 1; - dst_na->nm_lock(dst_ifp, NETMAP_RX_LOCK, dst_nr); - j = kring->nr_hwcur + kring->nr_hwavail; - if (j > lim) - j -= kring->nkr_num_slots; - howmany = lim - kring->nr_hwavail; - } else { /* hw or sw adapter */ - if (dst_nr >= dst_na->num_tx_rings) - dst_nr = dst_nr % dst_na->num_tx_rings; - kring = &dst_na->tx_rings[dst_nr]; - ring = kring->ring; - lim = kring->nkr_num_slots - 1; - dst_na->nm_lock(dst_ifp, NETMAP_TX_LOCK, dst_nr); -retry: - dst_na->nm_txsync(dst_ifp, dst_nr, 0); - /* see nm_bdg_flush() */ - j = kring->nr_hwcur; - howmany = kring->nr_hwavail; - } - while (howmany-- > 0) { - struct netmap_slot *slot; - struct nm_bdg_fwd *ft_p; - - /* our 'NULL' is always higher than valid indexes - * so we never dereference it if the other list - * has packets (and if both are NULL we never - * get here). - */ - if (next < brd_next) { - ft_p = ft + next; - next = ft_p->ft_next; - ND("j %d uni %d next %d %d", - j, ft_p - ft, next, brd_next); - } else { /* insert broadcast */ - ft_p = ft + brd_next; - brd_next = ft_p->ft_next; - ND("j %d brd %d next %d %d", - j, ft_p - ft, next, brd_next); - } - slot = &ring->slot[j]; - ND("send %d %d bytes at %s:%d", i, ft_p->ft_len, dst_ifp->if_xname, j); - if (ft_p->ft_flags & NS_INDIRECT) { - ND("copying from INDIRECT source"); - copyin(ft_p->ft_buf, NMB(slot), - (ft_p->ft_len + 63) & ~63); - } else { - pkt_copy(ft_p->ft_buf, NMB(slot), ft_p->ft_len); - } - slot->len = ft_p->ft_len; - j = unlikely(j == lim) ? 0: j + 1; /* XXX to be macro-ed */ - sent++; - /* are we done ? */ - if (next == NM_BDG_BATCH && brd_next == NM_BDG_BATCH) - break; - } - if (netmap_verbose && (howmany < 0)) - D("rx ring full on %s", dst_ifp->if_xname); - if (is_vp) { - if (sent) { - kring->nr_hwavail += sent; - selwakeuppri(&kring->si, PI_NET); - } - dst_na->nm_lock(dst_ifp, NETMAP_RX_UNLOCK, dst_nr); - } else { - if (sent) { - ring->avail -= sent; - ring->cur = j; - dst_na->nm_txsync(dst_ifp, dst_nr, 0); - } - /* retry to send more packets */ - if (nma_is_hw(dst_na) && howmany < 0 && retry--) - goto retry; - dst_na->nm_lock(dst_ifp, NETMAP_TX_UNLOCK, dst_nr); - } - /* NM_BDG_BATCH means 'no packet' */ - d->bq_head = d->bq_tail = NM_BDG_BATCH; /* cleanup */ - } - brddst->bq_head = brddst->bq_tail = NM_BDG_BATCH; /* cleanup */ - BDG_RUNLOCK(b); - return 0; -} - - -/* - * main dispatch routine - */ -static int -bdg_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) -{ - struct netmap_adapter *na = NA(ifp); - struct netmap_kring *kring = &na->tx_rings[ring_nr]; - struct netmap_ring *ring = kring->ring; - int i, j, k, lim = kring->nkr_num_slots - 1; - - k = ring->cur; - if (k > lim) - return netmap_ring_reinit(kring); - if (do_lock) - na->nm_lock(ifp, NETMAP_TX_LOCK, ring_nr); + // XXX could we check NAF_NATIVE_ON ? + if (!(ifp->if_capenable & IFCAP_NETMAP)) + return 0; - if (netmap_bridge <= 0) { /* testing only */ - j = k; // used all - goto done; + if (NA(ifp)->na_flags & NAF_SKIP_INTR) { + ND("use regular interrupt"); + return 0; } - if (netmap_bridge > NM_BDG_BATCH) - netmap_bridge = NM_BDG_BATCH; - - j = nm_bdg_preflush(na, ring_nr, kring, k); - i = k - j; - if (i < 0) - i += kring->nkr_num_slots; - kring->nr_hwavail = kring->nkr_num_slots - 1 - i; - if (j != k) - D("early break at %d/ %d, avail %d", j, k, kring->nr_hwavail); - -done: - kring->nr_hwcur = j; - ring->avail = kring->nr_hwavail; - if (do_lock) - na->nm_lock(ifp, NETMAP_TX_UNLOCK, ring_nr); - - if (netmap_verbose) - D("%s ring %d lock %d", ifp->if_xname, ring_nr, do_lock); - return 0; -} - - -static int -bdg_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) -{ - struct netmap_adapter *na = NA(ifp); - struct netmap_kring *kring = &na->rx_rings[ring_nr]; - struct netmap_ring *ring = kring->ring; - u_int j, lim = kring->nkr_num_slots - 1; - u_int k = ring->cur, resvd = ring->reserved; - int n; - - ND("%s ring %d lock %d avail %d", - ifp->if_xname, ring_nr, do_lock, kring->nr_hwavail); - - if (k > lim) - return netmap_ring_reinit(kring); - if (do_lock) - na->nm_lock(ifp, NETMAP_RX_LOCK, ring_nr); - - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; /* netmap ring index */ - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - - if (j != k) { /* userspace has released some packets. */ - n = k - j; - if (n < 0) - n += kring->nkr_num_slots; - ND("userspace releases %d packets", n); - for (n = 0; likely(j != k); n++) { - struct netmap_slot *slot = &ring->slot[j]; - void *addr = NMB(slot); - - if (addr == netmap_buffer_base) { /* bad buf */ - if (do_lock) - na->nm_lock(ifp, NETMAP_RX_UNLOCK, ring_nr); - return netmap_ring_reinit(kring); - } - /* decrease refcount for buffer */ - - slot->flags &= ~NS_BUF_CHANGED; - j = unlikely(j == lim) ? 0 : j + 1; - } - kring->nr_hwavail -= n; - kring->nr_hwcur = k; - } - /* tell userspace that there are new packets */ - ring->avail = kring->nr_hwavail - resvd; - - if (do_lock) - na->nm_lock(ifp, NETMAP_RX_UNLOCK, ring_nr); - return 0; -} - -static void -bdg_netmap_attach(struct netmap_adapter *arg) -{ - struct netmap_adapter na; - - ND("attaching virtual bridge"); - bzero(&na, sizeof(na)); - - na.ifp = arg->ifp; - na.separate_locks = 1; - na.num_tx_rings = arg->num_tx_rings; - na.num_rx_rings = arg->num_rx_rings; - na.num_tx_desc = NM_BRIDGE_RINGSIZE; - na.num_rx_desc = NM_BRIDGE_RINGSIZE; - na.nm_txsync = bdg_netmap_txsync; - na.nm_rxsync = bdg_netmap_rxsync; - na.nm_register = bdg_netmap_reg; - netmap_attach(&na, na.num_tx_rings); + netmap_common_irq(ifp, q, work_done); + return 1; } -#endif /* NM_BRIDGE */ - -static struct cdev *netmap_dev; /* /dev/netmap character device. */ - /* - * Module loader. + * Module loader and unloader * - * Create the /dev/netmap device and initialize all global - * variables. + * netmap_init() creates the /dev/netmap device and initializes + * all global variables. Returns 0 on success, errno on failure + * (but there is no chance) * - * Return 0 on success, errno on failure. + * netmap_fini() destroys everything. */ -static int -netmap_init(void) -{ - int error; - error = netmap_memory_init(); - if (error != 0) { - printf("netmap: unable to initialize the memory allocator.\n"); - return (error); - } - printf("netmap: loaded module\n"); - netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, - "netmap"); - -#ifdef NM_BRIDGE - { - int i; - mtx_init(&netmap_bridge_mutex, "netmap_bridge_mutex", - MTX_NETWORK_LOCK, MTX_DEF); - bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */ - for (i = 0; i < NM_BRIDGES; i++) - rw_init(&nm_bridges[i].bdg_lock, "bdg lock"); - } -#endif - return (error); -} +static struct cdev *netmap_dev; /* /dev/netmap character device. */ +extern struct cdevsw netmap_cdevsw; -/* - * Module unloader. - * - * Free all the memory, and destroy the ``/dev/netmap`` device. - */ -static void +void netmap_fini(void) { - destroy_dev(netmap_dev); - netmap_memory_fini(); + // XXX destroy_bridges() ? + if (netmap_dev) + destroy_dev(netmap_dev); + netmap_mem_fini(); + NMG_LOCK_DESTROY(); printf("netmap: unloaded module.\n"); } -#ifdef __FreeBSD__ -/* - * Kernel entry point. - * - * Initialize/finalize the module and return. - * - * Return 0 on success, errno on failure. - */ -static int -netmap_loader(__unused struct module *module, int event, __unused void *arg) +int +netmap_init(void) { - int error = 0; - - switch (event) { - case MOD_LOAD: - error = netmap_init(); - break; + int error; - case MOD_UNLOAD: - netmap_fini(); - break; + NMG_LOCK_INIT(); - default: - error = EOPNOTSUPP; - break; - } + error = netmap_mem_init(); + if (error != 0) + goto fail; + /* XXX could use make_dev_credv() to get error number */ + netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, + "netmap"); + if (!netmap_dev) + goto fail; - return (error); + netmap_init_bridges(); + printf("netmap: loaded module\n"); + return (0); +fail: + netmap_fini(); + return (EINVAL); /* may be incorrect */ } - - -DEV_MODULE(netmap, netmap_loader, NULL); -#endif /* __FreeBSD__ */ diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h index e246e14..ddcb0e3 100644 --- a/sys/dev/netmap/netmap_kern.h +++ b/sys/dev/netmap/netmap_kern.h @@ -1,5 +1,6 @@ /* - * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -33,28 +34,68 @@ #ifndef _NET_NETMAP_KERN_H_ #define _NET_NETMAP_KERN_H_ +#define WITH_VALE // comment out to disable VALE support +#define WITH_PIPES + #if defined(__FreeBSD__) -#define likely(x) __builtin_expect(!!(x), 1) -#define unlikely(x) __builtin_expect(!!(x), 0) +#define likely(x) __builtin_expect((long)!!(x), 1L) +#define unlikely(x) __builtin_expect((long)!!(x), 0L) #define NM_LOCK_T struct mtx -#define NM_RWLOCK_T struct rwlock +#define NMG_LOCK_T struct mtx +#define NMG_LOCK_INIT() mtx_init(&netmap_global_lock, \ + "netmap global lock", NULL, MTX_DEF) +#define NMG_LOCK_DESTROY() mtx_destroy(&netmap_global_lock) +#define NMG_LOCK() mtx_lock(&netmap_global_lock) +#define NMG_UNLOCK() mtx_unlock(&netmap_global_lock) +#define NMG_LOCK_ASSERT() mtx_assert(&netmap_global_lock, MA_OWNED) + #define NM_SELINFO_T struct selinfo #define MBUF_LEN(m) ((m)->m_pkthdr.len) -#define NM_SEND_UP(ifp, m) ((ifp)->if_input)(ifp, m) +#define MBUF_IFP(m) ((m)->m_pkthdr.rcvif) +#define NM_SEND_UP(ifp, m) ((NA(ifp))->if_input)(ifp, m) + +#define NM_ATOMIC_T volatile int // XXX ? +/* atomic operations */ +#include <machine/atomic.h> +#define NM_ATOMIC_TEST_AND_SET(p) (!atomic_cmpset_acq_int((p), 0, 1)) +#define NM_ATOMIC_CLEAR(p) atomic_store_rel_int((p), 0) + + +MALLOC_DECLARE(M_NETMAP); + +// XXX linux struct, not used in FreeBSD +struct net_device_ops { +}; +struct hrtimer { +}; #elif defined (linux) #define NM_LOCK_T safe_spinlock_t // see bsd_glue.h -#define NM_RWLOCK_T safe_spinlock_t // see bsd_glue.h #define NM_SELINFO_T wait_queue_head_t #define MBUF_LEN(m) ((m)->len) -#define NM_SEND_UP(ifp, m) netif_rx(m) +#define MBUF_IFP(m) ((m)->dev) +#define NM_SEND_UP(ifp, m) \ + do { \ + m->priority = NM_MAGIC_PRIORITY; \ + netif_rx(m); \ + } while (0) + +#define NM_ATOMIC_T volatile long unsigned int + +// XXX a mtx would suffice here too 20130404 gl +#define NMG_LOCK_T struct semaphore +#define NMG_LOCK_INIT() sema_init(&netmap_global_lock, 1) +#define NMG_LOCK_DESTROY() +#define NMG_LOCK() down(&netmap_global_lock) +#define NMG_UNLOCK() up(&netmap_global_lock) +#define NMG_LOCK_ASSERT() // XXX to be completed #ifndef DEV_NETMAP #define DEV_NETMAP -#endif +#endif /* DEV_NETMAP */ /* * IFCAP_NETMAP goes into net_device's priv_flags (if_capenable). @@ -89,9 +130,9 @@ do { \ struct timeval __xxts; \ microtime(&__xxts); \ - printf("%03d.%06d %s [%d] " format "\n", \ + printf("%03d.%06d [%4d] %-25s " format "\n", \ (int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \ - __FUNCTION__, __LINE__, ##__VA_ARGS__); \ + __LINE__, __FUNCTION__, ##__VA_ARGS__); \ } while (0) /* rate limited, lps indicates how many per second */ @@ -111,76 +152,254 @@ struct nm_bdg_fwd; struct nm_bridge; struct netmap_priv_d; +const char *nm_dump_buf(char *p, int len, int lim, char *dst); + +#include "netmap_mbq.h" + +extern NMG_LOCK_T netmap_global_lock; + /* * private, kernel view of a ring. Keeps track of the status of * a ring across system calls. * * nr_hwcur index of the next buffer to refill. - * It corresponds to ring->cur - ring->reserved + * It corresponds to ring->head + * at the time the system call returns. * - * nr_hwavail the number of slots "owned" by userspace. - * nr_hwavail =:= ring->avail + ring->reserved + * nr_hwtail index of the first buffer owned by the kernel. + * On RX, hwcur->hwtail are receive buffers + * not yet released. hwcur is advanced following + * ring->head, hwtail is advanced on incoming packets, + * and a wakeup is generated when hwtail passes ring->cur + * On TX, hwcur->rcur have been filled by the sender + * but not sent yet to the NIC; rcur->hwtail are available + * for new transmissions, and hwtail->hwcur-1 are pending + * transmissions not yet acknowledged. * * The indexes in the NIC and netmap rings are offset by nkr_hwofs slots. * This is so that, on a reset, buffers owned by userspace are not * modified by the kernel. In particular: - * RX rings: the next empty buffer (hwcur + hwavail + hwofs) coincides with + * RX rings: the next empty buffer (hwtail + hwofs) coincides with * the next empty buffer as known by the hardware (next_to_check or so). * TX rings: hwcur + hwofs coincides with next_to_send * + * Clients cannot issue concurrent syscall on a ring. The system + * detects this and reports an error using two flags, + * NKR_WBUSY and NKR_RBUSY * For received packets, slot->flags is set to nkr_slot_flags * so we can provide a proper initial value (e.g. set NS_FORWARD * when operating in 'transparent' mode). + * + * The following fields are used to implement lock-free copy of packets + * from input to output ports in VALE switch: + * nkr_hwlease buffer after the last one being copied. + * A writer in nm_bdg_flush reserves N buffers + * from nr_hwlease, advances it, then does the + * copy outside the lock. + * In RX rings (used for VALE ports), + * nkr_hwtail <= nkr_hwlease < nkr_hwcur+N-1 + * In TX rings (used for NIC or host stack ports) + * nkr_hwcur <= nkr_hwlease < nkr_hwtail + * nkr_leases array of nkr_num_slots where writers can report + * completion of their block. NR_NOSLOT (~0) indicates + * that the writer has not finished yet + * nkr_lease_idx index of next free slot in nr_leases, to be assigned + * + * The kring is manipulated by txsync/rxsync and generic netmap function. + * + * Concurrent rxsync or txsync on the same ring are prevented through + * by nm_kr_lock() which in turn uses nr_busy. This is all we need + * for NIC rings, and for TX rings attached to the host stack. + * + * RX rings attached to the host stack use an mbq (rx_queue) on both + * rxsync_from_host() and netmap_transmit(). The mbq is protected + * by its internal lock. + * + * RX rings attached to the VALE switch are accessed by both sender + * and receiver. They are protected through the q_lock on the RX ring. */ struct netmap_kring { - struct netmap_ring *ring; - u_int nr_hwcur; - int nr_hwavail; - u_int nr_kflags; /* private driver flags */ -#define NKR_PENDINTR 0x1 // Pending interrupt. - u_int nkr_num_slots; + struct netmap_ring *ring; + + uint32_t nr_hwcur; + uint32_t nr_hwtail; + + /* + * Copies of values in user rings, so we do not need to look + * at the ring (which could be modified). These are set in the + * *sync_prologue()/finalize() routines. + */ + uint32_t rhead; + uint32_t rcur; + uint32_t rtail; + + uint32_t nr_kflags; /* private driver flags */ +#define NKR_PENDINTR 0x1 // Pending interrupt. + uint32_t nkr_num_slots; + + /* + * On a NIC reset, the NIC ring indexes may be reset but the + * indexes in the netmap rings remain the same. nkr_hwofs + * keeps track of the offset between the two. + */ + int32_t nkr_hwofs; uint16_t nkr_slot_flags; /* initial value for flags */ - int nkr_hwofs; /* offset between NIC and netmap ring */ + + /* last_reclaim is opaque marker to help reduce the frequency + * of operations such as reclaiming tx buffers. A possible use + * is set it to ticks and do the reclaim only once per tick. + */ + uint64_t last_reclaim; + + + NM_SELINFO_T si; /* poll/select wait queue */ + NM_LOCK_T q_lock; /* protects kring and ring. */ + NM_ATOMIC_T nr_busy; /* prevent concurrent syscalls */ + struct netmap_adapter *na; + + /* The folloiwing fields are for VALE switch support */ struct nm_bdg_fwd *nkr_ft; - NM_SELINFO_T si; /* poll/select wait queue */ - NM_LOCK_T q_lock; /* used if no device lock available */ + uint32_t *nkr_leases; +#define NR_NOSLOT ((uint32_t)~0) /* used in nkr_*lease* */ + uint32_t nkr_hwlease; + uint32_t nkr_lease_idx; + + volatile int nkr_stopped; // XXX what for ? + + /* Support for adapters without native netmap support. + * On tx rings we preallocate an array of tx buffers + * (same size as the netmap ring), on rx rings we + * store incoming mbufs in a queue that is drained by + * a rxsync. + */ + struct mbuf **tx_pool; + // u_int nr_ntc; /* Emulation of a next-to-clean RX ring pointer. */ + struct mbq rx_queue; /* intercepted rx mbufs. */ + + uint32_t ring_id; /* debugging */ + char name[64]; /* diagnostic */ + + int (*nm_sync)(struct netmap_kring *kring, int flags); + +#ifdef WITH_PIPES + struct netmap_kring *pipe; + struct netmap_ring *save_ring; +#endif /* WITH_PIPES */ + } __attribute__((__aligned__(64))); + +/* return the next index, with wraparound */ +static inline uint32_t +nm_next(uint32_t i, uint32_t lim) +{ + return unlikely (i == lim) ? 0 : i + 1; +} + + +/* return the previous index, with wraparound */ +static inline uint32_t +nm_prev(uint32_t i, uint32_t lim) +{ + return unlikely (i == 0) ? lim : i - 1; +} + + /* - * This struct extends the 'struct adapter' (or - * equivalent) device descriptor. It contains all fields needed to - * support netmap operation. + * + * Here is the layout for the Rx and Tx rings. + + RxRING TxRING + + +-----------------+ +-----------------+ + | | | | + |XXX free slot XXX| |XXX free slot XXX| + +-----------------+ +-----------------+ +head->| owned by user |<-hwcur | not sent to nic |<-hwcur + | | | yet | + +-----------------+ | | + cur->| available to | | | + | user, not read | +-----------------+ + | yet | cur->| (being | + | | | prepared) | + | | | | + +-----------------+ + ------ + +tail->| |<-hwtail | |<-hwlease + | (being | ... | | ... + | prepared) | ... | | ... + +-----------------+ ... | | ... + | |<-hwlease +-----------------+ + | | tail->| |<-hwtail + | | | | + | | | | + | | | | + +-----------------+ +-----------------+ + + * The cur/tail (user view) and hwcur/hwtail (kernel view) + * are used in the normal operation of the card. + * + * When a ring is the output of a switch port (Rx ring for + * a VALE port, Tx ring for the host stack or NIC), slots + * are reserved in blocks through 'hwlease' which points + * to the next unused slot. + * On an Rx ring, hwlease is always after hwtail, + * and completions cause hwtail to advance. + * On a Tx ring, hwlease is always between cur and hwtail, + * and completions cause cur to advance. + * + * nm_kr_space() returns the maximum number of slots that + * can be assigned. + * nm_kr_lease() reserves the required number of buffers, + * advances nkr_hwlease and also returns an entry in + * a circular array where completions should be reported. + */ + + + +enum txrx { NR_RX = 0, NR_TX = 1 }; + +/* + * The "struct netmap_adapter" extends the "struct adapter" + * (or equivalent) device descriptor. + * It contains all base fields needed to support netmap operation. + * There are in fact different types of netmap adapters + * (native, generic, VALE switch...) so a netmap_adapter is + * just the first field in the derived type. */ struct netmap_adapter { /* * On linux we do not have a good way to tell if an interface - * is netmap-capable. So we use the following trick: + * is netmap-capable. So we always use the following trick: * NA(ifp) points here, and the first entry (which hopefully * always exists and is at least 32 bits) contains a magic * value which we can use to detect that the interface is good. */ uint32_t magic; - uint32_t na_flags; /* future place for IFCAP_NETMAP */ + uint32_t na_flags; /* enabled, and other flags */ #define NAF_SKIP_INTR 1 /* use the regular interrupt handler. * useful during initialization */ #define NAF_SW_ONLY 2 /* forward packets only to sw adapter */ - int refcount; /* number of user-space descriptors using this +#define NAF_BDG_MAYSLEEP 4 /* the bridge is allowed to sleep when + * forwarding packets coming from this + * interface + */ +#define NAF_MEM_OWNER 8 /* the adapter is responsible for the + * deallocation of the memory allocator + */ +#define NAF_NATIVE_ON 16 /* the adapter is native and the attached + * interface is in netmap mode + */ +#define NAF_NETMAP_ON 32 /* netmap is active (either native or + * emulated. Where possible (e.g. FreeBSD) + * IFCAP_NETMAP also mirrors this flag. + */ +#define NAF_HOST_RINGS 64 /* the adapter supports the host rings */ + int active_fds; /* number of user-space descriptors using this interface, which is equal to the number of struct netmap_if objs in the mapped region. */ - /* - * The selwakeup in the interrupt thread can use per-ring - * and/or global wait queues. We track how many clients - * of each type we have so we can optimize the drivers, - * and especially avoid huge contention on the locks. - */ - int na_single; /* threads attached to a single hw queue */ - int na_multi; /* threads attached to multiple hw queues */ - - int separate_locks; /* set if the interface suports different - locks for rx, tx and core. */ u_int num_rx_rings; /* number of adapter receive rings */ u_int num_tx_rings; /* number of adapter transmit rings */ @@ -195,89 +414,324 @@ struct netmap_adapter { struct netmap_kring *tx_rings; /* array of TX rings. */ struct netmap_kring *rx_rings; /* array of RX rings. */ + void *tailroom; /* space below the rings array */ + /* (used for leases) */ + + NM_SELINFO_T tx_si, rx_si; /* global wait queues */ + /* count users of the global wait queues */ + int tx_si_users, rx_si_users; + /* copy of if_qflush and if_transmit pointers, to intercept * packets from the network stack when netmap is active. */ int (*if_transmit)(struct ifnet *, struct mbuf *); + /* copy of if_input for netmap_send_up() */ + void (*if_input)(struct ifnet *, struct mbuf *); + /* references to the ifnet and device routines, used by * the generic netmap functions. */ struct ifnet *ifp; /* adapter is ifp->if_softc */ - NM_LOCK_T core_lock; /* used if no device lock available */ + /*---- callbacks for this netmap adapter -----*/ + /* + * nm_dtor() is the cleanup routine called when destroying + * the adapter. + * + * nm_register() is called on NIOCREGIF and close() to enter + * or exit netmap mode on the NIC + * + * nm_txsync() pushes packets to the underlying hw/switch + * + * nm_rxsync() collects packets from the underlying hw/switch + * + * nm_config() returns configuration information from the OS + * + * nm_krings_create() create and init the krings array + * (the array layout must conform to the description + * found above the definition of netmap_krings_create) + * + * nm_krings_delete() cleanup and delete the kring array + * + * nm_notify() is used to act after data have become available. + * For hw devices this is typically a selwakeup(), + * but for NIC/host ports attached to a switch (or vice-versa) + * we also need to invoke the 'txsync' code downstream. + */ + + /* private cleanup */ + void (*nm_dtor)(struct netmap_adapter *); + + int (*nm_register)(struct netmap_adapter *, int onoff); - int (*nm_register)(struct ifnet *, int onoff); - void (*nm_lock)(struct ifnet *, int what, u_int ringid); - int (*nm_txsync)(struct ifnet *, u_int ring, int lock); - int (*nm_rxsync)(struct ifnet *, u_int ring, int lock); + int (*nm_txsync)(struct netmap_adapter *, u_int ring, int flags); + int (*nm_rxsync)(struct netmap_adapter *, u_int ring, int flags); +#define NAF_FORCE_READ 1 +#define NAF_FORCE_RECLAIM 2 /* return configuration information */ - int (*nm_config)(struct ifnet *, u_int *txr, u_int *txd, - u_int *rxr, u_int *rxd); + int (*nm_config)(struct netmap_adapter *, + u_int *txr, u_int *txd, u_int *rxr, u_int *rxd); + int (*nm_krings_create)(struct netmap_adapter *); + void (*nm_krings_delete)(struct netmap_adapter *); + int (*nm_notify)(struct netmap_adapter *, + u_int ring, enum txrx, int flags); +#define NAF_DISABLE_NOTIFY 8 + + /* standard refcount to control the lifetime of the adapter + * (it should be equal to the lifetime of the corresponding ifp) + */ + int na_refcount; + + /* memory allocator (opaque) + * We also cache a pointer to the lut_entry for translating + * buffer addresses, and the total number of buffers. + */ + struct netmap_mem_d *nm_mem; + struct lut_entry *na_lut; + uint32_t na_lut_objtotal; /* max buffer index */ + + /* used internally. If non-null, the interface cannot be bound + * from userspace + */ + void *na_private; + +#ifdef WITH_PIPES + struct netmap_pipe_adapter **na_pipes; + int na_next_pipe; + int na_max_pipes; +#endif /* WITH_PIPES */ +}; + + +/* + * If the NIC is owned by the kernel + * (i.e., bridge), neither another bridge nor user can use it; + * if the NIC is owned by a user, only users can share it. + * Evaluation must be done under NMG_LOCK(). + */ +#define NETMAP_OWNED_BY_KERN(na) (na->na_private) +#define NETMAP_OWNED_BY_ANY(na) \ + (NETMAP_OWNED_BY_KERN(na) || (na->active_fds > 0)) + + +/* + * derived netmap adapters for various types of ports + */ +struct netmap_vp_adapter { /* VALE software port */ + struct netmap_adapter up; /* * Bridge support: * * bdg_port is the port number used in the bridge; - * na_bdg_refcount is a refcount used for bridge ports, - * when it goes to 0 we can detach+free this port - * (a bridge port is always attached if it exists; - * it is not always registered) * na_bdg points to the bridge this NA is attached to. */ int bdg_port; - int na_bdg_refcount; struct nm_bridge *na_bdg; - /* When we attach a physical interface to the bridge, we - * allow the controlling process to terminate, so we need - * a place to store the netmap_priv_d data structure. - * This is only done when physical interfaces are attached to a bridge. + int retry; + + /* Offset of ethernet header for each packet. */ + u_int virt_hdr_len; + /* Maximum Frame Size, used in bdg_mismatch_datapath() */ + u_int mfs; +}; + + +struct netmap_hw_adapter { /* physical device */ + struct netmap_adapter up; + + struct net_device_ops nm_ndo; // XXX linux only +}; + +/* Mitigation support. */ +struct nm_generic_mit { + struct hrtimer mit_timer; + int mit_pending; + struct netmap_adapter *mit_na; /* backpointer */ +}; + +struct netmap_generic_adapter { /* emulated device */ + struct netmap_hw_adapter up; + + /* Pointer to a previously used netmap adapter. */ + struct netmap_adapter *prev; + + /* generic netmap adapters support: + * a net_device_ops struct overrides ndo_select_queue(), + * save_if_input saves the if_input hook (FreeBSD), + * mit implements rx interrupt mitigation, */ - struct netmap_priv_d *na_kpriv; + struct net_device_ops generic_ndo; + void (*save_if_input)(struct ifnet *, struct mbuf *); + + struct nm_generic_mit *mit; #ifdef linux - struct net_device_ops nm_ndo; -#endif /* linux */ + netdev_tx_t (*save_start_xmit)(struct mbuf *, struct ifnet *); +#endif }; +static __inline int +netmap_real_tx_rings(struct netmap_adapter *na) +{ + return na->num_tx_rings + !!(na->na_flags & NAF_HOST_RINGS); +} + +static __inline int +netmap_real_rx_rings(struct netmap_adapter *na) +{ + return na->num_rx_rings + !!(na->na_flags & NAF_HOST_RINGS); +} + +#ifdef WITH_VALE + /* - * The combination of "enable" (ifp->if_capenable & IFCAP_NETMAP) - * and refcount gives the status of the interface, namely: + * Bridge wrapper for non VALE ports attached to a VALE switch. + * + * The real device must already have its own netmap adapter (hwna). + * The bridge wrapper and the hwna adapter share the same set of + * netmap rings and buffers, but they have two separate sets of + * krings descriptors, with tx/rx meanings swapped: + * + * netmap + * bwrap krings rings krings hwna + * +------+ +------+ +-----+ +------+ +------+ + * |tx_rings->| |\ /| |----| |<-tx_rings| + * | | +------+ \ / +-----+ +------+ | | + * | | X | | + * | | / \ | | + * | | +------+/ \+-----+ +------+ | | + * |rx_rings->| | | |----| |<-rx_rings| + * | | +------+ +-----+ +------+ | | + * +------+ +------+ * - * enable refcount Status + * - packets coming from the bridge go to the brwap rx rings, + * which are also the hwna tx rings. The bwrap notify callback + * will then complete the hwna tx (see netmap_bwrap_notify). + * + * - packets coming from the outside go to the hwna rx rings, + * which are also the bwrap tx rings. The (overwritten) hwna + * notify method will then complete the bridge tx + * (see netmap_bwrap_intr_notify). + * + * The bridge wrapper may optionally connect the hwna 'host' rings + * to the bridge. This is done by using a second port in the + * bridge and connecting it to the 'host' netmap_vp_adapter + * contained in the netmap_bwrap_adapter. The brwap host adapter + * cross-links the hwna host rings in the same way as shown above. + * + * - packets coming from the bridge and directed to the host stack + * are handled by the bwrap host notify callback + * (see netmap_bwrap_host_notify) + * + * - packets coming from the host stack are still handled by the + * overwritten hwna notify callback (netmap_bwrap_intr_notify), + * but are diverted to the host adapter depending on the ring number. * - * FALSE 0 normal operation - * FALSE != 0 -- (impossible) - * TRUE 1 netmap mode - * TRUE 0 being deleted. */ +struct netmap_bwrap_adapter { + struct netmap_vp_adapter up; + struct netmap_vp_adapter host; /* for host rings */ + struct netmap_adapter *hwna; /* the underlying device */ + + /* backup of the hwna notify callback */ + int (*save_notify)(struct netmap_adapter *, + u_int ring, enum txrx, int flags); + + /* + * When we attach a physical interface to the bridge, we + * allow the controlling process to terminate, so we need + * a place to store the netmap_priv_d data structure. + * This is only done when physical interfaces + * are attached to a bridge. + */ + struct netmap_priv_d *na_kpriv; +}; + + +#endif /* WITH_VALE */ + +#ifdef WITH_PIPES + +#define NM_MAXPIPES 64 /* max number of pipes per adapter */ + +struct netmap_pipe_adapter { + struct netmap_adapter up; + + u_int id; /* pipe identifier */ + int role; /* either NR_REG_PIPE_MASTER or NR_REG_PIPE_SLAVE */ + + struct netmap_adapter *parent; /* adapter that owns the memory */ + struct netmap_pipe_adapter *peer; /* the other end of the pipe */ + int peer_ref; /* 1 iff we are holding a ref to the peer */ + + u_int parent_slot; /* index in the parent pipe array */ +}; + +#endif /* WITH_PIPES */ + + +/* return slots reserved to rx clients; used in drivers */ +static inline uint32_t +nm_kr_rxspace(struct netmap_kring *k) +{ + int space = k->nr_hwtail - k->nr_hwcur; + if (space < 0) + space += k->nkr_num_slots; + ND("preserving %d rx slots %d -> %d", space, k->nr_hwcur, k->nr_hwtail); + + return space; +} + + +/* True if no space in the tx ring. only valid after txsync_prologue */ +static inline int +nm_kr_txempty(struct netmap_kring *kring) +{ + return kring->rcur == kring->nr_hwtail; +} -#define NETMAP_DELETING(_na) ( ((_na)->refcount == 0) && \ - ( (_na)->ifp->if_capenable & IFCAP_NETMAP) ) /* - * parameters for (*nm_lock)(adapter, what, index) + * protect against multiple threads using the same ring. + * also check that the ring has not been stopped. + * We only care for 0 or !=0 as a return code. */ -enum { - NETMAP_NO_LOCK = 0, - NETMAP_CORE_LOCK, NETMAP_CORE_UNLOCK, - NETMAP_TX_LOCK, NETMAP_TX_UNLOCK, - NETMAP_RX_LOCK, NETMAP_RX_UNLOCK, -#ifdef __FreeBSD__ -#define NETMAP_REG_LOCK NETMAP_CORE_LOCK -#define NETMAP_REG_UNLOCK NETMAP_CORE_UNLOCK -#else - NETMAP_REG_LOCK, NETMAP_REG_UNLOCK -#endif -}; +#define NM_KR_BUSY 1 +#define NM_KR_STOPPED 2 + + +static __inline void nm_kr_put(struct netmap_kring *kr) +{ + NM_ATOMIC_CLEAR(&kr->nr_busy); +} + + +static __inline int nm_kr_tryget(struct netmap_kring *kr) +{ + /* check a first time without taking the lock + * to avoid starvation for nm_kr_get() + */ + if (unlikely(kr->nkr_stopped)) { + ND("ring %p stopped (%d)", kr, kr->nkr_stopped); + return NM_KR_STOPPED; + } + if (unlikely(NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))) + return NM_KR_BUSY; + /* check a second time with lock held */ + if (unlikely(kr->nkr_stopped)) { + ND("ring %p stopped (%d)", kr, kr->nkr_stopped); + nm_kr_put(kr); + return NM_KR_STOPPED; + } + return 0; +} -/* How to handle locking support in netmap_rx_irq/netmap_tx_irq */ -#define NETMAP_LOCKED_ENTER 0x10000000 /* already locked on enter */ -#define NETMAP_LOCKED_EXIT 0x20000000 /* keep locked on exit */ /* - * The following are support routines used by individual drivers to + * The following functions are used by individual drivers to * support netmap operation. * * netmap_attach() initializes a struct netmap_adapter, allocating the @@ -285,7 +739,7 @@ enum { * * netmap_detach() frees the memory allocated by netmap_attach(). * - * netmap_start() replaces the if_transmit routine of the interface, + * netmap_transmit() replaces the if_transmit routine of the interface, * and is used to intercept packets coming from the stack. * * netmap_load_map/netmap_reload_map are helper routines to set/reset @@ -294,36 +748,252 @@ enum { * netmap_reset() is a helper routine to be called in the driver * when reinitializing a ring. */ -int netmap_attach(struct netmap_adapter *, int); +int netmap_attach(struct netmap_adapter *); +int netmap_attach_common(struct netmap_adapter *); +void netmap_detach_common(struct netmap_adapter *na); void netmap_detach(struct ifnet *); -int netmap_start(struct ifnet *, struct mbuf *); -enum txrx { NR_RX = 0, NR_TX = 1 }; +int netmap_transmit(struct ifnet *, struct mbuf *); struct netmap_slot *netmap_reset(struct netmap_adapter *na, - enum txrx tx, int n, u_int new_cur); + enum txrx tx, u_int n, u_int new_cur); int netmap_ring_reinit(struct netmap_kring *); +/* default functions to handle rx/tx interrupts */ +int netmap_rx_irq(struct ifnet *, u_int, u_int *); +#define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL) +void netmap_common_irq(struct ifnet *, u_int, u_int *work_done); + +void netmap_disable_all_rings(struct ifnet *); +void netmap_enable_all_rings(struct ifnet *); +void netmap_disable_ring(struct netmap_kring *kr); + + +/* set/clear native flags and if_transmit/netdev_ops */ +static inline void +nm_set_native_flags(struct netmap_adapter *na) +{ + struct ifnet *ifp = na->ifp; + + na->na_flags |= (NAF_NATIVE_ON | NAF_NETMAP_ON); +#ifdef IFCAP_NETMAP /* or FreeBSD ? */ + ifp->if_capenable |= IFCAP_NETMAP; +#endif +#ifdef __FreeBSD__ + na->if_transmit = ifp->if_transmit; + ifp->if_transmit = netmap_transmit; +#else + na->if_transmit = (void *)ifp->netdev_ops; + ifp->netdev_ops = &((struct netmap_hw_adapter *)na)->nm_ndo; +#endif +} + + +static inline void +nm_clear_native_flags(struct netmap_adapter *na) +{ + struct ifnet *ifp = na->ifp; + +#ifdef __FreeBSD__ + ifp->if_transmit = na->if_transmit; +#else + ifp->netdev_ops = (void *)na->if_transmit; +#endif + na->na_flags &= ~(NAF_NATIVE_ON | NAF_NETMAP_ON); +#ifdef IFCAP_NETMAP /* or FreeBSD ? */ + ifp->if_capenable &= ~IFCAP_NETMAP; +#endif +} + + +/* + * validates parameters in the ring/kring, returns a value for head + * If any error, returns ring_size to force a reinit. + */ +uint32_t nm_txsync_prologue(struct netmap_kring *); + + +/* + * validates parameters in the ring/kring, returns a value for head, + * and the 'reserved' value in the argument. + * If any error, returns ring_size lim to force a reinit. + */ +uint32_t nm_rxsync_prologue(struct netmap_kring *); + + +/* + * update kring and ring at the end of txsync. + */ +static inline void +nm_txsync_finalize(struct netmap_kring *kring) +{ + /* update ring tail to what the kernel knows */ + kring->ring->tail = kring->rtail = kring->nr_hwtail; + + /* note, head/rhead/hwcur might be behind cur/rcur + * if no carrier + */ + ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d", + kring->name, kring->nr_hwcur, kring->nr_hwtail, + kring->rhead, kring->rcur, kring->rtail); +} + + +/* + * update kring and ring at the end of rxsync + */ +static inline void +nm_rxsync_finalize(struct netmap_kring *kring) +{ + /* tell userspace that there might be new packets */ + //struct netmap_ring *ring = kring->ring; + ND("head %d cur %d tail %d -> %d", ring->head, ring->cur, ring->tail, + kring->nr_hwtail); + kring->ring->tail = kring->rtail = kring->nr_hwtail; + /* make a copy of the state for next round */ + kring->rhead = kring->ring->head; + kring->rcur = kring->ring->cur; +} + + +/* check/fix address and len in tx rings */ +#if 1 /* debug version */ +#define NM_CHECK_ADDR_LEN(_a, _l) do { \ + if (_a == netmap_buffer_base || _l > NETMAP_BUF_SIZE) { \ + RD(5, "bad addr/len ring %d slot %d idx %d len %d", \ + ring_nr, nm_i, slot->buf_idx, len); \ + if (_l > NETMAP_BUF_SIZE) \ + _l = NETMAP_BUF_SIZE; \ + } } while (0) +#else /* no debug version */ +#define NM_CHECK_ADDR_LEN(_a, _l) do { \ + if (_l > NETMAP_BUF_SIZE) \ + _l = NETMAP_BUF_SIZE; \ + } while (0) +#endif + + +/*---------------------------------------------------------------*/ /* - * The following bridge-related interfaces are used by other kernel modules - * In the version that only supports unicast or broadcast, the lookup + * Support routines to be used with the VALE switch + */ +int netmap_update_config(struct netmap_adapter *na); +int netmap_krings_create(struct netmap_adapter *na, u_int tailroom); +void netmap_krings_delete(struct netmap_adapter *na); +int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait); + + +struct netmap_if * +netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, + uint16_t ringid, uint32_t flags, int *err); + + + +u_int nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg); +int netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create); +int netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na); + + +#ifdef WITH_VALE +/* + * The following bridge-related functions are used by other + * kernel modules. + * + * VALE only supports unicast or broadcast. The lookup * function can return 0 .. NM_BDG_MAXPORTS-1 for regular ports, * NM_BDG_MAXPORTS for broadcast, NM_BDG_MAXPORTS+1 for unknown. * XXX in practice "unknown" might be handled same as broadcast. */ -typedef u_int (*bdg_lookup_fn_t)(char *buf, u_int len, uint8_t *ring_nr, - struct netmap_adapter *); -int netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func); -u_int netmap_bdg_learning(char *, u_int, uint8_t *, struct netmap_adapter *); -#define NM_NAME "vale" /* prefix for the bridge port name */ -#define NM_BDG_MAXPORTS 254 /* up to 32 for bitmap, 254 ok otherwise */ +typedef u_int (*bdg_lookup_fn_t)(char *buf, u_int len, + uint8_t *ring_nr, struct netmap_vp_adapter *); +u_int netmap_bdg_learning(char *, u_int, uint8_t *, + struct netmap_vp_adapter *); + +#define NM_BDG_MAXPORTS 254 /* up to 254 */ #define NM_BDG_BROADCAST NM_BDG_MAXPORTS #define NM_BDG_NOPORT (NM_BDG_MAXPORTS+1) +#define NM_NAME "vale" /* prefix for bridge port name */ + + +/* these are redefined in case of no VALE support */ +int netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create); +void netmap_init_bridges(void); +int netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func); + +#else /* !WITH_VALE */ +#define netmap_get_bdg_na(_1, _2, _3) 0 +#define netmap_init_bridges(_1) +#define netmap_bdg_ctl(_1, _2) EINVAL +#endif /* !WITH_VALE */ + +#ifdef WITH_PIPES +/* max number of pipes per device */ +#define NM_MAXPIPES 64 /* XXX how many? */ +/* in case of no error, returns the actual number of pipes in nmr->nr_arg1 */ +int netmap_pipe_alloc(struct netmap_adapter *, struct nmreq *nmr); +void netmap_pipe_dealloc(struct netmap_adapter *); +int netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create); +#else /* !WITH_PIPES */ +#define NM_MAXPIPES 0 +#define netmap_pipe_alloc(_1, _2) EOPNOTSUPP +#define netmap_pipe_dealloc(_1) +#define netmap_get_pipe_na(_1, _2, _3) 0 +#endif + +/* Various prototypes */ +int netmap_poll(struct cdev *dev, int events, struct thread *td); +int netmap_init(void); +void netmap_fini(void); +int netmap_get_memory(struct netmap_priv_d* p); +void netmap_dtor(void *data); +int netmap_dtor_locked(struct netmap_priv_d *priv); + +int netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td); + +/* netmap_adapter creation/destruction */ +#define NM_IFPNAME(ifp) ((ifp) ? (ifp)->if_xname : "zombie") + +// #define NM_DEBUG_PUTGET 1 + +#ifdef NM_DEBUG_PUTGET + +#define NM_DBG(f) __##f + +void __netmap_adapter_get(struct netmap_adapter *na); + +#define netmap_adapter_get(na) \ + do { \ + struct netmap_adapter *__na = na; \ + D("getting %p:%s (%d)", __na, NM_IFPNAME(__na->ifp), __na->na_refcount); \ + __netmap_adapter_get(__na); \ + } while (0) + +int __netmap_adapter_put(struct netmap_adapter *na); + +#define netmap_adapter_put(na) \ + ({ \ + struct netmap_adapter *__na = na; \ + D("putting %p:%s (%d)", __na, NM_IFPNAME(__na->ifp), __na->na_refcount); \ + __netmap_adapter_put(__na); \ + }) + +#else /* !NM_DEBUG_PUTGET */ + +#define NM_DBG(f) f +void netmap_adapter_get(struct netmap_adapter *na); +int netmap_adapter_put(struct netmap_adapter *na); + +#endif /* !NM_DEBUG_PUTGET */ + + +/* + * module variables + */ extern u_int netmap_buf_size; #define NETMAP_BUF_SIZE netmap_buf_size // XXX remove -extern int netmap_mitigate; +extern int netmap_mitigate; // XXX not really used extern int netmap_no_pendintr; -extern u_int netmap_total_buffers; -extern char *netmap_buffer_base; +extern u_int netmap_total_buffers; // global allocator +extern char *netmap_buffer_base; // global allocator extern int netmap_verbose; // XXX debugging enum { /* verbose flags */ NM_VERB_ON = 1, /* generic verbose */ @@ -336,18 +1006,19 @@ enum { /* verbose flags */ NM_VERB_NIC_TXSYNC = 0x2000, }; +extern int netmap_txsync_retry; +extern int netmap_generic_mit; +extern int netmap_generic_ringsize; +extern int netmap_generic_rings; + /* * NA returns a pointer to the struct netmap adapter from the ifp, * WNA is used to write it. - * SWNA() is used for the "host stack" endpoint associated - * to an interface. It is allocated together with the main NA(), - * as an array of two objects. */ #ifndef WNA #define WNA(_ifp) (_ifp)->if_pspare[0] #endif #define NA(_ifp) ((struct netmap_adapter *)WNA(_ifp)) -#define SWNA(_ifp) (NA(_ifp) + 1) /* * Macros to determine if an interface is netmap capable or netmap enabled. @@ -381,7 +1052,8 @@ enum { /* verbose flags */ #endif /* linux */ #ifdef __FreeBSD__ -/* Callback invoked by the dma machinery after a successfull dmamap_load */ + +/* Callback invoked by the dma machinery after a successful dmamap_load */ static void netmap_dmamap_cb(__unused void *arg, __unused bus_dma_segment_t * segs, __unused int nseg, __unused int error) { @@ -408,6 +1080,7 @@ netmap_reload_map(bus_dma_tag_t tag, bus_dmamap_t map, void *buf) netmap_dmamap_cb, NULL, BUS_DMA_NOWAIT); } } + #else /* linux */ /* @@ -451,6 +1124,7 @@ netmap_reload_map(bus_dma_tag_t tag, bus_dmamap_t map, void *buf) #endif /* linux */ + /* * functions to map NIC to KRING indexes (n2k) and vice versa (k2n) */ @@ -514,8 +1188,193 @@ PNMB(struct netmap_slot *slot, uint64_t *pp) return ret; } -/* default functions to handle rx/tx interrupts */ -int netmap_rx_irq(struct ifnet *, int, int *); -#define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL) +/* Generic version of NMB, which uses device-specific memory. */ +static inline void * +BDG_NMB(struct netmap_adapter *na, struct netmap_slot *slot) +{ + struct lut_entry *lut = na->na_lut; + uint32_t i = slot->buf_idx; + return (unlikely(i >= na->na_lut_objtotal)) ? + lut[0].vaddr : lut[i].vaddr; +} + + + +void netmap_txsync_to_host(struct netmap_adapter *na); + + +/* + * Structure associated to each thread which registered an interface. + * + * The first 4 fields of this structure are written by NIOCREGIF and + * read by poll() and NIOC?XSYNC. + * + * There is low contention among writers (a correct user program + * should have none) and among writers and readers, so we use a + * single global lock to protect the structure initialization; + * since initialization involves the allocation of memory, + * we reuse the memory allocator lock. + * + * Read access to the structure is lock free. Readers must check that + * np_nifp is not NULL before using the other fields. + * If np_nifp is NULL initialization has not been performed, + * so they should return an error to userspace. + * + * The ref_done field is used to regulate access to the refcount in the + * memory allocator. The refcount must be incremented at most once for + * each open("/dev/netmap"). The increment is performed by the first + * function that calls netmap_get_memory() (currently called by + * mmap(), NIOCGINFO and NIOCREGIF). + * If the refcount is incremented, it is then decremented when the + * private structure is destroyed. + */ +struct netmap_priv_d { + struct netmap_if * volatile np_nifp; /* netmap if descriptor. */ + + struct netmap_adapter *np_na; + uint32_t np_flags; /* from the ioctl */ + u_int np_txqfirst, np_txqlast; /* range of tx rings to scan */ + u_int np_rxqfirst, np_rxqlast; /* range of rx rings to scan */ + uint16_t np_txpoll; /* XXX and also np_rxpoll ? */ + + struct netmap_mem_d *np_mref; /* use with NMG_LOCK held */ + /* np_refcount is only used on FreeBSD */ + int np_refcount; /* use with NMG_LOCK held */ + + /* pointers to the selinfo to be used for selrecord. + * Either the local or the global one depending on the + * number of rings. + */ + NM_SELINFO_T *np_rxsi, *np_txsi; + struct thread *np_td; /* kqueue, just debugging */ +}; + + +/* + * generic netmap emulation for devices that do not have + * native netmap support. + */ +int generic_netmap_attach(struct ifnet *ifp); + +int netmap_catch_rx(struct netmap_adapter *na, int intercept); +void generic_rx_handler(struct ifnet *ifp, struct mbuf *m);; +void netmap_catch_tx(struct netmap_generic_adapter *na, int enable); +int generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, void *addr, u_int len, u_int ring_nr); +int generic_find_num_desc(struct ifnet *ifp, u_int *tx, u_int *rx); +void generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq); + +/* + * netmap_mitigation API. This is used by the generic adapter + * to reduce the number of interrupt requests/selwakeup + * to clients on incoming packets. + */ +void netmap_mitigation_init(struct nm_generic_mit *mit, struct netmap_adapter *na); +void netmap_mitigation_start(struct nm_generic_mit *mit); +void netmap_mitigation_restart(struct nm_generic_mit *mit); +int netmap_mitigation_active(struct nm_generic_mit *mit); +void netmap_mitigation_cleanup(struct nm_generic_mit *mit); + + + +/* Shared declarations for the VALE switch. */ + +/* + * Each transmit queue accumulates a batch of packets into + * a structure before forwarding. Packets to the same + * destination are put in a list using ft_next as a link field. + * ft_frags and ft_next are valid only on the first fragment. + */ +struct nm_bdg_fwd { /* forwarding entry for a bridge */ + void *ft_buf; /* netmap or indirect buffer */ + uint8_t ft_frags; /* how many fragments (only on 1st frag) */ + uint8_t _ft_port; /* dst port (unused) */ + uint16_t ft_flags; /* flags, e.g. indirect */ + uint16_t ft_len; /* src fragment len */ + uint16_t ft_next; /* next packet to same destination */ +}; + +/* struct 'virtio_net_hdr' from linux. */ +struct nm_vnet_hdr { +#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start, csum_offset */ +#define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */ + uint8_t flags; +#define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */ +#define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */ +#define VIRTIO_NET_HDR_GSO_UDP 3 /* GSO frame, IPv4 UDP (UFO) */ +#define VIRTIO_NET_HDR_GSO_TCPV6 4 /* GSO frame, IPv6 TCP */ +#define VIRTIO_NET_HDR_GSO_ECN 0x80 /* TCP has ECN set */ + uint8_t gso_type; + uint16_t hdr_len; + uint16_t gso_size; + uint16_t csum_start; + uint16_t csum_offset; +}; + +#define WORST_CASE_GSO_HEADER (14+40+60) /* IPv6 + TCP */ + +/* Private definitions for IPv4, IPv6, UDP and TCP headers. */ + +struct nm_iphdr { + uint8_t version_ihl; + uint8_t tos; + uint16_t tot_len; + uint16_t id; + uint16_t frag_off; + uint8_t ttl; + uint8_t protocol; + uint16_t check; + uint32_t saddr; + uint32_t daddr; + /*The options start here. */ +}; + +struct nm_tcphdr { + uint16_t source; + uint16_t dest; + uint32_t seq; + uint32_t ack_seq; + uint8_t doff; /* Data offset + Reserved */ + uint8_t flags; + uint16_t window; + uint16_t check; + uint16_t urg_ptr; +}; + +struct nm_udphdr { + uint16_t source; + uint16_t dest; + uint16_t len; + uint16_t check; +}; + +struct nm_ipv6hdr { + uint8_t priority_version; + uint8_t flow_lbl[3]; + + uint16_t payload_len; + uint8_t nexthdr; + uint8_t hop_limit; + + uint8_t saddr[16]; + uint8_t daddr[16]; +}; + +/* Type used to store a checksum (in host byte order) that hasn't been + * folded yet. + */ +#define rawsum_t uint32_t + +rawsum_t nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum); +uint16_t nm_csum_ipv4(struct nm_iphdr *iph); +void nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, + size_t datalen, uint16_t *check); +void nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, + size_t datalen, uint16_t *check); +uint16_t nm_csum_fold(rawsum_t cur_sum); + +void bdg_mismatch_datapath(struct netmap_vp_adapter *na, + struct netmap_vp_adapter *dst_na, + struct nm_bdg_fwd *ft_p, struct netmap_ring *ring, + u_int *j, u_int lim, u_int *howmany); #endif /* _NET_NETMAP_KERN_H_ */ diff --git a/sys/dev/netmap/netmap_mem2.c b/sys/dev/netmap/netmap_mem2.c index dcf4b06..5491845 100644 --- a/sys/dev/netmap/netmap_mem2.c +++ b/sys/dev/netmap/netmap_mem2.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012-2013 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. + * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -8,7 +8,7 @@ * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. + * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE @@ -23,108 +23,49 @@ * SUCH DAMAGE. */ -/* - * $FreeBSD$ - * - * (New) memory allocator for netmap - */ - -/* - * This allocator creates three memory pools: - * nm_if_pool for the struct netmap_if - * nm_ring_pool for the struct netmap_ring - * nm_buf_pool for the packet buffers. - * - * that contain netmap objects. Each pool is made of a number of clusters, - * multiple of a page size, each containing an integer number of objects. - * The clusters are contiguous in user space but not in the kernel. - * Only nm_buf_pool needs to be dma-able, - * but for convenience use the same type of allocator for all. - * - * Once mapped, the three pools are exported to userspace - * as a contiguous block, starting from nm_if_pool. Each - * cluster (and pool) is an integral number of pages. - * [ . . . ][ . . . . . .][ . . . . . . . . . .] - * nm_if nm_ring nm_buf - * - * The userspace areas contain offsets of the objects in userspace. - * When (at init time) we write these offsets, we find out the index - * of the object, and from there locate the offset from the beginning - * of the region. - * - * The invididual allocators manage a pool of memory for objects of - * the same size. - * The pool is split into smaller clusters, whose size is a - * multiple of the page size. The cluster size is chosen - * to minimize the waste for a given max cluster size - * (we do it by brute force, as we have relatively few objects - * per cluster). - * - * Objects are aligned to the cache line (64 bytes) rounding up object - * sizes when needed. A bitmap contains the state of each object. - * Allocation scans the bitmap; this is done only on attach, so we are not - * too worried about performance - * - * For each allocator we can define (thorugh sysctl) the size and - * number of each object. Memory is allocated at the first use of a - * netmap file descriptor, and can be freed when all such descriptors - * have been released (including unmapping the memory). - * If memory is scarce, the system tries to get as much as possible - * and the sysctl values reflect the actual allocation. - * Together with desired values, the sysctl export also absolute - * min and maximum values that cannot be overridden. - * - * struct netmap_if: - * variable size, max 16 bytes per ring pair plus some fixed amount. - * 1024 bytes should be large enough in practice. - * - * In the worst case we have one netmap_if per ring in the system. - * - * struct netmap_ring - * variable size, 8 byte per slot plus some fixed amount. - * Rings can be large (e.g. 4k slots, or >32Kbytes). - * We default to 36 KB (9 pages), and a few hundred rings. - * - * struct netmap_buffer - * The more the better, both because fast interfaces tend to have - * many slots, and because we may want to use buffers to store - * packets in userspace avoiding copies. - * Must contain a full frame (eg 1518, or more for vlans, jumbo - * frames etc.) plus be nicely aligned, plus some NICs restrict - * the size to multiple of 1K or so. Default to 2K - */ +#ifdef linux +#include "bsd_glue.h" +#endif /* linux */ -#define NETMAP_BUF_MAX_NUM 20*4096*2 /* large machine */ +#ifdef __APPLE__ +#include "osx_glue.h" +#endif /* __APPLE__ */ + +#ifdef __FreeBSD__ +#include <sys/cdefs.h> /* prerequisite */ +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/malloc.h> +#include <sys/proc.h> +#include <vm/vm.h> /* vtophys */ +#include <vm/pmap.h> /* vtophys */ +#include <sys/socket.h> /* sockaddrs */ +#include <sys/selinfo.h> +#include <sys/sysctl.h> +#include <net/if.h> +#include <net/if_var.h> +#include <net/vnet.h> +#include <machine/bus.h> /* bus_dmamap_* */ + +#endif /* __FreeBSD__ */ + +#include <net/netmap.h> +#include <dev/netmap/netmap_kern.h> +#include "netmap_mem2.h" #ifdef linux -// XXX a mtx would suffice here 20130415 lr -// #define NMA_LOCK_T safe_spinlock_t -#define NMA_LOCK_T struct semaphore -#define NMA_LOCK_INIT() sema_init(&nm_mem.nm_mtx, 1) -#define NMA_LOCK_DESTROY() -#define NMA_LOCK() down(&nm_mem.nm_mtx) -#define NMA_UNLOCK() up(&nm_mem.nm_mtx) +#define NMA_LOCK_INIT(n) sema_init(&(n)->nm_mtx, 1) +#define NMA_LOCK_DESTROY(n) +#define NMA_LOCK(n) down(&(n)->nm_mtx) +#define NMA_UNLOCK(n) up(&(n)->nm_mtx) #else /* !linux */ -#define NMA_LOCK_T struct mtx -#define NMA_LOCK_INIT() mtx_init(&nm_mem.nm_mtx, "netmap memory allocator lock", NULL, MTX_DEF) -#define NMA_LOCK_DESTROY() mtx_destroy(&nm_mem.nm_mtx) -#define NMA_LOCK() mtx_lock(&nm_mem.nm_mtx) -#define NMA_UNLOCK() mtx_unlock(&nm_mem.nm_mtx) +#define NMA_LOCK_INIT(n) mtx_init(&(n)->nm_mtx, "netmap memory allocator lock", NULL, MTX_DEF) +#define NMA_LOCK_DESTROY(n) mtx_destroy(&(n)->nm_mtx) +#define NMA_LOCK(n) mtx_lock(&(n)->nm_mtx) +#define NMA_UNLOCK(n) mtx_unlock(&(n)->nm_mtx) #endif /* linux */ -enum { - NETMAP_IF_POOL = 0, - NETMAP_RING_POOL, - NETMAP_BUF_POOL, - NETMAP_POOLS_NR -}; - - -struct netmap_obj_params { - u_int size; - u_int num; -}; - struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = { [NETMAP_IF_POOL] = { @@ -141,48 +82,31 @@ struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = { }, }; - -struct netmap_obj_pool { - char name[16]; /* name of the allocator */ - u_int objtotal; /* actual total number of objects. */ - u_int objfree; /* number of free objects. */ - u_int clustentries; /* actual objects per cluster */ - - /* limits */ - u_int objminsize; /* minimum object size */ - u_int objmaxsize; /* maximum object size */ - u_int nummin; /* minimum number of objects */ - u_int nummax; /* maximum number of objects */ - - /* the total memory space is _numclusters*_clustsize */ - u_int _numclusters; /* how many clusters */ - u_int _clustsize; /* cluster size */ - u_int _objsize; /* actual object size */ - - u_int _memtotal; /* _numclusters*_clustsize */ - struct lut_entry *lut; /* virt,phys addresses, objtotal entries */ - uint32_t *bitmap; /* one bit per buffer, 1 means free */ - uint32_t bitmap_slots; /* number of uint32 entries in bitmap */ +struct netmap_obj_params netmap_min_priv_params[NETMAP_POOLS_NR] = { + [NETMAP_IF_POOL] = { + .size = 1024, + .num = 1, + }, + [NETMAP_RING_POOL] = { + .size = 5*PAGE_SIZE, + .num = 4, + }, + [NETMAP_BUF_POOL] = { + .size = 2048, + .num = 4098, + }, }; -struct netmap_mem_d { - NMA_LOCK_T nm_mtx; /* protect the allocator */ - u_int nm_totalsize; /* shorthand */ - - int finalized; /* !=0 iff preallocation done */ - int lasterr; /* last error for curr config */ - int refcount; /* existing priv structures */ - /* the three allocators */ - struct netmap_obj_pool pools[NETMAP_POOLS_NR]; -}; - /* * nm_mem is the memory allocator used for all physical interfaces * running in netmap mode. * Virtual (VALE) ports will have each its own allocator. */ -static struct netmap_mem_d nm_mem = { /* Our memory allocator. */ +static int netmap_mem_global_config(struct netmap_mem_d *nmd); +static int netmap_mem_global_finalize(struct netmap_mem_d *nmd); +static void netmap_mem_global_deref(struct netmap_mem_d *nmd); +struct netmap_mem_d nm_mem = { /* Our memory allocator. */ .pools = { [NETMAP_IF_POOL] = { .name = "netmap_if", @@ -206,62 +130,193 @@ static struct netmap_mem_d nm_mem = { /* Our memory allocator. */ .nummax = 1000000, /* one million! */ }, }, + .config = netmap_mem_global_config, + .finalize = netmap_mem_global_finalize, + .deref = netmap_mem_global_deref, + + .nm_id = 1, + + .prev = &nm_mem, + .next = &nm_mem, }; + +struct netmap_mem_d *netmap_last_mem_d = &nm_mem; + // XXX logically belongs to nm_mem struct lut_entry *netmap_buffer_lut; /* exported */ +/* blueprint for the private memory allocators */ +static int netmap_mem_private_config(struct netmap_mem_d *nmd); +static int netmap_mem_private_finalize(struct netmap_mem_d *nmd); +static void netmap_mem_private_deref(struct netmap_mem_d *nmd); +const struct netmap_mem_d nm_blueprint = { + .pools = { + [NETMAP_IF_POOL] = { + .name = "%s_if", + .objminsize = sizeof(struct netmap_if), + .objmaxsize = 4096, + .nummin = 1, + .nummax = 100, + }, + [NETMAP_RING_POOL] = { + .name = "%s_ring", + .objminsize = sizeof(struct netmap_ring), + .objmaxsize = 32*PAGE_SIZE, + .nummin = 2, + .nummax = 1024, + }, + [NETMAP_BUF_POOL] = { + .name = "%s_buf", + .objminsize = 64, + .objmaxsize = 65536, + .nummin = 4, + .nummax = 1000000, /* one million! */ + }, + }, + .config = netmap_mem_private_config, + .finalize = netmap_mem_private_finalize, + .deref = netmap_mem_private_deref, + + .flags = NETMAP_MEM_PRIVATE, +}; + /* memory allocator related sysctls */ #define STRINGIFY(x) #x + #define DECLARE_SYSCTLS(id, name) \ SYSCTL_INT(_dev_netmap, OID_AUTO, name##_size, \ CTLFLAG_RW, &netmap_params[id].size, 0, "Requested size of netmap " STRINGIFY(name) "s"); \ - SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_size, \ - CTLFLAG_RD, &nm_mem.pools[id]._objsize, 0, "Current size of netmap " STRINGIFY(name) "s"); \ - SYSCTL_INT(_dev_netmap, OID_AUTO, name##_num, \ - CTLFLAG_RW, &netmap_params[id].num, 0, "Requested number of netmap " STRINGIFY(name) "s"); \ - SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_num, \ - CTLFLAG_RD, &nm_mem.pools[id].objtotal, 0, "Current number of netmap " STRINGIFY(name) "s") - + SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_size, \ + CTLFLAG_RD, &nm_mem.pools[id]._objsize, 0, "Current size of netmap " STRINGIFY(name) "s"); \ + SYSCTL_INT(_dev_netmap, OID_AUTO, name##_num, \ + CTLFLAG_RW, &netmap_params[id].num, 0, "Requested number of netmap " STRINGIFY(name) "s"); \ + SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_num, \ + CTLFLAG_RD, &nm_mem.pools[id].objtotal, 0, "Current number of netmap " STRINGIFY(name) "s"); \ + SYSCTL_INT(_dev_netmap, OID_AUTO, priv_##name##_size, \ + CTLFLAG_RW, &netmap_min_priv_params[id].size, 0, \ + "Default size of private netmap " STRINGIFY(name) "s"); \ + SYSCTL_INT(_dev_netmap, OID_AUTO, priv_##name##_num, \ + CTLFLAG_RW, &netmap_min_priv_params[id].num, 0, \ + "Default number of private netmap " STRINGIFY(name) "s") + +SYSCTL_DECL(_dev_netmap); DECLARE_SYSCTLS(NETMAP_IF_POOL, if); DECLARE_SYSCTLS(NETMAP_RING_POOL, ring); DECLARE_SYSCTLS(NETMAP_BUF_POOL, buf); +static int +nm_mem_assign_id(struct netmap_mem_d *nmd) +{ + nm_memid_t id; + struct netmap_mem_d *scan = netmap_last_mem_d; + int error = ENOMEM; + + NMA_LOCK(&nm_mem); + + do { + /* we rely on unsigned wrap around */ + id = scan->nm_id + 1; + if (id == 0) /* reserve 0 as error value */ + id = 1; + scan = scan->next; + if (id != scan->nm_id) { + nmd->nm_id = id; + nmd->prev = scan->prev; + nmd->next = scan; + scan->prev->next = nmd; + scan->prev = nmd; + netmap_last_mem_d = nmd; + error = 0; + break; + } + } while (scan != netmap_last_mem_d); + + NMA_UNLOCK(&nm_mem); + return error; +} + +static void +nm_mem_release_id(struct netmap_mem_d *nmd) +{ + NMA_LOCK(&nm_mem); + + nmd->prev->next = nmd->next; + nmd->next->prev = nmd->prev; + + if (netmap_last_mem_d == nmd) + netmap_last_mem_d = nmd->prev; + + nmd->prev = nmd->next = NULL; + + NMA_UNLOCK(&nm_mem); +} + + /* - * Convert a userspace offset to a physical address. - * XXX only called in the FreeBSD's netmap_mmap() - * because in linux we map everything at once. - * * First, find the allocator that contains the requested offset, * then locate the cluster through a lookup table. */ -static inline vm_paddr_t -netmap_ofstophys(vm_offset_t offset) +vm_paddr_t +netmap_mem_ofstophys(struct netmap_mem_d* nmd, vm_ooffset_t offset) { int i; - vm_offset_t o = offset; - struct netmap_obj_pool *p = nm_mem.pools; + vm_ooffset_t o = offset; + vm_paddr_t pa; + struct netmap_obj_pool *p; - for (i = 0; i < NETMAP_POOLS_NR; offset -= p[i]._memtotal, i++) { - if (offset >= p[i]._memtotal) + NMA_LOCK(nmd); + p = nmd->pools; + + for (i = 0; i < NETMAP_POOLS_NR; offset -= p[i].memtotal, i++) { + if (offset >= p[i].memtotal) continue; // now lookup the cluster's address - return p[i].lut[offset / p[i]._objsize].paddr + + pa = p[i].lut[offset / p[i]._objsize].paddr + offset % p[i]._objsize; + NMA_UNLOCK(nmd); + return pa; } /* this is only in case of errors */ D("invalid ofs 0x%x out of 0x%x 0x%x 0x%x", (u_int)o, - p[NETMAP_IF_POOL]._memtotal, - p[NETMAP_IF_POOL]._memtotal - + p[NETMAP_RING_POOL]._memtotal, - p[NETMAP_IF_POOL]._memtotal - + p[NETMAP_RING_POOL]._memtotal - + p[NETMAP_BUF_POOL]._memtotal); + p[NETMAP_IF_POOL].memtotal, + p[NETMAP_IF_POOL].memtotal + + p[NETMAP_RING_POOL].memtotal, + p[NETMAP_IF_POOL].memtotal + + p[NETMAP_RING_POOL].memtotal + + p[NETMAP_BUF_POOL].memtotal); + NMA_UNLOCK(nmd); return 0; // XXX bad address } +int +netmap_mem_get_info(struct netmap_mem_d* nmd, u_int* size, u_int *memflags, + nm_memid_t *id) +{ + int error = 0; + NMA_LOCK(nmd); + error = nmd->config(nmd); + if (error) + goto out; + if (nmd->flags & NETMAP_MEM_FINALIZED) { + *size = nmd->nm_totalsize; + } else { + int i; + *size = 0; + for (i = 0; i < NETMAP_POOLS_NR; i++) { + struct netmap_obj_pool *p = nmd->pools + i; + *size += (p->_numclusters * p->_clustsize); + } + } + *memflags = nmd->flags; + *id = nmd->nm_id; +out: + NMA_UNLOCK(nmd); + return error; +} + /* * we store objects by kernel address, need to find the offset * within the pool to export the value to userspace. @@ -271,7 +326,7 @@ netmap_ofstophys(vm_offset_t offset) static ssize_t netmap_obj_offset(struct netmap_obj_pool *p, const void *vaddr) { - int i, k = p->clustentries, n = p->objtotal; + int i, k = p->_clustentries, n = p->objtotal; ssize_t ofs = 0; for (i = 0; i < n; i += k, ofs += p->_clustsize) { @@ -292,25 +347,35 @@ netmap_obj_offset(struct netmap_obj_pool *p, const void *vaddr) } /* Helper functions which convert virtual addresses to offsets */ -#define netmap_if_offset(v) \ - netmap_obj_offset(&nm_mem.pools[NETMAP_IF_POOL], (v)) +#define netmap_if_offset(n, v) \ + netmap_obj_offset(&(n)->pools[NETMAP_IF_POOL], (v)) -#define netmap_ring_offset(v) \ - (nm_mem.pools[NETMAP_IF_POOL]._memtotal + \ - netmap_obj_offset(&nm_mem.pools[NETMAP_RING_POOL], (v))) +#define netmap_ring_offset(n, v) \ + ((n)->pools[NETMAP_IF_POOL].memtotal + \ + netmap_obj_offset(&(n)->pools[NETMAP_RING_POOL], (v))) -#define netmap_buf_offset(v) \ - (nm_mem.pools[NETMAP_IF_POOL]._memtotal + \ - nm_mem.pools[NETMAP_RING_POOL]._memtotal + \ - netmap_obj_offset(&nm_mem.pools[NETMAP_BUF_POOL], (v))) +#define netmap_buf_offset(n, v) \ + ((n)->pools[NETMAP_IF_POOL].memtotal + \ + (n)->pools[NETMAP_RING_POOL].memtotal + \ + netmap_obj_offset(&(n)->pools[NETMAP_BUF_POOL], (v))) +ssize_t +netmap_mem_if_offset(struct netmap_mem_d *nmd, const void *addr) +{ + ssize_t v; + NMA_LOCK(nmd); + v = netmap_if_offset(nmd, addr); + NMA_UNLOCK(nmd); + return v; +} + /* * report the index, and use start position as a hint, * otherwise buffer allocation becomes terribly expensive. */ static void * -netmap_obj_malloc(struct netmap_obj_pool *p, int len, uint32_t *start, uint32_t *index) +netmap_obj_malloc(struct netmap_obj_pool *p, u_int len, uint32_t *start, uint32_t *index) { uint32_t i = 0; /* index in the bitmap */ uint32_t mask, j; /* slot counter */ @@ -323,7 +388,7 @@ netmap_obj_malloc(struct netmap_obj_pool *p, int len, uint32_t *start, uint32_t } if (p->objfree == 0) { - D("%s allocator: run out of memory", p->name); + D("no more %s objects", p->name); return NULL; } if (start) @@ -356,28 +421,41 @@ netmap_obj_malloc(struct netmap_obj_pool *p, int len, uint32_t *start, uint32_t /* - * free by index, not by address. This is slow, but is only used - * for a small number of objects (rings, nifp) + * free by index, not by address. + * XXX should we also cleanup the content ? */ -static void +static int netmap_obj_free(struct netmap_obj_pool *p, uint32_t j) { + uint32_t *ptr, mask; + if (j >= p->objtotal) { D("invalid index %u, max %u", j, p->objtotal); - return; + return 1; + } + ptr = &p->bitmap[j / 32]; + mask = (1 << (j % 32)); + if (*ptr & mask) { + D("ouch, double free on buffer %d", j); + return 1; + } else { + *ptr |= mask; + p->objfree++; + return 0; } - p->bitmap[j / 32] |= (1 << (j % 32)); - p->objfree++; - return; } +/* + * free by address. This is slow but is only used for a few + * objects (rings, nifp) + */ static void netmap_obj_free_va(struct netmap_obj_pool *p, void *vaddr) { - int i, j, n = p->_memtotal / p->_clustsize; + u_int i, j, n = p->numclusters; - for (i = 0, j = 0; i < n; i++, j += p->clustentries) { - void *base = p->lut[i * p->clustentries].vaddr; + for (i = 0, j = 0; i < n; i++, j += p->_clustentries) { + void *base = p->lut[i * p->_clustentries].vaddr; ssize_t relofs = (ssize_t) vaddr - (ssize_t) base; /* Given address, is out of the scope of the current cluster.*/ @@ -385,7 +463,7 @@ netmap_obj_free_va(struct netmap_obj_pool *p, void *vaddr) continue; j = j + relofs / p->_objsize; - KASSERT(j != 0, ("Cannot free object 0")); + /* KASSERT(j != 0, ("Cannot free object 0")); */ netmap_obj_free(p, j); return; } @@ -393,43 +471,91 @@ netmap_obj_free_va(struct netmap_obj_pool *p, void *vaddr) vaddr, p->name); } -#define netmap_if_malloc(len) netmap_obj_malloc(&nm_mem.pools[NETMAP_IF_POOL], len, NULL, NULL) -#define netmap_if_free(v) netmap_obj_free_va(&nm_mem.pools[NETMAP_IF_POOL], (v)) -#define netmap_ring_malloc(len) netmap_obj_malloc(&nm_mem.pools[NETMAP_RING_POOL], len, NULL, NULL) -#define netmap_ring_free(v) netmap_obj_free_va(&nm_mem.pools[NETMAP_RING_POOL], (v)) -#define netmap_buf_malloc(_pos, _index) \ - netmap_obj_malloc(&nm_mem.pools[NETMAP_BUF_POOL], NETMAP_BUF_SIZE, _pos, _index) +#define netmap_if_malloc(n, len) netmap_obj_malloc(&(n)->pools[NETMAP_IF_POOL], len, NULL, NULL) +#define netmap_if_free(n, v) netmap_obj_free_va(&(n)->pools[NETMAP_IF_POOL], (v)) +#define netmap_ring_malloc(n, len) netmap_obj_malloc(&(n)->pools[NETMAP_RING_POOL], len, NULL, NULL) +#define netmap_ring_free(n, v) netmap_obj_free_va(&(n)->pools[NETMAP_RING_POOL], (v)) +#define netmap_buf_malloc(n, _pos, _index) \ + netmap_obj_malloc(&(n)->pools[NETMAP_BUF_POOL], NETMAP_BDG_BUF_SIZE(n), _pos, _index) +#if 0 // XXX unused /* Return the index associated to the given packet buffer */ -#define netmap_buf_index(v) \ - (netmap_obj_offset(&nm_mem.pools[NETMAP_BUF_POOL], (v)) / nm_mem.pools[NETMAP_BUF_POOL]._objsize) +#define netmap_buf_index(n, v) \ + (netmap_obj_offset(&(n)->pools[NETMAP_BUF_POOL], (v)) / NETMAP_BDG_BUF_SIZE(n)) +#endif + +/* + * allocate extra buffers in a linked list. + * returns the actual number. + */ +uint32_t +netmap_extra_alloc(struct netmap_adapter *na, uint32_t *head, uint32_t n) +{ + struct netmap_mem_d *nmd = na->nm_mem; + uint32_t i, pos = 0; /* opaque, scan position in the bitmap */ + + NMA_LOCK(nmd); + + *head = 0; /* default, 'null' index ie empty list */ + for (i = 0 ; i < n; i++) { + uint32_t cur = *head; /* save current head */ + uint32_t *p = netmap_buf_malloc(nmd, &pos, head); + if (p == NULL) { + D("no more buffers after %d of %d", i, n); + *head = cur; /* restore */ + break; + } + RD(5, "allocate buffer %d -> %d", *head, cur); + *p = cur; /* link to previous head */ + } + + NMA_UNLOCK(nmd); + + return i; +} + +static void +netmap_extra_free(struct netmap_adapter *na, uint32_t head) +{ + struct lut_entry *lut = na->na_lut; + struct netmap_mem_d *nmd = na->nm_mem; + struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; + uint32_t i, cur, *buf; + + D("freeing the extra list"); + for (i = 0; head >=2 && head < p->objtotal; i++) { + cur = head; + buf = lut[head].vaddr; + head = *buf; + *buf = 0; + if (netmap_obj_free(p, cur)) + break; + } + if (head != 0) + D("breaking with head %d", head); + D("freed %d buffers", i); +} /* Return nonzero on error */ static int -netmap_new_bufs(struct netmap_if *nifp, - struct netmap_slot *slot, u_int n) +netmap_new_bufs(struct netmap_mem_d *nmd, struct netmap_slot *slot, u_int n) { - struct netmap_obj_pool *p = &nm_mem.pools[NETMAP_BUF_POOL]; - int i = 0; /* slot counter */ + struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; + u_int i = 0; /* slot counter */ uint32_t pos = 0; /* slot in p->bitmap */ uint32_t index = 0; /* buffer index */ - (void)nifp; /* UNUSED */ for (i = 0; i < n; i++) { - void *vaddr = netmap_buf_malloc(&pos, &index); + void *vaddr = netmap_buf_malloc(nmd, &pos, &index); if (vaddr == NULL) { - D("unable to locate empty packet buffer"); + D("no more buffers after %d of %d", i, n); goto cleanup; } slot[i].buf_idx = index; slot[i].len = p->_objsize; - /* XXX setting flags=NS_BUF_CHANGED forces a pointer reload - * in the NIC ring. This is a hack that hides missing - * initializations in the drivers, and should go away. - */ - // slot[i].flags = NS_BUF_CHANGED; + slot[i].flags = 0; } ND("allocated %d buffers, %d available, first at %d", n, p->objfree, pos); @@ -444,11 +570,24 @@ cleanup: return (ENOMEM); } +static void +netmap_mem_set_ring(struct netmap_mem_d *nmd, struct netmap_slot *slot, u_int n, uint32_t index) +{ + struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; + u_int i; + + for (i = 0; i < n; i++) { + slot[i].buf_idx = index; + slot[i].len = p->_objsize; + slot[i].flags = 0; + } +} + static void -netmap_free_buf(struct netmap_if *nifp, uint32_t i) +netmap_free_buf(struct netmap_mem_d *nmd, uint32_t i) { - struct netmap_obj_pool *p = &nm_mem.pools[NETMAP_BUF_POOL]; + struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; if (i < 2 || i >= p->objtotal) { D("Cannot free buf#%d: should be in [2, %d[", i, p->objtotal); @@ -457,19 +596,34 @@ netmap_free_buf(struct netmap_if *nifp, uint32_t i) netmap_obj_free(p, i); } + +static void +netmap_free_bufs(struct netmap_mem_d *nmd, struct netmap_slot *slot, u_int n) +{ + u_int i; + + for (i = 0; i < n; i++) { + if (slot[i].buf_idx > 2) + netmap_free_buf(nmd, slot[i].buf_idx); + } +} + static void netmap_reset_obj_allocator(struct netmap_obj_pool *p) { + if (p == NULL) return; if (p->bitmap) free(p->bitmap, M_NETMAP); p->bitmap = NULL; if (p->lut) { - int i; - for (i = 0; i < p->objtotal; i += p->clustentries) { + u_int i; + size_t sz = p->_clustsize; + + for (i = 0; i < p->objtotal; i += p->_clustentries) { if (p->lut[i].vaddr) - contigfree(p->lut[i].vaddr, p->_clustsize, M_NETMAP); + contigfree(p->lut[i].vaddr, sz, M_NETMAP); } bzero(p->lut, sizeof(struct lut_entry) * p->objtotal); #ifdef linux @@ -479,6 +633,10 @@ netmap_reset_obj_allocator(struct netmap_obj_pool *p) #endif } p->lut = NULL; + p->objtotal = 0; + p->memtotal = 0; + p->numclusters = 0; + p->objfree = 0; } /* @@ -496,8 +654,7 @@ netmap_destroy_obj_allocator(struct netmap_obj_pool *p) * We receive a request for objtotal objects, of size objsize each. * Internally we may round up both numbers, as we allocate objects * in small clusters multiple of the page size. - * In the allocator we don't need to store the objsize, - * but we do need to keep track of objtotal' and clustentries, + * We need to keep track of objtotal and clustentries, * as they are needed when freeing memory. * * XXX note -- userspace needs the buffers to be contiguous, @@ -509,16 +666,21 @@ netmap_destroy_obj_allocator(struct netmap_obj_pool *p) static int netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int objsize) { - int i, n; + int i; u_int clustsize; /* the cluster size, multiple of page size */ u_int clustentries; /* how many objects per entry */ + /* we store the current request, so we can + * detect configuration changes later */ + p->r_objtotal = objtotal; + p->r_objsize = objsize; + #define MAX_CLUSTSIZE (1<<17) -#define LINE_ROUND 64 +#define LINE_ROUND NM_CACHE_ALIGN // 64 if (objsize >= MAX_CLUSTSIZE) { /* we could do it but there is no point */ D("unsupported allocation for %d bytes", objsize); - goto error; + return EINVAL; } /* make sure objsize is a multiple of LINE_ROUND */ i = (objsize & (LINE_ROUND - 1)); @@ -529,12 +691,12 @@ netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int obj if (objsize < p->objminsize || objsize > p->objmaxsize) { D("requested objsize %d out of range [%d, %d]", objsize, p->objminsize, p->objmaxsize); - goto error; + return EINVAL; } if (objtotal < p->nummin || objtotal > p->nummax) { D("requested objtotal %d out of range [%d, %d]", objtotal, p->nummin, p->nummax); - goto error; + return EINVAL; } /* * Compute number of objects using a brute-force approach: @@ -568,22 +730,15 @@ netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int obj * The number of clusters is n = ceil(objtotal/clustentries) * objtotal' = n * clustentries */ - p->clustentries = clustentries; + p->_clustentries = clustentries; p->_clustsize = clustsize; - n = (objtotal + clustentries - 1) / clustentries; - p->_numclusters = n; - p->objtotal = n * clustentries; - p->objfree = p->objtotal - 2; /* obj 0 and 1 are reserved */ - p->_memtotal = p->_numclusters * p->_clustsize; - p->_objsize = objsize; - - return 0; + p->_numclusters = (objtotal + clustentries - 1) / clustentries; -error: + /* actual values (may be larger than requested) */ p->_objsize = objsize; - p->objtotal = objtotal; + p->_objtotal = p->_numclusters * clustentries; - return EINVAL; + return 0; } @@ -591,7 +746,12 @@ error: static int netmap_finalize_obj_allocator(struct netmap_obj_pool *p) { - int i, n; + int i; /* must be signed */ + size_t n; + + /* optimistically assume we have enough memory */ + p->numclusters = p->_numclusters; + p->objtotal = p->_objtotal; n = sizeof(struct lut_entry) * p->objtotal; #ifdef linux @@ -600,7 +760,7 @@ netmap_finalize_obj_allocator(struct netmap_obj_pool *p) p->lut = malloc(n, M_NETMAP, M_NOWAIT | M_ZERO); #endif if (p->lut == NULL) { - D("Unable to create lookup table (%d bytes) for '%s'", n, p->name); + D("Unable to create lookup table (%d bytes) for '%s'", (int)n, p->name); goto clean; } @@ -608,7 +768,7 @@ netmap_finalize_obj_allocator(struct netmap_obj_pool *p) n = (p->objtotal + 31) / 32; p->bitmap = malloc(sizeof(uint32_t) * n, M_NETMAP, M_NOWAIT | M_ZERO); if (p->bitmap == NULL) { - D("Unable to create bitmap (%d entries) for allocator '%s'", n, + D("Unable to create bitmap (%d entries) for allocator '%s'", (int)n, p->name); goto clean; } @@ -617,31 +777,34 @@ netmap_finalize_obj_allocator(struct netmap_obj_pool *p) /* * Allocate clusters, init pointers and bitmap */ - for (i = 0; i < p->objtotal;) { - int lim = i + p->clustentries; + + n = p->_clustsize; + for (i = 0; i < (int)p->objtotal;) { + int lim = i + p->_clustentries; char *clust; - clust = contigmalloc(p->_clustsize, M_NETMAP, M_NOWAIT | M_ZERO, - 0, -1UL, PAGE_SIZE, 0); + clust = contigmalloc(n, M_NETMAP, M_NOWAIT | M_ZERO, + (size_t)0, -1UL, PAGE_SIZE, 0); if (clust == NULL) { /* * If we get here, there is a severe memory shortage, * so halve the allocated memory to reclaim some. - * XXX check boundaries */ D("Unable to create cluster at %d for '%s' allocator", i, p->name); + if (i < 2) /* nothing to halve */ + goto out; lim = i / 2; for (i--; i >= lim; i--) { p->bitmap[ (i>>5) ] &= ~( 1 << (i & 31) ); - if (i % p->clustentries == 0 && p->lut[i].vaddr) + if (i % p->_clustentries == 0 && p->lut[i].vaddr) contigfree(p->lut[i].vaddr, - p->_clustsize, M_NETMAP); + n, M_NETMAP); } + out: p->objtotal = i; - p->objfree = p->objtotal - 2; - p->_numclusters = i / p->clustentries; - p->_memtotal = p->_numclusters * p->_clustsize; + /* we may have stopped in the middle of a cluster */ + p->numclusters = (i + p->_clustentries - 1) / p->_clustentries; break; } for (; i < lim; i++, clust += p->_objsize) { @@ -650,11 +813,14 @@ netmap_finalize_obj_allocator(struct netmap_obj_pool *p) p->lut[i].paddr = vtophys(clust); } } - p->bitmap[0] = ~3; /* objs 0 and 1 is always busy */ + p->objfree = p->objtotal; + p->memtotal = p->numclusters * p->_clustsize; + if (p->objfree == 0) + goto clean; if (netmap_verbose) D("Pre-allocated %d clusters (%d/%dKB) for '%s'", - p->_numclusters, p->_clustsize >> 10, - p->_memtotal >> 10, p->name); + p->numclusters, p->_clustsize >> 10, + p->memtotal >> 10, p->name); return 0; @@ -665,310 +831,547 @@ clean: /* call with lock held */ static int -netmap_memory_config_changed(void) +netmap_memory_config_changed(struct netmap_mem_d *nmd) { int i; for (i = 0; i < NETMAP_POOLS_NR; i++) { - if (nm_mem.pools[i]._objsize != netmap_params[i].size || - nm_mem.pools[i].objtotal != netmap_params[i].num) + if (nmd->pools[i].r_objsize != netmap_params[i].size || + nmd->pools[i].r_objtotal != netmap_params[i].num) return 1; } return 0; } +static void +netmap_mem_reset_all(struct netmap_mem_d *nmd) +{ + int i; + + if (netmap_verbose) + D("resetting %p", nmd); + for (i = 0; i < NETMAP_POOLS_NR; i++) { + netmap_reset_obj_allocator(&nmd->pools[i]); + } + nmd->flags &= ~NETMAP_MEM_FINALIZED; +} + +static int +netmap_mem_finalize_all(struct netmap_mem_d *nmd) +{ + int i; + if (nmd->flags & NETMAP_MEM_FINALIZED) + return 0; + nmd->lasterr = 0; + nmd->nm_totalsize = 0; + for (i = 0; i < NETMAP_POOLS_NR; i++) { + nmd->lasterr = netmap_finalize_obj_allocator(&nmd->pools[i]); + if (nmd->lasterr) + goto error; + nmd->nm_totalsize += nmd->pools[i].memtotal; + } + /* buffers 0 and 1 are reserved */ + nmd->pools[NETMAP_BUF_POOL].objfree -= 2; + nmd->pools[NETMAP_BUF_POOL].bitmap[0] = ~3; + nmd->flags |= NETMAP_MEM_FINALIZED; + + if (netmap_verbose) + D("interfaces %d KB, rings %d KB, buffers %d MB", + nmd->pools[NETMAP_IF_POOL].memtotal >> 10, + nmd->pools[NETMAP_RING_POOL].memtotal >> 10, + nmd->pools[NETMAP_BUF_POOL].memtotal >> 20); + + if (netmap_verbose) + D("Free buffers: %d", nmd->pools[NETMAP_BUF_POOL].objfree); + + + return 0; +error: + netmap_mem_reset_all(nmd); + return nmd->lasterr; +} + + + +void +netmap_mem_private_delete(struct netmap_mem_d *nmd) +{ + if (nmd == NULL) + return; + if (netmap_verbose) + D("deleting %p", nmd); + if (nmd->refcount > 0) + D("bug: deleting mem allocator with refcount=%d!", nmd->refcount); + nm_mem_release_id(nmd); + if (netmap_verbose) + D("done deleting %p", nmd); + NMA_LOCK_DESTROY(nmd); + free(nmd, M_DEVBUF); +} + +static int +netmap_mem_private_config(struct netmap_mem_d *nmd) +{ + /* nothing to do, we are configured on creation + * and configuration never changes thereafter + */ + return 0; +} + +static int +netmap_mem_private_finalize(struct netmap_mem_d *nmd) +{ + int err; + NMA_LOCK(nmd); + nmd->refcount++; + err = netmap_mem_finalize_all(nmd); + NMA_UNLOCK(nmd); + return err; + +} + +static void +netmap_mem_private_deref(struct netmap_mem_d *nmd) +{ + NMA_LOCK(nmd); + if (--nmd->refcount <= 0) + netmap_mem_reset_all(nmd); + NMA_UNLOCK(nmd); +} + + +/* + * allocator for private memory + */ +struct netmap_mem_d * +netmap_mem_private_new(const char *name, u_int txr, u_int txd, + u_int rxr, u_int rxd, u_int extra_bufs, u_int npipes, int *perr) +{ + struct netmap_mem_d *d = NULL; + struct netmap_obj_params p[NETMAP_POOLS_NR]; + int i, err; + u_int v, maxd; + + d = malloc(sizeof(struct netmap_mem_d), + M_DEVBUF, M_NOWAIT | M_ZERO); + if (d == NULL) { + err = ENOMEM; + goto error; + } + + *d = nm_blueprint; + + err = nm_mem_assign_id(d); + if (err) + goto error; + + /* account for the fake host rings */ + txr++; + rxr++; + + /* copy the min values */ + for (i = 0; i < NETMAP_POOLS_NR; i++) { + p[i] = netmap_min_priv_params[i]; + } + + /* possibly increase them to fit user request */ + v = sizeof(struct netmap_if) + sizeof(ssize_t) * (txr + rxr); + if (p[NETMAP_IF_POOL].size < v) + p[NETMAP_IF_POOL].size = v; + v = 2 + 4 * npipes; + if (p[NETMAP_IF_POOL].num < v) + p[NETMAP_IF_POOL].num = v; + maxd = (txd > rxd) ? txd : rxd; + v = sizeof(struct netmap_ring) + sizeof(struct netmap_slot) * maxd; + if (p[NETMAP_RING_POOL].size < v) + p[NETMAP_RING_POOL].size = v; + /* each pipe endpoint needs two tx rings (1 normal + 1 host, fake) + * and two rx rings (again, 1 normal and 1 fake host) + */ + v = txr + rxr + 8 * npipes; + if (p[NETMAP_RING_POOL].num < v) + p[NETMAP_RING_POOL].num = v; + /* for each pipe we only need the buffers for the 4 "real" rings. + * On the other end, the pipe ring dimension may be different from + * the parent port ring dimension. As a compromise, we allocate twice the + * space actually needed if the pipe rings were the same size as the parent rings + */ + v = (4 * npipes + rxr) * rxd + (4 * npipes + txr) * txd + 2 + extra_bufs; + /* the +2 is for the tx and rx fake buffers (indices 0 and 1) */ + if (p[NETMAP_BUF_POOL].num < v) + p[NETMAP_BUF_POOL].num = v; + + if (netmap_verbose) + D("req if %d*%d ring %d*%d buf %d*%d", + p[NETMAP_IF_POOL].num, + p[NETMAP_IF_POOL].size, + p[NETMAP_RING_POOL].num, + p[NETMAP_RING_POOL].size, + p[NETMAP_BUF_POOL].num, + p[NETMAP_BUF_POOL].size); + + for (i = 0; i < NETMAP_POOLS_NR; i++) { + snprintf(d->pools[i].name, NETMAP_POOL_MAX_NAMSZ, + nm_blueprint.pools[i].name, + name); + err = netmap_config_obj_allocator(&d->pools[i], + p[i].num, p[i].size); + if (err) + goto error; + } + + d->flags &= ~NETMAP_MEM_FINALIZED; + + NMA_LOCK_INIT(d); + + return d; +error: + netmap_mem_private_delete(d); + if (perr) + *perr = err; + return NULL; +} + /* call with lock held */ static int -netmap_memory_config(void) +netmap_mem_global_config(struct netmap_mem_d *nmd) { int i; - if (!netmap_memory_config_changed()) + if (nmd->refcount) + /* already in use, we cannot change the configuration */ + goto out; + + if (!netmap_memory_config_changed(nmd)) goto out; D("reconfiguring"); - if (nm_mem.finalized) { + if (nmd->flags & NETMAP_MEM_FINALIZED) { /* reset previous allocation */ for (i = 0; i < NETMAP_POOLS_NR; i++) { - netmap_reset_obj_allocator(&nm_mem.pools[i]); + netmap_reset_obj_allocator(&nmd->pools[i]); } - nm_mem.finalized = 0; - } + nmd->flags &= ~NETMAP_MEM_FINALIZED; + } for (i = 0; i < NETMAP_POOLS_NR; i++) { - nm_mem.lasterr = netmap_config_obj_allocator(&nm_mem.pools[i], + nmd->lasterr = netmap_config_obj_allocator(&nmd->pools[i], netmap_params[i].num, netmap_params[i].size); - if (nm_mem.lasterr) + if (nmd->lasterr) goto out; } - D("Have %d KB for interfaces, %d KB for rings and %d MB for buffers", - nm_mem.pools[NETMAP_IF_POOL]._memtotal >> 10, - nm_mem.pools[NETMAP_RING_POOL]._memtotal >> 10, - nm_mem.pools[NETMAP_BUF_POOL]._memtotal >> 20); - out: - return nm_mem.lasterr; + return nmd->lasterr; } -/* call with lock held */ static int -netmap_memory_finalize(void) +netmap_mem_global_finalize(struct netmap_mem_d *nmd) { - int i; - u_int totalsize = 0; + int err; + + NMA_LOCK(nmd); - nm_mem.refcount++; - if (nm_mem.refcount > 1) { - ND("busy (refcount %d)", nm_mem.refcount); - goto out; - } /* update configuration if changed */ - if (netmap_memory_config()) + if (netmap_mem_global_config(nmd)) goto out; - if (nm_mem.finalized) { + nmd->refcount++; + + if (nmd->flags & NETMAP_MEM_FINALIZED) { /* may happen if config is not changed */ ND("nothing to do"); goto out; } - for (i = 0; i < NETMAP_POOLS_NR; i++) { - nm_mem.lasterr = netmap_finalize_obj_allocator(&nm_mem.pools[i]); - if (nm_mem.lasterr) - goto cleanup; - totalsize += nm_mem.pools[i]._memtotal; - } - nm_mem.nm_totalsize = totalsize; + if (netmap_mem_finalize_all(nmd)) + goto out; /* backward compatibility */ - netmap_buf_size = nm_mem.pools[NETMAP_BUF_POOL]._objsize; - netmap_total_buffers = nm_mem.pools[NETMAP_BUF_POOL].objtotal; + netmap_buf_size = nmd->pools[NETMAP_BUF_POOL]._objsize; + netmap_total_buffers = nmd->pools[NETMAP_BUF_POOL].objtotal; - netmap_buffer_lut = nm_mem.pools[NETMAP_BUF_POOL].lut; - netmap_buffer_base = nm_mem.pools[NETMAP_BUF_POOL].lut[0].vaddr; + netmap_buffer_lut = nmd->pools[NETMAP_BUF_POOL].lut; + netmap_buffer_base = nmd->pools[NETMAP_BUF_POOL].lut[0].vaddr; - nm_mem.finalized = 1; - nm_mem.lasterr = 0; - - /* make sysctl values match actual values in the pools */ - for (i = 0; i < NETMAP_POOLS_NR; i++) { - netmap_params[i].size = nm_mem.pools[i]._objsize; - netmap_params[i].num = nm_mem.pools[i].objtotal; - } + nmd->lasterr = 0; out: - if (nm_mem.lasterr) - nm_mem.refcount--; + if (nmd->lasterr) + nmd->refcount--; + err = nmd->lasterr; - return nm_mem.lasterr; + NMA_UNLOCK(nmd); -cleanup: - for (i = 0; i < NETMAP_POOLS_NR; i++) { - netmap_reset_obj_allocator(&nm_mem.pools[i]); - } - nm_mem.refcount--; + return err; - return nm_mem.lasterr; } -static int -netmap_memory_init(void) +int +netmap_mem_init(void) { - NMA_LOCK_INIT(); + NMA_LOCK_INIT(&nm_mem); return (0); } -static void -netmap_memory_fini(void) +void +netmap_mem_fini(void) { int i; for (i = 0; i < NETMAP_POOLS_NR; i++) { netmap_destroy_obj_allocator(&nm_mem.pools[i]); } - NMA_LOCK_DESTROY(); + NMA_LOCK_DESTROY(&nm_mem); } static void netmap_free_rings(struct netmap_adapter *na) { - int i; + struct netmap_kring *kring; + struct netmap_ring *ring; if (!na->tx_rings) return; - for (i = 0; i < na->num_tx_rings + 1; i++) { - netmap_ring_free(na->tx_rings[i].ring); - na->tx_rings[i].ring = NULL; + for (kring = na->tx_rings; kring != na->rx_rings; kring++) { + ring = kring->ring; + if (ring == NULL) + continue; + netmap_free_bufs(na->nm_mem, ring->slot, kring->nkr_num_slots); + netmap_ring_free(na->nm_mem, ring); + kring->ring = NULL; } - for (i = 0; i < na->num_rx_rings + 1; i++) { - netmap_ring_free(na->rx_rings[i].ring); - na->rx_rings[i].ring = NULL; + for (/* cont'd from above */; kring != na->tailroom; kring++) { + ring = kring->ring; + if (ring == NULL) + continue; + netmap_free_bufs(na->nm_mem, ring->slot, kring->nkr_num_slots); + netmap_ring_free(na->nm_mem, ring); + kring->ring = NULL; } - free(na->tx_rings, M_DEVBUF); - na->tx_rings = na->rx_rings = NULL; } - - -/* call with NMA_LOCK held */ -/* - * Allocate the per-fd structure netmap_if. - * If this is the first instance, also allocate the krings, rings etc. +/* call with NMA_LOCK held * + * + * Allocate netmap rings and buffers for this card + * The rings are contiguous, but have variable size. + * The kring array must follow the layout described + * in netmap_krings_create(). */ -static void * -netmap_if_new(const char *ifname, struct netmap_adapter *na) +int +netmap_mem_rings_create(struct netmap_adapter *na) { - struct netmap_if *nifp; struct netmap_ring *ring; - ssize_t base; /* handy for relative offsets between rings and nifp */ - u_int i, len, ndesc, ntx, nrx; + u_int len, ndesc; struct netmap_kring *kring; + u_int i; - if (netmap_update_config(na)) { - /* configuration mismatch, report and fail */ - return NULL; - } - ntx = na->num_tx_rings + 1; /* shorthand, include stack ring */ - nrx = na->num_rx_rings + 1; /* shorthand, include stack ring */ - /* - * the descriptor is followed inline by an array of offsets - * to the tx and rx rings in the shared memory region. - */ - len = sizeof(struct netmap_if) + (nrx + ntx) * sizeof(ssize_t); - nifp = netmap_if_malloc(len); - if (nifp == NULL) { - return NULL; - } - - /* initialize base fields -- override const */ - *(int *)(uintptr_t)&nifp->ni_tx_rings = na->num_tx_rings; - *(int *)(uintptr_t)&nifp->ni_rx_rings = na->num_rx_rings; - strncpy(nifp->ni_name, ifname, IFNAMSIZ); - - (na->refcount)++; /* XXX atomic ? we are under lock */ - if (na->refcount > 1) { /* already setup, we are done */ - goto final; - } + NMA_LOCK(na->nm_mem); - len = (ntx + nrx) * sizeof(struct netmap_kring); - na->tx_rings = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO); - if (na->tx_rings == NULL) { - D("Cannot allocate krings for %s", ifname); - goto cleanup; - } - na->rx_rings = na->tx_rings + ntx; - - /* - * First instance, allocate netmap rings and buffers for this card - * The rings are contiguous, but have variable size. - */ - for (i = 0; i < ntx; i++) { /* Transmit rings */ - kring = &na->tx_rings[i]; - ndesc = na->num_tx_desc; - bzero(kring, sizeof(*kring)); + /* transmit rings */ + for (i =0, kring = na->tx_rings; kring != na->rx_rings; kring++, i++) { + if (kring->ring) { + ND("%s %ld already created", kring->name, kring - na->tx_rings); + continue; /* already created by somebody else */ + } + ndesc = kring->nkr_num_slots; len = sizeof(struct netmap_ring) + ndesc * sizeof(struct netmap_slot); - ring = netmap_ring_malloc(len); + ring = netmap_ring_malloc(na->nm_mem, len); if (ring == NULL) { - D("Cannot allocate tx_ring[%d] for %s", i, ifname); + D("Cannot allocate tx_ring"); goto cleanup; } - ND("txring[%d] at %p ofs %d", i, ring); - kring->na = na; + ND("txring at %p", ring); kring->ring = ring; - *(int *)(uintptr_t)&ring->num_slots = kring->nkr_num_slots = ndesc; - *(ssize_t *)(uintptr_t)&ring->buf_ofs = - (nm_mem.pools[NETMAP_IF_POOL]._memtotal + - nm_mem.pools[NETMAP_RING_POOL]._memtotal) - - netmap_ring_offset(ring); - - /* - * IMPORTANT: - * Always keep one slot empty, so we can detect new - * transmissions comparing cur and nr_hwcur (they are - * the same only if there are no new transmissions). - */ - ring->avail = kring->nr_hwavail = ndesc - 1; - ring->cur = kring->nr_hwcur = 0; - *(int *)(uintptr_t)&ring->nr_buf_size = NETMAP_BUF_SIZE; - ND("initializing slots for txring[%d]", i); - if (netmap_new_bufs(nifp, ring->slot, ndesc)) { - D("Cannot allocate buffers for tx_ring[%d] for %s", i, ifname); - goto cleanup; + *(uint32_t *)(uintptr_t)&ring->num_slots = ndesc; + *(int64_t *)(uintptr_t)&ring->buf_ofs = + (na->nm_mem->pools[NETMAP_IF_POOL].memtotal + + na->nm_mem->pools[NETMAP_RING_POOL].memtotal) - + netmap_ring_offset(na->nm_mem, ring); + + /* copy values from kring */ + ring->head = kring->rhead; + ring->cur = kring->rcur; + ring->tail = kring->rtail; + *(uint16_t *)(uintptr_t)&ring->nr_buf_size = + NETMAP_BDG_BUF_SIZE(na->nm_mem); + ND("%s h %d c %d t %d", kring->name, + ring->head, ring->cur, ring->tail); + ND("initializing slots for txring"); + if (i != na->num_tx_rings || (na->na_flags & NAF_HOST_RINGS)) { + /* this is a real ring */ + if (netmap_new_bufs(na->nm_mem, ring->slot, ndesc)) { + D("Cannot allocate buffers for tx_ring"); + goto cleanup; + } + } else { + /* this is a fake tx ring, set all indices to 0 */ + netmap_mem_set_ring(na->nm_mem, ring->slot, ndesc, 0); } } - for (i = 0; i < nrx; i++) { /* Receive rings */ - kring = &na->rx_rings[i]; - ndesc = na->num_rx_desc; - bzero(kring, sizeof(*kring)); + /* receive rings */ + for ( i = 0 /* kring cont'd from above */ ; kring != na->tailroom; kring++, i++) { + if (kring->ring) { + ND("%s %ld already created", kring->name, kring - na->rx_rings); + continue; /* already created by somebody else */ + } + ndesc = kring->nkr_num_slots; len = sizeof(struct netmap_ring) + ndesc * sizeof(struct netmap_slot); - ring = netmap_ring_malloc(len); + ring = netmap_ring_malloc(na->nm_mem, len); if (ring == NULL) { - D("Cannot allocate rx_ring[%d] for %s", i, ifname); + D("Cannot allocate rx_ring"); goto cleanup; } - ND("rxring[%d] at %p ofs %d", i, ring); - - kring->na = na; + ND("rxring at %p", ring); kring->ring = ring; - *(int *)(uintptr_t)&ring->num_slots = kring->nkr_num_slots = ndesc; - *(ssize_t *)(uintptr_t)&ring->buf_ofs = - (nm_mem.pools[NETMAP_IF_POOL]._memtotal + - nm_mem.pools[NETMAP_RING_POOL]._memtotal) - - netmap_ring_offset(ring); - - ring->cur = kring->nr_hwcur = 0; - ring->avail = kring->nr_hwavail = 0; /* empty */ - *(int *)(uintptr_t)&ring->nr_buf_size = NETMAP_BUF_SIZE; - ND("initializing slots for rxring[%d]", i); - if (netmap_new_bufs(nifp, ring->slot, ndesc)) { - D("Cannot allocate buffers for rx_ring[%d] for %s", i, ifname); - goto cleanup; + *(uint32_t *)(uintptr_t)&ring->num_slots = ndesc; + *(int64_t *)(uintptr_t)&ring->buf_ofs = + (na->nm_mem->pools[NETMAP_IF_POOL].memtotal + + na->nm_mem->pools[NETMAP_RING_POOL].memtotal) - + netmap_ring_offset(na->nm_mem, ring); + + /* copy values from kring */ + ring->head = kring->rhead; + ring->cur = kring->rcur; + ring->tail = kring->rtail; + *(int *)(uintptr_t)&ring->nr_buf_size = + NETMAP_BDG_BUF_SIZE(na->nm_mem); + ND("%s h %d c %d t %d", kring->name, + ring->head, ring->cur, ring->tail); + ND("initializing slots for rxring %p", ring); + if (i != na->num_rx_rings || (na->na_flags & NAF_HOST_RINGS)) { + /* this is a real ring */ + if (netmap_new_bufs(na->nm_mem, ring->slot, ndesc)) { + D("Cannot allocate buffers for rx_ring"); + goto cleanup; + } + } else { + /* this is a fake rx ring, set all indices to 1 */ + netmap_mem_set_ring(na->nm_mem, ring->slot, ndesc, 1); } } -#ifdef linux - // XXX initialize the selrecord structs. - for (i = 0; i < ntx; i++) - init_waitqueue_head(&na->tx_rings[i].si); - for (i = 0; i < nrx; i++) - init_waitqueue_head(&na->rx_rings[i].si); - init_waitqueue_head(&na->tx_si); - init_waitqueue_head(&na->rx_si); -#endif -final: + + NMA_UNLOCK(na->nm_mem); + + return 0; + +cleanup: + netmap_free_rings(na); + + NMA_UNLOCK(na->nm_mem); + + return ENOMEM; +} + +void +netmap_mem_rings_delete(struct netmap_adapter *na) +{ + /* last instance, release bufs and rings */ + NMA_LOCK(na->nm_mem); + + netmap_free_rings(na); + + NMA_UNLOCK(na->nm_mem); +} + + +/* call with NMA_LOCK held */ +/* + * Allocate the per-fd structure netmap_if. + * + * We assume that the configuration stored in na + * (number of tx/rx rings and descs) does not change while + * the interface is in netmap mode. + */ +struct netmap_if * +netmap_mem_if_new(const char *ifname, struct netmap_adapter *na) +{ + struct netmap_if *nifp; + ssize_t base; /* handy for relative offsets between rings and nifp */ + u_int i, len, ntx, nrx; + + /* account for the (eventually fake) host rings */ + ntx = na->num_tx_rings + 1; + nrx = na->num_rx_rings + 1; + /* + * the descriptor is followed inline by an array of offsets + * to the tx and rx rings in the shared memory region. + */ + + NMA_LOCK(na->nm_mem); + + len = sizeof(struct netmap_if) + (nrx + ntx) * sizeof(ssize_t); + nifp = netmap_if_malloc(na->nm_mem, len); + if (nifp == NULL) { + NMA_UNLOCK(na->nm_mem); + return NULL; + } + + /* initialize base fields -- override const */ + *(u_int *)(uintptr_t)&nifp->ni_tx_rings = na->num_tx_rings; + *(u_int *)(uintptr_t)&nifp->ni_rx_rings = na->num_rx_rings; + strncpy(nifp->ni_name, ifname, (size_t)IFNAMSIZ); + /* * fill the slots for the rx and tx rings. They contain the offset * between the ring and nifp, so the information is usable in * userspace to reach the ring from the nifp. */ - base = netmap_if_offset(nifp); + base = netmap_if_offset(na->nm_mem, nifp); for (i = 0; i < ntx; i++) { *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i] = - netmap_ring_offset(na->tx_rings[i].ring) - base; + netmap_ring_offset(na->nm_mem, na->tx_rings[i].ring) - base; } for (i = 0; i < nrx; i++) { *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i+ntx] = - netmap_ring_offset(na->rx_rings[i].ring) - base; + netmap_ring_offset(na->nm_mem, na->rx_rings[i].ring) - base; } + + NMA_UNLOCK(na->nm_mem); + return (nifp); -cleanup: - netmap_free_rings(na); - netmap_if_free(nifp); - (na->refcount)--; - return NULL; } -/* call with NMA_LOCK held */ +void +netmap_mem_if_delete(struct netmap_adapter *na, struct netmap_if *nifp) +{ + if (nifp == NULL) + /* nothing to do */ + return; + NMA_LOCK(na->nm_mem); + if (nifp->ni_bufs_head) + netmap_extra_free(na, nifp->ni_bufs_head); + netmap_if_free(na->nm_mem, nifp); + + NMA_UNLOCK(na->nm_mem); +} + static void -netmap_memory_deref(void) +netmap_mem_global_deref(struct netmap_mem_d *nmd) { - nm_mem.refcount--; + NMA_LOCK(nmd); + + nmd->refcount--; if (netmap_verbose) - D("refcount = %d", nm_mem.refcount); + D("refcount = %d", nmd->refcount); + + NMA_UNLOCK(nmd); +} + +int +netmap_mem_finalize(struct netmap_mem_d *nmd) +{ + return nmd->finalize(nmd); +} + +void +netmap_mem_deref(struct netmap_mem_d *nmd) +{ + return nmd->deref(nmd); } diff --git a/sys/dev/re/if_re.c b/sys/dev/re/if_re.c index 99a5438..1f71e9c 100644 --- a/sys/dev/re/if_re.c +++ b/sys/dev/re/if_re.c @@ -2133,8 +2133,7 @@ re_rxeof(struct rl_softc *sc, int *rx_npktsp) ifp = sc->rl_ifp; #ifdef DEV_NETMAP - if (netmap_rx_irq(ifp, 0 | (NETMAP_LOCKED_ENTER|NETMAP_LOCKED_EXIT), - &rx_npkts)) + if (netmap_rx_irq(ifp, 0, &rx_npkts)) return 0; #endif /* DEV_NETMAP */ if (ifp->if_mtu > RL_MTU && (sc->rl_flags & RL_FLAG_JUMBOV2) != 0) @@ -2379,7 +2378,7 @@ re_txeof(struct rl_softc *sc) ifp = sc->rl_ifp; #ifdef DEV_NETMAP - if (netmap_tx_irq(ifp, 0 | (NETMAP_LOCKED_ENTER|NETMAP_LOCKED_EXIT))) + if (netmap_tx_irq(ifp, 0)) return; #endif /* DEV_NETMAP */ /* Invalidate the TX descriptor list */ diff --git a/sys/modules/netmap/Makefile b/sys/modules/netmap/Makefile index b58bfdb..647cd10 100644 --- a/sys/modules/netmap/Makefile +++ b/sys/modules/netmap/Makefile @@ -5,10 +5,16 @@ .PATH: ${.CURDIR}/../../dev/netmap .PATH.h: ${.CURDIR}/../../net +CFLAGS += -I${.CURDIR}/../../ KMOD = netmap SRCS = device_if.h bus_if.h opt_netmap.h SRCS += netmap.c netmap.h netmap_kern.h - -netmap.o: netmap_mem2.c +SRCS += netmap_mem2.c netmap_mem2.h +SRCS += netmap_generic.c +SRCS += netmap_mbq.c netmap_mbq.h +SRCS += netmap_vale.c +SRCS += netmap_freebsd.c +SRCS += netmap_offloadings.c +SRCS += netmap_pipe.c .include <bsd.kmod.mk> diff --git a/sys/net/netmap.h b/sys/net/netmap.h index b5ab6d5..f0b4c56 100644 --- a/sys/net/netmap.h +++ b/sys/net/netmap.h @@ -1,33 +1,27 @@ /* - * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. - * + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. + * * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * + * modification, are permitted provided that the following conditions + * are met: + * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. - * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the - * distribution. - * - * 3. Neither the name of the authors nor the names of their contributors - * may be used to endorse or promote products derived from this - * software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY MATTEO LANDI AND CONTRIBUTORS "AS IS" AND + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``S IS''AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL MATTEO LANDI OR CONTRIBUTORS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ /* @@ -36,215 +30,249 @@ * Definitions of constants and the structures used by the netmap * framework, for the part visible to both kernel and userspace. * Detailed info on netmap is available with "man netmap" or at - * + * * http://info.iet.unipi.it/~luigi/netmap/ + * + * This API is also used to communicate with the VALE software switch */ #ifndef _NET_NETMAP_H_ #define _NET_NETMAP_H_ +#define NETMAP_API 11 /* current API version */ + +#define NETMAP_MIN_API 11 /* min and max versions accepted */ +#define NETMAP_MAX_API 15 +/* + * Some fields should be cache-aligned to reduce contention. + * The alignment is architecture and OS dependent, but rather than + * digging into OS headers to find the exact value we use an estimate + * that should cover most architectures. + */ +#define NM_CACHE_ALIGN 128 + /* * --- Netmap data structures --- * - * The data structures used by netmap are shown below. Those in - * capital letters are in an mmapp()ed area shared with userspace, - * while others are private to the kernel. - * Shared structures do not contain pointers but only memory - * offsets, so that addressing is portable between kernel and userspace. - - - softc -+----------------+ -| standard fields| -| if_pspare[0] ----------+ -+----------------+ | - | -+----------------+<------+ -|(netmap_adapter)| -| | netmap_kring -| tx_rings *--------------------------------->+---------------+ -| | netmap_kring | ring *---------. -| rx_rings *--------->+---------------+ | nr_hwcur | | -+----------------+ | ring *--------. | nr_hwavail | V - | nr_hwcur | | | selinfo | | - | nr_hwavail | | +---------------+ . - | selinfo | | | ... | . - +---------------+ | |(ntx+1 entries)| - | .... | | | | - |(nrx+1 entries)| | +---------------+ - | | | - KERNEL +---------------+ | - | + * The userspace data structures used by netmap are shown below. + * They are allocated by the kernel and mmap()ed by userspace threads. + * Pointers are implemented as memory offsets or indexes, + * so that they can be easily dereferenced in kernel and userspace. + + KERNEL (opaque, obviously) + ==================================================================== | - USERSPACE | NETMAP_RING - +---->+-------------+ - / | cur | - NETMAP_IF (nifp, one per file desc.) / | avail | - +---------------+ / | buf_ofs | - | ni_tx_rings | / +=============+ - | ni_rx_rings | / | buf_idx | slot[0] - | | / | len, flags | - | | / +-------------+ - +===============+ / | buf_idx | slot[1] - | txring_ofs[0] | (rel.to nifp)--' | len, flags | - | txring_ofs[1] | +-------------+ - (num_rings+1 entries) (nr_num_slots entries) - | txring_ofs[n] | | buf_idx | slot[n-1] - +---------------+ | len, flags | - | rxring_ofs[0] | +-------------+ + USERSPACE | struct netmap_ring + +---->+---------------+ + / | head,cur,tail | + struct netmap_if (nifp, 1 per fd) / | buf_ofs | + +---------------+ / | other fields | + | ni_tx_rings | / +===============+ + | ni_rx_rings | / | buf_idx, len | slot[0] + | | / | flags, ptr | + | | / +---------------+ + +===============+ / | buf_idx, len | slot[1] + | txring_ofs[0] | (rel.to nifp)--' | flags, ptr | + | txring_ofs[1] | +---------------+ + (tx+1 entries) (num_slots entries) + | txring_ofs[t] | | buf_idx, len | slot[n-1] + +---------------+ | flags, ptr | + | rxring_ofs[0] | +---------------+ | rxring_ofs[1] | - (num_rings+1 entries) - | txring_ofs[n] | + (rx+1 entries) + | rxring_ofs[r] | +---------------+ - * The private descriptor ('softc' or 'adapter') of each interface - * is extended with a "struct netmap_adapter" containing netmap-related - * info (see description in dev/netmap/netmap_kernel.h. - * Among other things, tx_rings and rx_rings point to the arrays of - * "struct netmap_kring" which in turn reache the various - * "struct netmap_ring", shared with userspace. - - * The NETMAP_RING is the userspace-visible replica of the NIC ring. - * Each slot has the index of a buffer, its length and some flags. + * For each "interface" (NIC, host stack, PIPE, VALE switch port) bound to + * a file descriptor, the mmap()ed region contains a (logically readonly) + * struct netmap_if pointing to struct netmap_ring's. + * + * There is one netmap_ring per physical NIC ring, plus one tx/rx ring + * pair attached to the host stack (this pair is unused for non-NIC ports). + * + * All physical/host stack ports share the same memory region, + * so that zero-copy can be implemented between them. + * VALE switch ports instead have separate memory regions. + * + * The netmap_ring is the userspace-visible replica of the NIC ring. + * Each slot has the index of a buffer (MTU-sized and residing in the + * mmapped region), its length and some flags. An extra 64-bit pointer + * is provided for user-supplied buffers in the tx path. + * * In user space, the buffer address is computed as - * (char *)ring + buf_ofs + index*NETMAP_BUF_SIZE - * In the kernel, buffers do not necessarily need to be contiguous, - * and the virtual and physical addresses are derived through - * a lookup table. - * - * struct netmap_slot: - * - * buf_idx is the index of the buffer associated to the slot. - * len is the length of the payload - * NS_BUF_CHANGED must be set whenever userspace wants - * to change buf_idx (it might be necessary to - * reprogram the NIC slot) - * NS_REPORT must be set if we want the NIC to generate an interrupt - * when this slot is used. Leaving it to 0 improves - * performance. - * NS_FORWARD if set on a receive ring, and the device is in - * transparent mode, buffers released with the flag set - * will be forwarded to the 'other' side (host stack - * or NIC, respectively) on the next select() or ioctl() - * - * The following will be supported from NETMAP_API = 5 - * NS_NO_LEARN on a VALE switch, do not 'learn' the source port for - * this packet. - * NS_INDIRECT the netmap buffer contains a 64-bit pointer to - * the actual userspace buffer. This may be useful - * to reduce copies in a VM environment. - * NS_MOREFRAG Part of a multi-segment frame. The last (or only) - * segment must not have this flag. - * NS_PORT_MASK the high 8 bits of the flag, if not zero, indicate the - * destination port for the VALE switch, overriding - * the lookup table. + * (char *)ring + buf_ofs + index * NETMAP_BUF_SIZE + * + * Added in NETMAP_API 11: + * + * + NIOCREGIF can request the allocation of extra spare buffers from + * the same memory pool. The desired number of buffers must be in + * nr_arg3. The ioctl may return fewer buffers, depending on memory + * availability. nr_arg3 will return the actual value, and, once + * mapped, nifp->ni_bufs_head will be the index of the first buffer. + * + * The buffers are linked to each other using the first uint32_t + * as the index. On close, ni_bufs_head must point to the list of + * buffers to be released. + * + * + NIOCREGIF can request space for extra rings (and buffers) + * allocated in the same memory space. The number of extra rings + * is in nr_arg1, and is advisory. This is a no-op on NICs where + * the size of the memory space is fixed. + * + * + NIOCREGIF can attach to PIPE rings sharing the same memory + * space with a parent device. The ifname indicates the parent device, + * which must already exist. Flags in nr_flags indicate if we want to + * bind the master or slave side, the index (from nr_ringid) + * is just a cookie and does need to be sequential. + * + * + NIOCREGIF can also attach to 'monitor' rings that replicate + * the content of specific rings, also from the same memory space. + * + * Extra flags in nr_flags support the above functions. + * Application libraries may use the following naming scheme: + * netmap:foo all NIC ring pairs + * netmap:foo^ only host ring pair + * netmap:foo+ all NIC ring + host ring pairs + * netmap:foo-k the k-th NIC ring pair + * netmap:foo{k PIPE ring pair k, master side + * netmap:foo}k PIPE ring pair k, slave side */ +/* + * struct netmap_slot is a buffer descriptor + */ struct netmap_slot { - uint32_t buf_idx; /* buffer index */ - uint16_t len; /* packet length, to be copied to/from the hw ring */ - uint16_t flags; /* buf changed, etc. */ -#define NS_BUF_CHANGED 0x0001 /* must resync the map, buffer changed */ -#define NS_REPORT 0x0002 /* ask the hardware to report results - * e.g. by generating an interrupt - */ -#define NS_FORWARD 0x0004 /* pass packet to the other endpoint - * (host stack or device) - */ -#define NS_NO_LEARN 0x0008 -#define NS_INDIRECT 0x0010 -#define NS_MOREFRAG 0x0020 + uint32_t buf_idx; /* buffer index */ + uint16_t len; /* length for this slot */ + uint16_t flags; /* buf changed, etc. */ + uint64_t ptr; /* pointer for indirect buffers */ +}; + +/* + * The following flags control how the slot is used + */ + +#define NS_BUF_CHANGED 0x0001 /* buf_idx changed */ + /* + * must be set whenever buf_idx is changed (as it might be + * necessary to recompute the physical address and mapping) + */ + +#define NS_REPORT 0x0002 /* ask the hardware to report results */ + /* + * Request notification when slot is used by the hardware. + * Normally transmit completions are handled lazily and + * may be unreported. This flag lets us know when a slot + * has been sent (e.g. to terminate the sender). + */ + +#define NS_FORWARD 0x0004 /* pass packet 'forward' */ + /* + * (Only for physical ports, rx rings with NR_FORWARD set). + * Slot released to the kernel (i.e. before ring->head) with + * this flag set are passed to the peer ring (host/NIC), + * thus restoring the host-NIC connection for these slots. + * This supports efficient traffic monitoring or firewalling. + */ + +#define NS_NO_LEARN 0x0008 /* disable bridge learning */ + /* + * On a VALE switch, do not 'learn' the source port for + * this buffer. + */ + +#define NS_INDIRECT 0x0010 /* userspace buffer */ + /* + * (VALE tx rings only) data is in a userspace buffer, + * whose address is in the 'ptr' field in the slot. + */ + +#define NS_MOREFRAG 0x0020 /* packet has more fragments */ + /* + * (VALE ports only) + * Set on all but the last slot of a multi-segment packet. + * The 'len' field refers to the individual fragment. + */ + #define NS_PORT_SHIFT 8 #define NS_PORT_MASK (0xff << NS_PORT_SHIFT) -}; + /* + * The high 8 bits of the flag, if not zero, indicate the + * destination port for the VALE switch, overriding + * the lookup table. + */ + +#define NS_RFRAGS(_slot) ( ((_slot)->flags >> 8) & 0xff) + /* + * (VALE rx rings only) the high 8 bits + * are the number of fragments. + */ + /* + * struct netmap_ring + * * Netmap representation of a TX or RX ring (also known as "queue"). * This is a queue implemented as a fixed-size circular array. - * At the software level, two fields are important: avail and cur. + * At the software level the important fields are: head, cur, tail. * * In TX rings: - * avail indicates the number of slots available for transmission. - * It is updated by the kernel after every netmap system call. - * It MUST BE decremented by the application when it appends a - * packet. - * cur indicates the slot to use for the next packet - * to send (i.e. the "tail" of the queue). - * It MUST BE incremented by the application before - * netmap system calls to reflect the number of newly - * sent packets. - * It is checked by the kernel on netmap system calls - * (normally unmodified by the kernel unless invalid). - * - * The kernel side of netmap uses two additional fields in its own - * private ring structure, netmap_kring: - * nr_hwcur is a copy of nr_cur on an NIOCTXSYNC. - * nr_hwavail is the number of slots known as available by the - * hardware. It is updated on an INTR (inc by the - * number of packets sent) and on a NIOCTXSYNC - * (decrease by nr_cur - nr_hwcur) - * A special case, nr_hwavail is -1 if the transmit - * side is idle (no pending transmits). + * + * head first slot available for transmission. + * cur wakeup point. select() and poll() will unblock + * when 'tail' moves past 'cur' + * tail (readonly) first slot reserved to the kernel + * + * [head .. tail-1] can be used for new packets to send; + * 'head' and 'cur' must be incremented as slots are filled + * with new packets to be sent; + * 'cur' can be moved further ahead if we need more space + * for new transmissions. * * In RX rings: - * avail is the number of packets available (possibly 0). - * It MUST BE decremented by the application when it consumes - * a packet, and it is updated to nr_hwavail on a NIOCRXSYNC - * cur indicates the first slot that contains a packet not - * processed yet (the "head" of the queue). - * It MUST BE incremented by the software when it consumes - * a packet. - * reserved indicates the number of buffers before 'cur' - * that the application has still in use. Normally 0, - * it MUST BE incremented by the application when it - * does not return the buffer immediately, and decremented - * when the buffer is finally freed. - * - * The kernel side of netmap uses two additional fields in the kring: - * nr_hwcur is a copy of nr_cur on an NIOCRXSYNC - * nr_hwavail is the number of packets available. It is updated - * on INTR (inc by the number of new packets arrived) - * and on NIOCRXSYNC (decreased by nr_cur - nr_hwcur). + * + * head first valid received packet + * cur wakeup point. select() and poll() will unblock + * when 'tail' moves past 'cur' + * tail (readonly) first slot reserved to the kernel + * + * [head .. tail-1] contain received packets; + * 'head' and 'cur' must be incremented as slots are consumed + * and can be returned to the kernel; + * 'cur' can be moved further ahead if we want to wait for + * new packets without returning the previous ones. * * DATA OWNERSHIP/LOCKING: - * The netmap_ring is owned by the user program and it is only - * accessed or modified in the upper half of the kernel during - * a system call. - * - * The netmap_kring is only modified by the upper half of the kernel. - * - * FLAGS - * NR_TIMESTAMP updates the 'ts' field on each syscall. This is - * a global timestamp for all packets. - * NR_RX_TSTMP if set, the last 64 byte in each buffer will - * contain a timestamp for the frame supplied by - * the hardware (if supported) - * NR_FORWARD if set, the NS_FORWARD flag in each slot of the - * RX ring is checked, and if set the packet is - * passed to the other side (host stack or device, - * respectively). This permits bpf-like behaviour - * or transparency for selected packets. + * The netmap_ring, and all slots and buffers in the range + * [head .. tail-1] are owned by the user program; + * the kernel only accesses them during a netmap system call + * and in the user thread context. + * + * Other slots and buffers are reserved for use by the kernel */ struct netmap_ring { /* - * nr_buf_base_ofs is meant to be used through macros. + * buf_ofs is meant to be used through macros. * It contains the offset of the buffer region from this * descriptor. */ - const ssize_t buf_ofs; + const int64_t buf_ofs; const uint32_t num_slots; /* number of slots in the ring. */ - uint32_t avail; /* number of usable slots */ - uint32_t cur; /* 'current' r/w position */ - uint32_t reserved; /* not refilled before current */ + const uint32_t nr_buf_size; + const uint16_t ringid; + const uint16_t dir; /* 0: tx, 1: rx */ - const uint16_t nr_buf_size; - uint16_t flags; -#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */ -#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */ -#define NR_RX_TSTMP 0x0008 /* set rx timestamp in slots */ + uint32_t head; /* (u) first user slot */ + uint32_t cur; /* (u) wakeup point */ + uint32_t tail; /* (k) first kernel slot */ + + uint32_t flags; - struct timeval ts; /* time of last *sync() */ + struct timeval ts; /* (k) time of last *sync() */ + + /* opaque room for a mutex or similar object */ + uint8_t sem[128] __attribute__((__aligned__(NM_CACHE_ALIGN))); /* the slots follow. This struct has variable size */ struct netmap_slot slot[0]; /* array of slots. */ @@ -252,88 +280,246 @@ struct netmap_ring { /* + * RING FLAGS + */ +#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */ + /* + * updates the 'ts' field on each netmap syscall. This saves + * saves a separate gettimeofday(), and is not much worse than + * software timestamps generated in the interrupt handler. + */ + +#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */ + /* + * Enables the NS_FORWARD slot flag for the ring. + */ + + +/* * Netmap representation of an interface and its queue(s). + * This is initialized by the kernel when binding a file + * descriptor to a port, and should be considered as readonly + * by user programs. The kernel never uses it. + * * There is one netmap_if for each file descriptor on which we want - * to select/poll. We assume that on each interface has the same number - * of receive and transmit queues. + * to select/poll. * select/poll operates on one or all pairs depending on the value of * nmr_queueid passed on the ioctl. */ struct netmap_if { char ni_name[IFNAMSIZ]; /* name of the interface. */ - const u_int ni_version; /* API version, currently unused */ - const u_int ni_rx_rings; /* number of rx rings */ - const u_int ni_tx_rings; /* if zero, same as ni_rx_rings */ + const uint32_t ni_version; /* API version, currently unused */ + const uint32_t ni_flags; /* properties */ +#define NI_PRIV_MEM 0x1 /* private memory region */ + + /* + * The number of packet rings available in netmap mode. + * Physical NICs can have different numbers of tx and rx rings. + * Physical NICs also have a 'host' ring pair. + * Additionally, clients can request additional ring pairs to + * be used for internal communication. + */ + const uint32_t ni_tx_rings; /* number of HW tx rings */ + const uint32_t ni_rx_rings; /* number of HW rx rings */ + + uint32_t ni_bufs_head; /* head index for extra bufs */ + uint32_t ni_spare1[5]; /* * The following array contains the offset of each netmap ring - * from this structure. The first ni_tx_queues+1 entries refer - * to the tx rings, the next ni_rx_queues+1 refer to the rx rings - * (the last entry in each block refers to the host stack rings). - * The area is filled up by the kernel on NIOCREG, + * from this structure, in the following order: + * NIC tx rings (ni_tx_rings); host tx ring (1); extra tx rings; + * NIC rx rings (ni_rx_rings); host tx ring (1); extra rx rings. + * + * The area is filled up by the kernel on NIOCREGIF, * and then only read by userspace code. */ const ssize_t ring_ofs[0]; }; -#ifndef NIOCREGIF + +#ifndef NIOCREGIF /* * ioctl names and related fields * + * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues, + * whose identity is set in NIOCREGIF through nr_ringid. + * These are non blocking and take no argument. + * * NIOCGINFO takes a struct ifreq, the interface name is the input, * the outputs are number of queues and number of descriptor * for each queue (useful to set number of threads etc.). + * The info returned is only advisory and may change before + * the interface is bound to a file descriptor. * - * NIOCREGIF takes an interface name within a struct ifreq, + * NIOCREGIF takes an interface name within a struct nmre, * and activates netmap mode on the interface (if possible). * - * For vale ports, starting with NETMAP_API = 5, - * nr_tx_rings and nr_rx_rings specify how many software rings - * are created (0 means 1). + * The argument to NIOCGINFO/NIOCREGIF overlays struct ifreq so we + * can pass it down to other NIC-related ioctls. * - * NIOCREGIF is also used to attach a NIC to a VALE switch. - * In this case the name is vale*:ifname, and "nr_cmd" - * is set to 'NETMAP_BDG_ATTACH' or 'NETMAP_BDG_DETACH'. - * nr_ringid specifies which rings should be attached, 0 means all, - * NETMAP_HW_RING + n means only the n-th ring. - * The process can terminate after the interface has been attached. + * The actual argument (struct nmreq) has a number of options to request + * different functions. + * The following are used in NIOCREGIF when nr_cmd == 0: * - * NIOCUNREGIF unregisters the interface associated to the fd. - * this is deprecated and will go away. + * nr_name (in) + * The name of the port (em0, valeXXX:YYY, etc.) + * limited to IFNAMSIZ for backward compatibility. * - * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues, - * whose identity is set in NIOCREGIF through nr_ringid + * nr_version (in/out) + * Must match NETMAP_API as used in the kernel, error otherwise. + * Always returns the desired value on output. + * + * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings (in/out) + * On input, non-zero values may be used to reconfigure the port + * according to the requested values, but this is not guaranteed. + * On output the actual values in use are reported. + * + * nr_ringid (in) + * Indicates how rings should be bound to the file descriptors. + * If nr_flags != 0, then the low bits (in NETMAP_RING_MASK) + * are used to indicate the ring number, and nr_flags specifies + * the actual rings to bind. NETMAP_NO_TX_POLL is unaffected. + * + * NOTE: THE FOLLOWING (nr_flags == 0) IS DEPRECATED: + * If nr_flags == 0, NETMAP_HW_RING and NETMAP_SW_RING control + * the binding as follows: + * 0 (default) binds all physical rings + * NETMAP_HW_RING | ring number binds a single ring pair + * NETMAP_SW_RING binds only the host tx/rx rings + * + * NETMAP_NO_TX_POLL can be OR-ed to make select()/poll() push + * packets on tx rings only if POLLOUT is set. + * The default is to push any pending packet. + * + * NETMAP_DO_RX_POLL can be OR-ed to make select()/poll() release + * packets on rx rings also when POLLIN is NOT set. + * The default is to touch the rx ring only with POLLIN. + * Note that this is the opposite of TX because it + * reflects the common usage. + * + * NOTE: NETMAP_PRIV_MEM IS DEPRECATED, use nr_arg2 instead. + * NETMAP_PRIV_MEM is set on return for ports that do not use + * the global memory allocator. + * This information is not significant and applications + * should look at the region id in nr_arg2 + * + * nr_flags is the recommended mode to indicate which rings should + * be bound to a file descriptor. Values are NR_REG_* + * + * nr_arg1 (in) The number of extra rings to be reserved. + * Especially when allocating a VALE port the system only + * allocates the amount of memory needed for the port. + * If more shared memory rings are desired (e.g. for pipes), + * the first invocation for the same basename/allocator + * should specify a suitable number. Memory cannot be + * extended after the first allocation without closing + * all ports on the same region. + * + * nr_arg2 (in/out) The identity of the memory region used. + * On input, 0 means the system decides autonomously, + * other values may try to select a specific region. + * On return the actual value is reported. + * Region '1' is the global allocator, normally shared + * by all interfaces. Other values are private regions. + * If two ports the same region zero-copy is possible. + * + * nr_arg3 (in/out) number of extra buffers to be allocated. + * + * + * + * nr_cmd (in) if non-zero indicates a special command: + * NETMAP_BDG_ATTACH and nr_name = vale*:ifname + * attaches the NIC to the switch; nr_ringid specifies + * which rings to use. Used by vale-ctl -a ... + * nr_arg1 = NETMAP_BDG_HOST also attaches the host port + * as in vale-ctl -h ... + * + * NETMAP_BDG_DETACH and nr_name = vale*:ifname + * disconnects a previously attached NIC. + * Used by vale-ctl -d ... + * + * NETMAP_BDG_LIST + * list the configuration of VALE switches. + * + * NETMAP_BDG_VNET_HDR + * Set the virtio-net header length used by the client + * of a VALE switch port. + * + * nr_arg1, nr_arg2, nr_arg3 (in/out) command specific + * + * * - * NETMAP_API is the API version. */ + /* - * struct nmreq overlays a struct ifreq + * struct nmreq overlays a struct ifreq (just the name) + * + * On input, nr_ringid indicates which rings we are requesting, + * with the low flags for the specific ring number. + * selection FLAGS RING INDEX + * + * all the NIC rings 0x0000 - + * only HOST ring 0x2000 ring index + * single NIC ring 0x4000 - + * all the NIC+HOST rings 0x6000 - + * one pipe ring, master 0x8000 ring index + * *** INVALID 0xA000 + * one pipe ring, slave 0xC000 ring index + * *** INVALID 0xE000 + * */ struct nmreq { char nr_name[IFNAMSIZ]; uint32_t nr_version; /* API version */ -#define NETMAP_API 4 /* current version */ uint32_t nr_offset; /* nifp offset in the shared region */ uint32_t nr_memsize; /* size of the shared region */ uint32_t nr_tx_slots; /* slots in tx rings */ uint32_t nr_rx_slots; /* slots in rx rings */ uint16_t nr_tx_rings; /* number of tx rings */ uint16_t nr_rx_rings; /* number of rx rings */ + uint16_t nr_ringid; /* ring(s) we care about */ -#define NETMAP_HW_RING 0x4000 /* low bits indicate one hw ring */ -#define NETMAP_SW_RING 0x2000 /* process the sw ring */ +#define NETMAP_HW_RING 0x4000 /* single NIC ring pair */ +#define NETMAP_SW_RING 0x2000 /* only host ring pair */ + +#define NETMAP_RING_MASK 0x0fff /* the ring number */ + #define NETMAP_NO_TX_POLL 0x1000 /* no automatic txsync on poll */ -#define NETMAP_RING_MASK 0xfff /* the ring number */ + +#define NETMAP_DO_RX_POLL 0x8000 /* DO automatic rxsync on poll */ + uint16_t nr_cmd; #define NETMAP_BDG_ATTACH 1 /* attach the NIC */ #define NETMAP_BDG_DETACH 2 /* detach the NIC */ #define NETMAP_BDG_LOOKUP_REG 3 /* register lookup function */ #define NETMAP_BDG_LIST 4 /* get bridge's info */ - uint16_t nr_arg1; +#define NETMAP_BDG_VNET_HDR 5 /* set the port virtio-net-hdr length */ +#define NETMAP_BDG_OFFSET NETMAP_BDG_VNET_HDR /* deprecated alias */ + + uint16_t nr_arg1; /* reserve extra rings in NIOCREGIF */ #define NETMAP_BDG_HOST 1 /* attach the host stack on ATTACH */ + uint16_t nr_arg2; - uint32_t spare2[3]; + uint32_t nr_arg3; /* req. extra buffers in NIOCREGIF */ + uint32_t nr_flags; + /* various modes, extends nr_ringid */ + uint32_t spare2[1]; +}; + +#define NR_REG_MASK 0xf /* values for nr_flags */ +enum { NR_REG_DEFAULT = 0, /* backward compat, should not be used. */ + NR_REG_ALL_NIC = 1, + NR_REG_SW = 2, + NR_REG_NIC_SW = 3, + NR_REG_ONE_NIC = 4, + NR_REG_PIPE_MASTER = 5, + NR_REG_PIPE_SLAVE = 6, }; +/* monitor uses the NR_REG to select the rings to monitor */ +#define NR_MONITOR_TX 0x100 +#define NR_MONITOR_RX 0x200 + /* * FreeBSD uses the size value embedded in the _IOWR to determine @@ -343,9 +529,22 @@ struct nmreq { */ #define NIOCGINFO _IOWR('i', 145, struct nmreq) /* return IF info */ #define NIOCREGIF _IOWR('i', 146, struct nmreq) /* interface register */ -#define NIOCUNREGIF _IO('i', 147) /* interface unregister */ #define NIOCTXSYNC _IO('i', 148) /* sync tx queues */ #define NIOCRXSYNC _IO('i', 149) /* sync rx queues */ #endif /* !NIOCREGIF */ + +/* + * Helper functions for kernel and userspace + */ + +/* + * check if space is available in the ring. + */ +static inline int +nm_ring_empty(struct netmap_ring *ring) +{ + return (ring->cur == ring->tail); +} + #endif /* _NET_NETMAP_H_ */ diff --git a/sys/net/netmap_user.h b/sys/net/netmap_user.h index fcb5cb3..9c3a4c1 100644 --- a/sys/net/netmap_user.h +++ b/sys/net/netmap_user.h @@ -1,40 +1,34 @@ /* - * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. - * + * Copyright (C) 2011-2014 Universita` di Pisa. All rights reserved. + * * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * + * modification, are permitted provided that the following conditions + * are met: + * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. - * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the - * distribution. - * - * 3. Neither the name of the authors nor the names of their contributors - * may be used to endorse or promote products derived from this - * software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY MATTEO LANDI AND CONTRIBUTORS "AS IS" AND + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL MATTEO LANDI OR CONTRIBUTORS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ /* * $FreeBSD$ * - * This header contains the macros used to manipulate netmap structures - * and packets in userspace. See netmap(4) for more information. + * Functions and macros to manipulate netmap structures and packets + * in userspace. See netmap(4) for more information. * * The address of the struct netmap_if, say nifp, is computed from the * value returned from ioctl(.., NIOCREG, ...) and the mmap region: @@ -49,22 +43,44 @@ * we can access ring->nr_cur, ring->nr_avail, ring->nr_flags * * ring->slot[i] gives us the i-th slot (we can access - * directly plen, flags, bufindex) + * directly len, flags, buf_idx) * * char *buf = NETMAP_BUF(ring, x) returns a pointer to * the buffer numbered x * - * Since rings are circular, we have macros to compute the next index - * i = NETMAP_RING_NEXT(ring, i); + * All ring indexes (head, cur, tail) should always move forward. + * To compute the next index in a circular ring you can use + * i = nm_ring_next(ring, i); + * + * To ease porting apps from pcap to netmap we supply a few fuctions + * that can be called to open, close, read and write on netmap in a way + * similar to libpcap. Note that the read/write function depend on + * an ioctl()/select()/poll() being issued to refill rings or push + * packets out. + * + * In order to use these, include #define NETMAP_WITH_LIBS + * in the source file that invokes these functions. */ #ifndef _NET_NETMAP_USER_H_ #define _NET_NETMAP_USER_H_ +#include <stdint.h> +#include <sys/socket.h> /* apple needs sockaddr */ +#include <net/if.h> /* IFNAMSIZ */ + +#ifndef likely +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif /* likely and unlikely */ + +#include <net/netmap.h> + +/* helper macro */ #define _NETMAP_OFFSET(type, ptr, offset) \ ((type)(void *)((char *)(ptr) + (offset))) -#define NETMAP_IF(b, o) _NETMAP_OFFSET(struct netmap_if *, b, o) +#define NETMAP_IF(_base, _ofs) _NETMAP_OFFSET(struct netmap_if *, _base, _ofs) #define NETMAP_TXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, \ nifp, (nifp)->ring_ofs[index] ) @@ -77,19 +93,585 @@ #define NETMAP_BUF_IDX(ring, buf) \ ( ((char *)(buf) - ((char *)(ring) + (ring)->buf_ofs) ) / \ - (ring)->nr_buf_size ) + (ring)->nr_buf_size ) + + +static inline uint32_t +nm_ring_next(struct netmap_ring *r, uint32_t i) +{ + return ( unlikely(i + 1 == r->num_slots) ? 0 : i + 1); +} + + +/* + * Return 1 if we have pending transmissions in the tx ring. + * When everything is complete ring->head = ring->tail + 1 (modulo ring size) + */ +static inline int +nm_tx_pending(struct netmap_ring *r) +{ + return nm_ring_next(r, r->tail) != r->head; +} + -#define NETMAP_RING_NEXT(r, i) \ - ((i)+1 == (r)->num_slots ? 0 : (i) + 1 ) +static inline uint32_t +nm_ring_space(struct netmap_ring *ring) +{ + int ret = ring->tail - ring->cur; + if (ret < 0) + ret += ring->num_slots; + return ret; +} -#define NETMAP_RING_FIRST_RESERVED(r) \ - ( (r)->cur < (r)->reserved ? \ - (r)->cur + (r)->num_slots - (r)->reserved : \ - (r)->cur - (r)->reserved ) +#ifdef NETMAP_WITH_LIBS /* - * Return 1 if the given tx ring is empty. + * Support for simple I/O libraries. + * Include other system headers required for compiling this. */ -#define NETMAP_TX_RING_EMPTY(r) ((r)->avail >= (r)->num_slots - 1) + +#ifndef HAVE_NETMAP_WITH_LIBS +#define HAVE_NETMAP_WITH_LIBS + +#include <sys/time.h> +#include <sys/mman.h> +#include <string.h> /* memset */ +#include <sys/ioctl.h> +#include <sys/errno.h> /* EINVAL */ +#include <fcntl.h> /* O_RDWR */ +#include <unistd.h> /* close() */ +#include <signal.h> +#include <stdlib.h> + +#ifndef ND /* debug macros */ +/* debug support */ +#define ND(_fmt, ...) do {} while(0) +#define D(_fmt, ...) \ + do { \ + struct timeval t0; \ + gettimeofday(&t0, NULL); \ + fprintf(stderr, "%03d.%06d %s [%d] " _fmt "\n", \ + (int)(t0.tv_sec % 1000), (int)t0.tv_usec, \ + __FUNCTION__, __LINE__, ##__VA_ARGS__); \ + } while (0) + +/* Rate limited version of "D", lps indicates how many per second */ +#define RD(lps, format, ...) \ + do { \ + static int t0, __cnt; \ + struct timeval __xxts; \ + gettimeofday(&__xxts, NULL); \ + if (t0 != __xxts.tv_sec) { \ + t0 = __xxts.tv_sec; \ + __cnt = 0; \ + } \ + if (__cnt++ < lps) { \ + D(format, ##__VA_ARGS__); \ + } \ + } while (0) +#endif + +struct nm_pkthdr { /* same as pcap_pkthdr */ + struct timeval ts; + uint32_t caplen; + uint32_t len; +}; + +struct nm_stat { /* same as pcap_stat */ + u_int ps_recv; + u_int ps_drop; + u_int ps_ifdrop; +#ifdef WIN32 + u_int bs_capt; +#endif /* WIN32 */ +}; + +#define NM_ERRBUF_SIZE 512 + +struct nm_desc { + struct nm_desc *self; /* point to self if netmap. */ + int fd; + void *mem; + int memsize; + int done_mmap; /* set if mem is the result of mmap */ + struct netmap_if * const nifp; + uint16_t first_tx_ring, last_tx_ring, cur_tx_ring; + uint16_t first_rx_ring, last_rx_ring, cur_rx_ring; + struct nmreq req; /* also contains the nr_name = ifname */ + struct nm_pkthdr hdr; + + /* + * The memory contains netmap_if, rings and then buffers. + * Given a pointer (e.g. to nm_inject) we can compare with + * mem/buf_start/buf_end to tell if it is a buffer or + * some other descriptor in our region. + * We also store a pointer to some ring as it helps in the + * translation from buffer indexes to addresses. + */ + struct netmap_ring * const some_ring; + void * const buf_start; + void * const buf_end; + /* parameters from pcap_open_live */ + int snaplen; + int promisc; + int to_ms; + char *errbuf; + + /* save flags so we can restore them on close */ + uint32_t if_flags; + uint32_t if_reqcap; + uint32_t if_curcap; + + struct nm_stat st; + char msg[NM_ERRBUF_SIZE]; +}; + +/* + * when the descriptor is open correctly, d->self == d + * Eventually we should also use some magic number. + */ +#define P2NMD(p) ((struct nm_desc *)(p)) +#define IS_NETMAP_DESC(d) ((d) && P2NMD(d)->self == P2NMD(d)) +#define NETMAP_FD(d) (P2NMD(d)->fd) + + +/* + * this is a slightly optimized copy routine which rounds + * to multiple of 64 bytes and is often faster than dealing + * with other odd sizes. We assume there is enough room + * in the source and destination buffers. + * + * XXX only for multiples of 64 bytes, non overlapped. + */ +static inline void +nm_pkt_copy(const void *_src, void *_dst, int l) +{ + const uint64_t *src = (const uint64_t *)_src; + uint64_t *dst = (uint64_t *)_dst; + + if (unlikely(l >= 1024)) { + memcpy(dst, src, l); + return; + } + for (; likely(l > 0); l-=64) { + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + } +} + + +/* + * The callback, invoked on each received packet. Same as libpcap + */ +typedef void (*nm_cb_t)(u_char *, const struct nm_pkthdr *, const u_char *d); + +/* + *--- the pcap-like API --- + * + * nm_open() opens a file descriptor, binds to a port and maps memory. + * + * ifname (netmap:foo or vale:foo) is the port name + * a suffix can indicate the follwing: + * ^ bind the host (sw) ring pair + * * bind host and NIC ring pairs (transparent) + * -NN bind individual NIC ring pair + * {NN bind master side of pipe NN + * }NN bind slave side of pipe NN + * + * req provides the initial values of nmreq before parsing ifname. + * Remember that the ifname parsing will override the ring + * number in nm_ringid, and part of nm_flags; + * flags special functions, normally 0 + * indicates which fields of *arg are significant + * arg special functions, normally NULL + * if passed a netmap_desc with mem != NULL, + * use that memory instead of mmap. + */ + +static struct nm_desc *nm_open(const char *ifname, const struct nmreq *req, + uint64_t flags, const struct nm_desc *arg); + +/* + * nm_open can import some fields from the parent descriptor. + * These flags control which ones. + * Also in flags you can specify NETMAP_NO_TX_POLL and NETMAP_DO_RX_POLL, + * which set the initial value for these flags. + * Note that the 16 low bits of the flags are reserved for data + * that may go into the nmreq. + */ +enum { + NM_OPEN_NO_MMAP = 0x040000, /* reuse mmap from parent */ + NM_OPEN_IFNAME = 0x080000, /* nr_name, nr_ringid, nr_flags */ + NM_OPEN_ARG1 = 0x100000, + NM_OPEN_ARG2 = 0x200000, + NM_OPEN_ARG3 = 0x400000, + NM_OPEN_RING_CFG = 0x800000, /* tx|rx rings|slots */ +}; + + +/* + * nm_close() closes and restores the port to its previous state + */ + +static int nm_close(struct nm_desc *); + +/* + * nm_inject() is the same as pcap_inject() + * nm_dispatch() is the same as pcap_dispatch() + * nm_nextpkt() is the same as pcap_next() + */ + +static int nm_inject(struct nm_desc *, const void *, size_t); +static int nm_dispatch(struct nm_desc *, int, nm_cb_t, u_char *); +static u_char *nm_nextpkt(struct nm_desc *, struct nm_pkthdr *); + + +/* + * Try to open, return descriptor if successful, NULL otherwise. + * An invalid netmap name will return errno = 0; + * You can pass a pointer to a pre-filled nm_desc to add special + * parameters. Flags is used as follows + * NM_OPEN_NO_MMAP use the memory from arg, only + * if the nr_arg2 (memory block) matches. + * NM_OPEN_ARG1 use req.nr_arg1 from arg + * NM_OPEN_ARG2 use req.nr_arg2 from arg + * NM_OPEN_RING_CFG user ring config from arg + */ +static struct nm_desc * +nm_open(const char *ifname, const struct nmreq *req, + uint64_t new_flags, const struct nm_desc *arg) +{ + struct nm_desc *d = NULL; + const struct nm_desc *parent = arg; + u_int namelen; + uint32_t nr_ringid = 0, nr_flags; + const char *port = NULL; + const char *errmsg = NULL; + + if (strncmp(ifname, "netmap:", 7) && strncmp(ifname, "vale", 4)) { + errno = 0; /* name not recognised, not an error */ + return NULL; + } + if (ifname[0] == 'n') + ifname += 7; + /* scan for a separator */ + for (port = ifname; *port && !index("-*^{}", *port); port++) + ; + namelen = port - ifname; + if (namelen >= sizeof(d->req.nr_name)) { + errmsg = "name too long"; + goto fail; + } + switch (*port) { + default: /* '\0', no suffix */ + nr_flags = NR_REG_ALL_NIC; + break; + case '-': /* one NIC */ + nr_flags = NR_REG_ONE_NIC; + nr_ringid = atoi(port + 1); + break; + case '*': /* NIC and SW, ignore port */ + nr_flags = NR_REG_NIC_SW; + if (port[1]) { + errmsg = "invalid port for nic+sw"; + goto fail; + } + break; + case '^': /* only sw ring */ + nr_flags = NR_REG_SW; + if (port[1]) { + errmsg = "invalid port for sw ring"; + goto fail; + } + break; + case '{': + nr_flags = NR_REG_PIPE_MASTER; + nr_ringid = atoi(port + 1); + break; + case '}': + nr_flags = NR_REG_PIPE_SLAVE; + nr_ringid = atoi(port + 1); + break; + } + + if (nr_ringid >= NETMAP_RING_MASK) { + errmsg = "invalid ringid"; + goto fail; + } + /* add the *XPOLL flags */ + nr_ringid |= new_flags & (NETMAP_NO_TX_POLL | NETMAP_DO_RX_POLL); + + d = (struct nm_desc *)calloc(1, sizeof(*d)); + if (d == NULL) { + errmsg = "nm_desc alloc failure"; + errno = ENOMEM; + return NULL; + } + d->self = d; /* set this early so nm_close() works */ + d->fd = open("/dev/netmap", O_RDWR); + if (d->fd < 0) { + errmsg = "cannot open /dev/netmap"; + goto fail; + } + + if (req) + d->req = *req; + d->req.nr_version = NETMAP_API; + d->req.nr_ringid &= ~NETMAP_RING_MASK; + + /* these fields are overridden by ifname and flags processing */ + d->req.nr_ringid |= nr_ringid; + d->req.nr_flags = nr_flags; + memcpy(d->req.nr_name, ifname, namelen); + d->req.nr_name[namelen] = '\0'; + /* optionally import info from parent */ + if (IS_NETMAP_DESC(parent) && new_flags) { + if (new_flags & NM_OPEN_ARG1) + D("overriding ARG1 %d", parent->req.nr_arg1); + d->req.nr_arg1 = new_flags & NM_OPEN_ARG1 ? + parent->req.nr_arg1 : 4; + if (new_flags & NM_OPEN_ARG2) + D("overriding ARG2 %d", parent->req.nr_arg2); + d->req.nr_arg2 = new_flags & NM_OPEN_ARG2 ? + parent->req.nr_arg2 : 0; + if (new_flags & NM_OPEN_ARG3) + D("overriding ARG3 %d", parent->req.nr_arg3); + d->req.nr_arg3 = new_flags & NM_OPEN_ARG3 ? + parent->req.nr_arg3 : 0; + if (new_flags & NM_OPEN_RING_CFG) { + D("overriding RING_CFG"); + d->req.nr_tx_slots = parent->req.nr_tx_slots; + d->req.nr_rx_slots = parent->req.nr_rx_slots; + d->req.nr_tx_rings = parent->req.nr_tx_rings; + d->req.nr_rx_rings = parent->req.nr_rx_rings; + } + if (new_flags & NM_OPEN_IFNAME) { + D("overriding ifname %s ringid 0x%x flags 0x%x", + parent->req.nr_name, parent->req.nr_ringid, + parent->req.nr_flags); + memcpy(d->req.nr_name, parent->req.nr_name, + sizeof(d->req.nr_name)); + d->req.nr_ringid = parent->req.nr_ringid; + d->req.nr_flags = parent->req.nr_flags; + } + } + if (ioctl(d->fd, NIOCREGIF, &d->req)) { + errmsg = "NIOCREGIF failed"; + goto fail; + } + + if (IS_NETMAP_DESC(parent) && parent->mem && + parent->req.nr_arg2 == d->req.nr_arg2) { + /* do not mmap, inherit from parent */ + d->memsize = parent->memsize; + d->mem = parent->mem; + } else { + d->memsize = d->req.nr_memsize; + d->mem = mmap(0, d->memsize, PROT_WRITE | PROT_READ, MAP_SHARED, + d->fd, 0); + if (d->mem == NULL) { + errmsg = "mmap failed"; + goto fail; + } + d->done_mmap = 1; + } + { + struct netmap_if *nifp = NETMAP_IF(d->mem, d->req.nr_offset); + struct netmap_ring *r = NETMAP_RXRING(nifp, ); + + *(struct netmap_if **)(uintptr_t)&(d->nifp) = nifp; + *(struct netmap_ring **)(uintptr_t)&d->some_ring = r; + *(void **)(uintptr_t)&d->buf_start = NETMAP_BUF(r, 0); + *(void **)(uintptr_t)&d->buf_end = + (char *)d->mem + d->memsize; + } + + if (nr_flags == NR_REG_SW) { /* host stack */ + d->first_tx_ring = d->last_tx_ring = d->req.nr_tx_rings; + d->first_rx_ring = d->last_rx_ring = d->req.nr_rx_rings; + } else if (nr_flags == NR_REG_ALL_NIC) { /* only nic */ + d->first_tx_ring = 0; + d->first_rx_ring = 0; + d->last_tx_ring = d->req.nr_tx_rings - 1; + d->last_rx_ring = d->req.nr_rx_rings - 1; + } else if (nr_flags == NR_REG_NIC_SW) { + d->first_tx_ring = 0; + d->first_rx_ring = 0; + d->last_tx_ring = d->req.nr_tx_rings; + d->last_rx_ring = d->req.nr_rx_rings; + } else if (nr_flags == NR_REG_ONE_NIC) { + /* XXX check validity */ + d->first_tx_ring = d->last_tx_ring = + d->first_rx_ring = d->last_rx_ring = nr_ringid; + } else { /* pipes */ + d->first_tx_ring = d->last_tx_ring = 0; + d->first_rx_ring = d->last_rx_ring = 0; + } + +#ifdef DEBUG_NETMAP_USER + { /* debugging code */ + int i; + + D("%s tx %d .. %d %d rx %d .. %d %d", ifname, + d->first_tx_ring, d->last_tx_ring, d->req.nr_tx_rings, + d->first_rx_ring, d->last_rx_ring, d->req.nr_rx_rings); + for (i = 0; i <= d->req.nr_tx_rings; i++) { + struct netmap_ring *r = NETMAP_TXRING(d->nifp, i); + D("TX%d %p h %d c %d t %d", i, r, r->head, r->cur, r->tail); + } + for (i = 0; i <= d->req.nr_rx_rings; i++) { + struct netmap_ring *r = NETMAP_RXRING(d->nifp, i); + D("RX%d %p h %d c %d t %d", i, r, r->head, r->cur, r->tail); + } + } +#endif /* debugging */ + + d->cur_tx_ring = d->first_tx_ring; + d->cur_rx_ring = d->first_rx_ring; + return d; + +fail: + nm_close(d); + if (errmsg) + D("%s %s", errmsg, ifname); + errno = EINVAL; + return NULL; +} + + +static int +nm_close(struct nm_desc *d) +{ + /* + * ugly trick to avoid unused warnings + */ + static void *__xxzt[] __attribute__ ((unused)) = + { (void *)nm_open, (void *)nm_inject, + (void *)nm_dispatch, (void *)nm_nextpkt } ; + + if (d == NULL || d->self != d) + return EINVAL; + if (d->done_mmap && d->mem) + munmap(d->mem, d->memsize); + if (d->fd != -1) + close(d->fd); + bzero(d, sizeof(*d)); + free(d); + return 0; +} + + +/* + * Same prototype as pcap_inject(), only need to cast. + */ +static int +nm_inject(struct nm_desc *d, const void *buf, size_t size) +{ + u_int c, n = d->last_tx_ring - d->first_tx_ring + 1; + + for (c = 0; c < n ; c++) { + /* compute current ring to use */ + struct netmap_ring *ring; + uint32_t i, idx; + uint32_t ri = d->cur_tx_ring + c; + + if (ri > d->last_tx_ring) + ri = d->first_tx_ring; + ring = NETMAP_TXRING(d->nifp, ri); + if (nm_ring_empty(ring)) { + continue; + } + i = ring->cur; + idx = ring->slot[i].buf_idx; + ring->slot[i].len = size; + nm_pkt_copy(buf, NETMAP_BUF(ring, idx), size); + d->cur_tx_ring = ri; + ring->head = ring->cur = nm_ring_next(ring, i); + return size; + } + return 0; /* fail */ +} + + +/* + * Same prototype as pcap_dispatch(), only need to cast. + */ +static int +nm_dispatch(struct nm_desc *d, int cnt, nm_cb_t cb, u_char *arg) +{ + int n = d->last_rx_ring - d->first_rx_ring + 1; + int c, got = 0, ri = d->cur_rx_ring; + + if (cnt == 0) + cnt = -1; + /* cnt == -1 means infinite, but rings have a finite amount + * of buffers and the int is large enough that we never wrap, + * so we can omit checking for -1 + */ + for (c=0; c < n && cnt != got; c++) { + /* compute current ring to use */ + struct netmap_ring *ring; + + ri = d->cur_rx_ring + c; + if (ri > d->last_rx_ring) + ri = d->first_rx_ring; + ring = NETMAP_RXRING(d->nifp, ri); + for ( ; !nm_ring_empty(ring) && cnt != got; got++) { + u_int i = ring->cur; + u_int idx = ring->slot[i].buf_idx; + u_char *buf = (u_char *)NETMAP_BUF(ring, idx); + + // __builtin_prefetch(buf); + d->hdr.len = d->hdr.caplen = ring->slot[i].len; + d->hdr.ts = ring->ts; + cb(arg, &d->hdr, buf); + ring->head = ring->cur = nm_ring_next(ring, i); + } + } + d->cur_rx_ring = ri; + return got; +} + +static u_char * +nm_nextpkt(struct nm_desc *d, struct nm_pkthdr *hdr) +{ + int ri = d->cur_rx_ring; + + do { + /* compute current ring to use */ + struct netmap_ring *ring = NETMAP_RXRING(d->nifp, ri); + if (!nm_ring_empty(ring)) { + u_int i = ring->cur; + u_int idx = ring->slot[i].buf_idx; + u_char *buf = (u_char *)NETMAP_BUF(ring, idx); + + // __builtin_prefetch(buf); + hdr->ts = ring->ts; + hdr->len = hdr->caplen = ring->slot[i].len; + ring->cur = nm_ring_next(ring, i); + /* we could postpone advancing head if we want + * to hold the buffer. This can be supported in + * the future. + */ + ring->head = ring->cur; + d->cur_rx_ring = ri; + return buf; + } + ri++; + if (ri > d->last_rx_ring) + ri = d->first_rx_ring; + } while (ri != d->cur_rx_ring); + return NULL; /* nothing found */ +} + +#endif /* !HAVE_NETMAP_WITH_LIBS */ + +#endif /* NETMAP_WITH_LIBS */ #endif /* _NET_NETMAP_USER_H_ */ |