diff options
author | luigi <luigi@FreeBSD.org> | 2014-02-18 05:46:19 +0000 |
---|---|---|
committer | luigi <luigi@FreeBSD.org> | 2014-02-18 05:46:19 +0000 |
commit | c9f2fff1da752248a19cc8a978f43fd521639708 (patch) | |
tree | c085b4844c52900cd273b45d3fe39706e0fc4bed /sys/dev/netmap | |
parent | 5bacc3bb87b954978543b0d82a4d5705e33f5c06 (diff) | |
download | FreeBSD-src-c9f2fff1da752248a19cc8a978f43fd521639708.zip FreeBSD-src-c9f2fff1da752248a19cc8a978f43fd521639708.tar.gz |
missing files from previous commit...
Diffstat (limited to 'sys/dev/netmap')
-rw-r--r-- | sys/dev/netmap/netmap_freebsd.c | 655 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_generic.c | 806 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_mbq.c | 163 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_mbq.h | 78 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_mem2.h | 227 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_offloadings.c | 401 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_pipe.c | 711 | ||||
-rw-r--r-- | sys/dev/netmap/netmap_vale.c | 2103 |
8 files changed, 5144 insertions, 0 deletions
diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c new file mode 100644 index 0000000..a8e287c --- /dev/null +++ b/sys/dev/netmap/netmap_freebsd.c @@ -0,0 +1,655 @@ +/* + * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* $FreeBSD$ */ + +#include <sys/types.h> +#include <sys/module.h> +#include <sys/errno.h> +#include <sys/param.h> /* defines used in kernel.h */ +#include <sys/poll.h> /* POLLIN, POLLOUT */ +#include <sys/kernel.h> /* types used in module initialization */ +#include <sys/conf.h> /* DEV_MODULE */ +#include <sys/endian.h> + +#include <sys/rwlock.h> + +#include <vm/vm.h> /* vtophys */ +#include <vm/pmap.h> /* vtophys */ +#include <vm/vm_param.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pager.h> +#include <vm/uma.h> + + +#include <sys/malloc.h> +#include <sys/socket.h> /* sockaddrs */ +#include <sys/selinfo.h> +#include <net/if.h> +#include <net/if_var.h> +#include <machine/bus.h> /* bus_dmamap_* */ +#include <netinet/in.h> /* in6_cksum_pseudo() */ +#include <machine/in_cksum.h> /* in_pseudo(), in_cksum_hdr() */ + +#include <net/netmap.h> +#include <dev/netmap/netmap_kern.h> +#include <dev/netmap/netmap_mem2.h> + + +/* ======================== FREEBSD-SPECIFIC ROUTINES ================== */ + +rawsum_t nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum) +{ + /* TODO XXX please use the FreeBSD implementation for this. */ + uint16_t *words = (uint16_t *)data; + int nw = len / 2; + int i; + + for (i = 0; i < nw; i++) + cur_sum += be16toh(words[i]); + + if (len & 1) + cur_sum += (data[len-1] << 8); + + return cur_sum; +} + +/* Fold a raw checksum: 'cur_sum' is in host byte order, while the + * return value is in network byte order. + */ +uint16_t nm_csum_fold(rawsum_t cur_sum) +{ + /* TODO XXX please use the FreeBSD implementation for this. */ + while (cur_sum >> 16) + cur_sum = (cur_sum & 0xFFFF) + (cur_sum >> 16); + + return htobe16((~cur_sum) & 0xFFFF); +} + +uint16_t nm_csum_ipv4(struct nm_iphdr *iph) +{ +#if 0 + return in_cksum_hdr((void *)iph); +#else + return nm_csum_fold(nm_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0)); +#endif +} + +void nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, + size_t datalen, uint16_t *check) +{ + uint16_t pseudolen = datalen + iph->protocol; + + /* Compute and insert the pseudo-header cheksum. */ + *check = in_pseudo(iph->saddr, iph->daddr, + htobe16(pseudolen)); + /* Compute the checksum on TCP/UDP header + payload + * (includes the pseudo-header). + */ + *check = nm_csum_fold(nm_csum_raw(data, datalen, 0)); +} + +void nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, + size_t datalen, uint16_t *check) +{ +#ifdef INET6 + *check = in6_cksum_pseudo((void*)ip6h, datalen, ip6h->nexthdr, 0); + *check = nm_csum_fold(nm_csum_raw(data, datalen, 0)); +#else + static int notsupported = 0; + if (!notsupported) { + notsupported = 1; + D("inet6 segmentation not supported"); + } +#endif +} + + +/* + * Intercept the rx routine in the standard device driver. + * Second argument is non-zero to intercept, 0 to restore + */ +int +netmap_catch_rx(struct netmap_adapter *na, int intercept) +{ + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; + struct ifnet *ifp = na->ifp; + + if (intercept) { + if (gna->save_if_input) { + D("cannot intercept again"); + return EINVAL; /* already set */ + } + gna->save_if_input = ifp->if_input; + ifp->if_input = generic_rx_handler; + } else { + if (!gna->save_if_input){ + D("cannot restore"); + return EINVAL; /* not saved */ + } + ifp->if_input = gna->save_if_input; + gna->save_if_input = NULL; + } + + return 0; +} + + +/* + * Intercept the packet steering routine in the tx path, + * so that we can decide which queue is used for an mbuf. + * Second argument is non-zero to intercept, 0 to restore. + * On freebsd we just intercept if_transmit. + */ +void +netmap_catch_tx(struct netmap_generic_adapter *gna, int enable) +{ + struct netmap_adapter *na = &gna->up.up; + struct ifnet *ifp = na->ifp; + + if (enable) { + na->if_transmit = ifp->if_transmit; + ifp->if_transmit = netmap_transmit; + } else { + ifp->if_transmit = na->if_transmit; + } +} + + +/* + * Transmit routine used by generic_netmap_txsync(). Returns 0 on success + * and non-zero on error (which may be packet drops or other errors). + * addr and len identify the netmap buffer, m is the (preallocated) + * mbuf to use for transmissions. + * + * We should add a reference to the mbuf so the m_freem() at the end + * of the transmission does not consume resources. + * + * On FreeBSD, and on multiqueue cards, we can force the queue using + * if ((m->m_flags & M_FLOWID) != 0) + * i = m->m_pkthdr.flowid % adapter->num_queues; + * else + * i = curcpu % adapter->num_queues; + * + */ +int +generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, + void *addr, u_int len, u_int ring_nr) +{ + int ret; + + m->m_len = m->m_pkthdr.len = 0; + + // copy data to the mbuf + m_copyback(m, 0, len, addr); + // inc refcount. We are alone, so we can skip the atomic + atomic_fetchadd_int(m->m_ext.ref_cnt, 1); + m->m_flags |= M_FLOWID; + m->m_pkthdr.flowid = ring_nr; + m->m_pkthdr.rcvif = ifp; /* used for tx notification */ + ret = NA(ifp)->if_transmit(ifp, m); + return ret; +} + + +/* + * The following two functions are empty until we have a generic + * way to extract the info from the ifp + */ +int +generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx) +{ + D("called"); + return 0; +} + + +void +generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq) +{ + D("called"); + *txq = netmap_generic_rings; + *rxq = netmap_generic_rings; +} + + +void netmap_mitigation_init(struct nm_generic_mit *mit, struct netmap_adapter *na) +{ + ND("called"); + mit->mit_pending = 0; + mit->mit_na = na; +} + + +void netmap_mitigation_start(struct nm_generic_mit *mit) +{ + ND("called"); +} + + +void netmap_mitigation_restart(struct nm_generic_mit *mit) +{ + ND("called"); +} + + +int netmap_mitigation_active(struct nm_generic_mit *mit) +{ + ND("called"); + return 0; +} + + +void netmap_mitigation_cleanup(struct nm_generic_mit *mit) +{ + ND("called"); +} + + +/* + * In order to track whether pages are still mapped, we hook into + * the standard cdev_pager and intercept the constructor and + * destructor. + */ + +struct netmap_vm_handle_t { + struct cdev *dev; + struct netmap_priv_d *priv; +}; + + +static int +netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, + vm_ooffset_t foff, struct ucred *cred, u_short *color) +{ + struct netmap_vm_handle_t *vmh = handle; + + if (netmap_verbose) + D("handle %p size %jd prot %d foff %jd", + handle, (intmax_t)size, prot, (intmax_t)foff); + dev_ref(vmh->dev); + return 0; +} + + +static void +netmap_dev_pager_dtor(void *handle) +{ + struct netmap_vm_handle_t *vmh = handle; + struct cdev *dev = vmh->dev; + struct netmap_priv_d *priv = vmh->priv; + + if (netmap_verbose) + D("handle %p", handle); + netmap_dtor(priv); + free(vmh, M_DEVBUF); + dev_rel(dev); +} + + +static int +netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, + int prot, vm_page_t *mres) +{ + struct netmap_vm_handle_t *vmh = object->handle; + struct netmap_priv_d *priv = vmh->priv; + vm_paddr_t paddr; + vm_page_t page; + vm_memattr_t memattr; + vm_pindex_t pidx; + + ND("object %p offset %jd prot %d mres %p", + object, (intmax_t)offset, prot, mres); + memattr = object->memattr; + pidx = OFF_TO_IDX(offset); + paddr = netmap_mem_ofstophys(priv->np_mref, offset); + if (paddr == 0) + return VM_PAGER_FAIL; + + if (((*mres)->flags & PG_FICTITIOUS) != 0) { + /* + * If the passed in result page is a fake page, update it with + * the new physical address. + */ + page = *mres; + vm_page_updatefake(page, paddr, memattr); + } else { + /* + * Replace the passed in reqpage page with our own fake page and + * free up the all of the original pages. + */ +#ifndef VM_OBJECT_WUNLOCK /* FreeBSD < 10.x */ +#define VM_OBJECT_WUNLOCK VM_OBJECT_UNLOCK +#define VM_OBJECT_WLOCK VM_OBJECT_LOCK +#endif /* VM_OBJECT_WUNLOCK */ + + VM_OBJECT_WUNLOCK(object); + page = vm_page_getfake(paddr, memattr); + VM_OBJECT_WLOCK(object); + vm_page_lock(*mres); + vm_page_free(*mres); + vm_page_unlock(*mres); + *mres = page; + vm_page_insert(page, object, pidx); + } + page->valid = VM_PAGE_BITS_ALL; + return (VM_PAGER_OK); +} + + +static struct cdev_pager_ops netmap_cdev_pager_ops = { + .cdev_pg_ctor = netmap_dev_pager_ctor, + .cdev_pg_dtor = netmap_dev_pager_dtor, + .cdev_pg_fault = netmap_dev_pager_fault, +}; + + +static int +netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff, + vm_size_t objsize, vm_object_t *objp, int prot) +{ + int error; + struct netmap_vm_handle_t *vmh; + struct netmap_priv_d *priv; + vm_object_t obj; + + if (netmap_verbose) + D("cdev %p foff %jd size %jd objp %p prot %d", cdev, + (intmax_t )*foff, (intmax_t )objsize, objp, prot); + + vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF, + M_NOWAIT | M_ZERO); + if (vmh == NULL) + return ENOMEM; + vmh->dev = cdev; + + NMG_LOCK(); + error = devfs_get_cdevpriv((void**)&priv); + if (error) + goto err_unlock; + vmh->priv = priv; + priv->np_refcount++; + NMG_UNLOCK(); + + error = netmap_get_memory(priv); + if (error) + goto err_deref; + + obj = cdev_pager_allocate(vmh, OBJT_DEVICE, + &netmap_cdev_pager_ops, objsize, prot, + *foff, NULL); + if (obj == NULL) { + D("cdev_pager_allocate failed"); + error = EINVAL; + goto err_deref; + } + + *objp = obj; + return 0; + +err_deref: + NMG_LOCK(); + priv->np_refcount--; +err_unlock: + NMG_UNLOCK(); +// err: + free(vmh, M_DEVBUF); + return error; +} + + +// XXX can we remove this ? +static int +netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td) +{ + if (netmap_verbose) + D("dev %p fflag 0x%x devtype %d td %p", + dev, fflag, devtype, td); + return 0; +} + + +static int +netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) +{ + struct netmap_priv_d *priv; + int error; + + (void)dev; + (void)oflags; + (void)devtype; + (void)td; + + // XXX wait or nowait ? + priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, + M_NOWAIT | M_ZERO); + if (priv == NULL) + return ENOMEM; + + error = devfs_set_cdevpriv(priv, netmap_dtor); + if (error) + return error; + + priv->np_refcount = 1; + + return 0; +} + +/******************** kqueue support ****************/ + +/* + * The OS_selwakeup also needs to issue a KNOTE_UNLOCKED. + * We use a non-zero argument to distinguish the call from the one + * in kevent_scan() which instead also needs to run netmap_poll(). + * The knote uses a global mutex for the time being. We might + * try to reuse the one in the si, but it is not allocated + * permanently so it might be a bit tricky. + * + * The *kqfilter function registers one or another f_event + * depending on read or write mode. + * In the call to f_event() td_fpop is NULL so any child function + * calling devfs_get_cdevpriv() would fail - and we need it in + * netmap_poll(). As a workaround we store priv into kn->kn_hook + * and pass it as first argument to netmap_poll(), which then + * uses the failure to tell that we are called from f_event() + * and do not need the selrecord(). + */ + +void freebsd_selwakeup(struct selinfo *si, int pri); + +void +freebsd_selwakeup(struct selinfo *si, int pri) +{ + if (netmap_verbose) + D("on knote %p", &si->si_note); + selwakeuppri(si, pri); + /* use a non-zero hint to tell the notification from the + * call done in kqueue_scan() which uses 0 + */ + KNOTE_UNLOCKED(&si->si_note, 0x100 /* notification */); +} + +static void +netmap_knrdetach(struct knote *kn) +{ + struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook; + struct selinfo *si = priv->np_rxsi; + + D("remove selinfo %p", si); + knlist_remove(&si->si_note, kn, 0); +} + +static void +netmap_knwdetach(struct knote *kn) +{ + struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook; + struct selinfo *si = priv->np_txsi; + + D("remove selinfo %p", si); + knlist_remove(&si->si_note, kn, 0); +} + +/* + * callback from notifies (generated externally) and our + * calls to kevent(). The former we just return 1 (ready) + * since we do not know better. + * In the latter we call netmap_poll and return 0/1 accordingly. + */ +static int +netmap_knrw(struct knote *kn, long hint, int events) +{ + struct netmap_priv_d *priv; + int revents; + + if (hint != 0) { + ND(5, "call from notify"); + return 1; /* assume we are ready */ + } + priv = kn->kn_hook; + /* the notification may come from an external thread, + * in which case we do not want to run the netmap_poll + * This should be filtered above, but check just in case. + */ + if (curthread != priv->np_td) { /* should not happen */ + RD(5, "curthread changed %p %p", curthread, priv->np_td); + return 1; + } else { + revents = netmap_poll((void *)priv, events, curthread); + return (events & revents) ? 1 : 0; + } +} + +static int +netmap_knread(struct knote *kn, long hint) +{ + return netmap_knrw(kn, hint, POLLIN); +} + +static int +netmap_knwrite(struct knote *kn, long hint) +{ + return netmap_knrw(kn, hint, POLLOUT); +} + +static struct filterops netmap_rfiltops = { + .f_isfd = 1, + .f_detach = netmap_knrdetach, + .f_event = netmap_knread, +}; + +static struct filterops netmap_wfiltops = { + .f_isfd = 1, + .f_detach = netmap_knwdetach, + .f_event = netmap_knwrite, +}; + + +/* + * This is called when a thread invokes kevent() to record + * a change in the configuration of the kqueue(). + * The 'priv' should be the same as in the netmap device. + */ +static int +netmap_kqfilter(struct cdev *dev, struct knote *kn) +{ + struct netmap_priv_d *priv; + int error; + struct netmap_adapter *na; + struct selinfo *si; + int ev = kn->kn_filter; + + if (ev != EVFILT_READ && ev != EVFILT_WRITE) { + D("bad filter request %d", ev); + return 1; + } + error = devfs_get_cdevpriv((void**)&priv); + if (error) { + D("device not yet setup"); + return 1; + } + na = priv->np_na; + if (na == NULL) { + D("no netmap adapter for this file descriptor"); + return 1; + } + /* the si is indicated in the priv */ + si = (ev == EVFILT_WRITE) ? priv->np_txsi : priv->np_rxsi; + // XXX lock(priv) ? + kn->kn_fop = (ev == EVFILT_WRITE) ? + &netmap_wfiltops : &netmap_rfiltops; + kn->kn_hook = priv; + knlist_add(&si->si_note, kn, 1); + // XXX unlock(priv) + ND("register %p %s td %p priv %p kn %p np_nifp %p kn_fp/fpop %s", + na, na->ifp->if_xname, curthread, priv, kn, + priv->np_nifp, + kn->kn_fp == curthread->td_fpop ? "match" : "MISMATCH"); + return 0; +} + +struct cdevsw netmap_cdevsw = { + .d_version = D_VERSION, + .d_name = "netmap", + .d_open = netmap_open, + .d_mmap_single = netmap_mmap_single, + .d_ioctl = netmap_ioctl, + .d_poll = netmap_poll, + .d_kqfilter = netmap_kqfilter, + .d_close = netmap_close, +}; +/*--- end of kqueue support ----*/ + +/* + * Kernel entry point. + * + * Initialize/finalize the module and return. + * + * Return 0 on success, errno on failure. + */ +static int +netmap_loader(__unused struct module *module, int event, __unused void *arg) +{ + int error = 0; + + switch (event) { + case MOD_LOAD: + error = netmap_init(); + break; + + case MOD_UNLOAD: + netmap_fini(); + break; + + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + + +DEV_MODULE(netmap, netmap_loader, NULL); diff --git a/sys/dev/netmap/netmap_generic.c b/sys/dev/netmap/netmap_generic.c new file mode 100644 index 0000000..63253b6 --- /dev/null +++ b/sys/dev/netmap/netmap_generic.c @@ -0,0 +1,806 @@ +/* + * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This module implements netmap support on top of standard, + * unmodified device drivers. + * + * A NIOCREGIF request is handled here if the device does not + * have native support. TX and RX rings are emulated as follows: + * + * NIOCREGIF + * We preallocate a block of TX mbufs (roughly as many as + * tx descriptors; the number is not critical) to speed up + * operation during transmissions. The refcount on most of + * these buffers is artificially bumped up so we can recycle + * them more easily. Also, the destructor is intercepted + * so we use it as an interrupt notification to wake up + * processes blocked on a poll(). + * + * For each receive ring we allocate one "struct mbq" + * (an mbuf tailq plus a spinlock). We intercept packets + * (through if_input) + * on the receive path and put them in the mbq from which + * netmap receive routines can grab them. + * + * TX: + * in the generic_txsync() routine, netmap buffers are copied + * (or linked, in a future) to the preallocated mbufs + * and pushed to the transmit queue. Some of these mbufs + * (those with NS_REPORT, or otherwise every half ring) + * have the refcount=1, others have refcount=2. + * When the destructor is invoked, we take that as + * a notification that all mbufs up to that one in + * the specific ring have been completed, and generate + * the equivalent of a transmit interrupt. + * + * RX: + * + */ + +#ifdef __FreeBSD__ + +#include <sys/cdefs.h> /* prerequisite */ +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/malloc.h> +#include <sys/lock.h> /* PROT_EXEC */ +#include <sys/rwlock.h> +#include <sys/socket.h> /* sockaddrs */ +#include <sys/selinfo.h> +#include <net/if.h> +#include <net/if_var.h> +#include <machine/bus.h> /* bus_dmamap_* in netmap_kern.h */ + +// XXX temporary - D() defined here +#include <net/netmap.h> +#include <dev/netmap/netmap_kern.h> +#include <dev/netmap/netmap_mem2.h> + +#define rtnl_lock() D("rtnl_lock called"); +#define rtnl_unlock() D("rtnl_unlock called"); +#define MBUF_TXQ(m) ((m)->m_pkthdr.flowid) +#define MBUF_RXQ(m) ((m)->m_pkthdr.flowid) +#define smp_mb() + +/* + * mbuf wrappers + */ + +/* + * we allocate an EXT_PACKET + */ +#define netmap_get_mbuf(len) m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR|M_NOFREE) + +/* mbuf destructor, also need to change the type to EXT_EXTREF, + * add an M_NOFREE flag, and then clear the flag and + * chain into uma_zfree(zone_pack, mf) + * (or reinstall the buffer ?) + */ +#define SET_MBUF_DESTRUCTOR(m, fn) do { \ + (m)->m_ext.ext_free = (void *)fn; \ + (m)->m_ext.ext_type = EXT_EXTREF; \ +} while (0) + + +#define GET_MBUF_REFCNT(m) ((m)->m_ext.ref_cnt ? *(m)->m_ext.ref_cnt : -1) + + + +#else /* linux */ + +#include "bsd_glue.h" + +#include <linux/rtnetlink.h> /* rtnl_[un]lock() */ +#include <linux/ethtool.h> /* struct ethtool_ops, get_ringparam */ +#include <linux/hrtimer.h> + +//#define RATE /* Enables communication statistics. */ + +//#define REG_RESET + +#endif /* linux */ + + +/* Common headers. */ +#include <net/netmap.h> +#include <dev/netmap/netmap_kern.h> +#include <dev/netmap/netmap_mem2.h> + + + +/* ======================== usage stats =========================== */ + +#ifdef RATE +#define IFRATE(x) x +struct rate_stats { + unsigned long txpkt; + unsigned long txsync; + unsigned long txirq; + unsigned long rxpkt; + unsigned long rxirq; + unsigned long rxsync; +}; + +struct rate_context { + unsigned refcount; + struct timer_list timer; + struct rate_stats new; + struct rate_stats old; +}; + +#define RATE_PRINTK(_NAME_) \ + printk( #_NAME_ " = %lu Hz\n", (cur._NAME_ - ctx->old._NAME_)/RATE_PERIOD); +#define RATE_PERIOD 2 +static void rate_callback(unsigned long arg) +{ + struct rate_context * ctx = (struct rate_context *)arg; + struct rate_stats cur = ctx->new; + int r; + + RATE_PRINTK(txpkt); + RATE_PRINTK(txsync); + RATE_PRINTK(txirq); + RATE_PRINTK(rxpkt); + RATE_PRINTK(rxsync); + RATE_PRINTK(rxirq); + printk("\n"); + + ctx->old = cur; + r = mod_timer(&ctx->timer, jiffies + + msecs_to_jiffies(RATE_PERIOD * 1000)); + if (unlikely(r)) + D("[v1000] Error: mod_timer()"); +} + +static struct rate_context rate_ctx; + +#else /* !RATE */ +#define IFRATE(x) +#endif /* !RATE */ + + +/* =============== GENERIC NETMAP ADAPTER SUPPORT ================= */ +#define GENERIC_BUF_SIZE netmap_buf_size /* Size of the mbufs in the Tx pool. */ + +/* + * Wrapper used by the generic adapter layer to notify + * the poller threads. Differently from netmap_rx_irq(), we check + * only IFCAP_NETMAP instead of NAF_NATIVE_ON to enable the irq. + */ +static void +netmap_generic_irq(struct ifnet *ifp, u_int q, u_int *work_done) +{ + if (unlikely(!(ifp->if_capenable & IFCAP_NETMAP))) + return; + + netmap_common_irq(ifp, q, work_done); +} + + +/* Enable/disable netmap mode for a generic network interface. */ +static int +generic_netmap_register(struct netmap_adapter *na, int enable) +{ + struct ifnet *ifp = na->ifp; + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; + struct mbuf *m; + int error; + int i, r; + + if (!na) + return EINVAL; + +#ifdef REG_RESET + error = ifp->netdev_ops->ndo_stop(ifp); + if (error) { + return error; + } +#endif /* REG_RESET */ + + if (enable) { /* Enable netmap mode. */ + /* Init the mitigation support. */ + gna->mit = malloc(na->num_rx_rings * sizeof(struct nm_generic_mit), + M_DEVBUF, M_NOWAIT | M_ZERO); + if (!gna->mit) { + D("mitigation allocation failed"); + error = ENOMEM; + goto out; + } + for (r=0; r<na->num_rx_rings; r++) + netmap_mitigation_init(&gna->mit[r], na); + + /* Initialize the rx queue, as generic_rx_handler() can + * be called as soon as netmap_catch_rx() returns. + */ + for (r=0; r<na->num_rx_rings; r++) { + mbq_safe_init(&na->rx_rings[r].rx_queue); + } + + /* + * Preallocate packet buffers for the tx rings. + */ + for (r=0; r<na->num_tx_rings; r++) + na->tx_rings[r].tx_pool = NULL; + for (r=0; r<na->num_tx_rings; r++) { + na->tx_rings[r].tx_pool = malloc(na->num_tx_desc * sizeof(struct mbuf *), + M_DEVBUF, M_NOWAIT | M_ZERO); + if (!na->tx_rings[r].tx_pool) { + D("tx_pool allocation failed"); + error = ENOMEM; + goto free_tx_pools; + } + for (i=0; i<na->num_tx_desc; i++) + na->tx_rings[r].tx_pool[i] = NULL; + for (i=0; i<na->num_tx_desc; i++) { + m = netmap_get_mbuf(GENERIC_BUF_SIZE); + if (!m) { + D("tx_pool[%d] allocation failed", i); + error = ENOMEM; + goto free_tx_pools; + } + na->tx_rings[r].tx_pool[i] = m; + } + } + rtnl_lock(); + /* Prepare to intercept incoming traffic. */ + error = netmap_catch_rx(na, 1); + if (error) { + D("netdev_rx_handler_register() failed (%d)", error); + goto register_handler; + } + ifp->if_capenable |= IFCAP_NETMAP; + + /* Make netmap control the packet steering. */ + netmap_catch_tx(gna, 1); + + rtnl_unlock(); + +#ifdef RATE + if (rate_ctx.refcount == 0) { + D("setup_timer()"); + memset(&rate_ctx, 0, sizeof(rate_ctx)); + setup_timer(&rate_ctx.timer, &rate_callback, (unsigned long)&rate_ctx); + if (mod_timer(&rate_ctx.timer, jiffies + msecs_to_jiffies(1500))) { + D("Error: mod_timer()"); + } + } + rate_ctx.refcount++; +#endif /* RATE */ + + } else if (na->tx_rings[0].tx_pool) { + /* Disable netmap mode. We enter here only if the previous + generic_netmap_register(na, 1) was successfull. + If it was not, na->tx_rings[0].tx_pool was set to NULL by the + error handling code below. */ + rtnl_lock(); + + ifp->if_capenable &= ~IFCAP_NETMAP; + + /* Release packet steering control. */ + netmap_catch_tx(gna, 0); + + /* Do not intercept packets on the rx path. */ + netmap_catch_rx(na, 0); + + rtnl_unlock(); + + /* Free the mbufs going to the netmap rings */ + for (r=0; r<na->num_rx_rings; r++) { + mbq_safe_purge(&na->rx_rings[r].rx_queue); + mbq_safe_destroy(&na->rx_rings[r].rx_queue); + } + + for (r=0; r<na->num_rx_rings; r++) + netmap_mitigation_cleanup(&gna->mit[r]); + free(gna->mit, M_DEVBUF); + + for (r=0; r<na->num_tx_rings; r++) { + for (i=0; i<na->num_tx_desc; i++) { + m_freem(na->tx_rings[r].tx_pool[i]); + } + free(na->tx_rings[r].tx_pool, M_DEVBUF); + } + +#ifdef RATE + if (--rate_ctx.refcount == 0) { + D("del_timer()"); + del_timer(&rate_ctx.timer); + } +#endif + } + +#ifdef REG_RESET + error = ifp->netdev_ops->ndo_open(ifp); + if (error) { + goto free_tx_pools; + } +#endif + + return 0; + +register_handler: + rtnl_unlock(); +free_tx_pools: + for (r=0; r<na->num_tx_rings; r++) { + if (na->tx_rings[r].tx_pool == NULL) + continue; + for (i=0; i<na->num_tx_desc; i++) + if (na->tx_rings[r].tx_pool[i]) + m_freem(na->tx_rings[r].tx_pool[i]); + free(na->tx_rings[r].tx_pool, M_DEVBUF); + na->tx_rings[r].tx_pool = NULL; + } + for (r=0; r<na->num_rx_rings; r++) { + netmap_mitigation_cleanup(&gna->mit[r]); + mbq_safe_destroy(&na->rx_rings[r].rx_queue); + } + free(gna->mit, M_DEVBUF); +out: + + return error; +} + +/* + * Callback invoked when the device driver frees an mbuf used + * by netmap to transmit a packet. This usually happens when + * the NIC notifies the driver that transmission is completed. + */ +static void +generic_mbuf_destructor(struct mbuf *m) +{ + if (netmap_verbose) + D("Tx irq (%p) queue %d", m, MBUF_TXQ(m)); + netmap_generic_irq(MBUF_IFP(m), MBUF_TXQ(m), NULL); +#ifdef __FreeBSD__ + m->m_ext.ext_type = EXT_PACKET; + m->m_ext.ext_free = NULL; + if (*(m->m_ext.ref_cnt) == 0) + *(m->m_ext.ref_cnt) = 1; + uma_zfree(zone_pack, m); +#endif /* __FreeBSD__ */ + IFRATE(rate_ctx.new.txirq++); +} + +/* Record completed transmissions and update hwtail. + * + * The oldest tx buffer not yet completed is at nr_hwtail + 1, + * nr_hwcur is the first unsent buffer. + */ +static u_int +generic_netmap_tx_clean(struct netmap_kring *kring) +{ + u_int const lim = kring->nkr_num_slots - 1; + u_int nm_i = nm_next(kring->nr_hwtail, lim); + u_int hwcur = kring->nr_hwcur; + u_int n = 0; + struct mbuf **tx_pool = kring->tx_pool; + + while (nm_i != hwcur) { /* buffers not completed */ + struct mbuf *m = tx_pool[nm_i]; + + if (unlikely(m == NULL)) { + /* this is done, try to replenish the entry */ + tx_pool[nm_i] = m = netmap_get_mbuf(GENERIC_BUF_SIZE); + if (unlikely(m == NULL)) { + D("mbuf allocation failed, XXX error"); + // XXX how do we proceed ? break ? + return -ENOMEM; + } + } else if (GET_MBUF_REFCNT(m) != 1) { + break; /* This mbuf is still busy: its refcnt is 2. */ + } + n++; + nm_i = nm_next(nm_i, lim); + } + kring->nr_hwtail = nm_prev(nm_i, lim); + ND("tx completed [%d] -> hwtail %d", n, kring->nr_hwtail); + + return n; +} + + +/* + * We have pending packets in the driver between nr_hwtail +1 and hwcur. + * Compute a position in the middle, to be used to generate + * a notification. + */ +static inline u_int +generic_tx_event_middle(struct netmap_kring *kring, u_int hwcur) +{ + u_int n = kring->nkr_num_slots; + u_int ntc = nm_next(kring->nr_hwtail, n-1); + u_int e; + + if (hwcur >= ntc) { + e = (hwcur + ntc) / 2; + } else { /* wrap around */ + e = (hwcur + n + ntc) / 2; + if (e >= n) { + e -= n; + } + } + + if (unlikely(e >= n)) { + D("This cannot happen"); + e = 0; + } + + return e; +} + +/* + * We have pending packets in the driver between nr_hwtail+1 and hwcur. + * Schedule a notification approximately in the middle of the two. + * There is a race but this is only called within txsync which does + * a double check. + */ +static void +generic_set_tx_event(struct netmap_kring *kring, u_int hwcur) +{ + struct mbuf *m; + u_int e; + + if (nm_next(kring->nr_hwtail, kring->nkr_num_slots -1) == hwcur) { + return; /* all buffers are free */ + } + e = generic_tx_event_middle(kring, hwcur); + + m = kring->tx_pool[e]; + if (m == NULL) { + /* This can happen if there is already an event on the netmap + slot 'e': There is nothing to do. */ + return; + } + ND("Event at %d mbuf %p refcnt %d", e, m, GET_MBUF_REFCNT(m)); + kring->tx_pool[e] = NULL; + SET_MBUF_DESTRUCTOR(m, generic_mbuf_destructor); + + // XXX wmb() ? + /* Decrement the refcount an free it if we have the last one. */ + m_freem(m); + smp_mb(); +} + + +/* + * generic_netmap_txsync() transforms netmap buffers into mbufs + * and passes them to the standard device driver + * (ndo_start_xmit() or ifp->if_transmit() ). + * On linux this is not done directly, but using dev_queue_xmit(), + * since it implements the TX flow control (and takes some locks). + */ +static int +generic_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct ifnet *ifp = na->ifp; + struct netmap_kring *kring = &na->tx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + u_int nm_i; /* index into the netmap ring */ // j + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; + + IFRATE(rate_ctx.new.txsync++); + + // TODO: handle the case of mbuf allocation failure + + rmb(); + + /* + * First part: process new packets to send. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* we have new packets to send */ + while (nm_i != head) { + struct netmap_slot *slot = &ring->slot[nm_i]; + u_int len = slot->len; + void *addr = NMB(slot); + + /* device-specific */ + struct mbuf *m; + int tx_ret; + + NM_CHECK_ADDR_LEN(addr, len); + + /* Tale a mbuf from the tx pool and copy in the user packet. */ + m = kring->tx_pool[nm_i]; + if (unlikely(!m)) { + RD(5, "This should never happen"); + kring->tx_pool[nm_i] = m = netmap_get_mbuf(GENERIC_BUF_SIZE); + if (unlikely(m == NULL)) { + D("mbuf allocation failed"); + break; + } + } + /* XXX we should ask notifications when NS_REPORT is set, + * or roughly every half frame. We can optimize this + * by lazily requesting notifications only when a + * transmission fails. Probably the best way is to + * break on failures and set notifications when + * ring->cur == ring->tail || nm_i != cur + */ + tx_ret = generic_xmit_frame(ifp, m, addr, len, ring_nr); + if (unlikely(tx_ret)) { + RD(5, "start_xmit failed: err %d [nm_i %u, head %u, hwtail %u]", + tx_ret, nm_i, head, kring->nr_hwtail); + /* + * No room for this mbuf in the device driver. + * Request a notification FOR A PREVIOUS MBUF, + * then call generic_netmap_tx_clean(kring) to do the + * double check and see if we can free more buffers. + * If there is space continue, else break; + * NOTE: the double check is necessary if the problem + * occurs in the txsync call after selrecord(). + * Also, we need some way to tell the caller that not + * all buffers were queued onto the device (this was + * not a problem with native netmap driver where space + * is preallocated). The bridge has a similar problem + * and we solve it there by dropping the excess packets. + */ + generic_set_tx_event(kring, nm_i); + if (generic_netmap_tx_clean(kring)) { /* space now available */ + continue; + } else { + break; + } + } + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + nm_i = nm_next(nm_i, lim); + IFRATE(rate_ctx.new.txpkt ++); + } + + /* Update hwcur to the next slot to transmit. */ + kring->nr_hwcur = nm_i; /* not head, we could break early */ + } + + /* + * Second, reclaim completed buffers + */ + if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { + /* No more available slots? Set a notification event + * on a netmap slot that will be cleaned in the future. + * No doublecheck is performed, since txsync() will be + * called twice by netmap_poll(). + */ + generic_set_tx_event(kring, nm_i); + } + ND("tx #%d, hwtail = %d", n, kring->nr_hwtail); + + generic_netmap_tx_clean(kring); + + nm_txsync_finalize(kring); + + return 0; +} + + +/* + * This handler is registered (through netmap_catch_rx()) + * within the attached network interface + * in the RX subsystem, so that every mbuf passed up by + * the driver can be stolen to the network stack. + * Stolen packets are put in a queue where the + * generic_netmap_rxsync() callback can extract them. + */ +void +generic_rx_handler(struct ifnet *ifp, struct mbuf *m) +{ + struct netmap_adapter *na = NA(ifp); + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; + u_int work_done; + u_int rr = MBUF_RXQ(m); // receive ring number + + if (rr >= na->num_rx_rings) { + rr = rr % na->num_rx_rings; // XXX expensive... + } + + /* limit the size of the queue */ + if (unlikely(mbq_len(&na->rx_rings[rr].rx_queue) > 1024)) { + m_freem(m); + } else { + mbq_safe_enqueue(&na->rx_rings[rr].rx_queue, m); + } + + if (netmap_generic_mit < 32768) { + /* no rx mitigation, pass notification up */ + netmap_generic_irq(na->ifp, rr, &work_done); + IFRATE(rate_ctx.new.rxirq++); + } else { + /* same as send combining, filter notification if there is a + * pending timer, otherwise pass it up and start a timer. + */ + if (likely(netmap_mitigation_active(&gna->mit[rr]))) { + /* Record that there is some pending work. */ + gna->mit[rr].mit_pending = 1; + } else { + netmap_generic_irq(na->ifp, rr, &work_done); + IFRATE(rate_ctx.new.rxirq++); + netmap_mitigation_start(&gna->mit[rr]); + } + } +} + +/* + * generic_netmap_rxsync() extracts mbufs from the queue filled by + * generic_netmap_rx_handler() and puts their content in the netmap + * receive ring. + * Access must be protected because the rx handler is asynchronous, + */ +static int +generic_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + u_int nm_i; /* index into the netmap ring */ //j, + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_rxsync_prologue(kring); + int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; + + if (head > lim) + return netmap_ring_reinit(kring); + + /* + * First part: import newly received packets. + */ + if (netmap_no_pendintr || force_update) { + /* extract buffers from the rx queue, stop at most one + * slot before nr_hwcur (stop_i) + */ + uint16_t slot_flags = kring->nkr_slot_flags; + u_int stop_i = nm_prev(kring->nr_hwcur, lim); + + nm_i = kring->nr_hwtail; /* first empty slot in the receive ring */ + for (n = 0; nm_i != stop_i; n++) { + int len; + void *addr = NMB(&ring->slot[nm_i]); + struct mbuf *m; + + /* we only check the address here on generic rx rings */ + if (addr == netmap_buffer_base) { /* Bad buffer */ + return netmap_ring_reinit(kring); + } + /* + * Call the locked version of the function. + * XXX Ideally we could grab a batch of mbufs at once + * and save some locking overhead. + */ + m = mbq_safe_dequeue(&kring->rx_queue); + if (!m) /* no more data */ + break; + len = MBUF_LEN(m); + m_copydata(m, 0, len, addr); + ring->slot[nm_i].len = len; + ring->slot[nm_i].flags = slot_flags; + m_freem(m); + nm_i = nm_next(nm_i, lim); + } + if (n) { + kring->nr_hwtail = nm_i; + IFRATE(rate_ctx.new.rxpkt += n); + } + kring->nr_kflags &= ~NKR_PENDINTR; + } + + // XXX should we invert the order ? + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { + /* Userspace has released some packets. */ + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; + + slot->flags &= ~NS_BUF_CHANGED; + nm_i = nm_next(nm_i, lim); + } + kring->nr_hwcur = head; + } + /* tell userspace that there might be new packets. */ + nm_rxsync_finalize(kring); + IFRATE(rate_ctx.new.rxsync++); + + return 0; +} + +static void +generic_netmap_dtor(struct netmap_adapter *na) +{ + struct ifnet *ifp = na->ifp; + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter*)na; + struct netmap_adapter *prev_na = gna->prev; + + if (prev_na != NULL) { + D("Released generic NA %p", gna); + if_rele(na->ifp); + netmap_adapter_put(prev_na); + } + if (ifp != NULL) { + WNA(ifp) = prev_na; + D("Restored native NA %p", prev_na); + na->ifp = NULL; + } +} + +/* + * generic_netmap_attach() makes it possible to use netmap on + * a device without native netmap support. + * This is less performant than native support but potentially + * faster than raw sockets or similar schemes. + * + * In this "emulated" mode, netmap rings do not necessarily + * have the same size as those in the NIC. We use a default + * value and possibly override it if the OS has ways to fetch the + * actual configuration. + */ +int +generic_netmap_attach(struct ifnet *ifp) +{ + struct netmap_adapter *na; + struct netmap_generic_adapter *gna; + int retval; + u_int num_tx_desc, num_rx_desc; + + num_tx_desc = num_rx_desc = netmap_generic_ringsize; /* starting point */ + + generic_find_num_desc(ifp, &num_tx_desc, &num_rx_desc); + ND("Netmap ring size: TX = %d, RX = %d", num_tx_desc, num_rx_desc); + + gna = malloc(sizeof(*gna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (gna == NULL) { + D("no memory on attach, give up"); + return ENOMEM; + } + na = (struct netmap_adapter *)gna; + na->ifp = ifp; + na->num_tx_desc = num_tx_desc; + na->num_rx_desc = num_rx_desc; + na->nm_register = &generic_netmap_register; + na->nm_txsync = &generic_netmap_txsync; + na->nm_rxsync = &generic_netmap_rxsync; + na->nm_dtor = &generic_netmap_dtor; + /* when using generic, IFCAP_NETMAP is set so we force + * NAF_SKIP_INTR to use the regular interrupt handler + */ + na->na_flags = NAF_SKIP_INTR | NAF_HOST_RINGS; + + ND("[GNA] num_tx_queues(%d), real_num_tx_queues(%d), len(%lu)", + ifp->num_tx_queues, ifp->real_num_tx_queues, + ifp->tx_queue_len); + ND("[GNA] num_rx_queues(%d), real_num_rx_queues(%d)", + ifp->num_rx_queues, ifp->real_num_rx_queues); + + generic_find_num_queues(ifp, &na->num_tx_rings, &na->num_rx_rings); + + retval = netmap_attach_common(na); + if (retval) { + free(gna, M_DEVBUF); + } + + return retval; +} diff --git a/sys/dev/netmap/netmap_mbq.c b/sys/dev/netmap/netmap_mbq.c new file mode 100644 index 0000000..2606b13 --- /dev/null +++ b/sys/dev/netmap/netmap_mbq.c @@ -0,0 +1,163 @@ +/* + * Copyright (C) 2013-2014 Vincenzo Maffione. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + */ + + +#ifdef linux +#include "bsd_glue.h" +#else /* __FreeBSD__ */ +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#endif /* __FreeBSD__ */ + +#include "netmap_mbq.h" + + +static inline void __mbq_init(struct mbq *q) +{ + q->head = q->tail = NULL; + q->count = 0; +} + + +void mbq_safe_init(struct mbq *q) +{ + mtx_init(&q->lock, "mbq", NULL, MTX_SPIN); + __mbq_init(q); +} + + +void mbq_init(struct mbq *q) +{ + __mbq_init(q); +} + + +static inline void __mbq_enqueue(struct mbq *q, struct mbuf *m) +{ + m->m_nextpkt = NULL; + if (q->tail) { + q->tail->m_nextpkt = m; + q->tail = m; + } else { + q->head = q->tail = m; + } + q->count++; +} + + +void mbq_safe_enqueue(struct mbq *q, struct mbuf *m) +{ + mtx_lock(&q->lock); + __mbq_enqueue(q, m); + mtx_unlock(&q->lock); +} + + +void mbq_enqueue(struct mbq *q, struct mbuf *m) +{ + __mbq_enqueue(q, m); +} + + +static inline struct mbuf *__mbq_dequeue(struct mbq *q) +{ + struct mbuf *ret = NULL; + + if (q->head) { + ret = q->head; + q->head = ret->m_nextpkt; + if (q->head == NULL) { + q->tail = NULL; + } + q->count--; + ret->m_nextpkt = NULL; + } + + return ret; +} + + +struct mbuf *mbq_safe_dequeue(struct mbq *q) +{ + struct mbuf *ret; + + mtx_lock(&q->lock); + ret = __mbq_dequeue(q); + mtx_unlock(&q->lock); + + return ret; +} + + +struct mbuf *mbq_dequeue(struct mbq *q) +{ + return __mbq_dequeue(q); +} + + +/* XXX seems pointless to have a generic purge */ +static void __mbq_purge(struct mbq *q, int safe) +{ + struct mbuf *m; + + for (;;) { + m = safe ? mbq_safe_dequeue(q) : mbq_dequeue(q); + if (m) { + m_freem(m); + } else { + break; + } + } +} + + +void mbq_purge(struct mbq *q) +{ + __mbq_purge(q, 0); +} + + +void mbq_safe_purge(struct mbq *q) +{ + __mbq_purge(q, 1); +} + + +void mbq_safe_destroy(struct mbq *q) +{ + mtx_destroy(&q->lock); +} + + +void mbq_destroy(struct mbq *q) +{ +} diff --git a/sys/dev/netmap/netmap_mbq.h b/sys/dev/netmap/netmap_mbq.h new file mode 100644 index 0000000..d273d8a --- /dev/null +++ b/sys/dev/netmap/netmap_mbq.h @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2013-2014 Vincenzo Maffione. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + */ + + +#ifndef __NETMAP_MBQ_H__ +#define __NETMAP_MBQ_H__ + +/* + * These function implement an mbuf tailq with an optional lock. + * The base functions act ONLY ON THE QUEUE, whereas the "safe" + * variants (mbq_safe_*) also handle the lock. + */ + +/* XXX probably rely on a previous definition of SPINLOCK_T */ +#ifdef linux +#define SPINLOCK_T safe_spinlock_t +#else +#define SPINLOCK_T struct mtx +#endif + +/* A FIFO queue of mbufs with an optional lock. */ +struct mbq { + struct mbuf *head; + struct mbuf *tail; + int count; + SPINLOCK_T lock; +}; + +/* XXX "destroy" does not match "init" as a name. + * We should also clarify whether init can be used while + * holding a lock, and whether mbq_safe_destroy() is a NOP. + */ +void mbq_init(struct mbq *q); +void mbq_destroy(struct mbq *q); +void mbq_enqueue(struct mbq *q, struct mbuf *m); +struct mbuf *mbq_dequeue(struct mbq *q); +void mbq_purge(struct mbq *q); + +/* XXX missing mbq_lock() and mbq_unlock */ + +void mbq_safe_init(struct mbq *q); +void mbq_safe_destroy(struct mbq *q); +void mbq_safe_enqueue(struct mbq *q, struct mbuf *m); +struct mbuf *mbq_safe_dequeue(struct mbq *q); +void mbq_safe_purge(struct mbq *q); + +static inline unsigned int mbq_len(struct mbq *q) +{ + return q->count; +} + +#endif /* __NETMAP_MBQ_H_ */ diff --git a/sys/dev/netmap/netmap_mem2.h b/sys/dev/netmap/netmap_mem2.h new file mode 100644 index 0000000..e83616a --- /dev/null +++ b/sys/dev/netmap/netmap_mem2.h @@ -0,0 +1,227 @@ +/* + * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * + * (New) memory allocator for netmap + */ + +/* + * This allocator creates three memory pools: + * nm_if_pool for the struct netmap_if + * nm_ring_pool for the struct netmap_ring + * nm_buf_pool for the packet buffers. + * + * that contain netmap objects. Each pool is made of a number of clusters, + * multiple of a page size, each containing an integer number of objects. + * The clusters are contiguous in user space but not in the kernel. + * Only nm_buf_pool needs to be dma-able, + * but for convenience use the same type of allocator for all. + * + * Once mapped, the three pools are exported to userspace + * as a contiguous block, starting from nm_if_pool. Each + * cluster (and pool) is an integral number of pages. + * [ . . . ][ . . . . . .][ . . . . . . . . . .] + * nm_if nm_ring nm_buf + * + * The userspace areas contain offsets of the objects in userspace. + * When (at init time) we write these offsets, we find out the index + * of the object, and from there locate the offset from the beginning + * of the region. + * + * The invididual allocators manage a pool of memory for objects of + * the same size. + * The pool is split into smaller clusters, whose size is a + * multiple of the page size. The cluster size is chosen + * to minimize the waste for a given max cluster size + * (we do it by brute force, as we have relatively few objects + * per cluster). + * + * Objects are aligned to the cache line (64 bytes) rounding up object + * sizes when needed. A bitmap contains the state of each object. + * Allocation scans the bitmap; this is done only on attach, so we are not + * too worried about performance + * + * For each allocator we can define (thorugh sysctl) the size and + * number of each object. Memory is allocated at the first use of a + * netmap file descriptor, and can be freed when all such descriptors + * have been released (including unmapping the memory). + * If memory is scarce, the system tries to get as much as possible + * and the sysctl values reflect the actual allocation. + * Together with desired values, the sysctl export also absolute + * min and maximum values that cannot be overridden. + * + * struct netmap_if: + * variable size, max 16 bytes per ring pair plus some fixed amount. + * 1024 bytes should be large enough in practice. + * + * In the worst case we have one netmap_if per ring in the system. + * + * struct netmap_ring + * variable size, 8 byte per slot plus some fixed amount. + * Rings can be large (e.g. 4k slots, or >32Kbytes). + * We default to 36 KB (9 pages), and a few hundred rings. + * + * struct netmap_buffer + * The more the better, both because fast interfaces tend to have + * many slots, and because we may want to use buffers to store + * packets in userspace avoiding copies. + * Must contain a full frame (eg 1518, or more for vlans, jumbo + * frames etc.) plus be nicely aligned, plus some NICs restrict + * the size to multiple of 1K or so. Default to 2K + */ +#ifndef _NET_NETMAP_MEM2_H_ +#define _NET_NETMAP_MEM2_H_ + + +#define NETMAP_BUF_MAX_NUM 20*4096*2 /* large machine */ + +#define NETMAP_POOL_MAX_NAMSZ 32 + + +enum { + NETMAP_IF_POOL = 0, + NETMAP_RING_POOL, + NETMAP_BUF_POOL, + NETMAP_POOLS_NR +}; + + +struct netmap_obj_params { + u_int size; + u_int num; +}; +struct netmap_obj_pool { + char name[NETMAP_POOL_MAX_NAMSZ]; /* name of the allocator */ + + /* ---------------------------------------------------*/ + /* these are only meaningful if the pool is finalized */ + /* (see 'finalized' field in netmap_mem_d) */ + u_int objtotal; /* actual total number of objects. */ + u_int memtotal; /* actual total memory space */ + u_int numclusters; /* actual number of clusters */ + + u_int objfree; /* number of free objects. */ + + struct lut_entry *lut; /* virt,phys addresses, objtotal entries */ + uint32_t *bitmap; /* one bit per buffer, 1 means free */ + uint32_t bitmap_slots; /* number of uint32 entries in bitmap */ + /* ---------------------------------------------------*/ + + /* limits */ + u_int objminsize; /* minimum object size */ + u_int objmaxsize; /* maximum object size */ + u_int nummin; /* minimum number of objects */ + u_int nummax; /* maximum number of objects */ + + /* these are changed only by config */ + u_int _objtotal; /* total number of objects */ + u_int _objsize; /* object size */ + u_int _clustsize; /* cluster size */ + u_int _clustentries; /* objects per cluster */ + u_int _numclusters; /* number of clusters */ + + /* requested values */ + u_int r_objtotal; + u_int r_objsize; +}; + +#ifdef linux +// XXX a mtx would suffice here 20130415 lr +#define NMA_LOCK_T struct semaphore +#else /* !linux */ +#define NMA_LOCK_T struct mtx +#endif /* linux */ + +typedef int (*netmap_mem_config_t)(struct netmap_mem_d*); +typedef int (*netmap_mem_finalize_t)(struct netmap_mem_d*); +typedef void (*netmap_mem_deref_t)(struct netmap_mem_d*); + +typedef uint16_t nm_memid_t; + +/* We implement two kinds of netmap_mem_d structures: + * + * - global: used by hardware NICS; + * + * - private: used by VALE ports. + * + * In both cases, the netmap_mem_d structure has the same lifetime as the + * netmap_adapter of the corresponding NIC or port. It is the responsibility of + * the client code to delete the private allocator when the associated + * netmap_adapter is freed (this is implemented by the NAF_MEM_OWNER flag in + * netmap.c). The 'refcount' field counts the number of active users of the + * structure. The global allocator uses this information to prevent/allow + * reconfiguration. The private allocators release all their memory when there + * are no active users. By 'active user' we mean an existing netmap_priv + * structure holding a reference to the allocator. + */ +struct netmap_mem_d { + NMA_LOCK_T nm_mtx; /* protect the allocator */ + u_int nm_totalsize; /* shorthand */ + + u_int flags; +#define NETMAP_MEM_FINALIZED 0x1 /* preallocation done */ +#define NETMAP_MEM_PRIVATE 0x2 /* uses private address space */ + int lasterr; /* last error for curr config */ + int refcount; /* existing priv structures */ + /* the three allocators */ + struct netmap_obj_pool pools[NETMAP_POOLS_NR]; + + netmap_mem_config_t config; + netmap_mem_finalize_t finalize; + netmap_mem_deref_t deref; + + nm_memid_t nm_id; /* allocator identifier */ + + /* list of all existing allocators, sorted by nm_id */ + struct netmap_mem_d *prev, *next; +}; + +extern struct netmap_mem_d nm_mem; + +vm_paddr_t netmap_mem_ofstophys(struct netmap_mem_d *, vm_ooffset_t); +int netmap_mem_finalize(struct netmap_mem_d *); +int netmap_mem_init(void); +void netmap_mem_fini(void); +struct netmap_if * + netmap_mem_if_new(const char *, struct netmap_adapter *); +void netmap_mem_if_delete(struct netmap_adapter *, struct netmap_if *); +int netmap_mem_rings_create(struct netmap_adapter *); +void netmap_mem_rings_delete(struct netmap_adapter *); +void netmap_mem_deref(struct netmap_mem_d *); +int netmap_mem_get_info(struct netmap_mem_d *, u_int *size, u_int *memflags, uint16_t *id); +ssize_t netmap_mem_if_offset(struct netmap_mem_d *, const void *vaddr); +struct netmap_mem_d* netmap_mem_private_new(const char *name, + u_int txr, u_int txd, u_int rxr, u_int rxd, u_int extra_bufs, u_int npipes, + int* error); +void netmap_mem_private_delete(struct netmap_mem_d *); + +#define NETMAP_BDG_BUF_SIZE(n) ((n)->pools[NETMAP_BUF_POOL]._objsize) + +uint32_t netmap_extra_alloc(struct netmap_adapter *, uint32_t *, uint32_t n); + + +#endif diff --git a/sys/dev/netmap/netmap_offloadings.c b/sys/dev/netmap/netmap_offloadings.c new file mode 100644 index 0000000..a776a24 --- /dev/null +++ b/sys/dev/netmap/netmap_offloadings.c @@ -0,0 +1,401 @@ +/* + * Copyright (C) 2014 Vincenzo Maffione. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* $FreeBSD$ */ + +#if defined(__FreeBSD__) +#include <sys/cdefs.h> /* prerequisite */ + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/param.h> /* defines used in kernel.h */ +#include <sys/kernel.h> /* types used in module initialization */ +#include <sys/sockio.h> +#include <sys/socketvar.h> /* struct socket */ +#include <sys/socket.h> /* sockaddrs */ +#include <net/if.h> +#include <net/if_var.h> +#include <machine/bus.h> /* bus_dmamap_* */ +#include <sys/endian.h> + +#elif defined(linux) + +#include "bsd_glue.h" + +#elif defined(__APPLE__) + +#warning OSX support is only partial +#include "osx_glue.h" + +#else + +#error Unsupported platform + +#endif /* unsupported */ + +#include <net/netmap.h> +#include <dev/netmap/netmap_kern.h> + + + +/* This routine is called by bdg_mismatch_datapath() when it finishes + * accumulating bytes for a segment, in order to fix some fields in the + * segment headers (which still contain the same content as the header + * of the original GSO packet). 'buf' points to the beginning (e.g. + * the ethernet header) of the segment, and 'len' is its length. + */ +static void gso_fix_segment(uint8_t *buf, size_t len, u_int idx, + u_int segmented_bytes, u_int last_segment, + u_int tcp, u_int iphlen) +{ + struct nm_iphdr *iph = (struct nm_iphdr *)(buf + 14); + struct nm_ipv6hdr *ip6h = (struct nm_ipv6hdr *)(buf + 14); + uint16_t *check = NULL; + uint8_t *check_data = NULL; + + if (iphlen == 20) { + /* Set the IPv4 "Total Length" field. */ + iph->tot_len = htobe16(len-14); + ND("ip total length %u", be16toh(ip->tot_len)); + + /* Set the IPv4 "Identification" field. */ + iph->id = htobe16(be16toh(iph->id) + idx); + ND("ip identification %u", be16toh(iph->id)); + + /* Compute and insert the IPv4 header checksum. */ + iph->check = 0; + iph->check = nm_csum_ipv4(iph); + ND("IP csum %x", be16toh(iph->check)); + } else {/* if (iphlen == 40) */ + /* Set the IPv6 "Payload Len" field. */ + ip6h->payload_len = htobe16(len-14-iphlen); + } + + if (tcp) { + struct nm_tcphdr *tcph = (struct nm_tcphdr *)(buf + 14 + iphlen); + + /* Set the TCP sequence number. */ + tcph->seq = htobe32(be32toh(tcph->seq) + segmented_bytes); + ND("tcp seq %u", be32toh(tcph->seq)); + + /* Zero the PSH and FIN TCP flags if this is not the last + segment. */ + if (!last_segment) + tcph->flags &= ~(0x8 | 0x1); + ND("last_segment %u", last_segment); + + check = &tcph->check; + check_data = (uint8_t *)tcph; + } else { /* UDP */ + struct nm_udphdr *udph = (struct nm_udphdr *)(buf + 14 + iphlen); + + /* Set the UDP 'Length' field. */ + udph->len = htobe16(len-14-iphlen); + + check = &udph->check; + check_data = (uint8_t *)udph; + } + + /* Compute and insert TCP/UDP checksum. */ + *check = 0; + if (iphlen == 20) + nm_csum_tcpudp_ipv4(iph, check_data, len-14-iphlen, check); + else + nm_csum_tcpudp_ipv6(ip6h, check_data, len-14-iphlen, check); + + ND("TCP/UDP csum %x", be16toh(*check)); +} + + +/* The VALE mismatch datapath implementation. */ +void bdg_mismatch_datapath(struct netmap_vp_adapter *na, + struct netmap_vp_adapter *dst_na, + struct nm_bdg_fwd *ft_p, struct netmap_ring *ring, + u_int *j, u_int lim, u_int *howmany) +{ + struct netmap_slot *slot = NULL; + struct nm_vnet_hdr *vh = NULL; + /* Number of source slots to process. */ + u_int frags = ft_p->ft_frags; + struct nm_bdg_fwd *ft_end = ft_p + frags; + + /* Source and destination pointers. */ + uint8_t *dst, *src; + size_t src_len, dst_len; + + u_int j_start = *j; + u_int dst_slots = 0; + + /* If the source port uses the offloadings, while destination doesn't, + * we grab the source virtio-net header and do the offloadings here. + */ + if (na->virt_hdr_len && !dst_na->virt_hdr_len) { + vh = (struct nm_vnet_hdr *)ft_p->ft_buf; + } + + /* Init source and dest pointers. */ + src = ft_p->ft_buf; + src_len = ft_p->ft_len; + slot = &ring->slot[*j]; + dst = BDG_NMB(&dst_na->up, slot); + dst_len = src_len; + + /* We are processing the first input slot and there is a mismatch + * between source and destination virt_hdr_len (SHL and DHL). + * When the a client is using virtio-net headers, the header length + * can be: + * - 10: the header corresponds to the struct nm_vnet_hdr + * - 12: the first 10 bytes correspond to the struct + * virtio_net_hdr, and the last 2 bytes store the + * "mergeable buffers" info, which is an optional + * hint that can be zeroed for compability + * + * The destination header is therefore built according to the + * following table: + * + * SHL | DHL | destination header + * ----------------------------- + * 0 | 10 | zero + * 0 | 12 | zero + * 10 | 0 | doesn't exist + * 10 | 12 | first 10 bytes are copied from source header, last 2 are zero + * 12 | 0 | doesn't exist + * 12 | 10 | copied from the first 10 bytes of source header + */ + bzero(dst, dst_na->virt_hdr_len); + if (na->virt_hdr_len && dst_na->virt_hdr_len) + memcpy(dst, src, sizeof(struct nm_vnet_hdr)); + /* Skip the virtio-net headers. */ + src += na->virt_hdr_len; + src_len -= na->virt_hdr_len; + dst += dst_na->virt_hdr_len; + dst_len = dst_na->virt_hdr_len + src_len; + + /* Here it could be dst_len == 0 (which implies src_len == 0), + * so we avoid passing a zero length fragment. + */ + if (dst_len == 0) { + ft_p++; + src = ft_p->ft_buf; + src_len = ft_p->ft_len; + dst_len = src_len; + } + + if (vh && vh->gso_type != VIRTIO_NET_HDR_GSO_NONE) { + u_int gso_bytes = 0; + /* Length of the GSO packet header. */ + u_int gso_hdr_len = 0; + /* Pointer to the GSO packet header. Assume it is in a single fragment. */ + uint8_t *gso_hdr = NULL; + /* Index of the current segment. */ + u_int gso_idx = 0; + /* Payload data bytes segmented so far (e.g. TCP data bytes). */ + u_int segmented_bytes = 0; + /* Length of the IP header (20 if IPv4, 40 if IPv6). */ + u_int iphlen = 0; + /* Is this a TCP or an UDP GSO packet? */ + u_int tcp = ((vh->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) + == VIRTIO_NET_HDR_GSO_UDP) ? 0 : 1; + + /* Segment the GSO packet contained into the input slots (frags). */ + while (ft_p != ft_end) { + size_t copy; + + /* Grab the GSO header if we don't have it. */ + if (!gso_hdr) { + uint16_t ethertype; + + gso_hdr = src; + + /* Look at the 'Ethertype' field to see if this packet + * is IPv4 or IPv6. + */ + ethertype = be16toh(*((uint16_t *)(gso_hdr + 12))); + if (ethertype == 0x0800) + iphlen = 20; + else /* if (ethertype == 0x86DD) */ + iphlen = 40; + ND(3, "type=%04x", ethertype); + + /* Compute gso_hdr_len. For TCP we need to read the + * content of the 'Data Offset' field. + */ + if (tcp) { + struct nm_tcphdr *tcph = + (struct nm_tcphdr *)&gso_hdr[14+iphlen]; + + gso_hdr_len = 14 + iphlen + 4*(tcph->doff >> 4); + } else + gso_hdr_len = 14 + iphlen + 8; /* UDP */ + + ND(3, "gso_hdr_len %u gso_mtu %d", gso_hdr_len, + dst_na->mfs); + + /* Advance source pointers. */ + src += gso_hdr_len; + src_len -= gso_hdr_len; + if (src_len == 0) { + ft_p++; + if (ft_p == ft_end) + break; + src = ft_p->ft_buf; + src_len = ft_p->ft_len; + continue; + } + } + + /* Fill in the header of the current segment. */ + if (gso_bytes == 0) { + memcpy(dst, gso_hdr, gso_hdr_len); + gso_bytes = gso_hdr_len; + } + + /* Fill in data and update source and dest pointers. */ + copy = src_len; + if (gso_bytes + copy > dst_na->mfs) + copy = dst_na->mfs - gso_bytes; + memcpy(dst + gso_bytes, src, copy); + gso_bytes += copy; + src += copy; + src_len -= copy; + + /* A segment is complete or we have processed all the + the GSO payload bytes. */ + if (gso_bytes >= dst_na->mfs || + (src_len == 0 && ft_p + 1 == ft_end)) { + /* After raw segmentation, we must fix some header + * fields and compute checksums, in a protocol dependent + * way. */ + gso_fix_segment(dst, gso_bytes, gso_idx, + segmented_bytes, + src_len == 0 && ft_p + 1 == ft_end, + tcp, iphlen); + + ND("frame %u completed with %d bytes", gso_idx, (int)gso_bytes); + slot->len = gso_bytes; + slot->flags = 0; + segmented_bytes += gso_bytes - gso_hdr_len; + + dst_slots++; + + /* Next destination slot. */ + *j = nm_next(*j, lim); + slot = &ring->slot[*j]; + dst = BDG_NMB(&dst_na->up, slot); + + gso_bytes = 0; + gso_idx++; + } + + /* Next input slot. */ + if (src_len == 0) { + ft_p++; + if (ft_p == ft_end) + break; + src = ft_p->ft_buf; + src_len = ft_p->ft_len; + } + } + ND(3, "%d bytes segmented", segmented_bytes); + + } else { + /* Address of a checksum field into a destination slot. */ + uint16_t *check = NULL; + /* Accumulator for an unfolded checksum. */ + rawsum_t csum = 0; + + /* Process a non-GSO packet. */ + + /* Init 'check' if necessary. */ + if (vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) { + if (unlikely(vh->csum_offset + vh->csum_start > src_len)) + D("invalid checksum request"); + else + check = (uint16_t *)(dst + vh->csum_start + + vh->csum_offset); + } + + while (ft_p != ft_end) { + /* Init/update the packet checksum if needed. */ + if (vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) { + if (!dst_slots) + csum = nm_csum_raw(src + vh->csum_start, + src_len - vh->csum_start, 0); + else + csum = nm_csum_raw(src, src_len, csum); + } + + /* Round to a multiple of 64 */ + src_len = (src_len + 63) & ~63; + + if (ft_p->ft_flags & NS_INDIRECT) { + if (copyin(src, dst, src_len)) { + /* Invalid user pointer, pretend len is 0. */ + dst_len = 0; + } + } else { + memcpy(dst, src, (int)src_len); + } + slot->len = dst_len; + + dst_slots++; + + /* Next destination slot. */ + *j = nm_next(*j, lim); + slot = &ring->slot[*j]; + dst = BDG_NMB(&dst_na->up, slot); + + /* Next source slot. */ + ft_p++; + src = ft_p->ft_buf; + dst_len = src_len = ft_p->ft_len; + + } + + /* Finalize (fold) the checksum if needed. */ + if (check && vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) { + *check = nm_csum_fold(csum); + } + ND(3, "using %u dst_slots", dst_slots); + + /* A second pass on the desitations slots to set the slot flags, + * using the right number of destination slots. + */ + while (j_start != *j) { + slot = &ring->slot[j_start]; + slot->flags = (dst_slots << 8)| NS_MOREFRAG; + j_start = nm_next(j_start, lim); + } + /* Clear NS_MOREFRAG flag on last entry. */ + slot->flags = (dst_slots << 8); + } + + /* Update howmany. */ + if (unlikely(dst_slots > *howmany)) { + dst_slots = *howmany; + D("Slot allocation error: Should never happen"); + } + *howmany -= dst_slots; +} diff --git a/sys/dev/netmap/netmap_pipe.c b/sys/dev/netmap/netmap_pipe.c new file mode 100644 index 0000000..f8f29fa --- /dev/null +++ b/sys/dev/netmap/netmap_pipe.c @@ -0,0 +1,711 @@ +/* + * Copyright (C) 2014 Giuseppe Lettieri. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* $FreeBSD$ */ + +#if defined(__FreeBSD__) +#include <sys/cdefs.h> /* prerequisite */ + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/param.h> /* defines used in kernel.h */ +#include <sys/kernel.h> /* types used in module initialization */ +#include <sys/malloc.h> +#include <sys/poll.h> +#include <sys/lock.h> +#include <sys/rwlock.h> +#include <sys/selinfo.h> +#include <sys/sysctl.h> +#include <sys/socket.h> /* sockaddrs */ +#include <net/if.h> +#include <net/if_var.h> +#include <machine/bus.h> /* bus_dmamap_* */ +#include <sys/refcount.h> + + +#elif defined(linux) + +#include "bsd_glue.h" + +#elif defined(__APPLE__) + +#warning OSX support is only partial +#include "osx_glue.h" + +#else + +#error Unsupported platform + +#endif /* unsupported */ + +/* + * common headers + */ + +#include <net/netmap.h> +#include <dev/netmap/netmap_kern.h> +#include <dev/netmap/netmap_mem2.h> + +#ifdef WITH_PIPES + +#define NM_PIPE_MAXSLOTS 4096 + +int netmap_default_pipes = 0; /* default number of pipes for each nic */ +SYSCTL_DECL(_dev_netmap); +SYSCTL_INT(_dev_netmap, OID_AUTO, default_pipes, CTLFLAG_RW, &netmap_default_pipes, 0 , ""); + +/* allocate the pipe array in the parent adapter */ +int +netmap_pipe_alloc(struct netmap_adapter *na, struct nmreq *nmr) +{ + size_t len; + int mode = nmr->nr_flags & NR_REG_MASK; + u_int npipes; + + if (mode == NR_REG_PIPE_MASTER || mode == NR_REG_PIPE_SLAVE) { + /* this is for our parent, not for us */ + return 0; + } + + /* TODO: we can resize the array if the new + * request can accomodate the already existing pipes + */ + if (na->na_pipes) { + nmr->nr_arg1 = na->na_max_pipes; + return 0; + } + + npipes = nmr->nr_arg1; + if (npipes == 0) + npipes = netmap_default_pipes; + nm_bound_var(&npipes, 0, 0, NM_MAXPIPES, NULL); + + if (npipes == 0) { + /* really zero, nothing to alloc */ + goto out; + } + + len = sizeof(struct netmap_pipe_adapter *) * npipes; + na->na_pipes = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO); + if (na->na_pipes == NULL) + return ENOMEM; + + na->na_max_pipes = npipes; + na->na_next_pipe = 0; + +out: + nmr->nr_arg1 = npipes; + + return 0; +} + +/* deallocate the parent array in the parent adapter */ +void +netmap_pipe_dealloc(struct netmap_adapter *na) +{ + if (na->na_pipes) { + ND("freeing pipes for %s", NM_IFPNAME(na->ifp)); + free(na->na_pipes, M_DEVBUF); + na->na_pipes = NULL; + na->na_max_pipes = 0; + na->na_next_pipe = 0; + } +} + +/* find a pipe endpoint with the given id among the parent's pipes */ +static struct netmap_pipe_adapter * +netmap_pipe_find(struct netmap_adapter *parent, u_int pipe_id) +{ + int i; + struct netmap_pipe_adapter *na; + + for (i = 0; i < parent->na_next_pipe; i++) { + na = parent->na_pipes[i]; + if (na->id == pipe_id) { + return na; + } + } + return NULL; +} + +/* add a new pipe endpoint to the parent array */ +static int +netmap_pipe_add(struct netmap_adapter *parent, struct netmap_pipe_adapter *na) +{ + if (parent->na_next_pipe >= parent->na_max_pipes) { + D("%s: no space left for pipes", NM_IFPNAME(parent->ifp)); + return ENOMEM; + } + + parent->na_pipes[parent->na_next_pipe] = na; + na->parent_slot = parent->na_next_pipe; + parent->na_next_pipe++; + return 0; +} + +/* remove the given pipe endpoint from the parent array */ +static void +netmap_pipe_remove(struct netmap_adapter *parent, struct netmap_pipe_adapter *na) +{ + u_int n; + n = --parent->na_next_pipe; + if (n != na->parent_slot) { + parent->na_pipes[na->parent_slot] = + parent->na_pipes[n]; + } + parent->na_pipes[n] = NULL; +} + +static int +netmap_pipe_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *txkring = na->tx_rings + ring_nr, + *rxkring = txkring->pipe; + u_int limit; /* slots to transfer */ + u_int j, k, lim_tx = txkring->nkr_num_slots - 1, + lim_rx = rxkring->nkr_num_slots - 1; + int m, busy; + + ND("%p: %s %x -> %s", txkring, txkring->name, flags, rxkring->name); + ND(2, "before: hwcur %d hwtail %d cur %d head %d tail %d", txkring->nr_hwcur, txkring->nr_hwtail, + txkring->rcur, txkring->rhead, txkring->rtail); + + j = rxkring->nr_hwtail; /* RX */ + k = txkring->nr_hwcur; /* TX */ + m = txkring->rhead - txkring->nr_hwcur; /* new slots */ + if (m < 0) + m += txkring->nkr_num_slots; + limit = m; + m = rxkring->nkr_num_slots - 1; /* max avail space on destination */ + busy = j - rxkring->nr_hwcur; /* busy slots */ + if (busy < 0) + busy += txkring->nkr_num_slots; + m -= busy; /* subtract busy slots */ + ND(2, "m %d limit %d", m, limit); + if (m < limit) + limit = m; + + if (limit == 0) { + /* either the rxring is full, or nothing to send */ + nm_txsync_finalize(txkring); /* actually useless */ + return 0; + } + + while (limit-- > 0) { + struct netmap_slot *rs = &rxkring->save_ring->slot[j]; + struct netmap_slot *ts = &txkring->ring->slot[k]; + struct netmap_slot tmp; + + /* swap the slots */ + tmp = *rs; + *rs = *ts; + *ts = tmp; + + /* no need to report the buffer change */ + + j = nm_next(j, lim_rx); + k = nm_next(k, lim_tx); + } + + wmb(); /* make sure the slots are updated before publishing them */ + rxkring->nr_hwtail = j; + txkring->nr_hwcur = k; + txkring->nr_hwtail = nm_prev(k, lim_tx); + + nm_txsync_finalize(txkring); + ND(2, "after: hwcur %d hwtail %d cur %d head %d tail %d j %d", txkring->nr_hwcur, txkring->nr_hwtail, + txkring->rcur, txkring->rhead, txkring->rtail, j); + + wmb(); /* make sure rxkring->nr_hwtail is updated before notifying */ + rxkring->na->nm_notify(rxkring->na, rxkring->ring_id, NR_RX, 0); + + return 0; +} + +static int +netmap_pipe_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *rxkring = na->rx_rings + ring_nr, + *txkring = rxkring->pipe; + uint32_t oldhwcur = rxkring->nr_hwcur; + + ND("%s %x <- %s", rxkring->name, flags, txkring->name); + rxkring->nr_hwcur = rxkring->rhead; /* recover user-relased slots */ + ND(5, "hwcur %d hwtail %d cur %d head %d tail %d", rxkring->nr_hwcur, rxkring->nr_hwtail, + rxkring->rcur, rxkring->rhead, rxkring->rtail); + rmb(); /* paired with the first wmb() in txsync */ + nm_rxsync_finalize(rxkring); + + if (oldhwcur != rxkring->nr_hwcur) { + /* we have released some slots, notify the other end */ + wmb(); /* make sure nr_hwcur is updated before notifying */ + txkring->na->nm_notify(txkring->na, txkring->ring_id, NR_TX, 0); + } + return 0; +} + +/* Pipe endpoints are created and destroyed together, so that endopoints do not + * have to check for the existence of their peer at each ?xsync. + * + * To play well with the existing netmap infrastructure (refcounts etc.), we + * adopt the following strategy: + * + * 1) The first endpoint that is created also creates the other endpoint and + * grabs a reference to it. + * + * state A) user1 --> endpoint1 --> endpoint2 + * + * 2) If, starting from state A, endpoint2 is then registered, endpoint1 gives + * its reference to the user: + * + * state B) user1 --> endpoint1 endpoint2 <--- user2 + * + * 3) Assume that, starting from state B endpoint2 is closed. In the unregister + * callback endpoint2 notes that endpoint1 is still active and adds a reference + * from endpoint1 to itself. When user2 then releases her own reference, + * endpoint2 is not destroyed and we are back to state A. A symmetrical state + * would be reached if endpoint1 were released instead. + * + * 4) If, starting from state A, endpoint1 is closed, the destructor notes that + * it owns a reference to endpoint2 and releases it. + * + * Something similar goes on for the creation and destruction of the krings. + */ + + +/* netmap_pipe_krings_delete. + * + * There are two cases: + * + * 1) state is + * + * usr1 --> e1 --> e2 + * + * and we are e1. We have to create both sets + * of krings. + * + * 2) state is + * + * usr1 --> e1 --> e2 + * + * and we are e2. e1 is certainly registered and our + * krings already exist, but they may be hidden. + */ +static int +netmap_pipe_krings_create(struct netmap_adapter *na) +{ + struct netmap_pipe_adapter *pna = + (struct netmap_pipe_adapter *)na; + struct netmap_adapter *ona = &pna->peer->up; + int error = 0; + if (pna->peer_ref) { + int i; + + /* case 1) above */ + D("%p: case 1, create everything", na); + error = netmap_krings_create(na, 0); + if (error) + goto err; + + /* we also create all the rings, since we need to + * update the save_ring pointers. + * netmap_mem_rings_create (called by our caller) + * will not create the rings again + */ + + error = netmap_mem_rings_create(na); + if (error) + goto del_krings1; + + /* update our hidden ring pointers */ + for (i = 0; i < na->num_tx_rings + 1; i++) + na->tx_rings[i].save_ring = na->tx_rings[i].ring; + for (i = 0; i < na->num_rx_rings + 1; i++) + na->rx_rings[i].save_ring = na->rx_rings[i].ring; + + /* now, create krings and rings of the other end */ + error = netmap_krings_create(ona, 0); + if (error) + goto del_rings1; + + error = netmap_mem_rings_create(ona); + if (error) + goto del_krings2; + + for (i = 0; i < ona->num_tx_rings + 1; i++) + ona->tx_rings[i].save_ring = ona->tx_rings[i].ring; + for (i = 0; i < ona->num_rx_rings + 1; i++) + ona->rx_rings[i].save_ring = ona->rx_rings[i].ring; + + /* cross link the krings */ + for (i = 0; i < na->num_tx_rings; i++) { + na->tx_rings[i].pipe = pna->peer->up.rx_rings + i; + na->rx_rings[i].pipe = pna->peer->up.tx_rings + i; + pna->peer->up.tx_rings[i].pipe = na->rx_rings + i; + pna->peer->up.rx_rings[i].pipe = na->tx_rings + i; + } + } else { + int i; + /* case 2) above */ + /* recover the hidden rings */ + ND("%p: case 2, hidden rings", na); + for (i = 0; i < na->num_tx_rings + 1; i++) + na->tx_rings[i].ring = na->tx_rings[i].save_ring; + for (i = 0; i < na->num_rx_rings + 1; i++) + na->rx_rings[i].ring = na->rx_rings[i].save_ring; + } + return 0; + +del_krings2: + netmap_krings_delete(ona); +del_rings1: + netmap_mem_rings_delete(na); +del_krings1: + netmap_krings_delete(na); +err: + return error; +} + +/* netmap_pipe_reg. + * + * There are two cases on registration (onoff==1) + * + * 1.a) state is + * + * usr1 --> e1 --> e2 + * + * and we are e1. Nothing special to do. + * + * 1.b) state is + * + * usr1 --> e1 --> e2 <-- usr2 + * + * and we are e2. Drop the ref e1 is holding. + * + * There are two additional cases on unregister (onoff==0) + * + * 2.a) state is + * + * usr1 --> e1 --> e2 + * + * and we are e1. Nothing special to do, e2 will + * be cleaned up by the destructor of e1. + * + * 2.b) state is + * + * usr1 --> e1 e2 <-- usr2 + * + * and we are either e1 or e2. Add a ref from the + * other end and hide our rings. + */ +static int +netmap_pipe_reg(struct netmap_adapter *na, int onoff) +{ + struct netmap_pipe_adapter *pna = + (struct netmap_pipe_adapter *)na; + struct ifnet *ifp = na->ifp; + ND("%p: onoff %d", na, onoff); + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + } else { + ifp->if_capenable &= ~IFCAP_NETMAP; + } + if (pna->peer_ref) { + ND("%p: case 1.a or 2.a, nothing to do", na); + return 0; + } + if (onoff) { + ND("%p: case 1.b, drop peer", na); + pna->peer->peer_ref = 0; + netmap_adapter_put(na); + } else { + int i; + ND("%p: case 2.b, grab peer", na); + netmap_adapter_get(na); + pna->peer->peer_ref = 1; + /* hide our rings from netmap_mem_rings_delete */ + for (i = 0; i < na->num_tx_rings + 1; i++) { + na->tx_rings[i].ring = NULL; + } + for (i = 0; i < na->num_rx_rings + 1; i++) { + na->rx_rings[i].ring = NULL; + } + } + return 0; +} + +/* netmap_pipe_krings_delete. + * + * There are two cases: + * + * 1) state is + * + * usr1 --> e1 --> e2 + * + * and we are e1 (e2 is not registered, so krings_delete cannot be + * called on it); + * + * 2) state is + * + * usr1 --> e1 e2 <-- usr2 + * + * and we are either e1 or e2. + * + * In the former case we have to also delete the krings of e2; + * in the latter case we do nothing (note that our krings + * have already been hidden in the unregister callback). + */ +static void +netmap_pipe_krings_delete(struct netmap_adapter *na) +{ + struct netmap_pipe_adapter *pna = + (struct netmap_pipe_adapter *)na; + struct netmap_adapter *ona; /* na of the other end */ + int i; + + if (!pna->peer_ref) { + ND("%p: case 2, kept alive by peer", na); + return; + } + /* case 1) above */ + ND("%p: case 1, deleting everyhing", na); + netmap_krings_delete(na); /* also zeroes tx_rings etc. */ + /* restore the ring to be deleted on the peer */ + ona = &pna->peer->up; + if (ona->tx_rings == NULL) { + /* already deleted, we must be on an + * cleanup-after-error path */ + return; + } + for (i = 0; i < ona->num_tx_rings + 1; i++) + ona->tx_rings[i].ring = ona->tx_rings[i].save_ring; + for (i = 0; i < ona->num_rx_rings + 1; i++) + ona->rx_rings[i].ring = ona->rx_rings[i].save_ring; + netmap_mem_rings_delete(ona); + netmap_krings_delete(ona); +} + + +static void +netmap_pipe_dtor(struct netmap_adapter *na) +{ + struct netmap_pipe_adapter *pna = + (struct netmap_pipe_adapter *)na; + ND("%p", na); + if (pna->peer_ref) { + ND("%p: clean up peer", na); + pna->peer_ref = 0; + netmap_adapter_put(&pna->peer->up); + } + if (pna->role == NR_REG_PIPE_MASTER) + netmap_pipe_remove(pna->parent, pna); + netmap_adapter_put(pna->parent); + free(na->ifp, M_DEVBUF); + na->ifp = NULL; + pna->parent = NULL; +} + +int +netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create) +{ + struct nmreq pnmr; + struct netmap_adapter *pna; /* parent adapter */ + struct netmap_pipe_adapter *mna, *sna, *req; + struct ifnet *ifp, *ifp2; + u_int pipe_id; + int role = nmr->nr_flags & NR_REG_MASK; + int error; + + ND("flags %x", nmr->nr_flags); + + if (role != NR_REG_PIPE_MASTER && role != NR_REG_PIPE_SLAVE) { + ND("not a pipe"); + return 0; + } + role = nmr->nr_flags & NR_REG_MASK; + + /* first, try to find the parent adapter */ + bzero(&pnmr, sizeof(pnmr)); + memcpy(&pnmr.nr_name, nmr->nr_name, IFNAMSIZ); + /* pass to parent the requested number of pipes */ + pnmr.nr_arg1 = nmr->nr_arg1; + error = netmap_get_na(&pnmr, &pna, create); + if (error) { + ND("parent lookup failed: %d", error); + return error; + } + ND("found parent: %s", NM_IFPNAME(pna->ifp)); + + if (NETMAP_OWNED_BY_KERN(pna)) { + ND("parent busy"); + error = EBUSY; + goto put_out; + } + + /* next, lookup the pipe id in the parent list */ + req = NULL; + pipe_id = nmr->nr_ringid & NETMAP_RING_MASK; + mna = netmap_pipe_find(pna, pipe_id); + if (mna) { + if (mna->role == role) { + ND("found %d directly at %d", pipe_id, mna->parent_slot); + req = mna; + } else { + ND("found %d indirectly at %d", pipe_id, mna->parent_slot); + req = mna->peer; + } + /* the pipe we have found already holds a ref to the parent, + * so we need to drop the one we got from netmap_get_na() + */ + netmap_adapter_put(pna); + goto found; + } + ND("pipe %d not found, create %d", pipe_id, create); + if (!create) { + error = ENODEV; + goto put_out; + } + /* we create both master and slave. + * The endpoint we were asked for holds a reference to + * the other one. + */ + ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!ifp) { + error = ENOMEM; + goto put_out; + } + strcpy(ifp->if_xname, NM_IFPNAME(pna->ifp)); + + mna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (mna == NULL) { + error = ENOMEM; + goto free_ifp; + } + mna->up.ifp = ifp; + + mna->id = pipe_id; + mna->role = NR_REG_PIPE_MASTER; + mna->parent = pna; + + mna->up.nm_txsync = netmap_pipe_txsync; + mna->up.nm_rxsync = netmap_pipe_rxsync; + mna->up.nm_register = netmap_pipe_reg; + mna->up.nm_dtor = netmap_pipe_dtor; + mna->up.nm_krings_create = netmap_pipe_krings_create; + mna->up.nm_krings_delete = netmap_pipe_krings_delete; + mna->up.nm_mem = pna->nm_mem; + mna->up.na_lut = pna->na_lut; + mna->up.na_lut_objtotal = pna->na_lut_objtotal; + + mna->up.num_tx_rings = 1; + mna->up.num_rx_rings = 1; + mna->up.num_tx_desc = nmr->nr_tx_slots; + nm_bound_var(&mna->up.num_tx_desc, pna->num_tx_desc, + 1, NM_PIPE_MAXSLOTS, NULL); + mna->up.num_rx_desc = nmr->nr_rx_slots; + nm_bound_var(&mna->up.num_rx_desc, pna->num_rx_desc, + 1, NM_PIPE_MAXSLOTS, NULL); + error = netmap_attach_common(&mna->up); + if (error) + goto free_ifp; + /* register the master with the parent */ + error = netmap_pipe_add(pna, mna); + if (error) + goto free_mna; + + /* create the slave */ + ifp2 = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!ifp) { + error = ENOMEM; + goto free_mna; + } + strcpy(ifp2->if_xname, NM_IFPNAME(pna->ifp)); + + sna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (sna == NULL) { + error = ENOMEM; + goto free_ifp2; + } + /* most fields are the same, copy from master and then fix */ + *sna = *mna; + sna->up.ifp = ifp2; + sna->role = NR_REG_PIPE_SLAVE; + error = netmap_attach_common(&sna->up); + if (error) + goto free_sna; + + /* join the two endpoints */ + mna->peer = sna; + sna->peer = mna; + + /* we already have a reference to the parent, but we + * need another one for the other endpoint we created + */ + netmap_adapter_get(pna); + + if (role == NR_REG_PIPE_MASTER) { + req = mna; + mna->peer_ref = 1; + netmap_adapter_get(&sna->up); + } else { + req = sna; + sna->peer_ref = 1; + netmap_adapter_get(&mna->up); + } + ND("created master %p and slave %p", mna, sna); +found: + + ND("pipe %d %s at %p", pipe_id, + (req->role == NR_REG_PIPE_MASTER ? "master" : "slave"), req); + *na = &req->up; + netmap_adapter_get(*na); + + /* write the configuration back */ + nmr->nr_tx_rings = req->up.num_tx_rings; + nmr->nr_rx_rings = req->up.num_rx_rings; + nmr->nr_tx_slots = req->up.num_tx_desc; + nmr->nr_rx_slots = req->up.num_rx_desc; + + /* keep the reference to the parent. + * It will be released by the req destructor + */ + + return 0; + +free_sna: + free(sna, M_DEVBUF); +free_ifp2: + free(ifp2, M_DEVBUF); +free_mna: + free(mna, M_DEVBUF); +free_ifp: + free(ifp, M_DEVBUF); +put_out: + netmap_adapter_put(pna); + return error; +} + + +#endif /* WITH_PIPES */ diff --git a/sys/dev/netmap/netmap_vale.c b/sys/dev/netmap/netmap_vale.c new file mode 100644 index 0000000..34e3912 --- /dev/null +++ b/sys/dev/netmap/netmap_vale.c @@ -0,0 +1,2103 @@ +/* + * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +/* + * This module implements the VALE switch for netmap + +--- VALE SWITCH --- + +NMG_LOCK() serializes all modifications to switches and ports. +A switch cannot be deleted until all ports are gone. + +For each switch, an SX lock (RWlock on linux) protects +deletion of ports. When configuring or deleting a new port, the +lock is acquired in exclusive mode (after holding NMG_LOCK). +When forwarding, the lock is acquired in shared mode (without NMG_LOCK). +The lock is held throughout the entire forwarding cycle, +during which the thread may incur in a page fault. +Hence it is important that sleepable shared locks are used. + +On the rx ring, the per-port lock is grabbed initially to reserve +a number of slot in the ring, then the lock is released, +packets are copied from source to destination, and then +the lock is acquired again and the receive ring is updated. +(A similar thing is done on the tx ring for NIC and host stack +ports attached to the switch) + + */ + +/* + * OS-specific code that is used only within this file. + * Other OS-specific code that must be accessed by drivers + * is present in netmap_kern.h + */ + +#if defined(__FreeBSD__) +#include <sys/cdefs.h> /* prerequisite */ +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/param.h> /* defines used in kernel.h */ +#include <sys/kernel.h> /* types used in module initialization */ +#include <sys/conf.h> /* cdevsw struct, UID, GID */ +#include <sys/sockio.h> +#include <sys/socketvar.h> /* struct socket */ +#include <sys/malloc.h> +#include <sys/poll.h> +#include <sys/rwlock.h> +#include <sys/socket.h> /* sockaddrs */ +#include <sys/selinfo.h> +#include <sys/sysctl.h> +#include <net/if.h> +#include <net/if_var.h> +#include <net/bpf.h> /* BIOCIMMEDIATE */ +#include <machine/bus.h> /* bus_dmamap_* */ +#include <sys/endian.h> +#include <sys/refcount.h> + + +#define BDG_RWLOCK_T struct rwlock // struct rwlock + +#define BDG_RWINIT(b) \ + rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS) +#define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock) +#define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock) +#define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock) +#define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock) +#define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock) +#define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock) + + +#elif defined(linux) + +#include "bsd_glue.h" + +#elif defined(__APPLE__) + +#warning OSX support is only partial +#include "osx_glue.h" + +#else + +#error Unsupported platform + +#endif /* unsupported */ + +/* + * common headers + */ + +#include <net/netmap.h> +#include <dev/netmap/netmap_kern.h> +#include <dev/netmap/netmap_mem2.h> + +#ifdef WITH_VALE + +/* + * system parameters (most of them in netmap_kern.h) + * NM_NAME prefix for switch port names, default "vale" + * NM_BDG_MAXPORTS number of ports + * NM_BRIDGES max number of switches in the system. + * XXX should become a sysctl or tunable + * + * Switch ports are named valeX:Y where X is the switch name and Y + * is the port. If Y matches a physical interface name, the port is + * connected to a physical device. + * + * Unlike physical interfaces, switch ports use their own memory region + * for rings and buffers. + * The virtual interfaces use per-queue lock instead of core lock. + * In the tx loop, we aggregate traffic in batches to make all operations + * faster. The batch size is bridge_batch. + */ +#define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ +#define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ +#define NM_BRIDGE_RINGSIZE 1024 /* in the device */ +#define NM_BDG_HASH 1024 /* forwarding table entries */ +#define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ +#define NM_MULTISEG 64 /* max size of a chain of bufs */ +/* actual size of the tables */ +#define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) +/* NM_FT_NULL terminates a list of slots in the ft */ +#define NM_FT_NULL NM_BDG_BATCH_MAX +#define NM_BRIDGES 8 /* number of bridges */ + + +/* + * bridge_batch is set via sysctl to the max batch size to be + * used in the bridge. The actual value may be larger as the + * last packet in the block may overflow the size. + */ +int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ +SYSCTL_DECL(_dev_netmap); +SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , ""); + + +static int bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp); +static int bdg_netmap_reg(struct netmap_adapter *na, int onoff); +static int netmap_bwrap_attach(struct ifnet *, struct ifnet *); +static int netmap_bwrap_register(struct netmap_adapter *, int onoff); +int kern_netmap_regif(struct nmreq *nmr); + +/* + * For each output interface, nm_bdg_q is used to construct a list. + * bq_len is the number of output buffers (we can have coalescing + * during the copy). + */ +struct nm_bdg_q { + uint16_t bq_head; + uint16_t bq_tail; + uint32_t bq_len; /* number of buffers */ +}; + +/* XXX revise this */ +struct nm_hash_ent { + uint64_t mac; /* the top 2 bytes are the epoch */ + uint64_t ports; +}; + +/* + * nm_bridge is a descriptor for a VALE switch. + * Interfaces for a bridge are all in bdg_ports[]. + * The array has fixed size, an empty entry does not terminate + * the search, but lookups only occur on attach/detach so we + * don't mind if they are slow. + * + * The bridge is non blocking on the transmit ports: excess + * packets are dropped if there is no room on the output port. + * + * bdg_lock protects accesses to the bdg_ports array. + * This is a rw lock (or equivalent). + */ +struct nm_bridge { + /* XXX what is the proper alignment/layout ? */ + BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */ + int bdg_namelen; + uint32_t bdg_active_ports; /* 0 means free */ + char bdg_basename[IFNAMSIZ]; + + /* Indexes of active ports (up to active_ports) + * and all other remaining ports. + */ + uint8_t bdg_port_index[NM_BDG_MAXPORTS]; + + struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS]; + + + /* + * The function to decide the destination port. + * It returns either of an index of the destination port, + * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to + * forward this packet. ring_nr is the source ring index, and the + * function may overwrite this value to forward this packet to a + * different ring index. + * This function must be set by netmap_bdgctl(). + */ + bdg_lookup_fn_t nm_bdg_lookup; + + /* the forwarding table, MAC+ports. + * XXX should be changed to an argument to be passed to + * the lookup function, and allocated on attach + */ + struct nm_hash_ent ht[NM_BDG_HASH]; +}; + + +/* + * XXX in principle nm_bridges could be created dynamically + * Right now we have a static array and deletions are protected + * by an exclusive lock. + */ +struct nm_bridge nm_bridges[NM_BRIDGES]; + + +/* + * this is a slightly optimized copy routine which rounds + * to multiple of 64 bytes and is often faster than dealing + * with other odd sizes. We assume there is enough room + * in the source and destination buffers. + * + * XXX only for multiples of 64 bytes, non overlapped. + */ +static inline void +pkt_copy(void *_src, void *_dst, int l) +{ + uint64_t *src = _src; + uint64_t *dst = _dst; + if (unlikely(l >= 1024)) { + memcpy(dst, src, l); + return; + } + for (; likely(l > 0); l-=64) { + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + } +} + + +/* + * locate a bridge among the existing ones. + * MUST BE CALLED WITH NMG_LOCK() + * + * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. + * We assume that this is called with a name of at least NM_NAME chars. + */ +static struct nm_bridge * +nm_find_bridge(const char *name, int create) +{ + int i, l, namelen; + struct nm_bridge *b = NULL; + + NMG_LOCK_ASSERT(); + + namelen = strlen(NM_NAME); /* base length */ + l = name ? strlen(name) : 0; /* actual length */ + if (l < namelen) { + D("invalid bridge name %s", name ? name : NULL); + return NULL; + } + for (i = namelen + 1; i < l; i++) { + if (name[i] == ':') { + namelen = i; + break; + } + } + if (namelen >= IFNAMSIZ) + namelen = IFNAMSIZ; + ND("--- prefix is '%.*s' ---", namelen, name); + + /* lookup the name, remember empty slot if there is one */ + for (i = 0; i < NM_BRIDGES; i++) { + struct nm_bridge *x = nm_bridges + i; + + if (x->bdg_active_ports == 0) { + if (create && b == NULL) + b = x; /* record empty slot */ + } else if (x->bdg_namelen != namelen) { + continue; + } else if (strncmp(name, x->bdg_basename, namelen) == 0) { + ND("found '%.*s' at %d", namelen, name, i); + b = x; + break; + } + } + if (i == NM_BRIDGES && b) { /* name not found, can create entry */ + /* initialize the bridge */ + strncpy(b->bdg_basename, name, namelen); + ND("create new bridge %s with ports %d", b->bdg_basename, + b->bdg_active_ports); + b->bdg_namelen = namelen; + b->bdg_active_ports = 0; + for (i = 0; i < NM_BDG_MAXPORTS; i++) + b->bdg_port_index[i] = i; + /* set the default function */ + b->nm_bdg_lookup = netmap_bdg_learning; + /* reset the MAC address table */ + bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH); + } + return b; +} + + +/* + * Free the forwarding tables for rings attached to switch ports. + */ +static void +nm_free_bdgfwd(struct netmap_adapter *na) +{ + int nrings, i; + struct netmap_kring *kring; + + NMG_LOCK_ASSERT(); + nrings = na->num_tx_rings; + kring = na->tx_rings; + for (i = 0; i < nrings; i++) { + if (kring[i].nkr_ft) { + free(kring[i].nkr_ft, M_DEVBUF); + kring[i].nkr_ft = NULL; /* protect from freeing twice */ + } + } +} + + +/* + * Allocate the forwarding tables for the rings attached to the bridge ports. + */ +static int +nm_alloc_bdgfwd(struct netmap_adapter *na) +{ + int nrings, l, i, num_dstq; + struct netmap_kring *kring; + + NMG_LOCK_ASSERT(); + /* all port:rings + broadcast */ + num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; + l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; + l += sizeof(struct nm_bdg_q) * num_dstq; + l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; + + nrings = netmap_real_tx_rings(na); + kring = na->tx_rings; + for (i = 0; i < nrings; i++) { + struct nm_bdg_fwd *ft; + struct nm_bdg_q *dstq; + int j; + + ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO); + if (!ft) { + nm_free_bdgfwd(na); + return ENOMEM; + } + dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); + for (j = 0; j < num_dstq; j++) { + dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; + dstq[j].bq_len = 0; + } + kring[i].nkr_ft = ft; + } + return 0; +} + + +static void +netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw) +{ + int s_hw = hw, s_sw = sw; + int i, lim =b->bdg_active_ports; + uint8_t tmp[NM_BDG_MAXPORTS]; + + /* + New algorithm: + make a copy of bdg_port_index; + lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port + in the array of bdg_port_index, replacing them with + entries from the bottom of the array; + decrement bdg_active_ports; + acquire BDG_WLOCK() and copy back the array. + */ + + if (netmap_verbose) + D("detach %d and %d (lim %d)", hw, sw, lim); + /* make a copy of the list of active ports, update it, + * and then copy back within BDG_WLOCK(). + */ + memcpy(tmp, b->bdg_port_index, sizeof(tmp)); + for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) { + if (hw >= 0 && tmp[i] == hw) { + ND("detach hw %d at %d", hw, i); + lim--; /* point to last active port */ + tmp[i] = tmp[lim]; /* swap with i */ + tmp[lim] = hw; /* now this is inactive */ + hw = -1; + } else if (sw >= 0 && tmp[i] == sw) { + ND("detach sw %d at %d", sw, i); + lim--; + tmp[i] = tmp[lim]; + tmp[lim] = sw; + sw = -1; + } else { + i++; + } + } + if (hw >= 0 || sw >= 0) { + D("XXX delete failed hw %d sw %d, should panic...", hw, sw); + } + + BDG_WLOCK(b); + b->bdg_ports[s_hw] = NULL; + if (s_sw >= 0) { + b->bdg_ports[s_sw] = NULL; + } + memcpy(b->bdg_port_index, tmp, sizeof(tmp)); + b->bdg_active_ports = lim; + BDG_WUNLOCK(b); + + ND("now %d active ports", lim); + if (lim == 0) { + ND("marking bridge %s as free", b->bdg_basename); + b->nm_bdg_lookup = NULL; + } +} + + +static void +netmap_adapter_vp_dtor(struct netmap_adapter *na) +{ + struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; + struct nm_bridge *b = vpna->na_bdg; + struct ifnet *ifp = na->ifp; + + ND("%s has %d references", NM_IFPNAME(ifp), na->na_refcount); + + if (b) { + netmap_bdg_detach_common(b, vpna->bdg_port, -1); + } + + bzero(ifp, sizeof(*ifp)); + free(ifp, M_DEVBUF); + na->ifp = NULL; +} + + +/* Try to get a reference to a netmap adapter attached to a VALE switch. + * If the adapter is found (or is created), this function returns 0, a + * non NULL pointer is returned into *na, and the caller holds a + * reference to the adapter. + * If an adapter is not found, then no reference is grabbed and the + * function returns an error code, or 0 if there is just a VALE prefix + * mismatch. Therefore the caller holds a reference when + * (*na != NULL && return == 0). + */ +int +netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create) +{ + const char *name = nmr->nr_name; + struct ifnet *ifp; + int error = 0; + struct netmap_adapter *ret; + struct netmap_vp_adapter *vpna; + struct nm_bridge *b; + int i, j, cand = -1, cand2 = -1; + int needed; + + *na = NULL; /* default return value */ + + /* first try to see if this is a bridge port. */ + NMG_LOCK_ASSERT(); + if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) { + return 0; /* no error, but no VALE prefix */ + } + + b = nm_find_bridge(name, create); + if (b == NULL) { + D("no bridges available for '%s'", name); + return (create ? ENOMEM : ENXIO); + } + + /* Now we are sure that name starts with the bridge's name, + * lookup the port in the bridge. We need to scan the entire + * list. It is not important to hold a WLOCK on the bridge + * during the search because NMG_LOCK already guarantees + * that there are no other possible writers. + */ + + /* lookup in the local list of ports */ + for (j = 0; j < b->bdg_active_ports; j++) { + i = b->bdg_port_index[j]; + vpna = b->bdg_ports[i]; + // KASSERT(na != NULL); + ifp = vpna->up.ifp; + /* XXX make sure the name only contains one : */ + if (!strcmp(NM_IFPNAME(ifp), name)) { + netmap_adapter_get(&vpna->up); + ND("found existing if %s refs %d", name, + vpna->na_bdg_refcount); + *na = (struct netmap_adapter *)vpna; + return 0; + } + } + /* not found, should we create it? */ + if (!create) + return ENXIO; + /* yes we should, see if we have space to attach entries */ + needed = 2; /* in some cases we only need 1 */ + if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) { + D("bridge full %d, cannot create new port", b->bdg_active_ports); + return ENOMEM; + } + /* record the next two ports available, but do not allocate yet */ + cand = b->bdg_port_index[b->bdg_active_ports]; + cand2 = b->bdg_port_index[b->bdg_active_ports + 1]; + ND("+++ bridge %s port %s used %d avail %d %d", + b->bdg_basename, name, b->bdg_active_ports, cand, cand2); + + /* + * try see if there is a matching NIC with this name + * (after the bridge's name) + */ + ifp = ifunit_ref(name + b->bdg_namelen + 1); + if (!ifp) { /* this is a virtual port */ + if (nmr->nr_cmd) { + /* nr_cmd must be 0 for a virtual port */ + return EINVAL; + } + + /* create a struct ifnet for the new port. + * need M_NOWAIT as we are under nma_lock + */ + ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!ifp) + return ENOMEM; + + strcpy(ifp->if_xname, name); + /* bdg_netmap_attach creates a struct netmap_adapter */ + error = bdg_netmap_attach(nmr, ifp); + if (error) { + D("error %d", error); + free(ifp, M_DEVBUF); + return error; + } + ret = NA(ifp); + cand2 = -1; /* only need one port */ + } else { /* this is a NIC */ + struct ifnet *fake_ifp; + + error = netmap_get_hw_na(ifp, &ret); + if (error || ret == NULL) + goto out; + + /* make sure the NIC is not already in use */ + if (NETMAP_OWNED_BY_ANY(ret)) { + D("NIC %s busy, cannot attach to bridge", + NM_IFPNAME(ifp)); + error = EBUSY; + goto out; + } + /* create a fake interface */ + fake_ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!fake_ifp) { + error = ENOMEM; + goto out; + } + strcpy(fake_ifp->if_xname, name); + error = netmap_bwrap_attach(fake_ifp, ifp); + if (error) { + free(fake_ifp, M_DEVBUF); + goto out; + } + ret = NA(fake_ifp); + if (nmr->nr_arg1 != NETMAP_BDG_HOST) + cand2 = -1; /* only need one port */ + if_rele(ifp); + } + vpna = (struct netmap_vp_adapter *)ret; + + BDG_WLOCK(b); + vpna->bdg_port = cand; + ND("NIC %p to bridge port %d", vpna, cand); + /* bind the port to the bridge (virtual ports are not active) */ + b->bdg_ports[cand] = vpna; + vpna->na_bdg = b; + b->bdg_active_ports++; + if (cand2 >= 0) { + struct netmap_vp_adapter *hostna = vpna + 1; + /* also bind the host stack to the bridge */ + b->bdg_ports[cand2] = hostna; + hostna->bdg_port = cand2; + hostna->na_bdg = b; + b->bdg_active_ports++; + ND("host %p to bridge port %d", hostna, cand2); + } + ND("if %s refs %d", name, vpna->up.na_refcount); + BDG_WUNLOCK(b); + *na = ret; + netmap_adapter_get(ret); + return 0; + +out: + if_rele(ifp); + + return error; +} + + +/* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */ +static int +nm_bdg_attach(struct nmreq *nmr) +{ + struct netmap_adapter *na; + struct netmap_if *nifp; + struct netmap_priv_d *npriv; + struct netmap_bwrap_adapter *bna; + int error; + + npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO); + if (npriv == NULL) + return ENOMEM; + + NMG_LOCK(); + + error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */); + if (error) /* no device, or another bridge or user owns the device */ + goto unlock_exit; + + if (na == NULL) { /* VALE prefix missing */ + error = EINVAL; + goto unlock_exit; + } + + if (na->active_fds > 0) { /* already registered */ + error = EBUSY; + goto unref_exit; + } + + nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags, &error); + if (!nifp) { + goto unref_exit; + } + + bna = (struct netmap_bwrap_adapter*)na; + bna->na_kpriv = npriv; + NMG_UNLOCK(); + ND("registered %s to netmap-mode", NM_IFPNAME(na->ifp)); + return 0; + +unref_exit: + netmap_adapter_put(na); +unlock_exit: + NMG_UNLOCK(); + bzero(npriv, sizeof(*npriv)); + free(npriv, M_DEVBUF); + return error; +} + + +static int +nm_bdg_detach(struct nmreq *nmr) +{ + struct netmap_adapter *na; + int error; + struct netmap_bwrap_adapter *bna; + int last_instance; + + NMG_LOCK(); + error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */); + if (error) { /* no device, or another bridge or user owns the device */ + goto unlock_exit; + } + + if (na == NULL) { /* VALE prefix missing */ + error = EINVAL; + goto unlock_exit; + } + + bna = (struct netmap_bwrap_adapter *)na; + + if (na->active_fds == 0) { /* not registered */ + error = EINVAL; + goto unref_exit; + } + + last_instance = netmap_dtor_locked(bna->na_kpriv); /* unregister */ + if (!last_instance) { + D("--- error, trying to detach an entry with active mmaps"); + error = EINVAL; + } else { + struct netmap_priv_d *npriv = bna->na_kpriv; + + bna->na_kpriv = NULL; + D("deleting priv"); + + bzero(npriv, sizeof(*npriv)); + free(npriv, M_DEVBUF); + } + +unref_exit: + netmap_adapter_put(na); +unlock_exit: + NMG_UNLOCK(); + return error; + +} + + +/* exported to kernel callers, e.g. OVS ? + * Entry point. + * Called without NMG_LOCK. + */ +int +netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) +{ + struct nm_bridge *b; + struct netmap_adapter *na; + struct netmap_vp_adapter *vpna; + struct ifnet *iter; + char *name = nmr->nr_name; + int cmd = nmr->nr_cmd, namelen = strlen(name); + int error = 0, i, j; + + switch (cmd) { + case NETMAP_BDG_ATTACH: + error = nm_bdg_attach(nmr); + break; + + case NETMAP_BDG_DETACH: + error = nm_bdg_detach(nmr); + break; + + case NETMAP_BDG_LIST: + /* this is used to enumerate bridges and ports */ + if (namelen) { /* look up indexes of bridge and port */ + if (strncmp(name, NM_NAME, strlen(NM_NAME))) { + error = EINVAL; + break; + } + NMG_LOCK(); + b = nm_find_bridge(name, 0 /* don't create */); + if (!b) { + error = ENOENT; + NMG_UNLOCK(); + break; + } + + error = ENOENT; + for (j = 0; j < b->bdg_active_ports; j++) { + i = b->bdg_port_index[j]; + vpna = b->bdg_ports[i]; + if (vpna == NULL) { + D("---AAAAAAAAARGH-------"); + continue; + } + iter = vpna->up.ifp; + /* the former and the latter identify a + * virtual port and a NIC, respectively + */ + if (!strcmp(iter->if_xname, name)) { + /* bridge index */ + nmr->nr_arg1 = b - nm_bridges; + nmr->nr_arg2 = i; /* port index */ + error = 0; + break; + } + } + NMG_UNLOCK(); + } else { + /* return the first non-empty entry starting from + * bridge nr_arg1 and port nr_arg2. + * + * Users can detect the end of the same bridge by + * seeing the new and old value of nr_arg1, and can + * detect the end of all the bridge by error != 0 + */ + i = nmr->nr_arg1; + j = nmr->nr_arg2; + + NMG_LOCK(); + for (error = ENOENT; i < NM_BRIDGES; i++) { + b = nm_bridges + i; + if (j >= b->bdg_active_ports) { + j = 0; /* following bridges scan from 0 */ + continue; + } + nmr->nr_arg1 = i; + nmr->nr_arg2 = j; + j = b->bdg_port_index[j]; + vpna = b->bdg_ports[j]; + iter = vpna->up.ifp; + strncpy(name, iter->if_xname, (size_t)IFNAMSIZ); + error = 0; + break; + } + NMG_UNLOCK(); + } + break; + + case NETMAP_BDG_LOOKUP_REG: + /* register a lookup function to the given bridge. + * nmr->nr_name may be just bridge's name (including ':' + * if it is not just NM_NAME). + */ + if (!func) { + error = EINVAL; + break; + } + NMG_LOCK(); + b = nm_find_bridge(name, 0 /* don't create */); + if (!b) { + error = EINVAL; + } else { + b->nm_bdg_lookup = func; + } + NMG_UNLOCK(); + break; + + case NETMAP_BDG_VNET_HDR: + /* Valid lengths for the virtio-net header are 0 (no header), + 10 and 12. */ + if (nmr->nr_arg1 != 0 && + nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) && + nmr->nr_arg1 != 12) { + error = EINVAL; + break; + } + NMG_LOCK(); + error = netmap_get_bdg_na(nmr, &na, 0); + if (na && !error) { + vpna = (struct netmap_vp_adapter *)na; + vpna->virt_hdr_len = nmr->nr_arg1; + if (vpna->virt_hdr_len) + vpna->mfs = NETMAP_BDG_BUF_SIZE(na->nm_mem); + D("Using vnet_hdr_len %d for %p", vpna->virt_hdr_len, vpna); + netmap_adapter_put(na); + } + NMG_UNLOCK(); + break; + + default: + D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd); + error = EINVAL; + break; + } + return error; +} + +static int +netmap_vp_krings_create(struct netmap_adapter *na) +{ + u_int tailroom; + int error, i; + uint32_t *leases; + u_int nrx = netmap_real_rx_rings(na); + + /* + * Leases are attached to RX rings on vale ports + */ + tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx; + + error = netmap_krings_create(na, tailroom); + if (error) + return error; + + leases = na->tailroom; + + for (i = 0; i < nrx; i++) { /* Receive rings */ + na->rx_rings[i].nkr_leases = leases; + leases += na->num_rx_desc; + } + + error = nm_alloc_bdgfwd(na); + if (error) { + netmap_krings_delete(na); + return error; + } + + return 0; +} + + +static void +netmap_vp_krings_delete(struct netmap_adapter *na) +{ + nm_free_bdgfwd(na); + netmap_krings_delete(na); +} + + +static int +nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, + struct netmap_vp_adapter *na, u_int ring_nr); + + +/* + * Grab packets from a kring, move them into the ft structure + * associated to the tx (input) port. Max one instance per port, + * filtered on input (ioctl, poll or XXX). + * Returns the next position in the ring. + */ +static int +nm_bdg_preflush(struct netmap_vp_adapter *na, u_int ring_nr, + struct netmap_kring *kring, u_int end) +{ + struct netmap_ring *ring = kring->ring; + struct nm_bdg_fwd *ft; + u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; + u_int ft_i = 0; /* start from 0 */ + u_int frags = 1; /* how many frags ? */ + struct nm_bridge *b = na->na_bdg; + + /* To protect against modifications to the bridge we acquire a + * shared lock, waiting if we can sleep (if the source port is + * attached to a user process) or with a trylock otherwise (NICs). + */ + ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); + if (na->up.na_flags & NAF_BDG_MAYSLEEP) + BDG_RLOCK(b); + else if (!BDG_RTRYLOCK(b)) + return 0; + ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); + ft = kring->nkr_ft; + + for (; likely(j != end); j = nm_next(j, lim)) { + struct netmap_slot *slot = &ring->slot[j]; + char *buf; + + ft[ft_i].ft_len = slot->len; + ft[ft_i].ft_flags = slot->flags; + + ND("flags is 0x%x", slot->flags); + /* this slot goes into a list so initialize the link field */ + ft[ft_i].ft_next = NM_FT_NULL; + buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? + (void *)(uintptr_t)slot->ptr : BDG_NMB(&na->up, slot); + __builtin_prefetch(buf); + ++ft_i; + if (slot->flags & NS_MOREFRAG) { + frags++; + continue; + } + if (unlikely(netmap_verbose && frags > 1)) + RD(5, "%d frags at %d", frags, ft_i - frags); + ft[ft_i - frags].ft_frags = frags; + frags = 1; + if (unlikely((int)ft_i >= bridge_batch)) + ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); + } + if (frags > 1) { + D("truncate incomplete fragment at %d (%d frags)", ft_i, frags); + // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG + ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG; + ft[ft_i - frags].ft_frags = frags - 1; + } + if (ft_i) + ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); + BDG_RUNLOCK(b); + return j; +} + + +/* ----- FreeBSD if_bridge hash function ------- */ + +/* + * The following hash function is adapted from "Hash Functions" by Bob Jenkins + * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). + * + * http://www.burtleburtle.net/bob/hash/spooky.html + */ +#define mix(a, b, c) \ +do { \ + a -= b; a -= c; a ^= (c >> 13); \ + b -= c; b -= a; b ^= (a << 8); \ + c -= a; c -= b; c ^= (b >> 13); \ + a -= b; a -= c; a ^= (c >> 12); \ + b -= c; b -= a; b ^= (a << 16); \ + c -= a; c -= b; c ^= (b >> 5); \ + a -= b; a -= c; a ^= (c >> 3); \ + b -= c; b -= a; b ^= (a << 10); \ + c -= a; c -= b; c ^= (b >> 15); \ +} while (/*CONSTCOND*/0) + + +static __inline uint32_t +nm_bridge_rthash(const uint8_t *addr) +{ + uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key + + b += addr[5] << 8; + b += addr[4]; + a += addr[3] << 24; + a += addr[2] << 16; + a += addr[1] << 8; + a += addr[0]; + + mix(a, b, c); +#define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) + return (c & BRIDGE_RTHASH_MASK); +} + +#undef mix + + +static int +bdg_netmap_reg(struct netmap_adapter *na, int onoff) +{ + struct netmap_vp_adapter *vpna = + (struct netmap_vp_adapter*)na; + struct ifnet *ifp = na->ifp; + + /* the interface is already attached to the bridge, + * so we only need to toggle IFCAP_NETMAP. + */ + BDG_WLOCK(vpna->na_bdg); + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + } else { + ifp->if_capenable &= ~IFCAP_NETMAP; + } + BDG_WUNLOCK(vpna->na_bdg); + return 0; +} + + +/* + * Lookup function for a learning bridge. + * Update the hash table with the source address, + * and then returns the destination port index, and the + * ring in *dst_ring (at the moment, always use ring 0) + */ +u_int +netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring, + struct netmap_vp_adapter *na) +{ + struct nm_hash_ent *ht = na->na_bdg->ht; + uint32_t sh, dh; + u_int dst, mysrc = na->bdg_port; + uint64_t smac, dmac; + + if (buf_len < 14) { + D("invalid buf length %d", buf_len); + return NM_BDG_NOPORT; + } + dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; + smac = le64toh(*(uint64_t *)(buf + 4)); + smac >>= 16; + + /* + * The hash is somewhat expensive, there might be some + * worthwhile optimizations here. + */ + if ((buf[6] & 1) == 0) { /* valid src */ + uint8_t *s = buf+6; + sh = nm_bridge_rthash(s); // XXX hash of source + /* update source port forwarding entry */ + ht[sh].mac = smac; /* XXX expire ? */ + ht[sh].ports = mysrc; + if (netmap_verbose) + D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", + s[0], s[1], s[2], s[3], s[4], s[5], mysrc); + } + dst = NM_BDG_BROADCAST; + if ((buf[0] & 1) == 0) { /* unicast */ + dh = nm_bridge_rthash(buf); // XXX hash of dst + if (ht[dh].mac == dmac) { /* found dst */ + dst = ht[dh].ports; + } + /* XXX otherwise return NM_BDG_UNKNOWN ? */ + } + *dst_ring = 0; + return dst; +} + + +/* + * Available space in the ring. Only used in VALE code + * and only with is_rx = 1 + */ +static inline uint32_t +nm_kr_space(struct netmap_kring *k, int is_rx) +{ + int space; + + if (is_rx) { + int busy = k->nkr_hwlease - k->nr_hwcur; + if (busy < 0) + busy += k->nkr_num_slots; + space = k->nkr_num_slots - 1 - busy; + } else { + /* XXX never used in this branch */ + space = k->nr_hwtail - k->nkr_hwlease; + if (space < 0) + space += k->nkr_num_slots; + } +#if 0 + // sanity check + if (k->nkr_hwlease >= k->nkr_num_slots || + k->nr_hwcur >= k->nkr_num_slots || + k->nr_tail >= k->nkr_num_slots || + busy < 0 || + busy >= k->nkr_num_slots) { + D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, + k->nkr_lease_idx, k->nkr_num_slots); + } +#endif + return space; +} + + + + +/* make a lease on the kring for N positions. return the + * lease index + * XXX only used in VALE code and with is_rx = 1 + */ +static inline uint32_t +nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) +{ + uint32_t lim = k->nkr_num_slots - 1; + uint32_t lease_idx = k->nkr_lease_idx; + + k->nkr_leases[lease_idx] = NR_NOSLOT; + k->nkr_lease_idx = nm_next(lease_idx, lim); + + if (n > nm_kr_space(k, is_rx)) { + D("invalid request for %d slots", n); + panic("x"); + } + /* XXX verify that there are n slots */ + k->nkr_hwlease += n; + if (k->nkr_hwlease > lim) + k->nkr_hwlease -= lim + 1; + + if (k->nkr_hwlease >= k->nkr_num_slots || + k->nr_hwcur >= k->nkr_num_slots || + k->nr_hwtail >= k->nkr_num_slots || + k->nkr_lease_idx >= k->nkr_num_slots) { + D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d", + k->na->ifp->if_xname, + k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, + k->nkr_lease_idx, k->nkr_num_slots); + } + return lease_idx; +} + +/* + * This flush routine supports only unicast and broadcast but a large + * number of ports, and lets us replace the learn and dispatch functions. + */ +int +nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, + u_int ring_nr) +{ + struct nm_bdg_q *dst_ents, *brddst; + uint16_t num_dsts = 0, *dsts; + struct nm_bridge *b = na->na_bdg; + u_int i, j, me = na->bdg_port; + + /* + * The work area (pointed by ft) is followed by an array of + * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS + * queues per port plus one for the broadcast traffic. + * Then we have an array of destination indexes. + */ + dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); + dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); + + /* first pass: find a destination for each packet in the batch */ + for (i = 0; likely(i < n); i += ft[i].ft_frags) { + uint8_t dst_ring = ring_nr; /* default, same ring as origin */ + uint16_t dst_port, d_i; + struct nm_bdg_q *d; + uint8_t *buf = ft[i].ft_buf; + u_int len = ft[i].ft_len; + + ND("slot %d frags %d", i, ft[i].ft_frags); + /* Drop the packet if the virtio-net header is not into the first + fragment nor at the very beginning of the second. */ + if (unlikely(na->virt_hdr_len > len)) + continue; + if (len == na->virt_hdr_len) { + buf = ft[i+1].ft_buf; + len = ft[i+1].ft_len; + } else { + buf += na->virt_hdr_len; + len -= na->virt_hdr_len; + } + dst_port = b->nm_bdg_lookup(buf, len, &dst_ring, na); + if (netmap_verbose > 255) + RD(5, "slot %d port %d -> %d", i, me, dst_port); + if (dst_port == NM_BDG_NOPORT) + continue; /* this packet is identified to be dropped */ + else if (unlikely(dst_port > NM_BDG_MAXPORTS)) + continue; + else if (dst_port == NM_BDG_BROADCAST) + dst_ring = 0; /* broadcasts always go to ring 0 */ + else if (unlikely(dst_port == me || + !b->bdg_ports[dst_port])) + continue; + + /* get a position in the scratch pad */ + d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; + d = dst_ents + d_i; + + /* append the first fragment to the list */ + if (d->bq_head == NM_FT_NULL) { /* new destination */ + d->bq_head = d->bq_tail = i; + /* remember this position to be scanned later */ + if (dst_port != NM_BDG_BROADCAST) + dsts[num_dsts++] = d_i; + } else { + ft[d->bq_tail].ft_next = i; + d->bq_tail = i; + } + d->bq_len += ft[i].ft_frags; + } + + /* + * Broadcast traffic goes to ring 0 on all destinations. + * So we need to add these rings to the list of ports to scan. + * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is + * expensive. We should keep a compact list of active destinations + * so we could shorten this loop. + */ + brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; + if (brddst->bq_head != NM_FT_NULL) { + for (j = 0; likely(j < b->bdg_active_ports); j++) { + uint16_t d_i; + i = b->bdg_port_index[j]; + if (unlikely(i == me)) + continue; + d_i = i * NM_BDG_MAXRINGS; + if (dst_ents[d_i].bq_head == NM_FT_NULL) + dsts[num_dsts++] = d_i; + } + } + + ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts); + /* second pass: scan destinations (XXX will be modular somehow) */ + for (i = 0; i < num_dsts; i++) { + struct ifnet *dst_ifp; + struct netmap_vp_adapter *dst_na; + struct netmap_kring *kring; + struct netmap_ring *ring; + u_int dst_nr, lim, j, d_i, next, brd_next; + u_int needed, howmany; + int retry = netmap_txsync_retry; + struct nm_bdg_q *d; + uint32_t my_start = 0, lease_idx = 0; + int nrings; + int virt_hdr_mismatch = 0; + + d_i = dsts[i]; + ND("second pass %d port %d", i, d_i); + d = dst_ents + d_i; + // XXX fix the division + dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; + /* protect from the lookup function returning an inactive + * destination port + */ + if (unlikely(dst_na == NULL)) + goto cleanup; + if (dst_na->up.na_flags & NAF_SW_ONLY) + goto cleanup; + dst_ifp = dst_na->up.ifp; + /* + * The interface may be in !netmap mode in two cases: + * - when na is attached but not activated yet; + * - when na is being deactivated but is still attached. + */ + if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) { + ND("not in netmap mode!"); + goto cleanup; + } + + /* there is at least one either unicast or broadcast packet */ + brd_next = brddst->bq_head; + next = d->bq_head; + /* we need to reserve this many slots. If fewer are + * available, some packets will be dropped. + * Packets may have multiple fragments, so we may not use + * there is a chance that we may not use all of the slots + * we have claimed, so we will need to handle the leftover + * ones when we regain the lock. + */ + needed = d->bq_len + brddst->bq_len; + + if (unlikely(dst_na->virt_hdr_len != na->virt_hdr_len)) { + /* There is a virtio-net header/offloadings mismatch between + * source and destination. The slower mismatch datapath will + * be used to cope with all the mismatches. + */ + virt_hdr_mismatch = 1; + if (dst_na->mfs < na->mfs) { + /* We may need to do segmentation offloadings, and so + * we may need a number of destination slots greater + * than the number of input slots ('needed'). + * We look for the smallest integer 'x' which satisfies: + * needed * na->mfs + x * H <= x * na->mfs + * where 'H' is the length of the longest header that may + * be replicated in the segmentation process (e.g. for + * TCPv4 we must account for ethernet header, IP header + * and TCPv4 header). + */ + needed = (needed * na->mfs) / + (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1; + ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed); + } + } + + ND(5, "pass 2 dst %d is %x %s", + i, d_i, is_vp ? "virtual" : "nic/host"); + dst_nr = d_i & (NM_BDG_MAXRINGS-1); + nrings = dst_na->up.num_rx_rings; + if (dst_nr >= nrings) + dst_nr = dst_nr % nrings; + kring = &dst_na->up.rx_rings[dst_nr]; + ring = kring->ring; + lim = kring->nkr_num_slots - 1; + +retry: + + if (dst_na->retry && retry) { + /* try to get some free slot from the previous run */ + dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); + } + /* reserve the buffers in the queue and an entry + * to report completion, and drop lock. + * XXX this might become a helper function. + */ + mtx_lock(&kring->q_lock); + if (kring->nkr_stopped) { + mtx_unlock(&kring->q_lock); + goto cleanup; + } + my_start = j = kring->nkr_hwlease; + howmany = nm_kr_space(kring, 1); + if (needed < howmany) + howmany = needed; + lease_idx = nm_kr_lease(kring, howmany, 1); + mtx_unlock(&kring->q_lock); + + /* only retry if we need more than available slots */ + if (retry && needed <= howmany) + retry = 0; + + /* copy to the destination queue */ + while (howmany > 0) { + struct netmap_slot *slot; + struct nm_bdg_fwd *ft_p, *ft_end; + u_int cnt; + + /* find the queue from which we pick next packet. + * NM_FT_NULL is always higher than valid indexes + * so we never dereference it if the other list + * has packets (and if both are empty we never + * get here). + */ + if (next < brd_next) { + ft_p = ft + next; + next = ft_p->ft_next; + } else { /* insert broadcast */ + ft_p = ft + brd_next; + brd_next = ft_p->ft_next; + } + cnt = ft_p->ft_frags; // cnt > 0 + if (unlikely(cnt > howmany)) + break; /* no more space */ + if (netmap_verbose && cnt > 1) + RD(5, "rx %d frags to %d", cnt, j); + ft_end = ft_p + cnt; + if (unlikely(virt_hdr_mismatch)) { + bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany); + } else { + howmany -= cnt; + do { + char *dst, *src = ft_p->ft_buf; + size_t copy_len = ft_p->ft_len, dst_len = copy_len; + + slot = &ring->slot[j]; + dst = BDG_NMB(&dst_na->up, slot); + + ND("send [%d] %d(%d) bytes at %s:%d", + i, (int)copy_len, (int)dst_len, + NM_IFPNAME(dst_ifp), j); + /* round to a multiple of 64 */ + copy_len = (copy_len + 63) & ~63; + + if (ft_p->ft_flags & NS_INDIRECT) { + if (copyin(src, dst, copy_len)) { + // invalid user pointer, pretend len is 0 + dst_len = 0; + } + } else { + //memcpy(dst, src, copy_len); + pkt_copy(src, dst, (int)copy_len); + } + slot->len = dst_len; + slot->flags = (cnt << 8)| NS_MOREFRAG; + j = nm_next(j, lim); + needed--; + ft_p++; + } while (ft_p != ft_end); + slot->flags = (cnt << 8); /* clear flag on last entry */ + } + /* are we done ? */ + if (next == NM_FT_NULL && brd_next == NM_FT_NULL) + break; + } + { + /* current position */ + uint32_t *p = kring->nkr_leases; /* shorthand */ + uint32_t update_pos; + int still_locked = 1; + + mtx_lock(&kring->q_lock); + if (unlikely(howmany > 0)) { + /* not used all bufs. If i am the last one + * i can recover the slots, otherwise must + * fill them with 0 to mark empty packets. + */ + ND("leftover %d bufs", howmany); + if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { + /* yes i am the last one */ + ND("roll back nkr_hwlease to %d", j); + kring->nkr_hwlease = j; + } else { + while (howmany-- > 0) { + ring->slot[j].len = 0; + ring->slot[j].flags = 0; + j = nm_next(j, lim); + } + } + } + p[lease_idx] = j; /* report I am done */ + + update_pos = kring->nr_hwtail; + + if (my_start == update_pos) { + /* all slots before my_start have been reported, + * so scan subsequent leases to see if other ranges + * have been completed, and to a selwakeup or txsync. + */ + while (lease_idx != kring->nkr_lease_idx && + p[lease_idx] != NR_NOSLOT) { + j = p[lease_idx]; + p[lease_idx] = NR_NOSLOT; + lease_idx = nm_next(lease_idx, lim); + } + /* j is the new 'write' position. j != my_start + * means there are new buffers to report + */ + if (likely(j != my_start)) { + kring->nr_hwtail = j; + still_locked = 0; + mtx_unlock(&kring->q_lock); + dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); + if (dst_na->retry && retry--) + goto retry; + } + } + if (still_locked) + mtx_unlock(&kring->q_lock); + } +cleanup: + d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ + d->bq_len = 0; + } + brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ + brddst->bq_len = 0; + return 0; +} + + +static int +netmap_vp_txsync(struct netmap_vp_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *kring = &na->up.tx_rings[ring_nr]; + u_int done; + u_int const lim = kring->nkr_num_slots - 1; + u_int const cur = kring->rcur; + + if (bridge_batch <= 0) { /* testing only */ + done = cur; // used all + goto done; + } + if (bridge_batch > NM_BDG_BATCH) + bridge_batch = NM_BDG_BATCH; + + done = nm_bdg_preflush(na, ring_nr, kring, cur); +done: + if (done != cur) + D("early break at %d/ %d, tail %d", done, cur, kring->nr_hwtail); + /* + * packets between 'done' and 'cur' are left unsent. + */ + kring->nr_hwcur = done; + kring->nr_hwtail = nm_prev(done, lim); + nm_txsync_finalize(kring); + if (netmap_verbose) + D("%s ring %d flags %d", NM_IFPNAME(na->up.ifp), ring_nr, flags); + return 0; +} + + +/* + * main dispatch routine for the bridge. + * We already know that only one thread is running this. + * we must run nm_bdg_preflush without lock. + */ +static int +bdg_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; + return netmap_vp_txsync(vpna, ring_nr, flags); +} + +static int +netmap_vp_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + u_int nm_i, lim = kring->nkr_num_slots - 1; + u_int head = nm_rxsync_prologue(kring); + int n; + + if (head > lim) { + D("ouch dangerous reset!!!"); + n = netmap_ring_reinit(kring); + goto done; + } + + /* First part, import newly received packets. */ + /* actually nothing to do here, they are already in the kring */ + + /* Second part, skip past packets that userspace has released. */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { + /* consistency check, but nothing really important here */ + for (n = 0; likely(nm_i != head); n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; + void *addr = BDG_NMB(na, slot); + + if (addr == netmap_buffer_base) { /* bad buf */ + D("bad buffer index %d, ignore ?", + slot->buf_idx); + } + slot->flags &= ~NS_BUF_CHANGED; + nm_i = nm_next(nm_i, lim); + } + kring->nr_hwcur = head; + } + + /* tell userspace that there are new packets */ + nm_rxsync_finalize(kring); + n = 0; +done: + return n; +} + +/* + * user process reading from a VALE switch. + * Already protected against concurrent calls from userspace, + * but we must acquire the queue's lock to protect against + * writers on the same queue. + */ +static int +bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + int n; + + mtx_lock(&kring->q_lock); + n = netmap_vp_rxsync(na, ring_nr, flags); + mtx_unlock(&kring->q_lock); + return n; +} + + +static int +bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp) +{ + struct netmap_vp_adapter *vpna; + struct netmap_adapter *na; + int error; + u_int npipes = 0; + + vpna = malloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (vpna == NULL) + return ENOMEM; + + na = &vpna->up; + + na->ifp = ifp; + + /* bound checking */ + na->num_tx_rings = nmr->nr_tx_rings; + nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); + nmr->nr_tx_rings = na->num_tx_rings; // write back + na->num_rx_rings = nmr->nr_rx_rings; + nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); + nmr->nr_rx_rings = na->num_rx_rings; // write back + nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE, + 1, NM_BDG_MAXSLOTS, NULL); + na->num_tx_desc = nmr->nr_tx_slots; + nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE, + 1, NM_BDG_MAXSLOTS, NULL); + /* validate number of pipes. We want at least 1, + * but probably can do with some more. + * So let's use 2 as default (when 0 is supplied) + */ + npipes = nmr->nr_arg1; + nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL); + nmr->nr_arg1 = npipes; /* write back */ + /* validate extra bufs */ + nm_bound_var(&nmr->nr_arg3, 0, 0, + 128*NM_BDG_MAXSLOTS, NULL); + na->num_rx_desc = nmr->nr_rx_slots; + vpna->virt_hdr_len = 0; + vpna->mfs = 1514; + /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero?? + vpna->mfs = netmap_buf_size; */ + if (netmap_verbose) + D("max frame size %u", vpna->mfs); + + na->na_flags |= NAF_BDG_MAYSLEEP | NAF_MEM_OWNER; + na->nm_txsync = bdg_netmap_txsync; + na->nm_rxsync = bdg_netmap_rxsync; + na->nm_register = bdg_netmap_reg; + na->nm_dtor = netmap_adapter_vp_dtor; + na->nm_krings_create = netmap_vp_krings_create; + na->nm_krings_delete = netmap_vp_krings_delete; + na->nm_mem = netmap_mem_private_new(NM_IFPNAME(na->ifp), + na->num_tx_rings, na->num_tx_desc, + na->num_rx_rings, na->num_rx_desc, + nmr->nr_arg3, npipes, &error); + if (na->nm_mem == NULL) + goto err; + /* other nmd fields are set in the common routine */ + error = netmap_attach_common(na); + if (error) + goto err; + return 0; + +err: + if (na->nm_mem != NULL) + netmap_mem_private_delete(na->nm_mem); + free(vpna, M_DEVBUF); + return error; +} + + +static void +netmap_bwrap_dtor(struct netmap_adapter *na) +{ + struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; + struct netmap_adapter *hwna = bna->hwna; + struct nm_bridge *b = bna->up.na_bdg, + *bh = bna->host.na_bdg; + struct ifnet *ifp = na->ifp; + + ND("na %p", na); + + if (b) { + netmap_bdg_detach_common(b, bna->up.bdg_port, + (bh ? bna->host.bdg_port : -1)); + } + + hwna->na_private = NULL; + netmap_adapter_put(hwna); + + bzero(ifp, sizeof(*ifp)); + free(ifp, M_DEVBUF); + na->ifp = NULL; + +} + + +/* + * Intr callback for NICs connected to a bridge. + * Simply ignore tx interrupts (maybe we could try to recover space ?) + * and pass received packets from nic to the bridge. + * + * XXX TODO check locking: this is called from the interrupt + * handler so we should make sure that the interface is not + * disconnected while passing down an interrupt. + * + * Note, no user process can access this NIC or the host stack. + * The only part of the ring that is significant are the slots, + * and head/cur/tail are set from the kring as needed + * (part as a receive ring, part as a transmit ring). + * + * callback that overwrites the hwna notify callback. + * Packets come from the outside or from the host stack and are put on an hwna rx ring. + * The bridge wrapper then sends the packets through the bridge. + */ +static int +netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, int flags) +{ + struct ifnet *ifp = na->ifp; + struct netmap_bwrap_adapter *bna = na->na_private; + struct netmap_vp_adapter *hostna = &bna->host; + struct netmap_kring *kring, *bkring; + struct netmap_ring *ring; + int is_host_ring = ring_nr == na->num_rx_rings; + struct netmap_vp_adapter *vpna = &bna->up; + int error = 0; + + if (netmap_verbose) + D("%s %s%d 0x%x", NM_IFPNAME(ifp), + (tx == NR_TX ? "TX" : "RX"), ring_nr, flags); + + if (flags & NAF_DISABLE_NOTIFY) { + kring = tx == NR_TX ? na->tx_rings : na->rx_rings; + bkring = tx == NR_TX ? vpna->up.rx_rings : vpna->up.tx_rings; + if (kring[ring_nr].nkr_stopped) + netmap_disable_ring(&bkring[ring_nr]); + else + bkring[ring_nr].nkr_stopped = 0; + return 0; + } + + if (ifp == NULL || !(ifp->if_capenable & IFCAP_NETMAP)) + return 0; + + /* we only care about receive interrupts */ + if (tx == NR_TX) + return 0; + + kring = &na->rx_rings[ring_nr]; + ring = kring->ring; + + /* make sure the ring is not disabled */ + if (nm_kr_tryget(kring)) + return 0; + + if (is_host_ring && hostna->na_bdg == NULL) { + error = bna->save_notify(na, ring_nr, tx, flags); + goto put_out; + } + + /* Here we expect ring->head = ring->cur = ring->tail + * because everything has been released from the previous round. + * However the ring is shared and we might have info from + * the wrong side (the tx ring). Hence we overwrite with + * the info from the rx kring. + */ + if (netmap_verbose) + D("%s head %d cur %d tail %d (kring %d %d %d)", NM_IFPNAME(ifp), + ring->head, ring->cur, ring->tail, + kring->rhead, kring->rcur, kring->rtail); + + ring->head = kring->rhead; + ring->cur = kring->rcur; + ring->tail = kring->rtail; + + if (is_host_ring) { + vpna = hostna; + ring_nr = 0; + } + /* simulate a user wakeup on the rx ring */ + /* fetch packets that have arrived. + * XXX maybe do this in a loop ? + */ + error = kring->nm_sync(kring, 0); + if (error) + goto put_out; + if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) { + D("how strange, interrupt with no packets on %s", + NM_IFPNAME(ifp)); + goto put_out; + } + + /* new packets are ring->cur to ring->tail, and the bkring + * had hwcur == ring->cur. So advance ring->cur to ring->tail + * to push all packets out. + */ + ring->head = ring->cur = ring->tail; + + /* also set tail to what the bwrap expects */ + bkring = &vpna->up.tx_rings[ring_nr]; + ring->tail = bkring->nr_hwtail; // rtail too ? + + /* pass packets to the switch */ + nm_txsync_prologue(bkring); // XXX error checking ? + netmap_vp_txsync(vpna, ring_nr, flags); + + /* mark all buffers as released on this ring */ + ring->head = ring->cur = kring->nr_hwtail; + ring->tail = kring->rtail; + /* another call to actually release the buffers */ + if (!is_host_ring) { + error = kring->nm_sync(kring, 0); + } else { + /* mark all packets as released, as in the + * second part of netmap_rxsync_from_host() + */ + kring->nr_hwcur = kring->nr_hwtail; + nm_rxsync_finalize(kring); + } + +put_out: + nm_kr_put(kring); + return error; +} + + +static int +netmap_bwrap_register(struct netmap_adapter *na, int onoff) +{ + struct netmap_bwrap_adapter *bna = + (struct netmap_bwrap_adapter *)na; + struct netmap_adapter *hwna = bna->hwna; + struct netmap_vp_adapter *hostna = &bna->host; + int error; + + ND("%s %s", NM_IFPNAME(na->ifp), onoff ? "on" : "off"); + + if (onoff) { + int i; + + hwna->na_lut = na->na_lut; + hwna->na_lut_objtotal = na->na_lut_objtotal; + + if (hostna->na_bdg) { + hostna->up.na_lut = na->na_lut; + hostna->up.na_lut_objtotal = na->na_lut_objtotal; + } + + /* cross-link the netmap rings + * The original number of rings comes from hwna, + * rx rings on one side equals tx rings on the other. + */ + for (i = 0; i < na->num_rx_rings + 1; i++) { + hwna->tx_rings[i].nkr_num_slots = na->rx_rings[i].nkr_num_slots; + hwna->tx_rings[i].ring = na->rx_rings[i].ring; + } + for (i = 0; i < na->num_tx_rings + 1; i++) { + hwna->rx_rings[i].nkr_num_slots = na->tx_rings[i].nkr_num_slots; + hwna->rx_rings[i].ring = na->tx_rings[i].ring; + } + } + + if (hwna->ifp) { + error = hwna->nm_register(hwna, onoff); + if (error) + return error; + } + + bdg_netmap_reg(na, onoff); + + if (onoff) { + bna->save_notify = hwna->nm_notify; + hwna->nm_notify = netmap_bwrap_intr_notify; + } else { + hwna->nm_notify = bna->save_notify; + hwna->na_lut = NULL; + hwna->na_lut_objtotal = 0; + } + + return 0; +} + + +static int +netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd, + u_int *rxr, u_int *rxd) +{ + struct netmap_bwrap_adapter *bna = + (struct netmap_bwrap_adapter *)na; + struct netmap_adapter *hwna = bna->hwna; + + /* forward the request */ + netmap_update_config(hwna); + /* swap the results */ + *txr = hwna->num_rx_rings; + *txd = hwna->num_rx_desc; + *rxr = hwna->num_tx_rings; + *rxd = hwna->num_rx_desc; + + return 0; +} + + +static int +netmap_bwrap_krings_create(struct netmap_adapter *na) +{ + struct netmap_bwrap_adapter *bna = + (struct netmap_bwrap_adapter *)na; + struct netmap_adapter *hwna = bna->hwna; + struct netmap_adapter *hostna = &bna->host.up; + int error; + + ND("%s", NM_IFPNAME(na->ifp)); + + error = netmap_vp_krings_create(na); + if (error) + return error; + + error = hwna->nm_krings_create(hwna); + if (error) { + netmap_vp_krings_delete(na); + return error; + } + + if (na->na_flags & NAF_HOST_RINGS) { + hostna->tx_rings = na->tx_rings + na->num_tx_rings; + hostna->rx_rings = na->rx_rings + na->num_rx_rings; + } + + return 0; +} + + +static void +netmap_bwrap_krings_delete(struct netmap_adapter *na) +{ + struct netmap_bwrap_adapter *bna = + (struct netmap_bwrap_adapter *)na; + struct netmap_adapter *hwna = bna->hwna; + + ND("%s", NM_IFPNAME(na->ifp)); + + hwna->nm_krings_delete(hwna); + netmap_vp_krings_delete(na); +} + + +/* notify method for the bridge-->hwna direction */ +static int +netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags) +{ + struct netmap_bwrap_adapter *bna = + (struct netmap_bwrap_adapter *)na; + struct netmap_adapter *hwna = bna->hwna; + struct netmap_kring *kring, *hw_kring; + struct netmap_ring *ring; + u_int lim; + int error = 0; + + if (tx == NR_TX) + return EINVAL; + + kring = &na->rx_rings[ring_n]; + hw_kring = &hwna->tx_rings[ring_n]; + ring = kring->ring; + lim = kring->nkr_num_slots - 1; + + if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP)) + return 0; + mtx_lock(&kring->q_lock); + /* first step: simulate a user wakeup on the rx ring */ + netmap_vp_rxsync(na, ring_n, flags); + ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", + NM_IFPNAME(na->ifp), ring_n, + kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, + ring->head, ring->cur, ring->tail, + hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail); + /* second step: the simulated user consumes all new packets */ + ring->head = ring->cur = ring->tail; + + /* third step: the new packets are sent on the tx ring + * (which is actually the same ring) + */ + /* set tail to what the hw expects */ + ring->tail = hw_kring->rtail; + nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ? + error = hw_kring->nm_sync(hw_kring, flags); + + /* fourth step: now we are back the rx ring */ + /* claim ownership on all hw owned bufs */ + ring->head = nm_next(ring->tail, lim); /* skip past reserved slot */ + ring->tail = kring->rtail; /* restore saved value of tail, for safety */ + + /* fifth step: the user goes to sleep again, causing another rxsync */ + netmap_vp_rxsync(na, ring_n, flags); + ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", + NM_IFPNAME(na->ifp), ring_n, + kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, + ring->head, ring->cur, ring->tail, + hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail); + mtx_unlock(&kring->q_lock); + return error; +} + + +static int +netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags) +{ + struct netmap_bwrap_adapter *bna = na->na_private; + struct netmap_adapter *port_na = &bna->up.up; + if (tx == NR_TX || ring_n != 0) + return EINVAL; + return netmap_bwrap_notify(port_na, port_na->num_rx_rings, NR_RX, flags); +} + + +/* attach a bridge wrapper to the 'real' device */ +static int +netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real) +{ + struct netmap_bwrap_adapter *bna; + struct netmap_adapter *na; + struct netmap_adapter *hwna = NA(real); + struct netmap_adapter *hostna; + int error; + + + bna = malloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (bna == NULL) + return ENOMEM; + + na = &bna->up.up; + na->ifp = fake; + /* fill the ring data for the bwrap adapter with rx/tx meanings + * swapped. The real cross-linking will be done during register, + * when all the krings will have been created. + */ + na->num_rx_rings = hwna->num_tx_rings; + na->num_tx_rings = hwna->num_rx_rings; + na->num_tx_desc = hwna->num_rx_desc; + na->num_rx_desc = hwna->num_tx_desc; + na->nm_dtor = netmap_bwrap_dtor; + na->nm_register = netmap_bwrap_register; + // na->nm_txsync = netmap_bwrap_txsync; + // na->nm_rxsync = netmap_bwrap_rxsync; + na->nm_config = netmap_bwrap_config; + na->nm_krings_create = netmap_bwrap_krings_create; + na->nm_krings_delete = netmap_bwrap_krings_delete; + na->nm_notify = netmap_bwrap_notify; + na->nm_mem = hwna->nm_mem; + na->na_private = na; /* prevent NIOCREGIF */ + bna->up.retry = 1; /* XXX maybe this should depend on the hwna */ + + bna->hwna = hwna; + netmap_adapter_get(hwna); + hwna->na_private = bna; /* weak reference */ + + if (hwna->na_flags & NAF_HOST_RINGS) { + na->na_flags |= NAF_HOST_RINGS; + hostna = &bna->host.up; + hostna->ifp = hwna->ifp; + hostna->num_tx_rings = 1; + hostna->num_tx_desc = hwna->num_rx_desc; + hostna->num_rx_rings = 1; + hostna->num_rx_desc = hwna->num_tx_desc; + // hostna->nm_txsync = netmap_bwrap_host_txsync; + // hostna->nm_rxsync = netmap_bwrap_host_rxsync; + hostna->nm_notify = netmap_bwrap_host_notify; + hostna->nm_mem = na->nm_mem; + hostna->na_private = bna; + } + + ND("%s<->%s txr %d txd %d rxr %d rxd %d", + fake->if_xname, real->if_xname, + na->num_tx_rings, na->num_tx_desc, + na->num_rx_rings, na->num_rx_desc); + + error = netmap_attach_common(na); + if (error) { + netmap_adapter_put(hwna); + free(bna, M_DEVBUF); + return error; + } + return 0; +} + + +void +netmap_init_bridges(void) +{ + int i; + bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */ + for (i = 0; i < NM_BRIDGES; i++) + BDG_RWINIT(&nm_bridges[i]); +} +#endif /* WITH_VALE */ |