summaryrefslogtreecommitdiffstats
path: root/sys/dev/netmap
diff options
context:
space:
mode:
authorluigi <luigi@FreeBSD.org>2014-02-18 05:46:19 +0000
committerluigi <luigi@FreeBSD.org>2014-02-18 05:46:19 +0000
commitc9f2fff1da752248a19cc8a978f43fd521639708 (patch)
treec085b4844c52900cd273b45d3fe39706e0fc4bed /sys/dev/netmap
parent5bacc3bb87b954978543b0d82a4d5705e33f5c06 (diff)
downloadFreeBSD-src-c9f2fff1da752248a19cc8a978f43fd521639708.zip
FreeBSD-src-c9f2fff1da752248a19cc8a978f43fd521639708.tar.gz
missing files from previous commit...
Diffstat (limited to 'sys/dev/netmap')
-rw-r--r--sys/dev/netmap/netmap_freebsd.c655
-rw-r--r--sys/dev/netmap/netmap_generic.c806
-rw-r--r--sys/dev/netmap/netmap_mbq.c163
-rw-r--r--sys/dev/netmap/netmap_mbq.h78
-rw-r--r--sys/dev/netmap/netmap_mem2.h227
-rw-r--r--sys/dev/netmap/netmap_offloadings.c401
-rw-r--r--sys/dev/netmap/netmap_pipe.c711
-rw-r--r--sys/dev/netmap/netmap_vale.c2103
8 files changed, 5144 insertions, 0 deletions
diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c
new file mode 100644
index 0000000..a8e287c
--- /dev/null
+++ b/sys/dev/netmap/netmap_freebsd.c
@@ -0,0 +1,655 @@
+/*
+ * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* $FreeBSD$ */
+
+#include <sys/types.h>
+#include <sys/module.h>
+#include <sys/errno.h>
+#include <sys/param.h> /* defines used in kernel.h */
+#include <sys/poll.h> /* POLLIN, POLLOUT */
+#include <sys/kernel.h> /* types used in module initialization */
+#include <sys/conf.h> /* DEV_MODULE */
+#include <sys/endian.h>
+
+#include <sys/rwlock.h>
+
+#include <vm/vm.h> /* vtophys */
+#include <vm/pmap.h> /* vtophys */
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/uma.h>
+
+
+#include <sys/malloc.h>
+#include <sys/socket.h> /* sockaddrs */
+#include <sys/selinfo.h>
+#include <net/if.h>
+#include <net/if_var.h>
+#include <machine/bus.h> /* bus_dmamap_* */
+#include <netinet/in.h> /* in6_cksum_pseudo() */
+#include <machine/in_cksum.h> /* in_pseudo(), in_cksum_hdr() */
+
+#include <net/netmap.h>
+#include <dev/netmap/netmap_kern.h>
+#include <dev/netmap/netmap_mem2.h>
+
+
+/* ======================== FREEBSD-SPECIFIC ROUTINES ================== */
+
+rawsum_t nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum)
+{
+ /* TODO XXX please use the FreeBSD implementation for this. */
+ uint16_t *words = (uint16_t *)data;
+ int nw = len / 2;
+ int i;
+
+ for (i = 0; i < nw; i++)
+ cur_sum += be16toh(words[i]);
+
+ if (len & 1)
+ cur_sum += (data[len-1] << 8);
+
+ return cur_sum;
+}
+
+/* Fold a raw checksum: 'cur_sum' is in host byte order, while the
+ * return value is in network byte order.
+ */
+uint16_t nm_csum_fold(rawsum_t cur_sum)
+{
+ /* TODO XXX please use the FreeBSD implementation for this. */
+ while (cur_sum >> 16)
+ cur_sum = (cur_sum & 0xFFFF) + (cur_sum >> 16);
+
+ return htobe16((~cur_sum) & 0xFFFF);
+}
+
+uint16_t nm_csum_ipv4(struct nm_iphdr *iph)
+{
+#if 0
+ return in_cksum_hdr((void *)iph);
+#else
+ return nm_csum_fold(nm_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0));
+#endif
+}
+
+void nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data,
+ size_t datalen, uint16_t *check)
+{
+ uint16_t pseudolen = datalen + iph->protocol;
+
+ /* Compute and insert the pseudo-header cheksum. */
+ *check = in_pseudo(iph->saddr, iph->daddr,
+ htobe16(pseudolen));
+ /* Compute the checksum on TCP/UDP header + payload
+ * (includes the pseudo-header).
+ */
+ *check = nm_csum_fold(nm_csum_raw(data, datalen, 0));
+}
+
+void nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data,
+ size_t datalen, uint16_t *check)
+{
+#ifdef INET6
+ *check = in6_cksum_pseudo((void*)ip6h, datalen, ip6h->nexthdr, 0);
+ *check = nm_csum_fold(nm_csum_raw(data, datalen, 0));
+#else
+ static int notsupported = 0;
+ if (!notsupported) {
+ notsupported = 1;
+ D("inet6 segmentation not supported");
+ }
+#endif
+}
+
+
+/*
+ * Intercept the rx routine in the standard device driver.
+ * Second argument is non-zero to intercept, 0 to restore
+ */
+int
+netmap_catch_rx(struct netmap_adapter *na, int intercept)
+{
+ struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;
+ struct ifnet *ifp = na->ifp;
+
+ if (intercept) {
+ if (gna->save_if_input) {
+ D("cannot intercept again");
+ return EINVAL; /* already set */
+ }
+ gna->save_if_input = ifp->if_input;
+ ifp->if_input = generic_rx_handler;
+ } else {
+ if (!gna->save_if_input){
+ D("cannot restore");
+ return EINVAL; /* not saved */
+ }
+ ifp->if_input = gna->save_if_input;
+ gna->save_if_input = NULL;
+ }
+
+ return 0;
+}
+
+
+/*
+ * Intercept the packet steering routine in the tx path,
+ * so that we can decide which queue is used for an mbuf.
+ * Second argument is non-zero to intercept, 0 to restore.
+ * On freebsd we just intercept if_transmit.
+ */
+void
+netmap_catch_tx(struct netmap_generic_adapter *gna, int enable)
+{
+ struct netmap_adapter *na = &gna->up.up;
+ struct ifnet *ifp = na->ifp;
+
+ if (enable) {
+ na->if_transmit = ifp->if_transmit;
+ ifp->if_transmit = netmap_transmit;
+ } else {
+ ifp->if_transmit = na->if_transmit;
+ }
+}
+
+
+/*
+ * Transmit routine used by generic_netmap_txsync(). Returns 0 on success
+ * and non-zero on error (which may be packet drops or other errors).
+ * addr and len identify the netmap buffer, m is the (preallocated)
+ * mbuf to use for transmissions.
+ *
+ * We should add a reference to the mbuf so the m_freem() at the end
+ * of the transmission does not consume resources.
+ *
+ * On FreeBSD, and on multiqueue cards, we can force the queue using
+ * if ((m->m_flags & M_FLOWID) != 0)
+ * i = m->m_pkthdr.flowid % adapter->num_queues;
+ * else
+ * i = curcpu % adapter->num_queues;
+ *
+ */
+int
+generic_xmit_frame(struct ifnet *ifp, struct mbuf *m,
+ void *addr, u_int len, u_int ring_nr)
+{
+ int ret;
+
+ m->m_len = m->m_pkthdr.len = 0;
+
+ // copy data to the mbuf
+ m_copyback(m, 0, len, addr);
+ // inc refcount. We are alone, so we can skip the atomic
+ atomic_fetchadd_int(m->m_ext.ref_cnt, 1);
+ m->m_flags |= M_FLOWID;
+ m->m_pkthdr.flowid = ring_nr;
+ m->m_pkthdr.rcvif = ifp; /* used for tx notification */
+ ret = NA(ifp)->if_transmit(ifp, m);
+ return ret;
+}
+
+
+/*
+ * The following two functions are empty until we have a generic
+ * way to extract the info from the ifp
+ */
+int
+generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx)
+{
+ D("called");
+ return 0;
+}
+
+
+void
+generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq)
+{
+ D("called");
+ *txq = netmap_generic_rings;
+ *rxq = netmap_generic_rings;
+}
+
+
+void netmap_mitigation_init(struct nm_generic_mit *mit, struct netmap_adapter *na)
+{
+ ND("called");
+ mit->mit_pending = 0;
+ mit->mit_na = na;
+}
+
+
+void netmap_mitigation_start(struct nm_generic_mit *mit)
+{
+ ND("called");
+}
+
+
+void netmap_mitigation_restart(struct nm_generic_mit *mit)
+{
+ ND("called");
+}
+
+
+int netmap_mitigation_active(struct nm_generic_mit *mit)
+{
+ ND("called");
+ return 0;
+}
+
+
+void netmap_mitigation_cleanup(struct nm_generic_mit *mit)
+{
+ ND("called");
+}
+
+
+/*
+ * In order to track whether pages are still mapped, we hook into
+ * the standard cdev_pager and intercept the constructor and
+ * destructor.
+ */
+
+struct netmap_vm_handle_t {
+ struct cdev *dev;
+ struct netmap_priv_d *priv;
+};
+
+
+static int
+netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
+ vm_ooffset_t foff, struct ucred *cred, u_short *color)
+{
+ struct netmap_vm_handle_t *vmh = handle;
+
+ if (netmap_verbose)
+ D("handle %p size %jd prot %d foff %jd",
+ handle, (intmax_t)size, prot, (intmax_t)foff);
+ dev_ref(vmh->dev);
+ return 0;
+}
+
+
+static void
+netmap_dev_pager_dtor(void *handle)
+{
+ struct netmap_vm_handle_t *vmh = handle;
+ struct cdev *dev = vmh->dev;
+ struct netmap_priv_d *priv = vmh->priv;
+
+ if (netmap_verbose)
+ D("handle %p", handle);
+ netmap_dtor(priv);
+ free(vmh, M_DEVBUF);
+ dev_rel(dev);
+}
+
+
+static int
+netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset,
+ int prot, vm_page_t *mres)
+{
+ struct netmap_vm_handle_t *vmh = object->handle;
+ struct netmap_priv_d *priv = vmh->priv;
+ vm_paddr_t paddr;
+ vm_page_t page;
+ vm_memattr_t memattr;
+ vm_pindex_t pidx;
+
+ ND("object %p offset %jd prot %d mres %p",
+ object, (intmax_t)offset, prot, mres);
+ memattr = object->memattr;
+ pidx = OFF_TO_IDX(offset);
+ paddr = netmap_mem_ofstophys(priv->np_mref, offset);
+ if (paddr == 0)
+ return VM_PAGER_FAIL;
+
+ if (((*mres)->flags & PG_FICTITIOUS) != 0) {
+ /*
+ * If the passed in result page is a fake page, update it with
+ * the new physical address.
+ */
+ page = *mres;
+ vm_page_updatefake(page, paddr, memattr);
+ } else {
+ /*
+ * Replace the passed in reqpage page with our own fake page and
+ * free up the all of the original pages.
+ */
+#ifndef VM_OBJECT_WUNLOCK /* FreeBSD < 10.x */
+#define VM_OBJECT_WUNLOCK VM_OBJECT_UNLOCK
+#define VM_OBJECT_WLOCK VM_OBJECT_LOCK
+#endif /* VM_OBJECT_WUNLOCK */
+
+ VM_OBJECT_WUNLOCK(object);
+ page = vm_page_getfake(paddr, memattr);
+ VM_OBJECT_WLOCK(object);
+ vm_page_lock(*mres);
+ vm_page_free(*mres);
+ vm_page_unlock(*mres);
+ *mres = page;
+ vm_page_insert(page, object, pidx);
+ }
+ page->valid = VM_PAGE_BITS_ALL;
+ return (VM_PAGER_OK);
+}
+
+
+static struct cdev_pager_ops netmap_cdev_pager_ops = {
+ .cdev_pg_ctor = netmap_dev_pager_ctor,
+ .cdev_pg_dtor = netmap_dev_pager_dtor,
+ .cdev_pg_fault = netmap_dev_pager_fault,
+};
+
+
+static int
+netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff,
+ vm_size_t objsize, vm_object_t *objp, int prot)
+{
+ int error;
+ struct netmap_vm_handle_t *vmh;
+ struct netmap_priv_d *priv;
+ vm_object_t obj;
+
+ if (netmap_verbose)
+ D("cdev %p foff %jd size %jd objp %p prot %d", cdev,
+ (intmax_t )*foff, (intmax_t )objsize, objp, prot);
+
+ vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF,
+ M_NOWAIT | M_ZERO);
+ if (vmh == NULL)
+ return ENOMEM;
+ vmh->dev = cdev;
+
+ NMG_LOCK();
+ error = devfs_get_cdevpriv((void**)&priv);
+ if (error)
+ goto err_unlock;
+ vmh->priv = priv;
+ priv->np_refcount++;
+ NMG_UNLOCK();
+
+ error = netmap_get_memory(priv);
+ if (error)
+ goto err_deref;
+
+ obj = cdev_pager_allocate(vmh, OBJT_DEVICE,
+ &netmap_cdev_pager_ops, objsize, prot,
+ *foff, NULL);
+ if (obj == NULL) {
+ D("cdev_pager_allocate failed");
+ error = EINVAL;
+ goto err_deref;
+ }
+
+ *objp = obj;
+ return 0;
+
+err_deref:
+ NMG_LOCK();
+ priv->np_refcount--;
+err_unlock:
+ NMG_UNLOCK();
+// err:
+ free(vmh, M_DEVBUF);
+ return error;
+}
+
+
+// XXX can we remove this ?
+static int
+netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
+{
+ if (netmap_verbose)
+ D("dev %p fflag 0x%x devtype %d td %p",
+ dev, fflag, devtype, td);
+ return 0;
+}
+
+
+static int
+netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+ struct netmap_priv_d *priv;
+ int error;
+
+ (void)dev;
+ (void)oflags;
+ (void)devtype;
+ (void)td;
+
+ // XXX wait or nowait ?
+ priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF,
+ M_NOWAIT | M_ZERO);
+ if (priv == NULL)
+ return ENOMEM;
+
+ error = devfs_set_cdevpriv(priv, netmap_dtor);
+ if (error)
+ return error;
+
+ priv->np_refcount = 1;
+
+ return 0;
+}
+
+/******************** kqueue support ****************/
+
+/*
+ * The OS_selwakeup also needs to issue a KNOTE_UNLOCKED.
+ * We use a non-zero argument to distinguish the call from the one
+ * in kevent_scan() which instead also needs to run netmap_poll().
+ * The knote uses a global mutex for the time being. We might
+ * try to reuse the one in the si, but it is not allocated
+ * permanently so it might be a bit tricky.
+ *
+ * The *kqfilter function registers one or another f_event
+ * depending on read or write mode.
+ * In the call to f_event() td_fpop is NULL so any child function
+ * calling devfs_get_cdevpriv() would fail - and we need it in
+ * netmap_poll(). As a workaround we store priv into kn->kn_hook
+ * and pass it as first argument to netmap_poll(), which then
+ * uses the failure to tell that we are called from f_event()
+ * and do not need the selrecord().
+ */
+
+void freebsd_selwakeup(struct selinfo *si, int pri);
+
+void
+freebsd_selwakeup(struct selinfo *si, int pri)
+{
+ if (netmap_verbose)
+ D("on knote %p", &si->si_note);
+ selwakeuppri(si, pri);
+ /* use a non-zero hint to tell the notification from the
+ * call done in kqueue_scan() which uses 0
+ */
+ KNOTE_UNLOCKED(&si->si_note, 0x100 /* notification */);
+}
+
+static void
+netmap_knrdetach(struct knote *kn)
+{
+ struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook;
+ struct selinfo *si = priv->np_rxsi;
+
+ D("remove selinfo %p", si);
+ knlist_remove(&si->si_note, kn, 0);
+}
+
+static void
+netmap_knwdetach(struct knote *kn)
+{
+ struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook;
+ struct selinfo *si = priv->np_txsi;
+
+ D("remove selinfo %p", si);
+ knlist_remove(&si->si_note, kn, 0);
+}
+
+/*
+ * callback from notifies (generated externally) and our
+ * calls to kevent(). The former we just return 1 (ready)
+ * since we do not know better.
+ * In the latter we call netmap_poll and return 0/1 accordingly.
+ */
+static int
+netmap_knrw(struct knote *kn, long hint, int events)
+{
+ struct netmap_priv_d *priv;
+ int revents;
+
+ if (hint != 0) {
+ ND(5, "call from notify");
+ return 1; /* assume we are ready */
+ }
+ priv = kn->kn_hook;
+ /* the notification may come from an external thread,
+ * in which case we do not want to run the netmap_poll
+ * This should be filtered above, but check just in case.
+ */
+ if (curthread != priv->np_td) { /* should not happen */
+ RD(5, "curthread changed %p %p", curthread, priv->np_td);
+ return 1;
+ } else {
+ revents = netmap_poll((void *)priv, events, curthread);
+ return (events & revents) ? 1 : 0;
+ }
+}
+
+static int
+netmap_knread(struct knote *kn, long hint)
+{
+ return netmap_knrw(kn, hint, POLLIN);
+}
+
+static int
+netmap_knwrite(struct knote *kn, long hint)
+{
+ return netmap_knrw(kn, hint, POLLOUT);
+}
+
+static struct filterops netmap_rfiltops = {
+ .f_isfd = 1,
+ .f_detach = netmap_knrdetach,
+ .f_event = netmap_knread,
+};
+
+static struct filterops netmap_wfiltops = {
+ .f_isfd = 1,
+ .f_detach = netmap_knwdetach,
+ .f_event = netmap_knwrite,
+};
+
+
+/*
+ * This is called when a thread invokes kevent() to record
+ * a change in the configuration of the kqueue().
+ * The 'priv' should be the same as in the netmap device.
+ */
+static int
+netmap_kqfilter(struct cdev *dev, struct knote *kn)
+{
+ struct netmap_priv_d *priv;
+ int error;
+ struct netmap_adapter *na;
+ struct selinfo *si;
+ int ev = kn->kn_filter;
+
+ if (ev != EVFILT_READ && ev != EVFILT_WRITE) {
+ D("bad filter request %d", ev);
+ return 1;
+ }
+ error = devfs_get_cdevpriv((void**)&priv);
+ if (error) {
+ D("device not yet setup");
+ return 1;
+ }
+ na = priv->np_na;
+ if (na == NULL) {
+ D("no netmap adapter for this file descriptor");
+ return 1;
+ }
+ /* the si is indicated in the priv */
+ si = (ev == EVFILT_WRITE) ? priv->np_txsi : priv->np_rxsi;
+ // XXX lock(priv) ?
+ kn->kn_fop = (ev == EVFILT_WRITE) ?
+ &netmap_wfiltops : &netmap_rfiltops;
+ kn->kn_hook = priv;
+ knlist_add(&si->si_note, kn, 1);
+ // XXX unlock(priv)
+ ND("register %p %s td %p priv %p kn %p np_nifp %p kn_fp/fpop %s",
+ na, na->ifp->if_xname, curthread, priv, kn,
+ priv->np_nifp,
+ kn->kn_fp == curthread->td_fpop ? "match" : "MISMATCH");
+ return 0;
+}
+
+struct cdevsw netmap_cdevsw = {
+ .d_version = D_VERSION,
+ .d_name = "netmap",
+ .d_open = netmap_open,
+ .d_mmap_single = netmap_mmap_single,
+ .d_ioctl = netmap_ioctl,
+ .d_poll = netmap_poll,
+ .d_kqfilter = netmap_kqfilter,
+ .d_close = netmap_close,
+};
+/*--- end of kqueue support ----*/
+
+/*
+ * Kernel entry point.
+ *
+ * Initialize/finalize the module and return.
+ *
+ * Return 0 on success, errno on failure.
+ */
+static int
+netmap_loader(__unused struct module *module, int event, __unused void *arg)
+{
+ int error = 0;
+
+ switch (event) {
+ case MOD_LOAD:
+ error = netmap_init();
+ break;
+
+ case MOD_UNLOAD:
+ netmap_fini();
+ break;
+
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+
+ return (error);
+}
+
+
+DEV_MODULE(netmap, netmap_loader, NULL);
diff --git a/sys/dev/netmap/netmap_generic.c b/sys/dev/netmap/netmap_generic.c
new file mode 100644
index 0000000..63253b6
--- /dev/null
+++ b/sys/dev/netmap/netmap_generic.c
@@ -0,0 +1,806 @@
+/*
+ * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This module implements netmap support on top of standard,
+ * unmodified device drivers.
+ *
+ * A NIOCREGIF request is handled here if the device does not
+ * have native support. TX and RX rings are emulated as follows:
+ *
+ * NIOCREGIF
+ * We preallocate a block of TX mbufs (roughly as many as
+ * tx descriptors; the number is not critical) to speed up
+ * operation during transmissions. The refcount on most of
+ * these buffers is artificially bumped up so we can recycle
+ * them more easily. Also, the destructor is intercepted
+ * so we use it as an interrupt notification to wake up
+ * processes blocked on a poll().
+ *
+ * For each receive ring we allocate one "struct mbq"
+ * (an mbuf tailq plus a spinlock). We intercept packets
+ * (through if_input)
+ * on the receive path and put them in the mbq from which
+ * netmap receive routines can grab them.
+ *
+ * TX:
+ * in the generic_txsync() routine, netmap buffers are copied
+ * (or linked, in a future) to the preallocated mbufs
+ * and pushed to the transmit queue. Some of these mbufs
+ * (those with NS_REPORT, or otherwise every half ring)
+ * have the refcount=1, others have refcount=2.
+ * When the destructor is invoked, we take that as
+ * a notification that all mbufs up to that one in
+ * the specific ring have been completed, and generate
+ * the equivalent of a transmit interrupt.
+ *
+ * RX:
+ *
+ */
+
+#ifdef __FreeBSD__
+
+#include <sys/cdefs.h> /* prerequisite */
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/malloc.h>
+#include <sys/lock.h> /* PROT_EXEC */
+#include <sys/rwlock.h>
+#include <sys/socket.h> /* sockaddrs */
+#include <sys/selinfo.h>
+#include <net/if.h>
+#include <net/if_var.h>
+#include <machine/bus.h> /* bus_dmamap_* in netmap_kern.h */
+
+// XXX temporary - D() defined here
+#include <net/netmap.h>
+#include <dev/netmap/netmap_kern.h>
+#include <dev/netmap/netmap_mem2.h>
+
+#define rtnl_lock() D("rtnl_lock called");
+#define rtnl_unlock() D("rtnl_unlock called");
+#define MBUF_TXQ(m) ((m)->m_pkthdr.flowid)
+#define MBUF_RXQ(m) ((m)->m_pkthdr.flowid)
+#define smp_mb()
+
+/*
+ * mbuf wrappers
+ */
+
+/*
+ * we allocate an EXT_PACKET
+ */
+#define netmap_get_mbuf(len) m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR|M_NOFREE)
+
+/* mbuf destructor, also need to change the type to EXT_EXTREF,
+ * add an M_NOFREE flag, and then clear the flag and
+ * chain into uma_zfree(zone_pack, mf)
+ * (or reinstall the buffer ?)
+ */
+#define SET_MBUF_DESTRUCTOR(m, fn) do { \
+ (m)->m_ext.ext_free = (void *)fn; \
+ (m)->m_ext.ext_type = EXT_EXTREF; \
+} while (0)
+
+
+#define GET_MBUF_REFCNT(m) ((m)->m_ext.ref_cnt ? *(m)->m_ext.ref_cnt : -1)
+
+
+
+#else /* linux */
+
+#include "bsd_glue.h"
+
+#include <linux/rtnetlink.h> /* rtnl_[un]lock() */
+#include <linux/ethtool.h> /* struct ethtool_ops, get_ringparam */
+#include <linux/hrtimer.h>
+
+//#define RATE /* Enables communication statistics. */
+
+//#define REG_RESET
+
+#endif /* linux */
+
+
+/* Common headers. */
+#include <net/netmap.h>
+#include <dev/netmap/netmap_kern.h>
+#include <dev/netmap/netmap_mem2.h>
+
+
+
+/* ======================== usage stats =========================== */
+
+#ifdef RATE
+#define IFRATE(x) x
+struct rate_stats {
+ unsigned long txpkt;
+ unsigned long txsync;
+ unsigned long txirq;
+ unsigned long rxpkt;
+ unsigned long rxirq;
+ unsigned long rxsync;
+};
+
+struct rate_context {
+ unsigned refcount;
+ struct timer_list timer;
+ struct rate_stats new;
+ struct rate_stats old;
+};
+
+#define RATE_PRINTK(_NAME_) \
+ printk( #_NAME_ " = %lu Hz\n", (cur._NAME_ - ctx->old._NAME_)/RATE_PERIOD);
+#define RATE_PERIOD 2
+static void rate_callback(unsigned long arg)
+{
+ struct rate_context * ctx = (struct rate_context *)arg;
+ struct rate_stats cur = ctx->new;
+ int r;
+
+ RATE_PRINTK(txpkt);
+ RATE_PRINTK(txsync);
+ RATE_PRINTK(txirq);
+ RATE_PRINTK(rxpkt);
+ RATE_PRINTK(rxsync);
+ RATE_PRINTK(rxirq);
+ printk("\n");
+
+ ctx->old = cur;
+ r = mod_timer(&ctx->timer, jiffies +
+ msecs_to_jiffies(RATE_PERIOD * 1000));
+ if (unlikely(r))
+ D("[v1000] Error: mod_timer()");
+}
+
+static struct rate_context rate_ctx;
+
+#else /* !RATE */
+#define IFRATE(x)
+#endif /* !RATE */
+
+
+/* =============== GENERIC NETMAP ADAPTER SUPPORT ================= */
+#define GENERIC_BUF_SIZE netmap_buf_size /* Size of the mbufs in the Tx pool. */
+
+/*
+ * Wrapper used by the generic adapter layer to notify
+ * the poller threads. Differently from netmap_rx_irq(), we check
+ * only IFCAP_NETMAP instead of NAF_NATIVE_ON to enable the irq.
+ */
+static void
+netmap_generic_irq(struct ifnet *ifp, u_int q, u_int *work_done)
+{
+ if (unlikely(!(ifp->if_capenable & IFCAP_NETMAP)))
+ return;
+
+ netmap_common_irq(ifp, q, work_done);
+}
+
+
+/* Enable/disable netmap mode for a generic network interface. */
+static int
+generic_netmap_register(struct netmap_adapter *na, int enable)
+{
+ struct ifnet *ifp = na->ifp;
+ struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;
+ struct mbuf *m;
+ int error;
+ int i, r;
+
+ if (!na)
+ return EINVAL;
+
+#ifdef REG_RESET
+ error = ifp->netdev_ops->ndo_stop(ifp);
+ if (error) {
+ return error;
+ }
+#endif /* REG_RESET */
+
+ if (enable) { /* Enable netmap mode. */
+ /* Init the mitigation support. */
+ gna->mit = malloc(na->num_rx_rings * sizeof(struct nm_generic_mit),
+ M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (!gna->mit) {
+ D("mitigation allocation failed");
+ error = ENOMEM;
+ goto out;
+ }
+ for (r=0; r<na->num_rx_rings; r++)
+ netmap_mitigation_init(&gna->mit[r], na);
+
+ /* Initialize the rx queue, as generic_rx_handler() can
+ * be called as soon as netmap_catch_rx() returns.
+ */
+ for (r=0; r<na->num_rx_rings; r++) {
+ mbq_safe_init(&na->rx_rings[r].rx_queue);
+ }
+
+ /*
+ * Preallocate packet buffers for the tx rings.
+ */
+ for (r=0; r<na->num_tx_rings; r++)
+ na->tx_rings[r].tx_pool = NULL;
+ for (r=0; r<na->num_tx_rings; r++) {
+ na->tx_rings[r].tx_pool = malloc(na->num_tx_desc * sizeof(struct mbuf *),
+ M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (!na->tx_rings[r].tx_pool) {
+ D("tx_pool allocation failed");
+ error = ENOMEM;
+ goto free_tx_pools;
+ }
+ for (i=0; i<na->num_tx_desc; i++)
+ na->tx_rings[r].tx_pool[i] = NULL;
+ for (i=0; i<na->num_tx_desc; i++) {
+ m = netmap_get_mbuf(GENERIC_BUF_SIZE);
+ if (!m) {
+ D("tx_pool[%d] allocation failed", i);
+ error = ENOMEM;
+ goto free_tx_pools;
+ }
+ na->tx_rings[r].tx_pool[i] = m;
+ }
+ }
+ rtnl_lock();
+ /* Prepare to intercept incoming traffic. */
+ error = netmap_catch_rx(na, 1);
+ if (error) {
+ D("netdev_rx_handler_register() failed (%d)", error);
+ goto register_handler;
+ }
+ ifp->if_capenable |= IFCAP_NETMAP;
+
+ /* Make netmap control the packet steering. */
+ netmap_catch_tx(gna, 1);
+
+ rtnl_unlock();
+
+#ifdef RATE
+ if (rate_ctx.refcount == 0) {
+ D("setup_timer()");
+ memset(&rate_ctx, 0, sizeof(rate_ctx));
+ setup_timer(&rate_ctx.timer, &rate_callback, (unsigned long)&rate_ctx);
+ if (mod_timer(&rate_ctx.timer, jiffies + msecs_to_jiffies(1500))) {
+ D("Error: mod_timer()");
+ }
+ }
+ rate_ctx.refcount++;
+#endif /* RATE */
+
+ } else if (na->tx_rings[0].tx_pool) {
+ /* Disable netmap mode. We enter here only if the previous
+ generic_netmap_register(na, 1) was successfull.
+ If it was not, na->tx_rings[0].tx_pool was set to NULL by the
+ error handling code below. */
+ rtnl_lock();
+
+ ifp->if_capenable &= ~IFCAP_NETMAP;
+
+ /* Release packet steering control. */
+ netmap_catch_tx(gna, 0);
+
+ /* Do not intercept packets on the rx path. */
+ netmap_catch_rx(na, 0);
+
+ rtnl_unlock();
+
+ /* Free the mbufs going to the netmap rings */
+ for (r=0; r<na->num_rx_rings; r++) {
+ mbq_safe_purge(&na->rx_rings[r].rx_queue);
+ mbq_safe_destroy(&na->rx_rings[r].rx_queue);
+ }
+
+ for (r=0; r<na->num_rx_rings; r++)
+ netmap_mitigation_cleanup(&gna->mit[r]);
+ free(gna->mit, M_DEVBUF);
+
+ for (r=0; r<na->num_tx_rings; r++) {
+ for (i=0; i<na->num_tx_desc; i++) {
+ m_freem(na->tx_rings[r].tx_pool[i]);
+ }
+ free(na->tx_rings[r].tx_pool, M_DEVBUF);
+ }
+
+#ifdef RATE
+ if (--rate_ctx.refcount == 0) {
+ D("del_timer()");
+ del_timer(&rate_ctx.timer);
+ }
+#endif
+ }
+
+#ifdef REG_RESET
+ error = ifp->netdev_ops->ndo_open(ifp);
+ if (error) {
+ goto free_tx_pools;
+ }
+#endif
+
+ return 0;
+
+register_handler:
+ rtnl_unlock();
+free_tx_pools:
+ for (r=0; r<na->num_tx_rings; r++) {
+ if (na->tx_rings[r].tx_pool == NULL)
+ continue;
+ for (i=0; i<na->num_tx_desc; i++)
+ if (na->tx_rings[r].tx_pool[i])
+ m_freem(na->tx_rings[r].tx_pool[i]);
+ free(na->tx_rings[r].tx_pool, M_DEVBUF);
+ na->tx_rings[r].tx_pool = NULL;
+ }
+ for (r=0; r<na->num_rx_rings; r++) {
+ netmap_mitigation_cleanup(&gna->mit[r]);
+ mbq_safe_destroy(&na->rx_rings[r].rx_queue);
+ }
+ free(gna->mit, M_DEVBUF);
+out:
+
+ return error;
+}
+
+/*
+ * Callback invoked when the device driver frees an mbuf used
+ * by netmap to transmit a packet. This usually happens when
+ * the NIC notifies the driver that transmission is completed.
+ */
+static void
+generic_mbuf_destructor(struct mbuf *m)
+{
+ if (netmap_verbose)
+ D("Tx irq (%p) queue %d", m, MBUF_TXQ(m));
+ netmap_generic_irq(MBUF_IFP(m), MBUF_TXQ(m), NULL);
+#ifdef __FreeBSD__
+ m->m_ext.ext_type = EXT_PACKET;
+ m->m_ext.ext_free = NULL;
+ if (*(m->m_ext.ref_cnt) == 0)
+ *(m->m_ext.ref_cnt) = 1;
+ uma_zfree(zone_pack, m);
+#endif /* __FreeBSD__ */
+ IFRATE(rate_ctx.new.txirq++);
+}
+
+/* Record completed transmissions and update hwtail.
+ *
+ * The oldest tx buffer not yet completed is at nr_hwtail + 1,
+ * nr_hwcur is the first unsent buffer.
+ */
+static u_int
+generic_netmap_tx_clean(struct netmap_kring *kring)
+{
+ u_int const lim = kring->nkr_num_slots - 1;
+ u_int nm_i = nm_next(kring->nr_hwtail, lim);
+ u_int hwcur = kring->nr_hwcur;
+ u_int n = 0;
+ struct mbuf **tx_pool = kring->tx_pool;
+
+ while (nm_i != hwcur) { /* buffers not completed */
+ struct mbuf *m = tx_pool[nm_i];
+
+ if (unlikely(m == NULL)) {
+ /* this is done, try to replenish the entry */
+ tx_pool[nm_i] = m = netmap_get_mbuf(GENERIC_BUF_SIZE);
+ if (unlikely(m == NULL)) {
+ D("mbuf allocation failed, XXX error");
+ // XXX how do we proceed ? break ?
+ return -ENOMEM;
+ }
+ } else if (GET_MBUF_REFCNT(m) != 1) {
+ break; /* This mbuf is still busy: its refcnt is 2. */
+ }
+ n++;
+ nm_i = nm_next(nm_i, lim);
+ }
+ kring->nr_hwtail = nm_prev(nm_i, lim);
+ ND("tx completed [%d] -> hwtail %d", n, kring->nr_hwtail);
+
+ return n;
+}
+
+
+/*
+ * We have pending packets in the driver between nr_hwtail +1 and hwcur.
+ * Compute a position in the middle, to be used to generate
+ * a notification.
+ */
+static inline u_int
+generic_tx_event_middle(struct netmap_kring *kring, u_int hwcur)
+{
+ u_int n = kring->nkr_num_slots;
+ u_int ntc = nm_next(kring->nr_hwtail, n-1);
+ u_int e;
+
+ if (hwcur >= ntc) {
+ e = (hwcur + ntc) / 2;
+ } else { /* wrap around */
+ e = (hwcur + n + ntc) / 2;
+ if (e >= n) {
+ e -= n;
+ }
+ }
+
+ if (unlikely(e >= n)) {
+ D("This cannot happen");
+ e = 0;
+ }
+
+ return e;
+}
+
+/*
+ * We have pending packets in the driver between nr_hwtail+1 and hwcur.
+ * Schedule a notification approximately in the middle of the two.
+ * There is a race but this is only called within txsync which does
+ * a double check.
+ */
+static void
+generic_set_tx_event(struct netmap_kring *kring, u_int hwcur)
+{
+ struct mbuf *m;
+ u_int e;
+
+ if (nm_next(kring->nr_hwtail, kring->nkr_num_slots -1) == hwcur) {
+ return; /* all buffers are free */
+ }
+ e = generic_tx_event_middle(kring, hwcur);
+
+ m = kring->tx_pool[e];
+ if (m == NULL) {
+ /* This can happen if there is already an event on the netmap
+ slot 'e': There is nothing to do. */
+ return;
+ }
+ ND("Event at %d mbuf %p refcnt %d", e, m, GET_MBUF_REFCNT(m));
+ kring->tx_pool[e] = NULL;
+ SET_MBUF_DESTRUCTOR(m, generic_mbuf_destructor);
+
+ // XXX wmb() ?
+ /* Decrement the refcount an free it if we have the last one. */
+ m_freem(m);
+ smp_mb();
+}
+
+
+/*
+ * generic_netmap_txsync() transforms netmap buffers into mbufs
+ * and passes them to the standard device driver
+ * (ndo_start_xmit() or ifp->if_transmit() ).
+ * On linux this is not done directly, but using dev_queue_xmit(),
+ * since it implements the TX flow control (and takes some locks).
+ */
+static int
+generic_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+{
+ struct ifnet *ifp = na->ifp;
+ struct netmap_kring *kring = &na->tx_rings[ring_nr];
+ struct netmap_ring *ring = kring->ring;
+ u_int nm_i; /* index into the netmap ring */ // j
+ u_int const lim = kring->nkr_num_slots - 1;
+ u_int const head = kring->rhead;
+
+ IFRATE(rate_ctx.new.txsync++);
+
+ // TODO: handle the case of mbuf allocation failure
+
+ rmb();
+
+ /*
+ * First part: process new packets to send.
+ */
+ nm_i = kring->nr_hwcur;
+ if (nm_i != head) { /* we have new packets to send */
+ while (nm_i != head) {
+ struct netmap_slot *slot = &ring->slot[nm_i];
+ u_int len = slot->len;
+ void *addr = NMB(slot);
+
+ /* device-specific */
+ struct mbuf *m;
+ int tx_ret;
+
+ NM_CHECK_ADDR_LEN(addr, len);
+
+ /* Tale a mbuf from the tx pool and copy in the user packet. */
+ m = kring->tx_pool[nm_i];
+ if (unlikely(!m)) {
+ RD(5, "This should never happen");
+ kring->tx_pool[nm_i] = m = netmap_get_mbuf(GENERIC_BUF_SIZE);
+ if (unlikely(m == NULL)) {
+ D("mbuf allocation failed");
+ break;
+ }
+ }
+ /* XXX we should ask notifications when NS_REPORT is set,
+ * or roughly every half frame. We can optimize this
+ * by lazily requesting notifications only when a
+ * transmission fails. Probably the best way is to
+ * break on failures and set notifications when
+ * ring->cur == ring->tail || nm_i != cur
+ */
+ tx_ret = generic_xmit_frame(ifp, m, addr, len, ring_nr);
+ if (unlikely(tx_ret)) {
+ RD(5, "start_xmit failed: err %d [nm_i %u, head %u, hwtail %u]",
+ tx_ret, nm_i, head, kring->nr_hwtail);
+ /*
+ * No room for this mbuf in the device driver.
+ * Request a notification FOR A PREVIOUS MBUF,
+ * then call generic_netmap_tx_clean(kring) to do the
+ * double check and see if we can free more buffers.
+ * If there is space continue, else break;
+ * NOTE: the double check is necessary if the problem
+ * occurs in the txsync call after selrecord().
+ * Also, we need some way to tell the caller that not
+ * all buffers were queued onto the device (this was
+ * not a problem with native netmap driver where space
+ * is preallocated). The bridge has a similar problem
+ * and we solve it there by dropping the excess packets.
+ */
+ generic_set_tx_event(kring, nm_i);
+ if (generic_netmap_tx_clean(kring)) { /* space now available */
+ continue;
+ } else {
+ break;
+ }
+ }
+ slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
+ nm_i = nm_next(nm_i, lim);
+ IFRATE(rate_ctx.new.txpkt ++);
+ }
+
+ /* Update hwcur to the next slot to transmit. */
+ kring->nr_hwcur = nm_i; /* not head, we could break early */
+ }
+
+ /*
+ * Second, reclaim completed buffers
+ */
+ if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
+ /* No more available slots? Set a notification event
+ * on a netmap slot that will be cleaned in the future.
+ * No doublecheck is performed, since txsync() will be
+ * called twice by netmap_poll().
+ */
+ generic_set_tx_event(kring, nm_i);
+ }
+ ND("tx #%d, hwtail = %d", n, kring->nr_hwtail);
+
+ generic_netmap_tx_clean(kring);
+
+ nm_txsync_finalize(kring);
+
+ return 0;
+}
+
+
+/*
+ * This handler is registered (through netmap_catch_rx())
+ * within the attached network interface
+ * in the RX subsystem, so that every mbuf passed up by
+ * the driver can be stolen to the network stack.
+ * Stolen packets are put in a queue where the
+ * generic_netmap_rxsync() callback can extract them.
+ */
+void
+generic_rx_handler(struct ifnet *ifp, struct mbuf *m)
+{
+ struct netmap_adapter *na = NA(ifp);
+ struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;
+ u_int work_done;
+ u_int rr = MBUF_RXQ(m); // receive ring number
+
+ if (rr >= na->num_rx_rings) {
+ rr = rr % na->num_rx_rings; // XXX expensive...
+ }
+
+ /* limit the size of the queue */
+ if (unlikely(mbq_len(&na->rx_rings[rr].rx_queue) > 1024)) {
+ m_freem(m);
+ } else {
+ mbq_safe_enqueue(&na->rx_rings[rr].rx_queue, m);
+ }
+
+ if (netmap_generic_mit < 32768) {
+ /* no rx mitigation, pass notification up */
+ netmap_generic_irq(na->ifp, rr, &work_done);
+ IFRATE(rate_ctx.new.rxirq++);
+ } else {
+ /* same as send combining, filter notification if there is a
+ * pending timer, otherwise pass it up and start a timer.
+ */
+ if (likely(netmap_mitigation_active(&gna->mit[rr]))) {
+ /* Record that there is some pending work. */
+ gna->mit[rr].mit_pending = 1;
+ } else {
+ netmap_generic_irq(na->ifp, rr, &work_done);
+ IFRATE(rate_ctx.new.rxirq++);
+ netmap_mitigation_start(&gna->mit[rr]);
+ }
+ }
+}
+
+/*
+ * generic_netmap_rxsync() extracts mbufs from the queue filled by
+ * generic_netmap_rx_handler() and puts their content in the netmap
+ * receive ring.
+ * Access must be protected because the rx handler is asynchronous,
+ */
+static int
+generic_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+{
+ struct netmap_kring *kring = &na->rx_rings[ring_nr];
+ struct netmap_ring *ring = kring->ring;
+ u_int nm_i; /* index into the netmap ring */ //j,
+ u_int n;
+ u_int const lim = kring->nkr_num_slots - 1;
+ u_int const head = nm_rxsync_prologue(kring);
+ int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
+
+ if (head > lim)
+ return netmap_ring_reinit(kring);
+
+ /*
+ * First part: import newly received packets.
+ */
+ if (netmap_no_pendintr || force_update) {
+ /* extract buffers from the rx queue, stop at most one
+ * slot before nr_hwcur (stop_i)
+ */
+ uint16_t slot_flags = kring->nkr_slot_flags;
+ u_int stop_i = nm_prev(kring->nr_hwcur, lim);
+
+ nm_i = kring->nr_hwtail; /* first empty slot in the receive ring */
+ for (n = 0; nm_i != stop_i; n++) {
+ int len;
+ void *addr = NMB(&ring->slot[nm_i]);
+ struct mbuf *m;
+
+ /* we only check the address here on generic rx rings */
+ if (addr == netmap_buffer_base) { /* Bad buffer */
+ return netmap_ring_reinit(kring);
+ }
+ /*
+ * Call the locked version of the function.
+ * XXX Ideally we could grab a batch of mbufs at once
+ * and save some locking overhead.
+ */
+ m = mbq_safe_dequeue(&kring->rx_queue);
+ if (!m) /* no more data */
+ break;
+ len = MBUF_LEN(m);
+ m_copydata(m, 0, len, addr);
+ ring->slot[nm_i].len = len;
+ ring->slot[nm_i].flags = slot_flags;
+ m_freem(m);
+ nm_i = nm_next(nm_i, lim);
+ }
+ if (n) {
+ kring->nr_hwtail = nm_i;
+ IFRATE(rate_ctx.new.rxpkt += n);
+ }
+ kring->nr_kflags &= ~NKR_PENDINTR;
+ }
+
+ // XXX should we invert the order ?
+ /*
+ * Second part: skip past packets that userspace has released.
+ */
+ nm_i = kring->nr_hwcur;
+ if (nm_i != head) {
+ /* Userspace has released some packets. */
+ for (n = 0; nm_i != head; n++) {
+ struct netmap_slot *slot = &ring->slot[nm_i];
+
+ slot->flags &= ~NS_BUF_CHANGED;
+ nm_i = nm_next(nm_i, lim);
+ }
+ kring->nr_hwcur = head;
+ }
+ /* tell userspace that there might be new packets. */
+ nm_rxsync_finalize(kring);
+ IFRATE(rate_ctx.new.rxsync++);
+
+ return 0;
+}
+
+static void
+generic_netmap_dtor(struct netmap_adapter *na)
+{
+ struct ifnet *ifp = na->ifp;
+ struct netmap_generic_adapter *gna = (struct netmap_generic_adapter*)na;
+ struct netmap_adapter *prev_na = gna->prev;
+
+ if (prev_na != NULL) {
+ D("Released generic NA %p", gna);
+ if_rele(na->ifp);
+ netmap_adapter_put(prev_na);
+ }
+ if (ifp != NULL) {
+ WNA(ifp) = prev_na;
+ D("Restored native NA %p", prev_na);
+ na->ifp = NULL;
+ }
+}
+
+/*
+ * generic_netmap_attach() makes it possible to use netmap on
+ * a device without native netmap support.
+ * This is less performant than native support but potentially
+ * faster than raw sockets or similar schemes.
+ *
+ * In this "emulated" mode, netmap rings do not necessarily
+ * have the same size as those in the NIC. We use a default
+ * value and possibly override it if the OS has ways to fetch the
+ * actual configuration.
+ */
+int
+generic_netmap_attach(struct ifnet *ifp)
+{
+ struct netmap_adapter *na;
+ struct netmap_generic_adapter *gna;
+ int retval;
+ u_int num_tx_desc, num_rx_desc;
+
+ num_tx_desc = num_rx_desc = netmap_generic_ringsize; /* starting point */
+
+ generic_find_num_desc(ifp, &num_tx_desc, &num_rx_desc);
+ ND("Netmap ring size: TX = %d, RX = %d", num_tx_desc, num_rx_desc);
+
+ gna = malloc(sizeof(*gna), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (gna == NULL) {
+ D("no memory on attach, give up");
+ return ENOMEM;
+ }
+ na = (struct netmap_adapter *)gna;
+ na->ifp = ifp;
+ na->num_tx_desc = num_tx_desc;
+ na->num_rx_desc = num_rx_desc;
+ na->nm_register = &generic_netmap_register;
+ na->nm_txsync = &generic_netmap_txsync;
+ na->nm_rxsync = &generic_netmap_rxsync;
+ na->nm_dtor = &generic_netmap_dtor;
+ /* when using generic, IFCAP_NETMAP is set so we force
+ * NAF_SKIP_INTR to use the regular interrupt handler
+ */
+ na->na_flags = NAF_SKIP_INTR | NAF_HOST_RINGS;
+
+ ND("[GNA] num_tx_queues(%d), real_num_tx_queues(%d), len(%lu)",
+ ifp->num_tx_queues, ifp->real_num_tx_queues,
+ ifp->tx_queue_len);
+ ND("[GNA] num_rx_queues(%d), real_num_rx_queues(%d)",
+ ifp->num_rx_queues, ifp->real_num_rx_queues);
+
+ generic_find_num_queues(ifp, &na->num_tx_rings, &na->num_rx_rings);
+
+ retval = netmap_attach_common(na);
+ if (retval) {
+ free(gna, M_DEVBUF);
+ }
+
+ return retval;
+}
diff --git a/sys/dev/netmap/netmap_mbq.c b/sys/dev/netmap/netmap_mbq.c
new file mode 100644
index 0000000..2606b13
--- /dev/null
+++ b/sys/dev/netmap/netmap_mbq.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright (C) 2013-2014 Vincenzo Maffione. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+
+#ifdef linux
+#include "bsd_glue.h"
+#else /* __FreeBSD__ */
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#endif /* __FreeBSD__ */
+
+#include "netmap_mbq.h"
+
+
+static inline void __mbq_init(struct mbq *q)
+{
+ q->head = q->tail = NULL;
+ q->count = 0;
+}
+
+
+void mbq_safe_init(struct mbq *q)
+{
+ mtx_init(&q->lock, "mbq", NULL, MTX_SPIN);
+ __mbq_init(q);
+}
+
+
+void mbq_init(struct mbq *q)
+{
+ __mbq_init(q);
+}
+
+
+static inline void __mbq_enqueue(struct mbq *q, struct mbuf *m)
+{
+ m->m_nextpkt = NULL;
+ if (q->tail) {
+ q->tail->m_nextpkt = m;
+ q->tail = m;
+ } else {
+ q->head = q->tail = m;
+ }
+ q->count++;
+}
+
+
+void mbq_safe_enqueue(struct mbq *q, struct mbuf *m)
+{
+ mtx_lock(&q->lock);
+ __mbq_enqueue(q, m);
+ mtx_unlock(&q->lock);
+}
+
+
+void mbq_enqueue(struct mbq *q, struct mbuf *m)
+{
+ __mbq_enqueue(q, m);
+}
+
+
+static inline struct mbuf *__mbq_dequeue(struct mbq *q)
+{
+ struct mbuf *ret = NULL;
+
+ if (q->head) {
+ ret = q->head;
+ q->head = ret->m_nextpkt;
+ if (q->head == NULL) {
+ q->tail = NULL;
+ }
+ q->count--;
+ ret->m_nextpkt = NULL;
+ }
+
+ return ret;
+}
+
+
+struct mbuf *mbq_safe_dequeue(struct mbq *q)
+{
+ struct mbuf *ret;
+
+ mtx_lock(&q->lock);
+ ret = __mbq_dequeue(q);
+ mtx_unlock(&q->lock);
+
+ return ret;
+}
+
+
+struct mbuf *mbq_dequeue(struct mbq *q)
+{
+ return __mbq_dequeue(q);
+}
+
+
+/* XXX seems pointless to have a generic purge */
+static void __mbq_purge(struct mbq *q, int safe)
+{
+ struct mbuf *m;
+
+ for (;;) {
+ m = safe ? mbq_safe_dequeue(q) : mbq_dequeue(q);
+ if (m) {
+ m_freem(m);
+ } else {
+ break;
+ }
+ }
+}
+
+
+void mbq_purge(struct mbq *q)
+{
+ __mbq_purge(q, 0);
+}
+
+
+void mbq_safe_purge(struct mbq *q)
+{
+ __mbq_purge(q, 1);
+}
+
+
+void mbq_safe_destroy(struct mbq *q)
+{
+ mtx_destroy(&q->lock);
+}
+
+
+void mbq_destroy(struct mbq *q)
+{
+}
diff --git a/sys/dev/netmap/netmap_mbq.h b/sys/dev/netmap/netmap_mbq.h
new file mode 100644
index 0000000..d273d8a
--- /dev/null
+++ b/sys/dev/netmap/netmap_mbq.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2013-2014 Vincenzo Maffione. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+
+#ifndef __NETMAP_MBQ_H__
+#define __NETMAP_MBQ_H__
+
+/*
+ * These function implement an mbuf tailq with an optional lock.
+ * The base functions act ONLY ON THE QUEUE, whereas the "safe"
+ * variants (mbq_safe_*) also handle the lock.
+ */
+
+/* XXX probably rely on a previous definition of SPINLOCK_T */
+#ifdef linux
+#define SPINLOCK_T safe_spinlock_t
+#else
+#define SPINLOCK_T struct mtx
+#endif
+
+/* A FIFO queue of mbufs with an optional lock. */
+struct mbq {
+ struct mbuf *head;
+ struct mbuf *tail;
+ int count;
+ SPINLOCK_T lock;
+};
+
+/* XXX "destroy" does not match "init" as a name.
+ * We should also clarify whether init can be used while
+ * holding a lock, and whether mbq_safe_destroy() is a NOP.
+ */
+void mbq_init(struct mbq *q);
+void mbq_destroy(struct mbq *q);
+void mbq_enqueue(struct mbq *q, struct mbuf *m);
+struct mbuf *mbq_dequeue(struct mbq *q);
+void mbq_purge(struct mbq *q);
+
+/* XXX missing mbq_lock() and mbq_unlock */
+
+void mbq_safe_init(struct mbq *q);
+void mbq_safe_destroy(struct mbq *q);
+void mbq_safe_enqueue(struct mbq *q, struct mbuf *m);
+struct mbuf *mbq_safe_dequeue(struct mbq *q);
+void mbq_safe_purge(struct mbq *q);
+
+static inline unsigned int mbq_len(struct mbq *q)
+{
+ return q->count;
+}
+
+#endif /* __NETMAP_MBQ_H_ */
diff --git a/sys/dev/netmap/netmap_mem2.h b/sys/dev/netmap/netmap_mem2.h
new file mode 100644
index 0000000..e83616a
--- /dev/null
+++ b/sys/dev/netmap/netmap_mem2.h
@@ -0,0 +1,227 @@
+/*
+ * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ *
+ * (New) memory allocator for netmap
+ */
+
+/*
+ * This allocator creates three memory pools:
+ * nm_if_pool for the struct netmap_if
+ * nm_ring_pool for the struct netmap_ring
+ * nm_buf_pool for the packet buffers.
+ *
+ * that contain netmap objects. Each pool is made of a number of clusters,
+ * multiple of a page size, each containing an integer number of objects.
+ * The clusters are contiguous in user space but not in the kernel.
+ * Only nm_buf_pool needs to be dma-able,
+ * but for convenience use the same type of allocator for all.
+ *
+ * Once mapped, the three pools are exported to userspace
+ * as a contiguous block, starting from nm_if_pool. Each
+ * cluster (and pool) is an integral number of pages.
+ * [ . . . ][ . . . . . .][ . . . . . . . . . .]
+ * nm_if nm_ring nm_buf
+ *
+ * The userspace areas contain offsets of the objects in userspace.
+ * When (at init time) we write these offsets, we find out the index
+ * of the object, and from there locate the offset from the beginning
+ * of the region.
+ *
+ * The invididual allocators manage a pool of memory for objects of
+ * the same size.
+ * The pool is split into smaller clusters, whose size is a
+ * multiple of the page size. The cluster size is chosen
+ * to minimize the waste for a given max cluster size
+ * (we do it by brute force, as we have relatively few objects
+ * per cluster).
+ *
+ * Objects are aligned to the cache line (64 bytes) rounding up object
+ * sizes when needed. A bitmap contains the state of each object.
+ * Allocation scans the bitmap; this is done only on attach, so we are not
+ * too worried about performance
+ *
+ * For each allocator we can define (thorugh sysctl) the size and
+ * number of each object. Memory is allocated at the first use of a
+ * netmap file descriptor, and can be freed when all such descriptors
+ * have been released (including unmapping the memory).
+ * If memory is scarce, the system tries to get as much as possible
+ * and the sysctl values reflect the actual allocation.
+ * Together with desired values, the sysctl export also absolute
+ * min and maximum values that cannot be overridden.
+ *
+ * struct netmap_if:
+ * variable size, max 16 bytes per ring pair plus some fixed amount.
+ * 1024 bytes should be large enough in practice.
+ *
+ * In the worst case we have one netmap_if per ring in the system.
+ *
+ * struct netmap_ring
+ * variable size, 8 byte per slot plus some fixed amount.
+ * Rings can be large (e.g. 4k slots, or >32Kbytes).
+ * We default to 36 KB (9 pages), and a few hundred rings.
+ *
+ * struct netmap_buffer
+ * The more the better, both because fast interfaces tend to have
+ * many slots, and because we may want to use buffers to store
+ * packets in userspace avoiding copies.
+ * Must contain a full frame (eg 1518, or more for vlans, jumbo
+ * frames etc.) plus be nicely aligned, plus some NICs restrict
+ * the size to multiple of 1K or so. Default to 2K
+ */
+#ifndef _NET_NETMAP_MEM2_H_
+#define _NET_NETMAP_MEM2_H_
+
+
+#define NETMAP_BUF_MAX_NUM 20*4096*2 /* large machine */
+
+#define NETMAP_POOL_MAX_NAMSZ 32
+
+
+enum {
+ NETMAP_IF_POOL = 0,
+ NETMAP_RING_POOL,
+ NETMAP_BUF_POOL,
+ NETMAP_POOLS_NR
+};
+
+
+struct netmap_obj_params {
+ u_int size;
+ u_int num;
+};
+struct netmap_obj_pool {
+ char name[NETMAP_POOL_MAX_NAMSZ]; /* name of the allocator */
+
+ /* ---------------------------------------------------*/
+ /* these are only meaningful if the pool is finalized */
+ /* (see 'finalized' field in netmap_mem_d) */
+ u_int objtotal; /* actual total number of objects. */
+ u_int memtotal; /* actual total memory space */
+ u_int numclusters; /* actual number of clusters */
+
+ u_int objfree; /* number of free objects. */
+
+ struct lut_entry *lut; /* virt,phys addresses, objtotal entries */
+ uint32_t *bitmap; /* one bit per buffer, 1 means free */
+ uint32_t bitmap_slots; /* number of uint32 entries in bitmap */
+ /* ---------------------------------------------------*/
+
+ /* limits */
+ u_int objminsize; /* minimum object size */
+ u_int objmaxsize; /* maximum object size */
+ u_int nummin; /* minimum number of objects */
+ u_int nummax; /* maximum number of objects */
+
+ /* these are changed only by config */
+ u_int _objtotal; /* total number of objects */
+ u_int _objsize; /* object size */
+ u_int _clustsize; /* cluster size */
+ u_int _clustentries; /* objects per cluster */
+ u_int _numclusters; /* number of clusters */
+
+ /* requested values */
+ u_int r_objtotal;
+ u_int r_objsize;
+};
+
+#ifdef linux
+// XXX a mtx would suffice here 20130415 lr
+#define NMA_LOCK_T struct semaphore
+#else /* !linux */
+#define NMA_LOCK_T struct mtx
+#endif /* linux */
+
+typedef int (*netmap_mem_config_t)(struct netmap_mem_d*);
+typedef int (*netmap_mem_finalize_t)(struct netmap_mem_d*);
+typedef void (*netmap_mem_deref_t)(struct netmap_mem_d*);
+
+typedef uint16_t nm_memid_t;
+
+/* We implement two kinds of netmap_mem_d structures:
+ *
+ * - global: used by hardware NICS;
+ *
+ * - private: used by VALE ports.
+ *
+ * In both cases, the netmap_mem_d structure has the same lifetime as the
+ * netmap_adapter of the corresponding NIC or port. It is the responsibility of
+ * the client code to delete the private allocator when the associated
+ * netmap_adapter is freed (this is implemented by the NAF_MEM_OWNER flag in
+ * netmap.c). The 'refcount' field counts the number of active users of the
+ * structure. The global allocator uses this information to prevent/allow
+ * reconfiguration. The private allocators release all their memory when there
+ * are no active users. By 'active user' we mean an existing netmap_priv
+ * structure holding a reference to the allocator.
+ */
+struct netmap_mem_d {
+ NMA_LOCK_T nm_mtx; /* protect the allocator */
+ u_int nm_totalsize; /* shorthand */
+
+ u_int flags;
+#define NETMAP_MEM_FINALIZED 0x1 /* preallocation done */
+#define NETMAP_MEM_PRIVATE 0x2 /* uses private address space */
+ int lasterr; /* last error for curr config */
+ int refcount; /* existing priv structures */
+ /* the three allocators */
+ struct netmap_obj_pool pools[NETMAP_POOLS_NR];
+
+ netmap_mem_config_t config;
+ netmap_mem_finalize_t finalize;
+ netmap_mem_deref_t deref;
+
+ nm_memid_t nm_id; /* allocator identifier */
+
+ /* list of all existing allocators, sorted by nm_id */
+ struct netmap_mem_d *prev, *next;
+};
+
+extern struct netmap_mem_d nm_mem;
+
+vm_paddr_t netmap_mem_ofstophys(struct netmap_mem_d *, vm_ooffset_t);
+int netmap_mem_finalize(struct netmap_mem_d *);
+int netmap_mem_init(void);
+void netmap_mem_fini(void);
+struct netmap_if *
+ netmap_mem_if_new(const char *, struct netmap_adapter *);
+void netmap_mem_if_delete(struct netmap_adapter *, struct netmap_if *);
+int netmap_mem_rings_create(struct netmap_adapter *);
+void netmap_mem_rings_delete(struct netmap_adapter *);
+void netmap_mem_deref(struct netmap_mem_d *);
+int netmap_mem_get_info(struct netmap_mem_d *, u_int *size, u_int *memflags, uint16_t *id);
+ssize_t netmap_mem_if_offset(struct netmap_mem_d *, const void *vaddr);
+struct netmap_mem_d* netmap_mem_private_new(const char *name,
+ u_int txr, u_int txd, u_int rxr, u_int rxd, u_int extra_bufs, u_int npipes,
+ int* error);
+void netmap_mem_private_delete(struct netmap_mem_d *);
+
+#define NETMAP_BDG_BUF_SIZE(n) ((n)->pools[NETMAP_BUF_POOL]._objsize)
+
+uint32_t netmap_extra_alloc(struct netmap_adapter *, uint32_t *, uint32_t n);
+
+
+#endif
diff --git a/sys/dev/netmap/netmap_offloadings.c b/sys/dev/netmap/netmap_offloadings.c
new file mode 100644
index 0000000..a776a24
--- /dev/null
+++ b/sys/dev/netmap/netmap_offloadings.c
@@ -0,0 +1,401 @@
+/*
+ * Copyright (C) 2014 Vincenzo Maffione. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* $FreeBSD$ */
+
+#if defined(__FreeBSD__)
+#include <sys/cdefs.h> /* prerequisite */
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/param.h> /* defines used in kernel.h */
+#include <sys/kernel.h> /* types used in module initialization */
+#include <sys/sockio.h>
+#include <sys/socketvar.h> /* struct socket */
+#include <sys/socket.h> /* sockaddrs */
+#include <net/if.h>
+#include <net/if_var.h>
+#include <machine/bus.h> /* bus_dmamap_* */
+#include <sys/endian.h>
+
+#elif defined(linux)
+
+#include "bsd_glue.h"
+
+#elif defined(__APPLE__)
+
+#warning OSX support is only partial
+#include "osx_glue.h"
+
+#else
+
+#error Unsupported platform
+
+#endif /* unsupported */
+
+#include <net/netmap.h>
+#include <dev/netmap/netmap_kern.h>
+
+
+
+/* This routine is called by bdg_mismatch_datapath() when it finishes
+ * accumulating bytes for a segment, in order to fix some fields in the
+ * segment headers (which still contain the same content as the header
+ * of the original GSO packet). 'buf' points to the beginning (e.g.
+ * the ethernet header) of the segment, and 'len' is its length.
+ */
+static void gso_fix_segment(uint8_t *buf, size_t len, u_int idx,
+ u_int segmented_bytes, u_int last_segment,
+ u_int tcp, u_int iphlen)
+{
+ struct nm_iphdr *iph = (struct nm_iphdr *)(buf + 14);
+ struct nm_ipv6hdr *ip6h = (struct nm_ipv6hdr *)(buf + 14);
+ uint16_t *check = NULL;
+ uint8_t *check_data = NULL;
+
+ if (iphlen == 20) {
+ /* Set the IPv4 "Total Length" field. */
+ iph->tot_len = htobe16(len-14);
+ ND("ip total length %u", be16toh(ip->tot_len));
+
+ /* Set the IPv4 "Identification" field. */
+ iph->id = htobe16(be16toh(iph->id) + idx);
+ ND("ip identification %u", be16toh(iph->id));
+
+ /* Compute and insert the IPv4 header checksum. */
+ iph->check = 0;
+ iph->check = nm_csum_ipv4(iph);
+ ND("IP csum %x", be16toh(iph->check));
+ } else {/* if (iphlen == 40) */
+ /* Set the IPv6 "Payload Len" field. */
+ ip6h->payload_len = htobe16(len-14-iphlen);
+ }
+
+ if (tcp) {
+ struct nm_tcphdr *tcph = (struct nm_tcphdr *)(buf + 14 + iphlen);
+
+ /* Set the TCP sequence number. */
+ tcph->seq = htobe32(be32toh(tcph->seq) + segmented_bytes);
+ ND("tcp seq %u", be32toh(tcph->seq));
+
+ /* Zero the PSH and FIN TCP flags if this is not the last
+ segment. */
+ if (!last_segment)
+ tcph->flags &= ~(0x8 | 0x1);
+ ND("last_segment %u", last_segment);
+
+ check = &tcph->check;
+ check_data = (uint8_t *)tcph;
+ } else { /* UDP */
+ struct nm_udphdr *udph = (struct nm_udphdr *)(buf + 14 + iphlen);
+
+ /* Set the UDP 'Length' field. */
+ udph->len = htobe16(len-14-iphlen);
+
+ check = &udph->check;
+ check_data = (uint8_t *)udph;
+ }
+
+ /* Compute and insert TCP/UDP checksum. */
+ *check = 0;
+ if (iphlen == 20)
+ nm_csum_tcpudp_ipv4(iph, check_data, len-14-iphlen, check);
+ else
+ nm_csum_tcpudp_ipv6(ip6h, check_data, len-14-iphlen, check);
+
+ ND("TCP/UDP csum %x", be16toh(*check));
+}
+
+
+/* The VALE mismatch datapath implementation. */
+void bdg_mismatch_datapath(struct netmap_vp_adapter *na,
+ struct netmap_vp_adapter *dst_na,
+ struct nm_bdg_fwd *ft_p, struct netmap_ring *ring,
+ u_int *j, u_int lim, u_int *howmany)
+{
+ struct netmap_slot *slot = NULL;
+ struct nm_vnet_hdr *vh = NULL;
+ /* Number of source slots to process. */
+ u_int frags = ft_p->ft_frags;
+ struct nm_bdg_fwd *ft_end = ft_p + frags;
+
+ /* Source and destination pointers. */
+ uint8_t *dst, *src;
+ size_t src_len, dst_len;
+
+ u_int j_start = *j;
+ u_int dst_slots = 0;
+
+ /* If the source port uses the offloadings, while destination doesn't,
+ * we grab the source virtio-net header and do the offloadings here.
+ */
+ if (na->virt_hdr_len && !dst_na->virt_hdr_len) {
+ vh = (struct nm_vnet_hdr *)ft_p->ft_buf;
+ }
+
+ /* Init source and dest pointers. */
+ src = ft_p->ft_buf;
+ src_len = ft_p->ft_len;
+ slot = &ring->slot[*j];
+ dst = BDG_NMB(&dst_na->up, slot);
+ dst_len = src_len;
+
+ /* We are processing the first input slot and there is a mismatch
+ * between source and destination virt_hdr_len (SHL and DHL).
+ * When the a client is using virtio-net headers, the header length
+ * can be:
+ * - 10: the header corresponds to the struct nm_vnet_hdr
+ * - 12: the first 10 bytes correspond to the struct
+ * virtio_net_hdr, and the last 2 bytes store the
+ * "mergeable buffers" info, which is an optional
+ * hint that can be zeroed for compability
+ *
+ * The destination header is therefore built according to the
+ * following table:
+ *
+ * SHL | DHL | destination header
+ * -----------------------------
+ * 0 | 10 | zero
+ * 0 | 12 | zero
+ * 10 | 0 | doesn't exist
+ * 10 | 12 | first 10 bytes are copied from source header, last 2 are zero
+ * 12 | 0 | doesn't exist
+ * 12 | 10 | copied from the first 10 bytes of source header
+ */
+ bzero(dst, dst_na->virt_hdr_len);
+ if (na->virt_hdr_len && dst_na->virt_hdr_len)
+ memcpy(dst, src, sizeof(struct nm_vnet_hdr));
+ /* Skip the virtio-net headers. */
+ src += na->virt_hdr_len;
+ src_len -= na->virt_hdr_len;
+ dst += dst_na->virt_hdr_len;
+ dst_len = dst_na->virt_hdr_len + src_len;
+
+ /* Here it could be dst_len == 0 (which implies src_len == 0),
+ * so we avoid passing a zero length fragment.
+ */
+ if (dst_len == 0) {
+ ft_p++;
+ src = ft_p->ft_buf;
+ src_len = ft_p->ft_len;
+ dst_len = src_len;
+ }
+
+ if (vh && vh->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+ u_int gso_bytes = 0;
+ /* Length of the GSO packet header. */
+ u_int gso_hdr_len = 0;
+ /* Pointer to the GSO packet header. Assume it is in a single fragment. */
+ uint8_t *gso_hdr = NULL;
+ /* Index of the current segment. */
+ u_int gso_idx = 0;
+ /* Payload data bytes segmented so far (e.g. TCP data bytes). */
+ u_int segmented_bytes = 0;
+ /* Length of the IP header (20 if IPv4, 40 if IPv6). */
+ u_int iphlen = 0;
+ /* Is this a TCP or an UDP GSO packet? */
+ u_int tcp = ((vh->gso_type & ~VIRTIO_NET_HDR_GSO_ECN)
+ == VIRTIO_NET_HDR_GSO_UDP) ? 0 : 1;
+
+ /* Segment the GSO packet contained into the input slots (frags). */
+ while (ft_p != ft_end) {
+ size_t copy;
+
+ /* Grab the GSO header if we don't have it. */
+ if (!gso_hdr) {
+ uint16_t ethertype;
+
+ gso_hdr = src;
+
+ /* Look at the 'Ethertype' field to see if this packet
+ * is IPv4 or IPv6.
+ */
+ ethertype = be16toh(*((uint16_t *)(gso_hdr + 12)));
+ if (ethertype == 0x0800)
+ iphlen = 20;
+ else /* if (ethertype == 0x86DD) */
+ iphlen = 40;
+ ND(3, "type=%04x", ethertype);
+
+ /* Compute gso_hdr_len. For TCP we need to read the
+ * content of the 'Data Offset' field.
+ */
+ if (tcp) {
+ struct nm_tcphdr *tcph =
+ (struct nm_tcphdr *)&gso_hdr[14+iphlen];
+
+ gso_hdr_len = 14 + iphlen + 4*(tcph->doff >> 4);
+ } else
+ gso_hdr_len = 14 + iphlen + 8; /* UDP */
+
+ ND(3, "gso_hdr_len %u gso_mtu %d", gso_hdr_len,
+ dst_na->mfs);
+
+ /* Advance source pointers. */
+ src += gso_hdr_len;
+ src_len -= gso_hdr_len;
+ if (src_len == 0) {
+ ft_p++;
+ if (ft_p == ft_end)
+ break;
+ src = ft_p->ft_buf;
+ src_len = ft_p->ft_len;
+ continue;
+ }
+ }
+
+ /* Fill in the header of the current segment. */
+ if (gso_bytes == 0) {
+ memcpy(dst, gso_hdr, gso_hdr_len);
+ gso_bytes = gso_hdr_len;
+ }
+
+ /* Fill in data and update source and dest pointers. */
+ copy = src_len;
+ if (gso_bytes + copy > dst_na->mfs)
+ copy = dst_na->mfs - gso_bytes;
+ memcpy(dst + gso_bytes, src, copy);
+ gso_bytes += copy;
+ src += copy;
+ src_len -= copy;
+
+ /* A segment is complete or we have processed all the
+ the GSO payload bytes. */
+ if (gso_bytes >= dst_na->mfs ||
+ (src_len == 0 && ft_p + 1 == ft_end)) {
+ /* After raw segmentation, we must fix some header
+ * fields and compute checksums, in a protocol dependent
+ * way. */
+ gso_fix_segment(dst, gso_bytes, gso_idx,
+ segmented_bytes,
+ src_len == 0 && ft_p + 1 == ft_end,
+ tcp, iphlen);
+
+ ND("frame %u completed with %d bytes", gso_idx, (int)gso_bytes);
+ slot->len = gso_bytes;
+ slot->flags = 0;
+ segmented_bytes += gso_bytes - gso_hdr_len;
+
+ dst_slots++;
+
+ /* Next destination slot. */
+ *j = nm_next(*j, lim);
+ slot = &ring->slot[*j];
+ dst = BDG_NMB(&dst_na->up, slot);
+
+ gso_bytes = 0;
+ gso_idx++;
+ }
+
+ /* Next input slot. */
+ if (src_len == 0) {
+ ft_p++;
+ if (ft_p == ft_end)
+ break;
+ src = ft_p->ft_buf;
+ src_len = ft_p->ft_len;
+ }
+ }
+ ND(3, "%d bytes segmented", segmented_bytes);
+
+ } else {
+ /* Address of a checksum field into a destination slot. */
+ uint16_t *check = NULL;
+ /* Accumulator for an unfolded checksum. */
+ rawsum_t csum = 0;
+
+ /* Process a non-GSO packet. */
+
+ /* Init 'check' if necessary. */
+ if (vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) {
+ if (unlikely(vh->csum_offset + vh->csum_start > src_len))
+ D("invalid checksum request");
+ else
+ check = (uint16_t *)(dst + vh->csum_start +
+ vh->csum_offset);
+ }
+
+ while (ft_p != ft_end) {
+ /* Init/update the packet checksum if needed. */
+ if (vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) {
+ if (!dst_slots)
+ csum = nm_csum_raw(src + vh->csum_start,
+ src_len - vh->csum_start, 0);
+ else
+ csum = nm_csum_raw(src, src_len, csum);
+ }
+
+ /* Round to a multiple of 64 */
+ src_len = (src_len + 63) & ~63;
+
+ if (ft_p->ft_flags & NS_INDIRECT) {
+ if (copyin(src, dst, src_len)) {
+ /* Invalid user pointer, pretend len is 0. */
+ dst_len = 0;
+ }
+ } else {
+ memcpy(dst, src, (int)src_len);
+ }
+ slot->len = dst_len;
+
+ dst_slots++;
+
+ /* Next destination slot. */
+ *j = nm_next(*j, lim);
+ slot = &ring->slot[*j];
+ dst = BDG_NMB(&dst_na->up, slot);
+
+ /* Next source slot. */
+ ft_p++;
+ src = ft_p->ft_buf;
+ dst_len = src_len = ft_p->ft_len;
+
+ }
+
+ /* Finalize (fold) the checksum if needed. */
+ if (check && vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) {
+ *check = nm_csum_fold(csum);
+ }
+ ND(3, "using %u dst_slots", dst_slots);
+
+ /* A second pass on the desitations slots to set the slot flags,
+ * using the right number of destination slots.
+ */
+ while (j_start != *j) {
+ slot = &ring->slot[j_start];
+ slot->flags = (dst_slots << 8)| NS_MOREFRAG;
+ j_start = nm_next(j_start, lim);
+ }
+ /* Clear NS_MOREFRAG flag on last entry. */
+ slot->flags = (dst_slots << 8);
+ }
+
+ /* Update howmany. */
+ if (unlikely(dst_slots > *howmany)) {
+ dst_slots = *howmany;
+ D("Slot allocation error: Should never happen");
+ }
+ *howmany -= dst_slots;
+}
diff --git a/sys/dev/netmap/netmap_pipe.c b/sys/dev/netmap/netmap_pipe.c
new file mode 100644
index 0000000..f8f29fa
--- /dev/null
+++ b/sys/dev/netmap/netmap_pipe.c
@@ -0,0 +1,711 @@
+/*
+ * Copyright (C) 2014 Giuseppe Lettieri. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* $FreeBSD$ */
+
+#if defined(__FreeBSD__)
+#include <sys/cdefs.h> /* prerequisite */
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/param.h> /* defines used in kernel.h */
+#include <sys/kernel.h> /* types used in module initialization */
+#include <sys/malloc.h>
+#include <sys/poll.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/selinfo.h>
+#include <sys/sysctl.h>
+#include <sys/socket.h> /* sockaddrs */
+#include <net/if.h>
+#include <net/if_var.h>
+#include <machine/bus.h> /* bus_dmamap_* */
+#include <sys/refcount.h>
+
+
+#elif defined(linux)
+
+#include "bsd_glue.h"
+
+#elif defined(__APPLE__)
+
+#warning OSX support is only partial
+#include "osx_glue.h"
+
+#else
+
+#error Unsupported platform
+
+#endif /* unsupported */
+
+/*
+ * common headers
+ */
+
+#include <net/netmap.h>
+#include <dev/netmap/netmap_kern.h>
+#include <dev/netmap/netmap_mem2.h>
+
+#ifdef WITH_PIPES
+
+#define NM_PIPE_MAXSLOTS 4096
+
+int netmap_default_pipes = 0; /* default number of pipes for each nic */
+SYSCTL_DECL(_dev_netmap);
+SYSCTL_INT(_dev_netmap, OID_AUTO, default_pipes, CTLFLAG_RW, &netmap_default_pipes, 0 , "");
+
+/* allocate the pipe array in the parent adapter */
+int
+netmap_pipe_alloc(struct netmap_adapter *na, struct nmreq *nmr)
+{
+ size_t len;
+ int mode = nmr->nr_flags & NR_REG_MASK;
+ u_int npipes;
+
+ if (mode == NR_REG_PIPE_MASTER || mode == NR_REG_PIPE_SLAVE) {
+ /* this is for our parent, not for us */
+ return 0;
+ }
+
+ /* TODO: we can resize the array if the new
+ * request can accomodate the already existing pipes
+ */
+ if (na->na_pipes) {
+ nmr->nr_arg1 = na->na_max_pipes;
+ return 0;
+ }
+
+ npipes = nmr->nr_arg1;
+ if (npipes == 0)
+ npipes = netmap_default_pipes;
+ nm_bound_var(&npipes, 0, 0, NM_MAXPIPES, NULL);
+
+ if (npipes == 0) {
+ /* really zero, nothing to alloc */
+ goto out;
+ }
+
+ len = sizeof(struct netmap_pipe_adapter *) * npipes;
+ na->na_pipes = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (na->na_pipes == NULL)
+ return ENOMEM;
+
+ na->na_max_pipes = npipes;
+ na->na_next_pipe = 0;
+
+out:
+ nmr->nr_arg1 = npipes;
+
+ return 0;
+}
+
+/* deallocate the parent array in the parent adapter */
+void
+netmap_pipe_dealloc(struct netmap_adapter *na)
+{
+ if (na->na_pipes) {
+ ND("freeing pipes for %s", NM_IFPNAME(na->ifp));
+ free(na->na_pipes, M_DEVBUF);
+ na->na_pipes = NULL;
+ na->na_max_pipes = 0;
+ na->na_next_pipe = 0;
+ }
+}
+
+/* find a pipe endpoint with the given id among the parent's pipes */
+static struct netmap_pipe_adapter *
+netmap_pipe_find(struct netmap_adapter *parent, u_int pipe_id)
+{
+ int i;
+ struct netmap_pipe_adapter *na;
+
+ for (i = 0; i < parent->na_next_pipe; i++) {
+ na = parent->na_pipes[i];
+ if (na->id == pipe_id) {
+ return na;
+ }
+ }
+ return NULL;
+}
+
+/* add a new pipe endpoint to the parent array */
+static int
+netmap_pipe_add(struct netmap_adapter *parent, struct netmap_pipe_adapter *na)
+{
+ if (parent->na_next_pipe >= parent->na_max_pipes) {
+ D("%s: no space left for pipes", NM_IFPNAME(parent->ifp));
+ return ENOMEM;
+ }
+
+ parent->na_pipes[parent->na_next_pipe] = na;
+ na->parent_slot = parent->na_next_pipe;
+ parent->na_next_pipe++;
+ return 0;
+}
+
+/* remove the given pipe endpoint from the parent array */
+static void
+netmap_pipe_remove(struct netmap_adapter *parent, struct netmap_pipe_adapter *na)
+{
+ u_int n;
+ n = --parent->na_next_pipe;
+ if (n != na->parent_slot) {
+ parent->na_pipes[na->parent_slot] =
+ parent->na_pipes[n];
+ }
+ parent->na_pipes[n] = NULL;
+}
+
+static int
+netmap_pipe_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+{
+ struct netmap_kring *txkring = na->tx_rings + ring_nr,
+ *rxkring = txkring->pipe;
+ u_int limit; /* slots to transfer */
+ u_int j, k, lim_tx = txkring->nkr_num_slots - 1,
+ lim_rx = rxkring->nkr_num_slots - 1;
+ int m, busy;
+
+ ND("%p: %s %x -> %s", txkring, txkring->name, flags, rxkring->name);
+ ND(2, "before: hwcur %d hwtail %d cur %d head %d tail %d", txkring->nr_hwcur, txkring->nr_hwtail,
+ txkring->rcur, txkring->rhead, txkring->rtail);
+
+ j = rxkring->nr_hwtail; /* RX */
+ k = txkring->nr_hwcur; /* TX */
+ m = txkring->rhead - txkring->nr_hwcur; /* new slots */
+ if (m < 0)
+ m += txkring->nkr_num_slots;
+ limit = m;
+ m = rxkring->nkr_num_slots - 1; /* max avail space on destination */
+ busy = j - rxkring->nr_hwcur; /* busy slots */
+ if (busy < 0)
+ busy += txkring->nkr_num_slots;
+ m -= busy; /* subtract busy slots */
+ ND(2, "m %d limit %d", m, limit);
+ if (m < limit)
+ limit = m;
+
+ if (limit == 0) {
+ /* either the rxring is full, or nothing to send */
+ nm_txsync_finalize(txkring); /* actually useless */
+ return 0;
+ }
+
+ while (limit-- > 0) {
+ struct netmap_slot *rs = &rxkring->save_ring->slot[j];
+ struct netmap_slot *ts = &txkring->ring->slot[k];
+ struct netmap_slot tmp;
+
+ /* swap the slots */
+ tmp = *rs;
+ *rs = *ts;
+ *ts = tmp;
+
+ /* no need to report the buffer change */
+
+ j = nm_next(j, lim_rx);
+ k = nm_next(k, lim_tx);
+ }
+
+ wmb(); /* make sure the slots are updated before publishing them */
+ rxkring->nr_hwtail = j;
+ txkring->nr_hwcur = k;
+ txkring->nr_hwtail = nm_prev(k, lim_tx);
+
+ nm_txsync_finalize(txkring);
+ ND(2, "after: hwcur %d hwtail %d cur %d head %d tail %d j %d", txkring->nr_hwcur, txkring->nr_hwtail,
+ txkring->rcur, txkring->rhead, txkring->rtail, j);
+
+ wmb(); /* make sure rxkring->nr_hwtail is updated before notifying */
+ rxkring->na->nm_notify(rxkring->na, rxkring->ring_id, NR_RX, 0);
+
+ return 0;
+}
+
+static int
+netmap_pipe_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+{
+ struct netmap_kring *rxkring = na->rx_rings + ring_nr,
+ *txkring = rxkring->pipe;
+ uint32_t oldhwcur = rxkring->nr_hwcur;
+
+ ND("%s %x <- %s", rxkring->name, flags, txkring->name);
+ rxkring->nr_hwcur = rxkring->rhead; /* recover user-relased slots */
+ ND(5, "hwcur %d hwtail %d cur %d head %d tail %d", rxkring->nr_hwcur, rxkring->nr_hwtail,
+ rxkring->rcur, rxkring->rhead, rxkring->rtail);
+ rmb(); /* paired with the first wmb() in txsync */
+ nm_rxsync_finalize(rxkring);
+
+ if (oldhwcur != rxkring->nr_hwcur) {
+ /* we have released some slots, notify the other end */
+ wmb(); /* make sure nr_hwcur is updated before notifying */
+ txkring->na->nm_notify(txkring->na, txkring->ring_id, NR_TX, 0);
+ }
+ return 0;
+}
+
+/* Pipe endpoints are created and destroyed together, so that endopoints do not
+ * have to check for the existence of their peer at each ?xsync.
+ *
+ * To play well with the existing netmap infrastructure (refcounts etc.), we
+ * adopt the following strategy:
+ *
+ * 1) The first endpoint that is created also creates the other endpoint and
+ * grabs a reference to it.
+ *
+ * state A) user1 --> endpoint1 --> endpoint2
+ *
+ * 2) If, starting from state A, endpoint2 is then registered, endpoint1 gives
+ * its reference to the user:
+ *
+ * state B) user1 --> endpoint1 endpoint2 <--- user2
+ *
+ * 3) Assume that, starting from state B endpoint2 is closed. In the unregister
+ * callback endpoint2 notes that endpoint1 is still active and adds a reference
+ * from endpoint1 to itself. When user2 then releases her own reference,
+ * endpoint2 is not destroyed and we are back to state A. A symmetrical state
+ * would be reached if endpoint1 were released instead.
+ *
+ * 4) If, starting from state A, endpoint1 is closed, the destructor notes that
+ * it owns a reference to endpoint2 and releases it.
+ *
+ * Something similar goes on for the creation and destruction of the krings.
+ */
+
+
+/* netmap_pipe_krings_delete.
+ *
+ * There are two cases:
+ *
+ * 1) state is
+ *
+ * usr1 --> e1 --> e2
+ *
+ * and we are e1. We have to create both sets
+ * of krings.
+ *
+ * 2) state is
+ *
+ * usr1 --> e1 --> e2
+ *
+ * and we are e2. e1 is certainly registered and our
+ * krings already exist, but they may be hidden.
+ */
+static int
+netmap_pipe_krings_create(struct netmap_adapter *na)
+{
+ struct netmap_pipe_adapter *pna =
+ (struct netmap_pipe_adapter *)na;
+ struct netmap_adapter *ona = &pna->peer->up;
+ int error = 0;
+ if (pna->peer_ref) {
+ int i;
+
+ /* case 1) above */
+ D("%p: case 1, create everything", na);
+ error = netmap_krings_create(na, 0);
+ if (error)
+ goto err;
+
+ /* we also create all the rings, since we need to
+ * update the save_ring pointers.
+ * netmap_mem_rings_create (called by our caller)
+ * will not create the rings again
+ */
+
+ error = netmap_mem_rings_create(na);
+ if (error)
+ goto del_krings1;
+
+ /* update our hidden ring pointers */
+ for (i = 0; i < na->num_tx_rings + 1; i++)
+ na->tx_rings[i].save_ring = na->tx_rings[i].ring;
+ for (i = 0; i < na->num_rx_rings + 1; i++)
+ na->rx_rings[i].save_ring = na->rx_rings[i].ring;
+
+ /* now, create krings and rings of the other end */
+ error = netmap_krings_create(ona, 0);
+ if (error)
+ goto del_rings1;
+
+ error = netmap_mem_rings_create(ona);
+ if (error)
+ goto del_krings2;
+
+ for (i = 0; i < ona->num_tx_rings + 1; i++)
+ ona->tx_rings[i].save_ring = ona->tx_rings[i].ring;
+ for (i = 0; i < ona->num_rx_rings + 1; i++)
+ ona->rx_rings[i].save_ring = ona->rx_rings[i].ring;
+
+ /* cross link the krings */
+ for (i = 0; i < na->num_tx_rings; i++) {
+ na->tx_rings[i].pipe = pna->peer->up.rx_rings + i;
+ na->rx_rings[i].pipe = pna->peer->up.tx_rings + i;
+ pna->peer->up.tx_rings[i].pipe = na->rx_rings + i;
+ pna->peer->up.rx_rings[i].pipe = na->tx_rings + i;
+ }
+ } else {
+ int i;
+ /* case 2) above */
+ /* recover the hidden rings */
+ ND("%p: case 2, hidden rings", na);
+ for (i = 0; i < na->num_tx_rings + 1; i++)
+ na->tx_rings[i].ring = na->tx_rings[i].save_ring;
+ for (i = 0; i < na->num_rx_rings + 1; i++)
+ na->rx_rings[i].ring = na->rx_rings[i].save_ring;
+ }
+ return 0;
+
+del_krings2:
+ netmap_krings_delete(ona);
+del_rings1:
+ netmap_mem_rings_delete(na);
+del_krings1:
+ netmap_krings_delete(na);
+err:
+ return error;
+}
+
+/* netmap_pipe_reg.
+ *
+ * There are two cases on registration (onoff==1)
+ *
+ * 1.a) state is
+ *
+ * usr1 --> e1 --> e2
+ *
+ * and we are e1. Nothing special to do.
+ *
+ * 1.b) state is
+ *
+ * usr1 --> e1 --> e2 <-- usr2
+ *
+ * and we are e2. Drop the ref e1 is holding.
+ *
+ * There are two additional cases on unregister (onoff==0)
+ *
+ * 2.a) state is
+ *
+ * usr1 --> e1 --> e2
+ *
+ * and we are e1. Nothing special to do, e2 will
+ * be cleaned up by the destructor of e1.
+ *
+ * 2.b) state is
+ *
+ * usr1 --> e1 e2 <-- usr2
+ *
+ * and we are either e1 or e2. Add a ref from the
+ * other end and hide our rings.
+ */
+static int
+netmap_pipe_reg(struct netmap_adapter *na, int onoff)
+{
+ struct netmap_pipe_adapter *pna =
+ (struct netmap_pipe_adapter *)na;
+ struct ifnet *ifp = na->ifp;
+ ND("%p: onoff %d", na, onoff);
+ if (onoff) {
+ ifp->if_capenable |= IFCAP_NETMAP;
+ } else {
+ ifp->if_capenable &= ~IFCAP_NETMAP;
+ }
+ if (pna->peer_ref) {
+ ND("%p: case 1.a or 2.a, nothing to do", na);
+ return 0;
+ }
+ if (onoff) {
+ ND("%p: case 1.b, drop peer", na);
+ pna->peer->peer_ref = 0;
+ netmap_adapter_put(na);
+ } else {
+ int i;
+ ND("%p: case 2.b, grab peer", na);
+ netmap_adapter_get(na);
+ pna->peer->peer_ref = 1;
+ /* hide our rings from netmap_mem_rings_delete */
+ for (i = 0; i < na->num_tx_rings + 1; i++) {
+ na->tx_rings[i].ring = NULL;
+ }
+ for (i = 0; i < na->num_rx_rings + 1; i++) {
+ na->rx_rings[i].ring = NULL;
+ }
+ }
+ return 0;
+}
+
+/* netmap_pipe_krings_delete.
+ *
+ * There are two cases:
+ *
+ * 1) state is
+ *
+ * usr1 --> e1 --> e2
+ *
+ * and we are e1 (e2 is not registered, so krings_delete cannot be
+ * called on it);
+ *
+ * 2) state is
+ *
+ * usr1 --> e1 e2 <-- usr2
+ *
+ * and we are either e1 or e2.
+ *
+ * In the former case we have to also delete the krings of e2;
+ * in the latter case we do nothing (note that our krings
+ * have already been hidden in the unregister callback).
+ */
+static void
+netmap_pipe_krings_delete(struct netmap_adapter *na)
+{
+ struct netmap_pipe_adapter *pna =
+ (struct netmap_pipe_adapter *)na;
+ struct netmap_adapter *ona; /* na of the other end */
+ int i;
+
+ if (!pna->peer_ref) {
+ ND("%p: case 2, kept alive by peer", na);
+ return;
+ }
+ /* case 1) above */
+ ND("%p: case 1, deleting everyhing", na);
+ netmap_krings_delete(na); /* also zeroes tx_rings etc. */
+ /* restore the ring to be deleted on the peer */
+ ona = &pna->peer->up;
+ if (ona->tx_rings == NULL) {
+ /* already deleted, we must be on an
+ * cleanup-after-error path */
+ return;
+ }
+ for (i = 0; i < ona->num_tx_rings + 1; i++)
+ ona->tx_rings[i].ring = ona->tx_rings[i].save_ring;
+ for (i = 0; i < ona->num_rx_rings + 1; i++)
+ ona->rx_rings[i].ring = ona->rx_rings[i].save_ring;
+ netmap_mem_rings_delete(ona);
+ netmap_krings_delete(ona);
+}
+
+
+static void
+netmap_pipe_dtor(struct netmap_adapter *na)
+{
+ struct netmap_pipe_adapter *pna =
+ (struct netmap_pipe_adapter *)na;
+ ND("%p", na);
+ if (pna->peer_ref) {
+ ND("%p: clean up peer", na);
+ pna->peer_ref = 0;
+ netmap_adapter_put(&pna->peer->up);
+ }
+ if (pna->role == NR_REG_PIPE_MASTER)
+ netmap_pipe_remove(pna->parent, pna);
+ netmap_adapter_put(pna->parent);
+ free(na->ifp, M_DEVBUF);
+ na->ifp = NULL;
+ pna->parent = NULL;
+}
+
+int
+netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
+{
+ struct nmreq pnmr;
+ struct netmap_adapter *pna; /* parent adapter */
+ struct netmap_pipe_adapter *mna, *sna, *req;
+ struct ifnet *ifp, *ifp2;
+ u_int pipe_id;
+ int role = nmr->nr_flags & NR_REG_MASK;
+ int error;
+
+ ND("flags %x", nmr->nr_flags);
+
+ if (role != NR_REG_PIPE_MASTER && role != NR_REG_PIPE_SLAVE) {
+ ND("not a pipe");
+ return 0;
+ }
+ role = nmr->nr_flags & NR_REG_MASK;
+
+ /* first, try to find the parent adapter */
+ bzero(&pnmr, sizeof(pnmr));
+ memcpy(&pnmr.nr_name, nmr->nr_name, IFNAMSIZ);
+ /* pass to parent the requested number of pipes */
+ pnmr.nr_arg1 = nmr->nr_arg1;
+ error = netmap_get_na(&pnmr, &pna, create);
+ if (error) {
+ ND("parent lookup failed: %d", error);
+ return error;
+ }
+ ND("found parent: %s", NM_IFPNAME(pna->ifp));
+
+ if (NETMAP_OWNED_BY_KERN(pna)) {
+ ND("parent busy");
+ error = EBUSY;
+ goto put_out;
+ }
+
+ /* next, lookup the pipe id in the parent list */
+ req = NULL;
+ pipe_id = nmr->nr_ringid & NETMAP_RING_MASK;
+ mna = netmap_pipe_find(pna, pipe_id);
+ if (mna) {
+ if (mna->role == role) {
+ ND("found %d directly at %d", pipe_id, mna->parent_slot);
+ req = mna;
+ } else {
+ ND("found %d indirectly at %d", pipe_id, mna->parent_slot);
+ req = mna->peer;
+ }
+ /* the pipe we have found already holds a ref to the parent,
+ * so we need to drop the one we got from netmap_get_na()
+ */
+ netmap_adapter_put(pna);
+ goto found;
+ }
+ ND("pipe %d not found, create %d", pipe_id, create);
+ if (!create) {
+ error = ENODEV;
+ goto put_out;
+ }
+ /* we create both master and slave.
+ * The endpoint we were asked for holds a reference to
+ * the other one.
+ */
+ ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (!ifp) {
+ error = ENOMEM;
+ goto put_out;
+ }
+ strcpy(ifp->if_xname, NM_IFPNAME(pna->ifp));
+
+ mna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (mna == NULL) {
+ error = ENOMEM;
+ goto free_ifp;
+ }
+ mna->up.ifp = ifp;
+
+ mna->id = pipe_id;
+ mna->role = NR_REG_PIPE_MASTER;
+ mna->parent = pna;
+
+ mna->up.nm_txsync = netmap_pipe_txsync;
+ mna->up.nm_rxsync = netmap_pipe_rxsync;
+ mna->up.nm_register = netmap_pipe_reg;
+ mna->up.nm_dtor = netmap_pipe_dtor;
+ mna->up.nm_krings_create = netmap_pipe_krings_create;
+ mna->up.nm_krings_delete = netmap_pipe_krings_delete;
+ mna->up.nm_mem = pna->nm_mem;
+ mna->up.na_lut = pna->na_lut;
+ mna->up.na_lut_objtotal = pna->na_lut_objtotal;
+
+ mna->up.num_tx_rings = 1;
+ mna->up.num_rx_rings = 1;
+ mna->up.num_tx_desc = nmr->nr_tx_slots;
+ nm_bound_var(&mna->up.num_tx_desc, pna->num_tx_desc,
+ 1, NM_PIPE_MAXSLOTS, NULL);
+ mna->up.num_rx_desc = nmr->nr_rx_slots;
+ nm_bound_var(&mna->up.num_rx_desc, pna->num_rx_desc,
+ 1, NM_PIPE_MAXSLOTS, NULL);
+ error = netmap_attach_common(&mna->up);
+ if (error)
+ goto free_ifp;
+ /* register the master with the parent */
+ error = netmap_pipe_add(pna, mna);
+ if (error)
+ goto free_mna;
+
+ /* create the slave */
+ ifp2 = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (!ifp) {
+ error = ENOMEM;
+ goto free_mna;
+ }
+ strcpy(ifp2->if_xname, NM_IFPNAME(pna->ifp));
+
+ sna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (sna == NULL) {
+ error = ENOMEM;
+ goto free_ifp2;
+ }
+ /* most fields are the same, copy from master and then fix */
+ *sna = *mna;
+ sna->up.ifp = ifp2;
+ sna->role = NR_REG_PIPE_SLAVE;
+ error = netmap_attach_common(&sna->up);
+ if (error)
+ goto free_sna;
+
+ /* join the two endpoints */
+ mna->peer = sna;
+ sna->peer = mna;
+
+ /* we already have a reference to the parent, but we
+ * need another one for the other endpoint we created
+ */
+ netmap_adapter_get(pna);
+
+ if (role == NR_REG_PIPE_MASTER) {
+ req = mna;
+ mna->peer_ref = 1;
+ netmap_adapter_get(&sna->up);
+ } else {
+ req = sna;
+ sna->peer_ref = 1;
+ netmap_adapter_get(&mna->up);
+ }
+ ND("created master %p and slave %p", mna, sna);
+found:
+
+ ND("pipe %d %s at %p", pipe_id,
+ (req->role == NR_REG_PIPE_MASTER ? "master" : "slave"), req);
+ *na = &req->up;
+ netmap_adapter_get(*na);
+
+ /* write the configuration back */
+ nmr->nr_tx_rings = req->up.num_tx_rings;
+ nmr->nr_rx_rings = req->up.num_rx_rings;
+ nmr->nr_tx_slots = req->up.num_tx_desc;
+ nmr->nr_rx_slots = req->up.num_rx_desc;
+
+ /* keep the reference to the parent.
+ * It will be released by the req destructor
+ */
+
+ return 0;
+
+free_sna:
+ free(sna, M_DEVBUF);
+free_ifp2:
+ free(ifp2, M_DEVBUF);
+free_mna:
+ free(mna, M_DEVBUF);
+free_ifp:
+ free(ifp, M_DEVBUF);
+put_out:
+ netmap_adapter_put(pna);
+ return error;
+}
+
+
+#endif /* WITH_PIPES */
diff --git a/sys/dev/netmap/netmap_vale.c b/sys/dev/netmap/netmap_vale.c
new file mode 100644
index 0000000..34e3912
--- /dev/null
+++ b/sys/dev/netmap/netmap_vale.c
@@ -0,0 +1,2103 @@
+/*
+ * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+/*
+ * This module implements the VALE switch for netmap
+
+--- VALE SWITCH ---
+
+NMG_LOCK() serializes all modifications to switches and ports.
+A switch cannot be deleted until all ports are gone.
+
+For each switch, an SX lock (RWlock on linux) protects
+deletion of ports. When configuring or deleting a new port, the
+lock is acquired in exclusive mode (after holding NMG_LOCK).
+When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
+The lock is held throughout the entire forwarding cycle,
+during which the thread may incur in a page fault.
+Hence it is important that sleepable shared locks are used.
+
+On the rx ring, the per-port lock is grabbed initially to reserve
+a number of slot in the ring, then the lock is released,
+packets are copied from source to destination, and then
+the lock is acquired again and the receive ring is updated.
+(A similar thing is done on the tx ring for NIC and host stack
+ports attached to the switch)
+
+ */
+
+/*
+ * OS-specific code that is used only within this file.
+ * Other OS-specific code that must be accessed by drivers
+ * is present in netmap_kern.h
+ */
+
+#if defined(__FreeBSD__)
+#include <sys/cdefs.h> /* prerequisite */
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/param.h> /* defines used in kernel.h */
+#include <sys/kernel.h> /* types used in module initialization */
+#include <sys/conf.h> /* cdevsw struct, UID, GID */
+#include <sys/sockio.h>
+#include <sys/socketvar.h> /* struct socket */
+#include <sys/malloc.h>
+#include <sys/poll.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h> /* sockaddrs */
+#include <sys/selinfo.h>
+#include <sys/sysctl.h>
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/bpf.h> /* BIOCIMMEDIATE */
+#include <machine/bus.h> /* bus_dmamap_* */
+#include <sys/endian.h>
+#include <sys/refcount.h>
+
+
+#define BDG_RWLOCK_T struct rwlock // struct rwlock
+
+#define BDG_RWINIT(b) \
+ rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS)
+#define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock)
+#define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock)
+#define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock)
+#define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock)
+#define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock)
+#define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock)
+
+
+#elif defined(linux)
+
+#include "bsd_glue.h"
+
+#elif defined(__APPLE__)
+
+#warning OSX support is only partial
+#include "osx_glue.h"
+
+#else
+
+#error Unsupported platform
+
+#endif /* unsupported */
+
+/*
+ * common headers
+ */
+
+#include <net/netmap.h>
+#include <dev/netmap/netmap_kern.h>
+#include <dev/netmap/netmap_mem2.h>
+
+#ifdef WITH_VALE
+
+/*
+ * system parameters (most of them in netmap_kern.h)
+ * NM_NAME prefix for switch port names, default "vale"
+ * NM_BDG_MAXPORTS number of ports
+ * NM_BRIDGES max number of switches in the system.
+ * XXX should become a sysctl or tunable
+ *
+ * Switch ports are named valeX:Y where X is the switch name and Y
+ * is the port. If Y matches a physical interface name, the port is
+ * connected to a physical device.
+ *
+ * Unlike physical interfaces, switch ports use their own memory region
+ * for rings and buffers.
+ * The virtual interfaces use per-queue lock instead of core lock.
+ * In the tx loop, we aggregate traffic in batches to make all operations
+ * faster. The batch size is bridge_batch.
+ */
+#define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */
+#define NM_BDG_MAXSLOTS 4096 /* XXX same as above */
+#define NM_BRIDGE_RINGSIZE 1024 /* in the device */
+#define NM_BDG_HASH 1024 /* forwarding table entries */
+#define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */
+#define NM_MULTISEG 64 /* max size of a chain of bufs */
+/* actual size of the tables */
+#define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG)
+/* NM_FT_NULL terminates a list of slots in the ft */
+#define NM_FT_NULL NM_BDG_BATCH_MAX
+#define NM_BRIDGES 8 /* number of bridges */
+
+
+/*
+ * bridge_batch is set via sysctl to the max batch size to be
+ * used in the bridge. The actual value may be larger as the
+ * last packet in the block may overflow the size.
+ */
+int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
+SYSCTL_DECL(_dev_netmap);
+SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , "");
+
+
+static int bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp);
+static int bdg_netmap_reg(struct netmap_adapter *na, int onoff);
+static int netmap_bwrap_attach(struct ifnet *, struct ifnet *);
+static int netmap_bwrap_register(struct netmap_adapter *, int onoff);
+int kern_netmap_regif(struct nmreq *nmr);
+
+/*
+ * For each output interface, nm_bdg_q is used to construct a list.
+ * bq_len is the number of output buffers (we can have coalescing
+ * during the copy).
+ */
+struct nm_bdg_q {
+ uint16_t bq_head;
+ uint16_t bq_tail;
+ uint32_t bq_len; /* number of buffers */
+};
+
+/* XXX revise this */
+struct nm_hash_ent {
+ uint64_t mac; /* the top 2 bytes are the epoch */
+ uint64_t ports;
+};
+
+/*
+ * nm_bridge is a descriptor for a VALE switch.
+ * Interfaces for a bridge are all in bdg_ports[].
+ * The array has fixed size, an empty entry does not terminate
+ * the search, but lookups only occur on attach/detach so we
+ * don't mind if they are slow.
+ *
+ * The bridge is non blocking on the transmit ports: excess
+ * packets are dropped if there is no room on the output port.
+ *
+ * bdg_lock protects accesses to the bdg_ports array.
+ * This is a rw lock (or equivalent).
+ */
+struct nm_bridge {
+ /* XXX what is the proper alignment/layout ? */
+ BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */
+ int bdg_namelen;
+ uint32_t bdg_active_ports; /* 0 means free */
+ char bdg_basename[IFNAMSIZ];
+
+ /* Indexes of active ports (up to active_ports)
+ * and all other remaining ports.
+ */
+ uint8_t bdg_port_index[NM_BDG_MAXPORTS];
+
+ struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS];
+
+
+ /*
+ * The function to decide the destination port.
+ * It returns either of an index of the destination port,
+ * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
+ * forward this packet. ring_nr is the source ring index, and the
+ * function may overwrite this value to forward this packet to a
+ * different ring index.
+ * This function must be set by netmap_bdgctl().
+ */
+ bdg_lookup_fn_t nm_bdg_lookup;
+
+ /* the forwarding table, MAC+ports.
+ * XXX should be changed to an argument to be passed to
+ * the lookup function, and allocated on attach
+ */
+ struct nm_hash_ent ht[NM_BDG_HASH];
+};
+
+
+/*
+ * XXX in principle nm_bridges could be created dynamically
+ * Right now we have a static array and deletions are protected
+ * by an exclusive lock.
+ */
+struct nm_bridge nm_bridges[NM_BRIDGES];
+
+
+/*
+ * this is a slightly optimized copy routine which rounds
+ * to multiple of 64 bytes and is often faster than dealing
+ * with other odd sizes. We assume there is enough room
+ * in the source and destination buffers.
+ *
+ * XXX only for multiples of 64 bytes, non overlapped.
+ */
+static inline void
+pkt_copy(void *_src, void *_dst, int l)
+{
+ uint64_t *src = _src;
+ uint64_t *dst = _dst;
+ if (unlikely(l >= 1024)) {
+ memcpy(dst, src, l);
+ return;
+ }
+ for (; likely(l > 0); l-=64) {
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ }
+}
+
+
+/*
+ * locate a bridge among the existing ones.
+ * MUST BE CALLED WITH NMG_LOCK()
+ *
+ * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
+ * We assume that this is called with a name of at least NM_NAME chars.
+ */
+static struct nm_bridge *
+nm_find_bridge(const char *name, int create)
+{
+ int i, l, namelen;
+ struct nm_bridge *b = NULL;
+
+ NMG_LOCK_ASSERT();
+
+ namelen = strlen(NM_NAME); /* base length */
+ l = name ? strlen(name) : 0; /* actual length */
+ if (l < namelen) {
+ D("invalid bridge name %s", name ? name : NULL);
+ return NULL;
+ }
+ for (i = namelen + 1; i < l; i++) {
+ if (name[i] == ':') {
+ namelen = i;
+ break;
+ }
+ }
+ if (namelen >= IFNAMSIZ)
+ namelen = IFNAMSIZ;
+ ND("--- prefix is '%.*s' ---", namelen, name);
+
+ /* lookup the name, remember empty slot if there is one */
+ for (i = 0; i < NM_BRIDGES; i++) {
+ struct nm_bridge *x = nm_bridges + i;
+
+ if (x->bdg_active_ports == 0) {
+ if (create && b == NULL)
+ b = x; /* record empty slot */
+ } else if (x->bdg_namelen != namelen) {
+ continue;
+ } else if (strncmp(name, x->bdg_basename, namelen) == 0) {
+ ND("found '%.*s' at %d", namelen, name, i);
+ b = x;
+ break;
+ }
+ }
+ if (i == NM_BRIDGES && b) { /* name not found, can create entry */
+ /* initialize the bridge */
+ strncpy(b->bdg_basename, name, namelen);
+ ND("create new bridge %s with ports %d", b->bdg_basename,
+ b->bdg_active_ports);
+ b->bdg_namelen = namelen;
+ b->bdg_active_ports = 0;
+ for (i = 0; i < NM_BDG_MAXPORTS; i++)
+ b->bdg_port_index[i] = i;
+ /* set the default function */
+ b->nm_bdg_lookup = netmap_bdg_learning;
+ /* reset the MAC address table */
+ bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
+ }
+ return b;
+}
+
+
+/*
+ * Free the forwarding tables for rings attached to switch ports.
+ */
+static void
+nm_free_bdgfwd(struct netmap_adapter *na)
+{
+ int nrings, i;
+ struct netmap_kring *kring;
+
+ NMG_LOCK_ASSERT();
+ nrings = na->num_tx_rings;
+ kring = na->tx_rings;
+ for (i = 0; i < nrings; i++) {
+ if (kring[i].nkr_ft) {
+ free(kring[i].nkr_ft, M_DEVBUF);
+ kring[i].nkr_ft = NULL; /* protect from freeing twice */
+ }
+ }
+}
+
+
+/*
+ * Allocate the forwarding tables for the rings attached to the bridge ports.
+ */
+static int
+nm_alloc_bdgfwd(struct netmap_adapter *na)
+{
+ int nrings, l, i, num_dstq;
+ struct netmap_kring *kring;
+
+ NMG_LOCK_ASSERT();
+ /* all port:rings + broadcast */
+ num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
+ l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
+ l += sizeof(struct nm_bdg_q) * num_dstq;
+ l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
+
+ nrings = netmap_real_tx_rings(na);
+ kring = na->tx_rings;
+ for (i = 0; i < nrings; i++) {
+ struct nm_bdg_fwd *ft;
+ struct nm_bdg_q *dstq;
+ int j;
+
+ ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (!ft) {
+ nm_free_bdgfwd(na);
+ return ENOMEM;
+ }
+ dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
+ for (j = 0; j < num_dstq; j++) {
+ dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
+ dstq[j].bq_len = 0;
+ }
+ kring[i].nkr_ft = ft;
+ }
+ return 0;
+}
+
+
+static void
+netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
+{
+ int s_hw = hw, s_sw = sw;
+ int i, lim =b->bdg_active_ports;
+ uint8_t tmp[NM_BDG_MAXPORTS];
+
+ /*
+ New algorithm:
+ make a copy of bdg_port_index;
+ lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
+ in the array of bdg_port_index, replacing them with
+ entries from the bottom of the array;
+ decrement bdg_active_ports;
+ acquire BDG_WLOCK() and copy back the array.
+ */
+
+ if (netmap_verbose)
+ D("detach %d and %d (lim %d)", hw, sw, lim);
+ /* make a copy of the list of active ports, update it,
+ * and then copy back within BDG_WLOCK().
+ */
+ memcpy(tmp, b->bdg_port_index, sizeof(tmp));
+ for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
+ if (hw >= 0 && tmp[i] == hw) {
+ ND("detach hw %d at %d", hw, i);
+ lim--; /* point to last active port */
+ tmp[i] = tmp[lim]; /* swap with i */
+ tmp[lim] = hw; /* now this is inactive */
+ hw = -1;
+ } else if (sw >= 0 && tmp[i] == sw) {
+ ND("detach sw %d at %d", sw, i);
+ lim--;
+ tmp[i] = tmp[lim];
+ tmp[lim] = sw;
+ sw = -1;
+ } else {
+ i++;
+ }
+ }
+ if (hw >= 0 || sw >= 0) {
+ D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
+ }
+
+ BDG_WLOCK(b);
+ b->bdg_ports[s_hw] = NULL;
+ if (s_sw >= 0) {
+ b->bdg_ports[s_sw] = NULL;
+ }
+ memcpy(b->bdg_port_index, tmp, sizeof(tmp));
+ b->bdg_active_ports = lim;
+ BDG_WUNLOCK(b);
+
+ ND("now %d active ports", lim);
+ if (lim == 0) {
+ ND("marking bridge %s as free", b->bdg_basename);
+ b->nm_bdg_lookup = NULL;
+ }
+}
+
+
+static void
+netmap_adapter_vp_dtor(struct netmap_adapter *na)
+{
+ struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
+ struct nm_bridge *b = vpna->na_bdg;
+ struct ifnet *ifp = na->ifp;
+
+ ND("%s has %d references", NM_IFPNAME(ifp), na->na_refcount);
+
+ if (b) {
+ netmap_bdg_detach_common(b, vpna->bdg_port, -1);
+ }
+
+ bzero(ifp, sizeof(*ifp));
+ free(ifp, M_DEVBUF);
+ na->ifp = NULL;
+}
+
+
+/* Try to get a reference to a netmap adapter attached to a VALE switch.
+ * If the adapter is found (or is created), this function returns 0, a
+ * non NULL pointer is returned into *na, and the caller holds a
+ * reference to the adapter.
+ * If an adapter is not found, then no reference is grabbed and the
+ * function returns an error code, or 0 if there is just a VALE prefix
+ * mismatch. Therefore the caller holds a reference when
+ * (*na != NULL && return == 0).
+ */
+int
+netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
+{
+ const char *name = nmr->nr_name;
+ struct ifnet *ifp;
+ int error = 0;
+ struct netmap_adapter *ret;
+ struct netmap_vp_adapter *vpna;
+ struct nm_bridge *b;
+ int i, j, cand = -1, cand2 = -1;
+ int needed;
+
+ *na = NULL; /* default return value */
+
+ /* first try to see if this is a bridge port. */
+ NMG_LOCK_ASSERT();
+ if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) {
+ return 0; /* no error, but no VALE prefix */
+ }
+
+ b = nm_find_bridge(name, create);
+ if (b == NULL) {
+ D("no bridges available for '%s'", name);
+ return (create ? ENOMEM : ENXIO);
+ }
+
+ /* Now we are sure that name starts with the bridge's name,
+ * lookup the port in the bridge. We need to scan the entire
+ * list. It is not important to hold a WLOCK on the bridge
+ * during the search because NMG_LOCK already guarantees
+ * that there are no other possible writers.
+ */
+
+ /* lookup in the local list of ports */
+ for (j = 0; j < b->bdg_active_ports; j++) {
+ i = b->bdg_port_index[j];
+ vpna = b->bdg_ports[i];
+ // KASSERT(na != NULL);
+ ifp = vpna->up.ifp;
+ /* XXX make sure the name only contains one : */
+ if (!strcmp(NM_IFPNAME(ifp), name)) {
+ netmap_adapter_get(&vpna->up);
+ ND("found existing if %s refs %d", name,
+ vpna->na_bdg_refcount);
+ *na = (struct netmap_adapter *)vpna;
+ return 0;
+ }
+ }
+ /* not found, should we create it? */
+ if (!create)
+ return ENXIO;
+ /* yes we should, see if we have space to attach entries */
+ needed = 2; /* in some cases we only need 1 */
+ if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
+ D("bridge full %d, cannot create new port", b->bdg_active_ports);
+ return ENOMEM;
+ }
+ /* record the next two ports available, but do not allocate yet */
+ cand = b->bdg_port_index[b->bdg_active_ports];
+ cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
+ ND("+++ bridge %s port %s used %d avail %d %d",
+ b->bdg_basename, name, b->bdg_active_ports, cand, cand2);
+
+ /*
+ * try see if there is a matching NIC with this name
+ * (after the bridge's name)
+ */
+ ifp = ifunit_ref(name + b->bdg_namelen + 1);
+ if (!ifp) { /* this is a virtual port */
+ if (nmr->nr_cmd) {
+ /* nr_cmd must be 0 for a virtual port */
+ return EINVAL;
+ }
+
+ /* create a struct ifnet for the new port.
+ * need M_NOWAIT as we are under nma_lock
+ */
+ ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (!ifp)
+ return ENOMEM;
+
+ strcpy(ifp->if_xname, name);
+ /* bdg_netmap_attach creates a struct netmap_adapter */
+ error = bdg_netmap_attach(nmr, ifp);
+ if (error) {
+ D("error %d", error);
+ free(ifp, M_DEVBUF);
+ return error;
+ }
+ ret = NA(ifp);
+ cand2 = -1; /* only need one port */
+ } else { /* this is a NIC */
+ struct ifnet *fake_ifp;
+
+ error = netmap_get_hw_na(ifp, &ret);
+ if (error || ret == NULL)
+ goto out;
+
+ /* make sure the NIC is not already in use */
+ if (NETMAP_OWNED_BY_ANY(ret)) {
+ D("NIC %s busy, cannot attach to bridge",
+ NM_IFPNAME(ifp));
+ error = EBUSY;
+ goto out;
+ }
+ /* create a fake interface */
+ fake_ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (!fake_ifp) {
+ error = ENOMEM;
+ goto out;
+ }
+ strcpy(fake_ifp->if_xname, name);
+ error = netmap_bwrap_attach(fake_ifp, ifp);
+ if (error) {
+ free(fake_ifp, M_DEVBUF);
+ goto out;
+ }
+ ret = NA(fake_ifp);
+ if (nmr->nr_arg1 != NETMAP_BDG_HOST)
+ cand2 = -1; /* only need one port */
+ if_rele(ifp);
+ }
+ vpna = (struct netmap_vp_adapter *)ret;
+
+ BDG_WLOCK(b);
+ vpna->bdg_port = cand;
+ ND("NIC %p to bridge port %d", vpna, cand);
+ /* bind the port to the bridge (virtual ports are not active) */
+ b->bdg_ports[cand] = vpna;
+ vpna->na_bdg = b;
+ b->bdg_active_ports++;
+ if (cand2 >= 0) {
+ struct netmap_vp_adapter *hostna = vpna + 1;
+ /* also bind the host stack to the bridge */
+ b->bdg_ports[cand2] = hostna;
+ hostna->bdg_port = cand2;
+ hostna->na_bdg = b;
+ b->bdg_active_ports++;
+ ND("host %p to bridge port %d", hostna, cand2);
+ }
+ ND("if %s refs %d", name, vpna->up.na_refcount);
+ BDG_WUNLOCK(b);
+ *na = ret;
+ netmap_adapter_get(ret);
+ return 0;
+
+out:
+ if_rele(ifp);
+
+ return error;
+}
+
+
+/* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */
+static int
+nm_bdg_attach(struct nmreq *nmr)
+{
+ struct netmap_adapter *na;
+ struct netmap_if *nifp;
+ struct netmap_priv_d *npriv;
+ struct netmap_bwrap_adapter *bna;
+ int error;
+
+ npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO);
+ if (npriv == NULL)
+ return ENOMEM;
+
+ NMG_LOCK();
+
+ error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */);
+ if (error) /* no device, or another bridge or user owns the device */
+ goto unlock_exit;
+
+ if (na == NULL) { /* VALE prefix missing */
+ error = EINVAL;
+ goto unlock_exit;
+ }
+
+ if (na->active_fds > 0) { /* already registered */
+ error = EBUSY;
+ goto unref_exit;
+ }
+
+ nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags, &error);
+ if (!nifp) {
+ goto unref_exit;
+ }
+
+ bna = (struct netmap_bwrap_adapter*)na;
+ bna->na_kpriv = npriv;
+ NMG_UNLOCK();
+ ND("registered %s to netmap-mode", NM_IFPNAME(na->ifp));
+ return 0;
+
+unref_exit:
+ netmap_adapter_put(na);
+unlock_exit:
+ NMG_UNLOCK();
+ bzero(npriv, sizeof(*npriv));
+ free(npriv, M_DEVBUF);
+ return error;
+}
+
+
+static int
+nm_bdg_detach(struct nmreq *nmr)
+{
+ struct netmap_adapter *na;
+ int error;
+ struct netmap_bwrap_adapter *bna;
+ int last_instance;
+
+ NMG_LOCK();
+ error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */);
+ if (error) { /* no device, or another bridge or user owns the device */
+ goto unlock_exit;
+ }
+
+ if (na == NULL) { /* VALE prefix missing */
+ error = EINVAL;
+ goto unlock_exit;
+ }
+
+ bna = (struct netmap_bwrap_adapter *)na;
+
+ if (na->active_fds == 0) { /* not registered */
+ error = EINVAL;
+ goto unref_exit;
+ }
+
+ last_instance = netmap_dtor_locked(bna->na_kpriv); /* unregister */
+ if (!last_instance) {
+ D("--- error, trying to detach an entry with active mmaps");
+ error = EINVAL;
+ } else {
+ struct netmap_priv_d *npriv = bna->na_kpriv;
+
+ bna->na_kpriv = NULL;
+ D("deleting priv");
+
+ bzero(npriv, sizeof(*npriv));
+ free(npriv, M_DEVBUF);
+ }
+
+unref_exit:
+ netmap_adapter_put(na);
+unlock_exit:
+ NMG_UNLOCK();
+ return error;
+
+}
+
+
+/* exported to kernel callers, e.g. OVS ?
+ * Entry point.
+ * Called without NMG_LOCK.
+ */
+int
+netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
+{
+ struct nm_bridge *b;
+ struct netmap_adapter *na;
+ struct netmap_vp_adapter *vpna;
+ struct ifnet *iter;
+ char *name = nmr->nr_name;
+ int cmd = nmr->nr_cmd, namelen = strlen(name);
+ int error = 0, i, j;
+
+ switch (cmd) {
+ case NETMAP_BDG_ATTACH:
+ error = nm_bdg_attach(nmr);
+ break;
+
+ case NETMAP_BDG_DETACH:
+ error = nm_bdg_detach(nmr);
+ break;
+
+ case NETMAP_BDG_LIST:
+ /* this is used to enumerate bridges and ports */
+ if (namelen) { /* look up indexes of bridge and port */
+ if (strncmp(name, NM_NAME, strlen(NM_NAME))) {
+ error = EINVAL;
+ break;
+ }
+ NMG_LOCK();
+ b = nm_find_bridge(name, 0 /* don't create */);
+ if (!b) {
+ error = ENOENT;
+ NMG_UNLOCK();
+ break;
+ }
+
+ error = ENOENT;
+ for (j = 0; j < b->bdg_active_ports; j++) {
+ i = b->bdg_port_index[j];
+ vpna = b->bdg_ports[i];
+ if (vpna == NULL) {
+ D("---AAAAAAAAARGH-------");
+ continue;
+ }
+ iter = vpna->up.ifp;
+ /* the former and the latter identify a
+ * virtual port and a NIC, respectively
+ */
+ if (!strcmp(iter->if_xname, name)) {
+ /* bridge index */
+ nmr->nr_arg1 = b - nm_bridges;
+ nmr->nr_arg2 = i; /* port index */
+ error = 0;
+ break;
+ }
+ }
+ NMG_UNLOCK();
+ } else {
+ /* return the first non-empty entry starting from
+ * bridge nr_arg1 and port nr_arg2.
+ *
+ * Users can detect the end of the same bridge by
+ * seeing the new and old value of nr_arg1, and can
+ * detect the end of all the bridge by error != 0
+ */
+ i = nmr->nr_arg1;
+ j = nmr->nr_arg2;
+
+ NMG_LOCK();
+ for (error = ENOENT; i < NM_BRIDGES; i++) {
+ b = nm_bridges + i;
+ if (j >= b->bdg_active_ports) {
+ j = 0; /* following bridges scan from 0 */
+ continue;
+ }
+ nmr->nr_arg1 = i;
+ nmr->nr_arg2 = j;
+ j = b->bdg_port_index[j];
+ vpna = b->bdg_ports[j];
+ iter = vpna->up.ifp;
+ strncpy(name, iter->if_xname, (size_t)IFNAMSIZ);
+ error = 0;
+ break;
+ }
+ NMG_UNLOCK();
+ }
+ break;
+
+ case NETMAP_BDG_LOOKUP_REG:
+ /* register a lookup function to the given bridge.
+ * nmr->nr_name may be just bridge's name (including ':'
+ * if it is not just NM_NAME).
+ */
+ if (!func) {
+ error = EINVAL;
+ break;
+ }
+ NMG_LOCK();
+ b = nm_find_bridge(name, 0 /* don't create */);
+ if (!b) {
+ error = EINVAL;
+ } else {
+ b->nm_bdg_lookup = func;
+ }
+ NMG_UNLOCK();
+ break;
+
+ case NETMAP_BDG_VNET_HDR:
+ /* Valid lengths for the virtio-net header are 0 (no header),
+ 10 and 12. */
+ if (nmr->nr_arg1 != 0 &&
+ nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) &&
+ nmr->nr_arg1 != 12) {
+ error = EINVAL;
+ break;
+ }
+ NMG_LOCK();
+ error = netmap_get_bdg_na(nmr, &na, 0);
+ if (na && !error) {
+ vpna = (struct netmap_vp_adapter *)na;
+ vpna->virt_hdr_len = nmr->nr_arg1;
+ if (vpna->virt_hdr_len)
+ vpna->mfs = NETMAP_BDG_BUF_SIZE(na->nm_mem);
+ D("Using vnet_hdr_len %d for %p", vpna->virt_hdr_len, vpna);
+ netmap_adapter_put(na);
+ }
+ NMG_UNLOCK();
+ break;
+
+ default:
+ D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd);
+ error = EINVAL;
+ break;
+ }
+ return error;
+}
+
+static int
+netmap_vp_krings_create(struct netmap_adapter *na)
+{
+ u_int tailroom;
+ int error, i;
+ uint32_t *leases;
+ u_int nrx = netmap_real_rx_rings(na);
+
+ /*
+ * Leases are attached to RX rings on vale ports
+ */
+ tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
+
+ error = netmap_krings_create(na, tailroom);
+ if (error)
+ return error;
+
+ leases = na->tailroom;
+
+ for (i = 0; i < nrx; i++) { /* Receive rings */
+ na->rx_rings[i].nkr_leases = leases;
+ leases += na->num_rx_desc;
+ }
+
+ error = nm_alloc_bdgfwd(na);
+ if (error) {
+ netmap_krings_delete(na);
+ return error;
+ }
+
+ return 0;
+}
+
+
+static void
+netmap_vp_krings_delete(struct netmap_adapter *na)
+{
+ nm_free_bdgfwd(na);
+ netmap_krings_delete(na);
+}
+
+
+static int
+nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
+ struct netmap_vp_adapter *na, u_int ring_nr);
+
+
+/*
+ * Grab packets from a kring, move them into the ft structure
+ * associated to the tx (input) port. Max one instance per port,
+ * filtered on input (ioctl, poll or XXX).
+ * Returns the next position in the ring.
+ */
+static int
+nm_bdg_preflush(struct netmap_vp_adapter *na, u_int ring_nr,
+ struct netmap_kring *kring, u_int end)
+{
+ struct netmap_ring *ring = kring->ring;
+ struct nm_bdg_fwd *ft;
+ u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
+ u_int ft_i = 0; /* start from 0 */
+ u_int frags = 1; /* how many frags ? */
+ struct nm_bridge *b = na->na_bdg;
+
+ /* To protect against modifications to the bridge we acquire a
+ * shared lock, waiting if we can sleep (if the source port is
+ * attached to a user process) or with a trylock otherwise (NICs).
+ */
+ ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
+ if (na->up.na_flags & NAF_BDG_MAYSLEEP)
+ BDG_RLOCK(b);
+ else if (!BDG_RTRYLOCK(b))
+ return 0;
+ ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
+ ft = kring->nkr_ft;
+
+ for (; likely(j != end); j = nm_next(j, lim)) {
+ struct netmap_slot *slot = &ring->slot[j];
+ char *buf;
+
+ ft[ft_i].ft_len = slot->len;
+ ft[ft_i].ft_flags = slot->flags;
+
+ ND("flags is 0x%x", slot->flags);
+ /* this slot goes into a list so initialize the link field */
+ ft[ft_i].ft_next = NM_FT_NULL;
+ buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
+ (void *)(uintptr_t)slot->ptr : BDG_NMB(&na->up, slot);
+ __builtin_prefetch(buf);
+ ++ft_i;
+ if (slot->flags & NS_MOREFRAG) {
+ frags++;
+ continue;
+ }
+ if (unlikely(netmap_verbose && frags > 1))
+ RD(5, "%d frags at %d", frags, ft_i - frags);
+ ft[ft_i - frags].ft_frags = frags;
+ frags = 1;
+ if (unlikely((int)ft_i >= bridge_batch))
+ ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
+ }
+ if (frags > 1) {
+ D("truncate incomplete fragment at %d (%d frags)", ft_i, frags);
+ // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG
+ ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG;
+ ft[ft_i - frags].ft_frags = frags - 1;
+ }
+ if (ft_i)
+ ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
+ BDG_RUNLOCK(b);
+ return j;
+}
+
+
+/* ----- FreeBSD if_bridge hash function ------- */
+
+/*
+ * The following hash function is adapted from "Hash Functions" by Bob Jenkins
+ * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
+ *
+ * http://www.burtleburtle.net/bob/hash/spooky.html
+ */
+#define mix(a, b, c) \
+do { \
+ a -= b; a -= c; a ^= (c >> 13); \
+ b -= c; b -= a; b ^= (a << 8); \
+ c -= a; c -= b; c ^= (b >> 13); \
+ a -= b; a -= c; a ^= (c >> 12); \
+ b -= c; b -= a; b ^= (a << 16); \
+ c -= a; c -= b; c ^= (b >> 5); \
+ a -= b; a -= c; a ^= (c >> 3); \
+ b -= c; b -= a; b ^= (a << 10); \
+ c -= a; c -= b; c ^= (b >> 15); \
+} while (/*CONSTCOND*/0)
+
+
+static __inline uint32_t
+nm_bridge_rthash(const uint8_t *addr)
+{
+ uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
+
+ b += addr[5] << 8;
+ b += addr[4];
+ a += addr[3] << 24;
+ a += addr[2] << 16;
+ a += addr[1] << 8;
+ a += addr[0];
+
+ mix(a, b, c);
+#define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1)
+ return (c & BRIDGE_RTHASH_MASK);
+}
+
+#undef mix
+
+
+static int
+bdg_netmap_reg(struct netmap_adapter *na, int onoff)
+{
+ struct netmap_vp_adapter *vpna =
+ (struct netmap_vp_adapter*)na;
+ struct ifnet *ifp = na->ifp;
+
+ /* the interface is already attached to the bridge,
+ * so we only need to toggle IFCAP_NETMAP.
+ */
+ BDG_WLOCK(vpna->na_bdg);
+ if (onoff) {
+ ifp->if_capenable |= IFCAP_NETMAP;
+ } else {
+ ifp->if_capenable &= ~IFCAP_NETMAP;
+ }
+ BDG_WUNLOCK(vpna->na_bdg);
+ return 0;
+}
+
+
+/*
+ * Lookup function for a learning bridge.
+ * Update the hash table with the source address,
+ * and then returns the destination port index, and the
+ * ring in *dst_ring (at the moment, always use ring 0)
+ */
+u_int
+netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring,
+ struct netmap_vp_adapter *na)
+{
+ struct nm_hash_ent *ht = na->na_bdg->ht;
+ uint32_t sh, dh;
+ u_int dst, mysrc = na->bdg_port;
+ uint64_t smac, dmac;
+
+ if (buf_len < 14) {
+ D("invalid buf length %d", buf_len);
+ return NM_BDG_NOPORT;
+ }
+ dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
+ smac = le64toh(*(uint64_t *)(buf + 4));
+ smac >>= 16;
+
+ /*
+ * The hash is somewhat expensive, there might be some
+ * worthwhile optimizations here.
+ */
+ if ((buf[6] & 1) == 0) { /* valid src */
+ uint8_t *s = buf+6;
+ sh = nm_bridge_rthash(s); // XXX hash of source
+ /* update source port forwarding entry */
+ ht[sh].mac = smac; /* XXX expire ? */
+ ht[sh].ports = mysrc;
+ if (netmap_verbose)
+ D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
+ s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
+ }
+ dst = NM_BDG_BROADCAST;
+ if ((buf[0] & 1) == 0) { /* unicast */
+ dh = nm_bridge_rthash(buf); // XXX hash of dst
+ if (ht[dh].mac == dmac) { /* found dst */
+ dst = ht[dh].ports;
+ }
+ /* XXX otherwise return NM_BDG_UNKNOWN ? */
+ }
+ *dst_ring = 0;
+ return dst;
+}
+
+
+/*
+ * Available space in the ring. Only used in VALE code
+ * and only with is_rx = 1
+ */
+static inline uint32_t
+nm_kr_space(struct netmap_kring *k, int is_rx)
+{
+ int space;
+
+ if (is_rx) {
+ int busy = k->nkr_hwlease - k->nr_hwcur;
+ if (busy < 0)
+ busy += k->nkr_num_slots;
+ space = k->nkr_num_slots - 1 - busy;
+ } else {
+ /* XXX never used in this branch */
+ space = k->nr_hwtail - k->nkr_hwlease;
+ if (space < 0)
+ space += k->nkr_num_slots;
+ }
+#if 0
+ // sanity check
+ if (k->nkr_hwlease >= k->nkr_num_slots ||
+ k->nr_hwcur >= k->nkr_num_slots ||
+ k->nr_tail >= k->nkr_num_slots ||
+ busy < 0 ||
+ busy >= k->nkr_num_slots) {
+ D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
+ k->nkr_lease_idx, k->nkr_num_slots);
+ }
+#endif
+ return space;
+}
+
+
+
+
+/* make a lease on the kring for N positions. return the
+ * lease index
+ * XXX only used in VALE code and with is_rx = 1
+ */
+static inline uint32_t
+nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
+{
+ uint32_t lim = k->nkr_num_slots - 1;
+ uint32_t lease_idx = k->nkr_lease_idx;
+
+ k->nkr_leases[lease_idx] = NR_NOSLOT;
+ k->nkr_lease_idx = nm_next(lease_idx, lim);
+
+ if (n > nm_kr_space(k, is_rx)) {
+ D("invalid request for %d slots", n);
+ panic("x");
+ }
+ /* XXX verify that there are n slots */
+ k->nkr_hwlease += n;
+ if (k->nkr_hwlease > lim)
+ k->nkr_hwlease -= lim + 1;
+
+ if (k->nkr_hwlease >= k->nkr_num_slots ||
+ k->nr_hwcur >= k->nkr_num_slots ||
+ k->nr_hwtail >= k->nkr_num_slots ||
+ k->nkr_lease_idx >= k->nkr_num_slots) {
+ D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
+ k->na->ifp->if_xname,
+ k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
+ k->nkr_lease_idx, k->nkr_num_slots);
+ }
+ return lease_idx;
+}
+
+/*
+ * This flush routine supports only unicast and broadcast but a large
+ * number of ports, and lets us replace the learn and dispatch functions.
+ */
+int
+nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
+ u_int ring_nr)
+{
+ struct nm_bdg_q *dst_ents, *brddst;
+ uint16_t num_dsts = 0, *dsts;
+ struct nm_bridge *b = na->na_bdg;
+ u_int i, j, me = na->bdg_port;
+
+ /*
+ * The work area (pointed by ft) is followed by an array of
+ * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
+ * queues per port plus one for the broadcast traffic.
+ * Then we have an array of destination indexes.
+ */
+ dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
+ dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
+
+ /* first pass: find a destination for each packet in the batch */
+ for (i = 0; likely(i < n); i += ft[i].ft_frags) {
+ uint8_t dst_ring = ring_nr; /* default, same ring as origin */
+ uint16_t dst_port, d_i;
+ struct nm_bdg_q *d;
+ uint8_t *buf = ft[i].ft_buf;
+ u_int len = ft[i].ft_len;
+
+ ND("slot %d frags %d", i, ft[i].ft_frags);
+ /* Drop the packet if the virtio-net header is not into the first
+ fragment nor at the very beginning of the second. */
+ if (unlikely(na->virt_hdr_len > len))
+ continue;
+ if (len == na->virt_hdr_len) {
+ buf = ft[i+1].ft_buf;
+ len = ft[i+1].ft_len;
+ } else {
+ buf += na->virt_hdr_len;
+ len -= na->virt_hdr_len;
+ }
+ dst_port = b->nm_bdg_lookup(buf, len, &dst_ring, na);
+ if (netmap_verbose > 255)
+ RD(5, "slot %d port %d -> %d", i, me, dst_port);
+ if (dst_port == NM_BDG_NOPORT)
+ continue; /* this packet is identified to be dropped */
+ else if (unlikely(dst_port > NM_BDG_MAXPORTS))
+ continue;
+ else if (dst_port == NM_BDG_BROADCAST)
+ dst_ring = 0; /* broadcasts always go to ring 0 */
+ else if (unlikely(dst_port == me ||
+ !b->bdg_ports[dst_port]))
+ continue;
+
+ /* get a position in the scratch pad */
+ d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
+ d = dst_ents + d_i;
+
+ /* append the first fragment to the list */
+ if (d->bq_head == NM_FT_NULL) { /* new destination */
+ d->bq_head = d->bq_tail = i;
+ /* remember this position to be scanned later */
+ if (dst_port != NM_BDG_BROADCAST)
+ dsts[num_dsts++] = d_i;
+ } else {
+ ft[d->bq_tail].ft_next = i;
+ d->bq_tail = i;
+ }
+ d->bq_len += ft[i].ft_frags;
+ }
+
+ /*
+ * Broadcast traffic goes to ring 0 on all destinations.
+ * So we need to add these rings to the list of ports to scan.
+ * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
+ * expensive. We should keep a compact list of active destinations
+ * so we could shorten this loop.
+ */
+ brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
+ if (brddst->bq_head != NM_FT_NULL) {
+ for (j = 0; likely(j < b->bdg_active_ports); j++) {
+ uint16_t d_i;
+ i = b->bdg_port_index[j];
+ if (unlikely(i == me))
+ continue;
+ d_i = i * NM_BDG_MAXRINGS;
+ if (dst_ents[d_i].bq_head == NM_FT_NULL)
+ dsts[num_dsts++] = d_i;
+ }
+ }
+
+ ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
+ /* second pass: scan destinations (XXX will be modular somehow) */
+ for (i = 0; i < num_dsts; i++) {
+ struct ifnet *dst_ifp;
+ struct netmap_vp_adapter *dst_na;
+ struct netmap_kring *kring;
+ struct netmap_ring *ring;
+ u_int dst_nr, lim, j, d_i, next, brd_next;
+ u_int needed, howmany;
+ int retry = netmap_txsync_retry;
+ struct nm_bdg_q *d;
+ uint32_t my_start = 0, lease_idx = 0;
+ int nrings;
+ int virt_hdr_mismatch = 0;
+
+ d_i = dsts[i];
+ ND("second pass %d port %d", i, d_i);
+ d = dst_ents + d_i;
+ // XXX fix the division
+ dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
+ /* protect from the lookup function returning an inactive
+ * destination port
+ */
+ if (unlikely(dst_na == NULL))
+ goto cleanup;
+ if (dst_na->up.na_flags & NAF_SW_ONLY)
+ goto cleanup;
+ dst_ifp = dst_na->up.ifp;
+ /*
+ * The interface may be in !netmap mode in two cases:
+ * - when na is attached but not activated yet;
+ * - when na is being deactivated but is still attached.
+ */
+ if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) {
+ ND("not in netmap mode!");
+ goto cleanup;
+ }
+
+ /* there is at least one either unicast or broadcast packet */
+ brd_next = brddst->bq_head;
+ next = d->bq_head;
+ /* we need to reserve this many slots. If fewer are
+ * available, some packets will be dropped.
+ * Packets may have multiple fragments, so we may not use
+ * there is a chance that we may not use all of the slots
+ * we have claimed, so we will need to handle the leftover
+ * ones when we regain the lock.
+ */
+ needed = d->bq_len + brddst->bq_len;
+
+ if (unlikely(dst_na->virt_hdr_len != na->virt_hdr_len)) {
+ /* There is a virtio-net header/offloadings mismatch between
+ * source and destination. The slower mismatch datapath will
+ * be used to cope with all the mismatches.
+ */
+ virt_hdr_mismatch = 1;
+ if (dst_na->mfs < na->mfs) {
+ /* We may need to do segmentation offloadings, and so
+ * we may need a number of destination slots greater
+ * than the number of input slots ('needed').
+ * We look for the smallest integer 'x' which satisfies:
+ * needed * na->mfs + x * H <= x * na->mfs
+ * where 'H' is the length of the longest header that may
+ * be replicated in the segmentation process (e.g. for
+ * TCPv4 we must account for ethernet header, IP header
+ * and TCPv4 header).
+ */
+ needed = (needed * na->mfs) /
+ (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
+ ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
+ }
+ }
+
+ ND(5, "pass 2 dst %d is %x %s",
+ i, d_i, is_vp ? "virtual" : "nic/host");
+ dst_nr = d_i & (NM_BDG_MAXRINGS-1);
+ nrings = dst_na->up.num_rx_rings;
+ if (dst_nr >= nrings)
+ dst_nr = dst_nr % nrings;
+ kring = &dst_na->up.rx_rings[dst_nr];
+ ring = kring->ring;
+ lim = kring->nkr_num_slots - 1;
+
+retry:
+
+ if (dst_na->retry && retry) {
+ /* try to get some free slot from the previous run */
+ dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
+ }
+ /* reserve the buffers in the queue and an entry
+ * to report completion, and drop lock.
+ * XXX this might become a helper function.
+ */
+ mtx_lock(&kring->q_lock);
+ if (kring->nkr_stopped) {
+ mtx_unlock(&kring->q_lock);
+ goto cleanup;
+ }
+ my_start = j = kring->nkr_hwlease;
+ howmany = nm_kr_space(kring, 1);
+ if (needed < howmany)
+ howmany = needed;
+ lease_idx = nm_kr_lease(kring, howmany, 1);
+ mtx_unlock(&kring->q_lock);
+
+ /* only retry if we need more than available slots */
+ if (retry && needed <= howmany)
+ retry = 0;
+
+ /* copy to the destination queue */
+ while (howmany > 0) {
+ struct netmap_slot *slot;
+ struct nm_bdg_fwd *ft_p, *ft_end;
+ u_int cnt;
+
+ /* find the queue from which we pick next packet.
+ * NM_FT_NULL is always higher than valid indexes
+ * so we never dereference it if the other list
+ * has packets (and if both are empty we never
+ * get here).
+ */
+ if (next < brd_next) {
+ ft_p = ft + next;
+ next = ft_p->ft_next;
+ } else { /* insert broadcast */
+ ft_p = ft + brd_next;
+ brd_next = ft_p->ft_next;
+ }
+ cnt = ft_p->ft_frags; // cnt > 0
+ if (unlikely(cnt > howmany))
+ break; /* no more space */
+ if (netmap_verbose && cnt > 1)
+ RD(5, "rx %d frags to %d", cnt, j);
+ ft_end = ft_p + cnt;
+ if (unlikely(virt_hdr_mismatch)) {
+ bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
+ } else {
+ howmany -= cnt;
+ do {
+ char *dst, *src = ft_p->ft_buf;
+ size_t copy_len = ft_p->ft_len, dst_len = copy_len;
+
+ slot = &ring->slot[j];
+ dst = BDG_NMB(&dst_na->up, slot);
+
+ ND("send [%d] %d(%d) bytes at %s:%d",
+ i, (int)copy_len, (int)dst_len,
+ NM_IFPNAME(dst_ifp), j);
+ /* round to a multiple of 64 */
+ copy_len = (copy_len + 63) & ~63;
+
+ if (ft_p->ft_flags & NS_INDIRECT) {
+ if (copyin(src, dst, copy_len)) {
+ // invalid user pointer, pretend len is 0
+ dst_len = 0;
+ }
+ } else {
+ //memcpy(dst, src, copy_len);
+ pkt_copy(src, dst, (int)copy_len);
+ }
+ slot->len = dst_len;
+ slot->flags = (cnt << 8)| NS_MOREFRAG;
+ j = nm_next(j, lim);
+ needed--;
+ ft_p++;
+ } while (ft_p != ft_end);
+ slot->flags = (cnt << 8); /* clear flag on last entry */
+ }
+ /* are we done ? */
+ if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
+ break;
+ }
+ {
+ /* current position */
+ uint32_t *p = kring->nkr_leases; /* shorthand */
+ uint32_t update_pos;
+ int still_locked = 1;
+
+ mtx_lock(&kring->q_lock);
+ if (unlikely(howmany > 0)) {
+ /* not used all bufs. If i am the last one
+ * i can recover the slots, otherwise must
+ * fill them with 0 to mark empty packets.
+ */
+ ND("leftover %d bufs", howmany);
+ if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
+ /* yes i am the last one */
+ ND("roll back nkr_hwlease to %d", j);
+ kring->nkr_hwlease = j;
+ } else {
+ while (howmany-- > 0) {
+ ring->slot[j].len = 0;
+ ring->slot[j].flags = 0;
+ j = nm_next(j, lim);
+ }
+ }
+ }
+ p[lease_idx] = j; /* report I am done */
+
+ update_pos = kring->nr_hwtail;
+
+ if (my_start == update_pos) {
+ /* all slots before my_start have been reported,
+ * so scan subsequent leases to see if other ranges
+ * have been completed, and to a selwakeup or txsync.
+ */
+ while (lease_idx != kring->nkr_lease_idx &&
+ p[lease_idx] != NR_NOSLOT) {
+ j = p[lease_idx];
+ p[lease_idx] = NR_NOSLOT;
+ lease_idx = nm_next(lease_idx, lim);
+ }
+ /* j is the new 'write' position. j != my_start
+ * means there are new buffers to report
+ */
+ if (likely(j != my_start)) {
+ kring->nr_hwtail = j;
+ still_locked = 0;
+ mtx_unlock(&kring->q_lock);
+ dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
+ if (dst_na->retry && retry--)
+ goto retry;
+ }
+ }
+ if (still_locked)
+ mtx_unlock(&kring->q_lock);
+ }
+cleanup:
+ d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
+ d->bq_len = 0;
+ }
+ brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
+ brddst->bq_len = 0;
+ return 0;
+}
+
+
+static int
+netmap_vp_txsync(struct netmap_vp_adapter *na, u_int ring_nr, int flags)
+{
+ struct netmap_kring *kring = &na->up.tx_rings[ring_nr];
+ u_int done;
+ u_int const lim = kring->nkr_num_slots - 1;
+ u_int const cur = kring->rcur;
+
+ if (bridge_batch <= 0) { /* testing only */
+ done = cur; // used all
+ goto done;
+ }
+ if (bridge_batch > NM_BDG_BATCH)
+ bridge_batch = NM_BDG_BATCH;
+
+ done = nm_bdg_preflush(na, ring_nr, kring, cur);
+done:
+ if (done != cur)
+ D("early break at %d/ %d, tail %d", done, cur, kring->nr_hwtail);
+ /*
+ * packets between 'done' and 'cur' are left unsent.
+ */
+ kring->nr_hwcur = done;
+ kring->nr_hwtail = nm_prev(done, lim);
+ nm_txsync_finalize(kring);
+ if (netmap_verbose)
+ D("%s ring %d flags %d", NM_IFPNAME(na->up.ifp), ring_nr, flags);
+ return 0;
+}
+
+
+/*
+ * main dispatch routine for the bridge.
+ * We already know that only one thread is running this.
+ * we must run nm_bdg_preflush without lock.
+ */
+static int
+bdg_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+{
+ struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
+ return netmap_vp_txsync(vpna, ring_nr, flags);
+}
+
+static int
+netmap_vp_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+{
+ struct netmap_kring *kring = &na->rx_rings[ring_nr];
+ struct netmap_ring *ring = kring->ring;
+ u_int nm_i, lim = kring->nkr_num_slots - 1;
+ u_int head = nm_rxsync_prologue(kring);
+ int n;
+
+ if (head > lim) {
+ D("ouch dangerous reset!!!");
+ n = netmap_ring_reinit(kring);
+ goto done;
+ }
+
+ /* First part, import newly received packets. */
+ /* actually nothing to do here, they are already in the kring */
+
+ /* Second part, skip past packets that userspace has released. */
+ nm_i = kring->nr_hwcur;
+ if (nm_i != head) {
+ /* consistency check, but nothing really important here */
+ for (n = 0; likely(nm_i != head); n++) {
+ struct netmap_slot *slot = &ring->slot[nm_i];
+ void *addr = BDG_NMB(na, slot);
+
+ if (addr == netmap_buffer_base) { /* bad buf */
+ D("bad buffer index %d, ignore ?",
+ slot->buf_idx);
+ }
+ slot->flags &= ~NS_BUF_CHANGED;
+ nm_i = nm_next(nm_i, lim);
+ }
+ kring->nr_hwcur = head;
+ }
+
+ /* tell userspace that there are new packets */
+ nm_rxsync_finalize(kring);
+ n = 0;
+done:
+ return n;
+}
+
+/*
+ * user process reading from a VALE switch.
+ * Already protected against concurrent calls from userspace,
+ * but we must acquire the queue's lock to protect against
+ * writers on the same queue.
+ */
+static int
+bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+{
+ struct netmap_kring *kring = &na->rx_rings[ring_nr];
+ int n;
+
+ mtx_lock(&kring->q_lock);
+ n = netmap_vp_rxsync(na, ring_nr, flags);
+ mtx_unlock(&kring->q_lock);
+ return n;
+}
+
+
+static int
+bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp)
+{
+ struct netmap_vp_adapter *vpna;
+ struct netmap_adapter *na;
+ int error;
+ u_int npipes = 0;
+
+ vpna = malloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (vpna == NULL)
+ return ENOMEM;
+
+ na = &vpna->up;
+
+ na->ifp = ifp;
+
+ /* bound checking */
+ na->num_tx_rings = nmr->nr_tx_rings;
+ nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
+ nmr->nr_tx_rings = na->num_tx_rings; // write back
+ na->num_rx_rings = nmr->nr_rx_rings;
+ nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
+ nmr->nr_rx_rings = na->num_rx_rings; // write back
+ nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE,
+ 1, NM_BDG_MAXSLOTS, NULL);
+ na->num_tx_desc = nmr->nr_tx_slots;
+ nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE,
+ 1, NM_BDG_MAXSLOTS, NULL);
+ /* validate number of pipes. We want at least 1,
+ * but probably can do with some more.
+ * So let's use 2 as default (when 0 is supplied)
+ */
+ npipes = nmr->nr_arg1;
+ nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
+ nmr->nr_arg1 = npipes; /* write back */
+ /* validate extra bufs */
+ nm_bound_var(&nmr->nr_arg3, 0, 0,
+ 128*NM_BDG_MAXSLOTS, NULL);
+ na->num_rx_desc = nmr->nr_rx_slots;
+ vpna->virt_hdr_len = 0;
+ vpna->mfs = 1514;
+ /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero??
+ vpna->mfs = netmap_buf_size; */
+ if (netmap_verbose)
+ D("max frame size %u", vpna->mfs);
+
+ na->na_flags |= NAF_BDG_MAYSLEEP | NAF_MEM_OWNER;
+ na->nm_txsync = bdg_netmap_txsync;
+ na->nm_rxsync = bdg_netmap_rxsync;
+ na->nm_register = bdg_netmap_reg;
+ na->nm_dtor = netmap_adapter_vp_dtor;
+ na->nm_krings_create = netmap_vp_krings_create;
+ na->nm_krings_delete = netmap_vp_krings_delete;
+ na->nm_mem = netmap_mem_private_new(NM_IFPNAME(na->ifp),
+ na->num_tx_rings, na->num_tx_desc,
+ na->num_rx_rings, na->num_rx_desc,
+ nmr->nr_arg3, npipes, &error);
+ if (na->nm_mem == NULL)
+ goto err;
+ /* other nmd fields are set in the common routine */
+ error = netmap_attach_common(na);
+ if (error)
+ goto err;
+ return 0;
+
+err:
+ if (na->nm_mem != NULL)
+ netmap_mem_private_delete(na->nm_mem);
+ free(vpna, M_DEVBUF);
+ return error;
+}
+
+
+static void
+netmap_bwrap_dtor(struct netmap_adapter *na)
+{
+ struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
+ struct netmap_adapter *hwna = bna->hwna;
+ struct nm_bridge *b = bna->up.na_bdg,
+ *bh = bna->host.na_bdg;
+ struct ifnet *ifp = na->ifp;
+
+ ND("na %p", na);
+
+ if (b) {
+ netmap_bdg_detach_common(b, bna->up.bdg_port,
+ (bh ? bna->host.bdg_port : -1));
+ }
+
+ hwna->na_private = NULL;
+ netmap_adapter_put(hwna);
+
+ bzero(ifp, sizeof(*ifp));
+ free(ifp, M_DEVBUF);
+ na->ifp = NULL;
+
+}
+
+
+/*
+ * Intr callback for NICs connected to a bridge.
+ * Simply ignore tx interrupts (maybe we could try to recover space ?)
+ * and pass received packets from nic to the bridge.
+ *
+ * XXX TODO check locking: this is called from the interrupt
+ * handler so we should make sure that the interface is not
+ * disconnected while passing down an interrupt.
+ *
+ * Note, no user process can access this NIC or the host stack.
+ * The only part of the ring that is significant are the slots,
+ * and head/cur/tail are set from the kring as needed
+ * (part as a receive ring, part as a transmit ring).
+ *
+ * callback that overwrites the hwna notify callback.
+ * Packets come from the outside or from the host stack and are put on an hwna rx ring.
+ * The bridge wrapper then sends the packets through the bridge.
+ */
+static int
+netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, int flags)
+{
+ struct ifnet *ifp = na->ifp;
+ struct netmap_bwrap_adapter *bna = na->na_private;
+ struct netmap_vp_adapter *hostna = &bna->host;
+ struct netmap_kring *kring, *bkring;
+ struct netmap_ring *ring;
+ int is_host_ring = ring_nr == na->num_rx_rings;
+ struct netmap_vp_adapter *vpna = &bna->up;
+ int error = 0;
+
+ if (netmap_verbose)
+ D("%s %s%d 0x%x", NM_IFPNAME(ifp),
+ (tx == NR_TX ? "TX" : "RX"), ring_nr, flags);
+
+ if (flags & NAF_DISABLE_NOTIFY) {
+ kring = tx == NR_TX ? na->tx_rings : na->rx_rings;
+ bkring = tx == NR_TX ? vpna->up.rx_rings : vpna->up.tx_rings;
+ if (kring[ring_nr].nkr_stopped)
+ netmap_disable_ring(&bkring[ring_nr]);
+ else
+ bkring[ring_nr].nkr_stopped = 0;
+ return 0;
+ }
+
+ if (ifp == NULL || !(ifp->if_capenable & IFCAP_NETMAP))
+ return 0;
+
+ /* we only care about receive interrupts */
+ if (tx == NR_TX)
+ return 0;
+
+ kring = &na->rx_rings[ring_nr];
+ ring = kring->ring;
+
+ /* make sure the ring is not disabled */
+ if (nm_kr_tryget(kring))
+ return 0;
+
+ if (is_host_ring && hostna->na_bdg == NULL) {
+ error = bna->save_notify(na, ring_nr, tx, flags);
+ goto put_out;
+ }
+
+ /* Here we expect ring->head = ring->cur = ring->tail
+ * because everything has been released from the previous round.
+ * However the ring is shared and we might have info from
+ * the wrong side (the tx ring). Hence we overwrite with
+ * the info from the rx kring.
+ */
+ if (netmap_verbose)
+ D("%s head %d cur %d tail %d (kring %d %d %d)", NM_IFPNAME(ifp),
+ ring->head, ring->cur, ring->tail,
+ kring->rhead, kring->rcur, kring->rtail);
+
+ ring->head = kring->rhead;
+ ring->cur = kring->rcur;
+ ring->tail = kring->rtail;
+
+ if (is_host_ring) {
+ vpna = hostna;
+ ring_nr = 0;
+ }
+ /* simulate a user wakeup on the rx ring */
+ /* fetch packets that have arrived.
+ * XXX maybe do this in a loop ?
+ */
+ error = kring->nm_sync(kring, 0);
+ if (error)
+ goto put_out;
+ if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) {
+ D("how strange, interrupt with no packets on %s",
+ NM_IFPNAME(ifp));
+ goto put_out;
+ }
+
+ /* new packets are ring->cur to ring->tail, and the bkring
+ * had hwcur == ring->cur. So advance ring->cur to ring->tail
+ * to push all packets out.
+ */
+ ring->head = ring->cur = ring->tail;
+
+ /* also set tail to what the bwrap expects */
+ bkring = &vpna->up.tx_rings[ring_nr];
+ ring->tail = bkring->nr_hwtail; // rtail too ?
+
+ /* pass packets to the switch */
+ nm_txsync_prologue(bkring); // XXX error checking ?
+ netmap_vp_txsync(vpna, ring_nr, flags);
+
+ /* mark all buffers as released on this ring */
+ ring->head = ring->cur = kring->nr_hwtail;
+ ring->tail = kring->rtail;
+ /* another call to actually release the buffers */
+ if (!is_host_ring) {
+ error = kring->nm_sync(kring, 0);
+ } else {
+ /* mark all packets as released, as in the
+ * second part of netmap_rxsync_from_host()
+ */
+ kring->nr_hwcur = kring->nr_hwtail;
+ nm_rxsync_finalize(kring);
+ }
+
+put_out:
+ nm_kr_put(kring);
+ return error;
+}
+
+
+static int
+netmap_bwrap_register(struct netmap_adapter *na, int onoff)
+{
+ struct netmap_bwrap_adapter *bna =
+ (struct netmap_bwrap_adapter *)na;
+ struct netmap_adapter *hwna = bna->hwna;
+ struct netmap_vp_adapter *hostna = &bna->host;
+ int error;
+
+ ND("%s %s", NM_IFPNAME(na->ifp), onoff ? "on" : "off");
+
+ if (onoff) {
+ int i;
+
+ hwna->na_lut = na->na_lut;
+ hwna->na_lut_objtotal = na->na_lut_objtotal;
+
+ if (hostna->na_bdg) {
+ hostna->up.na_lut = na->na_lut;
+ hostna->up.na_lut_objtotal = na->na_lut_objtotal;
+ }
+
+ /* cross-link the netmap rings
+ * The original number of rings comes from hwna,
+ * rx rings on one side equals tx rings on the other.
+ */
+ for (i = 0; i < na->num_rx_rings + 1; i++) {
+ hwna->tx_rings[i].nkr_num_slots = na->rx_rings[i].nkr_num_slots;
+ hwna->tx_rings[i].ring = na->rx_rings[i].ring;
+ }
+ for (i = 0; i < na->num_tx_rings + 1; i++) {
+ hwna->rx_rings[i].nkr_num_slots = na->tx_rings[i].nkr_num_slots;
+ hwna->rx_rings[i].ring = na->tx_rings[i].ring;
+ }
+ }
+
+ if (hwna->ifp) {
+ error = hwna->nm_register(hwna, onoff);
+ if (error)
+ return error;
+ }
+
+ bdg_netmap_reg(na, onoff);
+
+ if (onoff) {
+ bna->save_notify = hwna->nm_notify;
+ hwna->nm_notify = netmap_bwrap_intr_notify;
+ } else {
+ hwna->nm_notify = bna->save_notify;
+ hwna->na_lut = NULL;
+ hwna->na_lut_objtotal = 0;
+ }
+
+ return 0;
+}
+
+
+static int
+netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
+ u_int *rxr, u_int *rxd)
+{
+ struct netmap_bwrap_adapter *bna =
+ (struct netmap_bwrap_adapter *)na;
+ struct netmap_adapter *hwna = bna->hwna;
+
+ /* forward the request */
+ netmap_update_config(hwna);
+ /* swap the results */
+ *txr = hwna->num_rx_rings;
+ *txd = hwna->num_rx_desc;
+ *rxr = hwna->num_tx_rings;
+ *rxd = hwna->num_rx_desc;
+
+ return 0;
+}
+
+
+static int
+netmap_bwrap_krings_create(struct netmap_adapter *na)
+{
+ struct netmap_bwrap_adapter *bna =
+ (struct netmap_bwrap_adapter *)na;
+ struct netmap_adapter *hwna = bna->hwna;
+ struct netmap_adapter *hostna = &bna->host.up;
+ int error;
+
+ ND("%s", NM_IFPNAME(na->ifp));
+
+ error = netmap_vp_krings_create(na);
+ if (error)
+ return error;
+
+ error = hwna->nm_krings_create(hwna);
+ if (error) {
+ netmap_vp_krings_delete(na);
+ return error;
+ }
+
+ if (na->na_flags & NAF_HOST_RINGS) {
+ hostna->tx_rings = na->tx_rings + na->num_tx_rings;
+ hostna->rx_rings = na->rx_rings + na->num_rx_rings;
+ }
+
+ return 0;
+}
+
+
+static void
+netmap_bwrap_krings_delete(struct netmap_adapter *na)
+{
+ struct netmap_bwrap_adapter *bna =
+ (struct netmap_bwrap_adapter *)na;
+ struct netmap_adapter *hwna = bna->hwna;
+
+ ND("%s", NM_IFPNAME(na->ifp));
+
+ hwna->nm_krings_delete(hwna);
+ netmap_vp_krings_delete(na);
+}
+
+
+/* notify method for the bridge-->hwna direction */
+static int
+netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
+{
+ struct netmap_bwrap_adapter *bna =
+ (struct netmap_bwrap_adapter *)na;
+ struct netmap_adapter *hwna = bna->hwna;
+ struct netmap_kring *kring, *hw_kring;
+ struct netmap_ring *ring;
+ u_int lim;
+ int error = 0;
+
+ if (tx == NR_TX)
+ return EINVAL;
+
+ kring = &na->rx_rings[ring_n];
+ hw_kring = &hwna->tx_rings[ring_n];
+ ring = kring->ring;
+ lim = kring->nkr_num_slots - 1;
+
+ if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP))
+ return 0;
+ mtx_lock(&kring->q_lock);
+ /* first step: simulate a user wakeup on the rx ring */
+ netmap_vp_rxsync(na, ring_n, flags);
+ ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
+ NM_IFPNAME(na->ifp), ring_n,
+ kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
+ ring->head, ring->cur, ring->tail,
+ hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail);
+ /* second step: the simulated user consumes all new packets */
+ ring->head = ring->cur = ring->tail;
+
+ /* third step: the new packets are sent on the tx ring
+ * (which is actually the same ring)
+ */
+ /* set tail to what the hw expects */
+ ring->tail = hw_kring->rtail;
+ nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ?
+ error = hw_kring->nm_sync(hw_kring, flags);
+
+ /* fourth step: now we are back the rx ring */
+ /* claim ownership on all hw owned bufs */
+ ring->head = nm_next(ring->tail, lim); /* skip past reserved slot */
+ ring->tail = kring->rtail; /* restore saved value of tail, for safety */
+
+ /* fifth step: the user goes to sleep again, causing another rxsync */
+ netmap_vp_rxsync(na, ring_n, flags);
+ ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
+ NM_IFPNAME(na->ifp), ring_n,
+ kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
+ ring->head, ring->cur, ring->tail,
+ hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
+ mtx_unlock(&kring->q_lock);
+ return error;
+}
+
+
+static int
+netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
+{
+ struct netmap_bwrap_adapter *bna = na->na_private;
+ struct netmap_adapter *port_na = &bna->up.up;
+ if (tx == NR_TX || ring_n != 0)
+ return EINVAL;
+ return netmap_bwrap_notify(port_na, port_na->num_rx_rings, NR_RX, flags);
+}
+
+
+/* attach a bridge wrapper to the 'real' device */
+static int
+netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real)
+{
+ struct netmap_bwrap_adapter *bna;
+ struct netmap_adapter *na;
+ struct netmap_adapter *hwna = NA(real);
+ struct netmap_adapter *hostna;
+ int error;
+
+
+ bna = malloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (bna == NULL)
+ return ENOMEM;
+
+ na = &bna->up.up;
+ na->ifp = fake;
+ /* fill the ring data for the bwrap adapter with rx/tx meanings
+ * swapped. The real cross-linking will be done during register,
+ * when all the krings will have been created.
+ */
+ na->num_rx_rings = hwna->num_tx_rings;
+ na->num_tx_rings = hwna->num_rx_rings;
+ na->num_tx_desc = hwna->num_rx_desc;
+ na->num_rx_desc = hwna->num_tx_desc;
+ na->nm_dtor = netmap_bwrap_dtor;
+ na->nm_register = netmap_bwrap_register;
+ // na->nm_txsync = netmap_bwrap_txsync;
+ // na->nm_rxsync = netmap_bwrap_rxsync;
+ na->nm_config = netmap_bwrap_config;
+ na->nm_krings_create = netmap_bwrap_krings_create;
+ na->nm_krings_delete = netmap_bwrap_krings_delete;
+ na->nm_notify = netmap_bwrap_notify;
+ na->nm_mem = hwna->nm_mem;
+ na->na_private = na; /* prevent NIOCREGIF */
+ bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
+
+ bna->hwna = hwna;
+ netmap_adapter_get(hwna);
+ hwna->na_private = bna; /* weak reference */
+
+ if (hwna->na_flags & NAF_HOST_RINGS) {
+ na->na_flags |= NAF_HOST_RINGS;
+ hostna = &bna->host.up;
+ hostna->ifp = hwna->ifp;
+ hostna->num_tx_rings = 1;
+ hostna->num_tx_desc = hwna->num_rx_desc;
+ hostna->num_rx_rings = 1;
+ hostna->num_rx_desc = hwna->num_tx_desc;
+ // hostna->nm_txsync = netmap_bwrap_host_txsync;
+ // hostna->nm_rxsync = netmap_bwrap_host_rxsync;
+ hostna->nm_notify = netmap_bwrap_host_notify;
+ hostna->nm_mem = na->nm_mem;
+ hostna->na_private = bna;
+ }
+
+ ND("%s<->%s txr %d txd %d rxr %d rxd %d",
+ fake->if_xname, real->if_xname,
+ na->num_tx_rings, na->num_tx_desc,
+ na->num_rx_rings, na->num_rx_desc);
+
+ error = netmap_attach_common(na);
+ if (error) {
+ netmap_adapter_put(hwna);
+ free(bna, M_DEVBUF);
+ return error;
+ }
+ return 0;
+}
+
+
+void
+netmap_init_bridges(void)
+{
+ int i;
+ bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */
+ for (i = 0; i < NM_BRIDGES; i++)
+ BDG_RWINIT(&nm_bridges[i]);
+}
+#endif /* WITH_VALE */
OpenPOWER on IntegriCloud