summaryrefslogtreecommitdiffstats
path: root/sys
diff options
context:
space:
mode:
Diffstat (limited to 'sys')
-rw-r--r--sys/dev/netmap/netmap.c1371
-rw-r--r--sys/dev/netmap/netmap_kern.h38
-rw-r--r--sys/net/netmap.h38
3 files changed, 1128 insertions, 319 deletions
diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c
index b55c338..e1beeae7 100644
--- a/sys/dev/netmap/netmap.c
+++ b/sys/dev/netmap/netmap.c
@@ -119,6 +119,9 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "")
int netmap_no_pendintr = 1;
SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
+int netmap_txsync_retry = 2;
+SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
+ &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
int netmap_drop = 0; /* debugging */
int netmap_flags = 0; /* debug flags */
@@ -128,25 +131,30 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, drop, CTLFLAG_RW, &netmap_drop, 0 , "");
SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
-#ifdef NM_BRIDGE /* support for netmap bridge */
+#ifdef NM_BRIDGE /* support for netmap virtual switch, called VALE */
/*
- * system parameters.
+ * system parameters (most of them in netmap_kern.h)
+ * NM_NAME prefix for switch port names, default "vale"
+ * NM_MAXPORTS number of ports
+ * NM_BRIDGES max number of switches in the system.
+ * XXX should become a sysctl or tunable
*
- * All switched ports have prefix NM_NAME.
- * The switch has a max of NM_BDG_MAXPORTS ports (often stored in a bitmap,
- * so a practical upper bound is 64).
- * Each tx ring is read-write, whereas rx rings are readonly (XXX not done yet).
+ * Switch ports are named valeX:Y where X is the switch name and Y
+ * is the port. If Y matches a physical interface name, the port is
+ * connected to a physical device.
+ *
+ * Unlike physical interfaces, switch ports use their own memory region
+ * for rings and buffers.
* The virtual interfaces use per-queue lock instead of core lock.
* In the tx loop, we aggregate traffic in batches to make all operations
* faster. The batch size is NM_BDG_BATCH
*/
-#define NM_NAME "vale" /* prefix for the interface */
-#define NM_BDG_MAXPORTS 16 /* up to 64 ? */
+#define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */
#define NM_BRIDGE_RINGSIZE 1024 /* in the device */
#define NM_BDG_HASH 1024 /* forwarding table entries */
#define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */
-#define NM_BRIDGES 4 /* number of bridges */
+#define NM_BRIDGES 8 /* number of bridges */
int netmap_bridge = NM_BDG_BATCH; /* bridge batch size */
@@ -174,14 +182,27 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, bridge, CTLFLAG_RW, &netmap_bridge, 0 , "");
#define ADD_BDG_REF(ifp) refcount_acquire(&NA(ifp)->na_bdg_refcount)
#define DROP_BDG_REF(ifp) refcount_release(&NA(ifp)->na_bdg_refcount)
-static void bdg_netmap_attach(struct ifnet *ifp);
+static void bdg_netmap_attach(struct netmap_adapter *);
static int bdg_netmap_reg(struct ifnet *ifp, int onoff);
+static int kern_netmap_regif(struct nmreq *nmr);
+
/* per-tx-queue entry */
struct nm_bdg_fwd { /* forwarding entry for a bridge */
void *buf;
- uint64_t dst; /* dst mask */
- uint32_t src; /* src index ? */
- uint16_t len; /* src len */
+ uint32_t ft_dst; /* dst port */
+ uint16_t ft_len; /* src len */
+ uint16_t ft_next; /* next packet to same destination */
+};
+
+/* We need to build a list of buffers going to each destination.
+ * Each buffer is in one entry of struct nm_bdg_fwd, we use ft_next
+ * to build the list, and struct nm_bdg_q below for the queue.
+ * The structure should compact because potentially we have a lot
+ * of destinations.
+ */
+struct nm_bdg_q {
+ uint16_t bq_head;
+ uint16_t bq_tail;
};
struct nm_hash_ent {
@@ -198,26 +219,78 @@ struct nm_hash_ent {
* The bridge is non blocking on the transmit ports.
*
* bdg_lock protects accesses to the bdg_ports array.
+ * This is a rw lock (or equivalent).
*/
struct nm_bridge {
- struct ifnet *bdg_ports[NM_BDG_MAXPORTS];
- int n_ports;
- uint64_t act_ports;
- int freelist; /* first buffer index */
- NM_SELINFO_T si; /* poll/select wait queue */
- NM_LOCK_T bdg_lock; /* protect the selinfo ? */
+ int namelen; /* 0 means free */
- /* the forwarding table, MAC+ports */
- struct nm_hash_ent ht[NM_BDG_HASH];
+ /* XXX what is the proper alignment/layout ? */
+ NM_RWLOCK_T bdg_lock; /* protects bdg_ports */
+ struct netmap_adapter *bdg_ports[NM_BDG_MAXPORTS];
- int namelen; /* 0 means free */
char basename[IFNAMSIZ];
+ /*
+ * The function to decide the destination port.
+ * It returns either of an index of the destination port,
+ * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
+ * forward this packet. ring_nr is the source ring index, and the
+ * function may overwrite this value to forward this packet to a
+ * different ring index.
+ * This function must be set by netmap_bdgctl().
+ */
+ bdg_lookup_fn_t nm_bdg_lookup;
+
+ /* the forwarding table, MAC+ports */
+ struct nm_hash_ent ht[NM_BDG_HASH];
};
struct nm_bridge nm_bridges[NM_BRIDGES];
+NM_LOCK_T netmap_bridge_mutex;
-#define BDG_LOCK(b) mtx_lock(&(b)->bdg_lock)
-#define BDG_UNLOCK(b) mtx_unlock(&(b)->bdg_lock)
+/* other OS will have these macros defined in their own glue code. */
+
+#ifdef __FreeBSD__
+#define BDG_LOCK() mtx_lock(&netmap_bridge_mutex)
+#define BDG_UNLOCK() mtx_unlock(&netmap_bridge_mutex)
+#define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock)
+#define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock)
+#define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock)
+#define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock)
+
+/* set/get variables. OS-specific macros may wrap these
+ * assignments into read/write lock or similar
+ */
+#define BDG_SET_VAR(lval, p) (lval = p)
+#define BDG_GET_VAR(lval) (lval)
+#define BDG_FREE(p) free(p, M_DEVBUF)
+#endif /* __FreeBSD__ */
+
+static __inline int
+nma_is_vp(struct netmap_adapter *na)
+{
+ return na->nm_register == bdg_netmap_reg;
+}
+static __inline int
+nma_is_host(struct netmap_adapter *na)
+{
+ return na->nm_register == NULL;
+}
+static __inline int
+nma_is_hw(struct netmap_adapter *na)
+{
+ /* In case of sw adapter, nm_register is NULL */
+ return !nma_is_vp(na) && !nma_is_host(na);
+}
+
+/*
+ * Regarding holding a NIC, if the NIC is owned by the kernel
+ * (i.e., bridge), neither another bridge nor user can use it;
+ * if the NIC is owned by a user, only users can share it.
+ * Evaluation must be done under NMA_LOCK().
+ */
+#define NETMAP_OWNED_BY_KERN(ifp) (!nma_is_vp(NA(ifp)) && NA(ifp)->na_bdg)
+#define NETMAP_OWNED_BY_ANY(ifp) \
+ (NETMAP_OWNED_BY_KERN(ifp) || (NA(ifp)->refcount > 0))
/*
* NA(ifp)->bdg_port port index
@@ -245,15 +318,16 @@ pkt_copy(void *_src, void *_dst, int l)
}
}
+
/*
* locate a bridge among the existing ones.
* a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
* We assume that this is called with a name of at least NM_NAME chars.
*/
static struct nm_bridge *
-nm_find_bridge(const char *name)
+nm_find_bridge(const char *name, int create)
{
- int i, l, namelen, e;
+ int i, l, namelen;
struct nm_bridge *b = NULL;
namelen = strlen(NM_NAME); /* base length */
@@ -268,29 +342,94 @@ nm_find_bridge(const char *name)
namelen = IFNAMSIZ;
ND("--- prefix is '%.*s' ---", namelen, name);
- /* use the first entry for locking */
- BDG_LOCK(nm_bridges); // XXX do better
- for (e = -1, i = 1; i < NM_BRIDGES; i++) {
- b = nm_bridges + i;
- if (b->namelen == 0)
- e = i; /* record empty slot */
- else if (strncmp(name, b->basename, namelen) == 0) {
+ BDG_LOCK();
+ /* lookup the name, remember empty slot if there is one */
+ for (i = 0; i < NM_BRIDGES; i++) {
+ struct nm_bridge *x = nm_bridges + i;
+
+ if (x->namelen == 0) {
+ if (create && b == NULL)
+ b = x; /* record empty slot */
+ } else if (x->namelen != namelen) {
+ continue;
+ } else if (strncmp(name, x->basename, namelen) == 0) {
ND("found '%.*s' at %d", namelen, name, i);
+ b = x;
break;
}
}
- if (i == NM_BRIDGES) { /* all full */
- if (e == -1) { /* no empty slot */
- b = NULL;
- } else {
- b = nm_bridges + e;
- strncpy(b->basename, name, namelen);
- b->namelen = namelen;
- }
+ if (i == NM_BRIDGES && b) { /* name not found, can create entry */
+ strncpy(b->basename, name, namelen);
+ b->namelen = namelen;
+ /* set the default function */
+ b->nm_bdg_lookup = netmap_bdg_learning;
+ /* reset the MAC address table */
+ bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
}
- BDG_UNLOCK(nm_bridges);
+ BDG_UNLOCK();
return b;
}
+
+
+/*
+ * Free the forwarding tables for rings attached to switch ports.
+ */
+static void
+nm_free_bdgfwd(struct netmap_adapter *na)
+{
+ int nrings, i;
+ struct netmap_kring *kring;
+
+ nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings;
+ kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings;
+ for (i = 0; i < nrings; i++) {
+ if (kring[i].nkr_ft) {
+ free(kring[i].nkr_ft, M_DEVBUF);
+ kring[i].nkr_ft = NULL; /* protect from freeing twice */
+ }
+ }
+ if (nma_is_hw(na))
+ nm_free_bdgfwd(SWNA(na->ifp));
+}
+
+
+/*
+ * Allocate the forwarding tables for the rings attached to the bridge ports.
+ */
+static int
+nm_alloc_bdgfwd(struct netmap_adapter *na)
+{
+ int nrings, l, i, num_dstq;
+ struct netmap_kring *kring;
+
+ /* all port:rings + broadcast */
+ num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
+ l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH;
+ l += sizeof(struct nm_bdg_q) * num_dstq;
+ l += sizeof(uint16_t) * NM_BDG_BATCH;
+
+ nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings;
+ kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings;
+ for (i = 0; i < nrings; i++) {
+ struct nm_bdg_fwd *ft;
+ struct nm_bdg_q *dstq;
+ int j;
+
+ ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (!ft) {
+ nm_free_bdgfwd(na);
+ return ENOMEM;
+ }
+ dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH);
+ for (j = 0; j < num_dstq; j++)
+ dstq[j].bq_head = dstq[j].bq_tail = NM_BDG_BATCH;
+ kring[i].nkr_ft = ft;
+ }
+ if (nma_is_hw(na))
+ nm_alloc_bdgfwd(SWNA(na->ifp));
+ return 0;
+}
+
#endif /* NM_BRIDGE */
@@ -413,20 +552,11 @@ netmap_dtor_locked(void *data)
if (netmap_verbose)
D("deleting last instance for %s", ifp->if_xname);
/*
- * there is a race here with *_netmap_task() and
- * netmap_poll(), which don't run under NETMAP_REG_LOCK.
- * na->refcount == 0 && na->ifp->if_capenable & IFCAP_NETMAP
- * (aka NETMAP_DELETING(na)) are a unique marker that the
- * device is dying.
- * Before destroying stuff we sleep a bit, and then complete
- * the job. NIOCREG should realize the condition and
- * loop until they can continue; the other routines
- * should check the condition at entry and quit if
- * they cannot run.
+ * (TO CHECK) This function is only called
+ * when the last reference to this file descriptor goes
+ * away. This means we cannot have any pending poll()
+ * or interrupt routine operating on the structure.
*/
- na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
- tsleep(na, 0, "NIOCUNREG", 4);
- na->nm_lock(ifp, NETMAP_REG_LOCK, 0);
na->nm_register(ifp, 0); /* off, clear IFCAP_NETMAP */
/* Wake up any sleeping threads. netmap_poll will
* then return POLLERR
@@ -437,6 +567,9 @@ netmap_dtor_locked(void *data)
selwakeuppri(&na->rx_rings[i].si, PI_NET);
selwakeuppri(&na->tx_si, PI_NET);
selwakeuppri(&na->rx_si, PI_NET);
+#ifdef NM_BRIDGE
+ nm_free_bdgfwd(na);
+#endif /* NM_BRIDGE */
/* release all buffers */
for (i = 0; i < na->num_tx_rings + 1; i++) {
struct netmap_ring *ring = na->tx_rings[i].ring;
@@ -458,49 +591,81 @@ netmap_dtor_locked(void *data)
/* knlist_destroy(&na->tx_si.si_note); */
/* knlist_destroy(&na->rx_si.si_note); */
netmap_free_rings(na);
- wakeup(na);
+ if (nma_is_hw(na))
+ SWNA(ifp)->tx_rings = SWNA(ifp)->rx_rings = NULL;
}
netmap_if_free(nifp);
}
+
+/* we assume netmap adapter exists */
static void
nm_if_rele(struct ifnet *ifp)
{
#ifndef NM_BRIDGE
if_rele(ifp);
#else /* NM_BRIDGE */
- int i, full;
+ int i, full = 0, is_hw;
struct nm_bridge *b;
+ struct netmap_adapter *na;
- if (strncmp(ifp->if_xname, NM_NAME, sizeof(NM_NAME) - 1)) {
+ /* I can be called not only for get_ifp()-ed references where netmap's
+ * capability is guaranteed, but also for non-netmap-capable NICs.
+ */
+ if (!NETMAP_CAPABLE(ifp) || !NA(ifp)->na_bdg) {
if_rele(ifp);
return;
}
if (!DROP_BDG_REF(ifp))
return;
- b = ifp->if_bridge;
- BDG_LOCK(nm_bridges);
- BDG_LOCK(b);
+
+ na = NA(ifp);
+ b = na->na_bdg;
+ is_hw = nma_is_hw(na);
+
+ BDG_WLOCK(b);
ND("want to disconnect %s from the bridge", ifp->if_xname);
full = 0;
+ /* remove the entry from the bridge, also check
+ * if there are any leftover interfaces
+ * XXX we should optimize this code, e.g. going directly
+ * to na->bdg_port, and having a counter of ports that
+ * are connected. But it is not in a critical path.
+ * In NIC's case, index of sw na is always higher than hw na
+ */
for (i = 0; i < NM_BDG_MAXPORTS; i++) {
- if (b->bdg_ports[i] == ifp) {
- b->bdg_ports[i] = NULL;
- bzero(ifp, sizeof(*ifp));
- free(ifp, M_DEVBUF);
- break;
- }
- else if (b->bdg_ports[i] != NULL)
+ struct netmap_adapter *tmp = BDG_GET_VAR(b->bdg_ports[i]);
+
+ if (tmp == na) {
+ /* disconnect from bridge */
+ BDG_SET_VAR(b->bdg_ports[i], NULL);
+ na->na_bdg = NULL;
+ if (is_hw && SWNA(ifp)->na_bdg) {
+ /* disconnect sw adapter too */
+ int j = SWNA(ifp)->bdg_port;
+ BDG_SET_VAR(b->bdg_ports[j], NULL);
+ SWNA(ifp)->na_bdg = NULL;
+ }
+ } else if (tmp != NULL) {
full = 1;
+ }
}
- BDG_UNLOCK(b);
+ BDG_WUNLOCK(b);
if (full == 0) {
- ND("freeing bridge %d", b - nm_bridges);
+ ND("marking bridge %d as free", b - nm_bridges);
b->namelen = 0;
+ b->nm_bdg_lookup = NULL;
}
- BDG_UNLOCK(nm_bridges);
- if (i == NM_BDG_MAXPORTS)
+ if (na->na_bdg) { /* still attached to the bridge */
D("ouch, cannot find ifp to remove");
+ } else if (is_hw) {
+ if_rele(ifp);
+ } else {
+ bzero(na, sizeof(*na));
+ free(na, M_DEVBUF);
+ bzero(ifp, sizeof(*ifp));
+ free(ifp, M_DEVBUF);
+ }
#endif /* NM_BRIDGE */
}
@@ -514,9 +679,13 @@ netmap_dtor(void *data)
if (ifp) {
struct netmap_adapter *na = NA(ifp);
+ if (na->na_bdg)
+ BDG_WLOCK(na->na_bdg);
na->nm_lock(ifp, NETMAP_REG_LOCK, 0);
netmap_dtor_locked(data);
na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
+ if (na->na_bdg)
+ BDG_WUNLOCK(na->na_bdg);
nm_if_rele(ifp); /* might also destroy *na */
}
@@ -528,6 +697,7 @@ netmap_dtor(void *data)
free(priv, M_DEVBUF);
}
+
#ifdef __FreeBSD__
#include <vm/vm.h>
#include <vm/vm_param.h>
@@ -536,8 +706,16 @@ netmap_dtor(void *data)
#include <vm/vm_pager.h>
#include <vm/uma.h>
+/*
+ * In order to track whether pages are still mapped, we hook into
+ * the standard cdev_pager and intercept the constructor and
+ * destructor.
+ * XXX but then ? Do we really use the information ?
+ * Need to investigate.
+ */
static struct cdev_pager_ops saved_cdev_pager_ops;
+
static int
netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
vm_ooffset_t foff, struct ucred *cred, u_short *color)
@@ -548,6 +726,7 @@ netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
size, prot, foff, cred, color);
}
+
static void
netmap_dev_pager_dtor(void *handle)
{
@@ -562,6 +741,8 @@ static struct cdev_pager_ops netmap_cdev_pager_ops = {
.cdev_pg_fault = NULL,
};
+
+// XXX check whether we need netmap_mmap_single _and_ netmap_mmap
static int
netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff,
vm_size_t objsize, vm_object_t *objp, int prot)
@@ -630,6 +811,7 @@ netmap_mmap(__unused struct cdev *dev,
return (*paddr ? 0 : ENOMEM);
}
+
static int
netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
{
@@ -639,6 +821,7 @@ netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
return 0;
}
+
static int
netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
{
@@ -677,6 +860,7 @@ netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
* might take a while before releasing the buffer.
*/
+
/*
* pass a chain of buffers to the host stack as coming from 'dst'
*/
@@ -701,6 +885,7 @@ struct mbq {
int count;
};
+
/*
* put a copy of the buffers marked NS_FORWARD into an mbuf chain.
* Run from hwcur to cur - reserved
@@ -745,6 +930,7 @@ netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
q->tail = tail;
}
+
/*
* called under main lock to send packets from the host to the NIC
* The host ring has packets from nr_hwcur to (cur - reserved)
@@ -794,6 +980,7 @@ netmap_sw_to_nic(struct netmap_adapter *na)
}
}
+
/*
* netmap_sync_to_host() passes packets up. We are called from a
* system call in user process context, and the only contention
@@ -827,6 +1014,18 @@ netmap_sync_to_host(struct netmap_adapter *na)
netmap_send_up(na->ifp, q.head);
}
+
+/* SWNA(ifp)->txrings[0] is always NA(ifp)->txrings[NA(ifp)->num_txrings] */
+static int
+netmap_bdg_to_host(struct ifnet *ifp, u_int ring_nr, int do_lock)
+{
+ (void)ring_nr;
+ (void)do_lock;
+ netmap_sync_to_host(NA(ifp));
+ return 0;
+}
+
+
/*
* rxsync backend for packets coming from the host stack.
* They have been put in the queue by netmap_start() so we
@@ -881,38 +1080,60 @@ netmap_sync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
* Return ENXIO if the interface does not exist, EINVAL if netmap
* is not supported by the interface.
* If successful, hold a reference.
+ *
+ * During the NIC is attached to a bridge, reference is managed
+ * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as
+ * virtual ports. Hence, on the final DROP_BDG_REF(), the NIC
+ * is detached from the bridge, then ifp's refcount is dropped (this
+ * is equivalent to that ifp is destroyed in case of virtual ports.
+ *
+ * This function uses if_rele() when we want to prevent the NIC from
+ * being detached from the bridge in error handling. But once refcount
+ * is acquired by this function, it must be released using nm_if_rele().
*/
static int
-get_ifp(const char *name, struct ifnet **ifp)
+get_ifp(struct nmreq *nmr, struct ifnet **ifp)
{
+ const char *name = nmr->nr_name;
+ int namelen = strlen(name);
#ifdef NM_BRIDGE
struct ifnet *iter = NULL;
+ int no_prefix = 0;
do {
struct nm_bridge *b;
- int i, l, cand = -1;
+ struct netmap_adapter *na;
+ int i, cand = -1, cand2 = -1;
- if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1))
+ if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) {
+ no_prefix = 1;
break;
- b = nm_find_bridge(name);
+ }
+ b = nm_find_bridge(name, 1 /* create a new one if no exist */ );
if (b == NULL) {
D("no bridges available for '%s'", name);
return (ENXIO);
}
- /* XXX locking */
- BDG_LOCK(b);
+ /* Now we are sure that name starts with the bridge's name */
+ BDG_WLOCK(b);
/* lookup in the local list of ports */
for (i = 0; i < NM_BDG_MAXPORTS; i++) {
- iter = b->bdg_ports[i];
- if (iter == NULL) {
+ na = BDG_GET_VAR(b->bdg_ports[i]);
+ if (na == NULL) {
if (cand == -1)
cand = i; /* potential insert point */
+ else if (cand2 == -1)
+ cand2 = i; /* for host stack */
continue;
}
- if (!strcmp(iter->if_xname, name)) {
+ iter = na->ifp;
+ /* XXX make sure the name only contains one : */
+ if (!strcmp(iter->if_xname, name) /* virtual port */ ||
+ (namelen > b->namelen && !strcmp(iter->if_xname,
+ name + b->namelen + 1)) /* NIC */) {
ADD_BDG_REF(iter);
ND("found existing interface");
- BDG_UNLOCK(b);
+ BDG_WUNLOCK(b);
break;
}
}
@@ -921,23 +1142,73 @@ get_ifp(const char *name, struct ifnet **ifp)
if (cand == -1) {
D("bridge full, cannot create new port");
no_port:
- BDG_UNLOCK(b);
+ BDG_WUNLOCK(b);
*ifp = NULL;
return EINVAL;
}
ND("create new bridge port %s", name);
- /* space for forwarding list after the ifnet */
- l = sizeof(*iter) +
- sizeof(struct nm_bdg_fwd)*NM_BDG_BATCH ;
- iter = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
- if (!iter)
- goto no_port;
- strcpy(iter->if_xname, name);
- bdg_netmap_attach(iter);
- b->bdg_ports[cand] = iter;
- iter->if_bridge = b;
+ /*
+ * create a struct ifnet for the new port.
+ * The forwarding table is attached to the kring(s).
+ */
+ /*
+ * try see if there is a matching NIC with this name
+ * (after the bridge's name)
+ */
+ iter = ifunit_ref(name + b->namelen + 1);
+ if (!iter) { /* this is a virtual port */
+ /* Create a temporary NA with arguments, then
+ * bdg_netmap_attach() will allocate the real one
+ * and attach it to the ifp
+ */
+ struct netmap_adapter tmp_na;
+
+ if (nmr->nr_cmd) /* nr_cmd must be for a NIC */
+ goto no_port;
+ bzero(&tmp_na, sizeof(tmp_na));
+ /* bound checking */
+ if (nmr->nr_tx_rings < 1)
+ nmr->nr_tx_rings = 1;
+ if (nmr->nr_tx_rings > NM_BDG_MAXRINGS)
+ nmr->nr_tx_rings = NM_BDG_MAXRINGS;
+ tmp_na.num_tx_rings = nmr->nr_tx_rings;
+ if (nmr->nr_rx_rings < 1)
+ nmr->nr_rx_rings = 1;
+ if (nmr->nr_rx_rings > NM_BDG_MAXRINGS)
+ nmr->nr_rx_rings = NM_BDG_MAXRINGS;
+ tmp_na.num_rx_rings = nmr->nr_rx_rings;
+
+ iter = malloc(sizeof(*iter), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (!iter)
+ goto no_port;
+ strcpy(iter->if_xname, name);
+ tmp_na.ifp = iter;
+ /* bdg_netmap_attach creates a struct netmap_adapter */
+ bdg_netmap_attach(&tmp_na);
+ } else if (NETMAP_CAPABLE(iter)) { /* this is a NIC */
+ /* cannot attach the NIC that any user or another
+ * bridge already holds.
+ */
+ if (NETMAP_OWNED_BY_ANY(iter) || cand2 == -1) {
+ifunit_rele:
+ if_rele(iter); /* don't detach from bridge */
+ goto no_port;
+ }
+ /* bind the host stack to the bridge */
+ if (nmr->nr_arg1 == NETMAP_BDG_HOST) {
+ BDG_SET_VAR(b->bdg_ports[cand2], SWNA(iter));
+ SWNA(iter)->bdg_port = cand2;
+ SWNA(iter)->na_bdg = b;
+ }
+ } else /* not a netmap-capable NIC */
+ goto ifunit_rele;
+ na = NA(iter);
+ na->bdg_port = cand;
+ /* bind the port to the bridge (virtual ports are not active) */
+ BDG_SET_VAR(b->bdg_ports[cand], na);
+ na->na_bdg = b;
ADD_BDG_REF(iter);
- BDG_UNLOCK(b);
+ BDG_WUNLOCK(b);
ND("attaching virtual bridge %p", b);
} while (0);
*ifp = iter;
@@ -949,8 +1220,16 @@ no_port:
/* can do this if the capability exists and if_pspare[0]
* points to the netmap descriptor.
*/
- if (NETMAP_CAPABLE(*ifp))
+ if (NETMAP_CAPABLE(*ifp)) {
+#ifdef NM_BRIDGE
+ /* Users cannot use the NIC attached to a bridge directly */
+ if (no_prefix && NETMAP_OWNED_BY_KERN(*ifp)) {
+ if_rele(*ifp); /* don't detach from bridge */
+ return EINVAL;
+ } else
+#endif /* NM_BRIDGE */
return 0; /* valid pointer, we hold the refcount */
+ }
nm_if_rele(*ifp);
return EINVAL; // not NETMAP capable
}
@@ -1059,6 +1338,296 @@ netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid)
return 0;
}
+
+/*
+ * possibly move the interface to netmap-mode.
+ * If success it returns a pointer to netmap_if, otherwise NULL.
+ * This must be called with NMA_LOCK held.
+ */
+static struct netmap_if *
+netmap_do_regif(struct netmap_priv_d *priv, struct ifnet *ifp,
+ uint16_t ringid, int *err)
+{
+ struct netmap_adapter *na = NA(ifp);
+ struct netmap_if *nifp = NULL;
+ int i, error;
+
+ if (na->na_bdg)
+ BDG_WLOCK(na->na_bdg);
+ na->nm_lock(ifp, NETMAP_REG_LOCK, 0);
+
+ /* ring configuration may have changed, fetch from the card */
+ netmap_update_config(na);
+ priv->np_ifp = ifp; /* store the reference */
+ error = netmap_set_ringid(priv, ringid);
+ if (error)
+ goto out;
+ nifp = netmap_if_new(ifp->if_xname, na);
+ if (nifp == NULL) { /* allocation failed */
+ error = ENOMEM;
+ } else if (ifp->if_capenable & IFCAP_NETMAP) {
+ /* was already set */
+ } else {
+ /* Otherwise set the card in netmap mode
+ * and make it use the shared buffers.
+ */
+ for (i = 0 ; i < na->num_tx_rings + 1; i++)
+ mtx_init(&na->tx_rings[i].q_lock, "nm_txq_lock",
+ MTX_NETWORK_LOCK, MTX_DEF);
+ for (i = 0 ; i < na->num_rx_rings + 1; i++) {
+ mtx_init(&na->rx_rings[i].q_lock, "nm_rxq_lock",
+ MTX_NETWORK_LOCK, MTX_DEF);
+ }
+ if (nma_is_hw(na)) {
+ SWNA(ifp)->tx_rings = &na->tx_rings[na->num_tx_rings];
+ SWNA(ifp)->rx_rings = &na->rx_rings[na->num_rx_rings];
+ }
+ error = na->nm_register(ifp, 1); /* mode on */
+#ifdef NM_BRIDGE
+ if (!error)
+ error = nm_alloc_bdgfwd(na);
+#endif /* NM_BRIDGE */
+ if (error) {
+ netmap_dtor_locked(priv);
+ /* nifp is not yet in priv, so free it separately */
+ netmap_if_free(nifp);
+ nifp = NULL;
+ }
+
+ }
+out:
+ *err = error;
+ na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
+ if (na->na_bdg)
+ BDG_WUNLOCK(na->na_bdg);
+ return nifp;
+}
+
+
+/* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */
+static int
+kern_netmap_regif(struct nmreq *nmr)
+{
+ struct ifnet *ifp;
+ struct netmap_if *nifp;
+ struct netmap_priv_d *npriv;
+ int error;
+
+ npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO);
+ if (npriv == NULL)
+ return ENOMEM;
+ error = netmap_get_memory(npriv);
+ if (error) {
+free_exit:
+ bzero(npriv, sizeof(*npriv));
+ free(npriv, M_DEVBUF);
+ return error;
+ }
+
+ NMA_LOCK();
+ error = get_ifp(nmr, &ifp);
+ if (error) { /* no device, or another bridge or user owns the device */
+ NMA_UNLOCK();
+ goto free_exit;
+ } else if (!NETMAP_OWNED_BY_KERN(ifp)) {
+ /* got reference to a virtual port or direct access to a NIC.
+ * perhaps specified no bridge's prefix or wrong NIC's name
+ */
+ error = EINVAL;
+unref_exit:
+ nm_if_rele(ifp);
+ NMA_UNLOCK();
+ goto free_exit;
+ }
+
+ if (nmr->nr_cmd == NETMAP_BDG_DETACH) {
+ if (NA(ifp)->refcount == 0) { /* not registered */
+ error = EINVAL;
+ goto unref_exit;
+ }
+ NMA_UNLOCK();
+
+ netmap_dtor(NA(ifp)->na_kpriv); /* unregister */
+ NA(ifp)->na_kpriv = NULL;
+ nm_if_rele(ifp); /* detach from the bridge */
+ goto free_exit;
+ } else if (NA(ifp)->refcount > 0) { /* already registered */
+ error = EINVAL;
+ goto unref_exit;
+ }
+
+ nifp = netmap_do_regif(npriv, ifp, nmr->nr_ringid, &error);
+ if (!nifp)
+ goto unref_exit;
+ wmb(); // XXX do we need it ?
+ npriv->np_nifp = nifp;
+ NA(ifp)->na_kpriv = npriv;
+ NMA_UNLOCK();
+ D("registered %s to netmap-mode", ifp->if_xname);
+ return 0;
+}
+
+
+/* CORE_LOCK is not necessary */
+static void
+netmap_swlock_wrapper(struct ifnet *dev, int what, u_int queueid)
+{
+ struct netmap_adapter *na = SWNA(dev);
+
+ switch (what) {
+ case NETMAP_TX_LOCK:
+ mtx_lock(&na->tx_rings[queueid].q_lock);
+ break;
+
+ case NETMAP_TX_UNLOCK:
+ mtx_unlock(&na->tx_rings[queueid].q_lock);
+ break;
+
+ case NETMAP_RX_LOCK:
+ mtx_lock(&na->rx_rings[queueid].q_lock);
+ break;
+
+ case NETMAP_RX_UNLOCK:
+ mtx_unlock(&na->rx_rings[queueid].q_lock);
+ break;
+ }
+}
+
+
+/* Initialize necessary fields of sw adapter located in right after hw's
+ * one. sw adapter attaches a pair of sw rings of the netmap-mode NIC.
+ * It is always activated and deactivated at the same tie with the hw's one.
+ * Thus we don't need refcounting on the sw adapter.
+ * Regardless of NIC's feature we use separate lock so that anybody can lock
+ * me independently from the hw adapter.
+ * Make sure nm_register is NULL to be handled as FALSE in nma_is_hw
+ */
+static void
+netmap_attach_sw(struct ifnet *ifp)
+{
+ struct netmap_adapter *hw_na = NA(ifp);
+ struct netmap_adapter *na = SWNA(ifp);
+
+ na->ifp = ifp;
+ na->separate_locks = 1;
+ na->nm_lock = netmap_swlock_wrapper;
+ na->num_rx_rings = na->num_tx_rings = 1;
+ na->num_tx_desc = hw_na->num_tx_desc;
+ na->num_rx_desc = hw_na->num_rx_desc;
+ na->nm_txsync = netmap_bdg_to_host;
+}
+
+
+/* exported to kernel callers */
+int
+netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
+{
+ struct nm_bridge *b;
+ struct netmap_adapter *na;
+ struct ifnet *iter;
+ char *name = nmr->nr_name;
+ int cmd = nmr->nr_cmd, namelen = strlen(name);
+ int error = 0, i, j;
+
+ switch (cmd) {
+ case NETMAP_BDG_ATTACH:
+ case NETMAP_BDG_DETACH:
+ error = kern_netmap_regif(nmr);
+ break;
+
+ case NETMAP_BDG_LIST:
+ /* this is used to enumerate bridges and ports */
+ if (namelen) { /* look up indexes of bridge and port */
+ if (strncmp(name, NM_NAME, strlen(NM_NAME))) {
+ error = EINVAL;
+ break;
+ }
+ b = nm_find_bridge(name, 0 /* don't create */);
+ if (!b) {
+ error = ENOENT;
+ break;
+ }
+
+ BDG_RLOCK(b);
+ error = ENOENT;
+ for (i = 0; i < NM_BDG_MAXPORTS; i++) {
+ na = BDG_GET_VAR(b->bdg_ports[i]);
+ if (na == NULL)
+ continue;
+ iter = na->ifp;
+ /* the former and the latter identify a
+ * virtual port and a NIC, respectively
+ */
+ if (!strcmp(iter->if_xname, name) ||
+ (namelen > b->namelen &&
+ !strcmp(iter->if_xname,
+ name + b->namelen + 1))) {
+ /* bridge index */
+ nmr->nr_arg1 = b - nm_bridges;
+ nmr->nr_arg2 = i; /* port index */
+ error = 0;
+ break;
+ }
+ }
+ BDG_RUNLOCK(b);
+ } else {
+ /* return the first non-empty entry starting from
+ * bridge nr_arg1 and port nr_arg2.
+ *
+ * Users can detect the end of the same bridge by
+ * seeing the new and old value of nr_arg1, and can
+ * detect the end of all the bridge by error != 0
+ */
+ i = nmr->nr_arg1;
+ j = nmr->nr_arg2;
+
+ for (error = ENOENT; error && i < NM_BRIDGES; i++) {
+ b = nm_bridges + i;
+ BDG_RLOCK(b);
+ for (; j < NM_BDG_MAXPORTS; j++) {
+ na = BDG_GET_VAR(b->bdg_ports[j]);
+ if (na == NULL)
+ continue;
+ iter = na->ifp;
+ nmr->nr_arg1 = i;
+ nmr->nr_arg2 = j;
+ strncpy(name, iter->if_xname, IFNAMSIZ);
+ error = 0;
+ break;
+ }
+ BDG_RUNLOCK(b);
+ j = 0; /* following bridges scan from 0 */
+ }
+ }
+ break;
+
+ case NETMAP_BDG_LOOKUP_REG:
+ /* register a lookup function to the given bridge.
+ * nmr->nr_name may be just bridge's name (including ':'
+ * if it is not just NM_NAME).
+ */
+ if (!func) {
+ error = EINVAL;
+ break;
+ }
+ b = nm_find_bridge(name, 0 /* don't create */);
+ if (!b) {
+ error = EINVAL;
+ break;
+ }
+ BDG_WLOCK(b);
+ b->nm_bdg_lookup = func;
+ BDG_WUNLOCK(b);
+ break;
+ default:
+ D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd);
+ error = EINVAL;
+ break;
+ }
+ return error;
+}
+
+
/*
* ioctl(2) support for the "netmap" device.
*
@@ -1121,6 +1690,10 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
error = EINVAL;
break;
}
+ if (nmr->nr_cmd == NETMAP_BDG_LIST) {
+ error = netmap_bdg_ctl(nmr, NULL);
+ break;
+ }
/* update configuration */
error = netmap_get_memory(priv);
ND("get_memory returned %d", error);
@@ -1129,15 +1702,19 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
/* memsize is always valid */
nmr->nr_memsize = nm_mem.nm_totalsize;
nmr->nr_offset = 0;
- nmr->nr_rx_rings = nmr->nr_tx_rings = 0;
nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
if (nmr->nr_name[0] == '\0') /* just get memory info */
break;
- error = get_ifp(nmr->nr_name, &ifp); /* get a refcount */
- if (error)
+ /* lock because get_ifp and update_config see na->refcount */
+ NMA_LOCK();
+ error = get_ifp(nmr, &ifp); /* get a refcount */
+ if (error) {
+ NMA_UNLOCK();
break;
+ }
na = NA(ifp); /* retrieve netmap_adapter */
netmap_update_config(na);
+ NMA_UNLOCK();
nmr->nr_rx_rings = na->num_rx_rings;
nmr->nr_tx_rings = na->num_tx_rings;
nmr->nr_rx_slots = na->num_rx_desc;
@@ -1151,6 +1728,17 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
error = EINVAL;
break;
}
+ /* possibly attach/detach NIC and VALE switch */
+ i = nmr->nr_cmd;
+ if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH) {
+ error = netmap_bdg_ctl(nmr, NULL);
+ break;
+ } else if (i != 0) {
+ D("nr_cmd must be 0 not %d", i);
+ error = EINVAL;
+ break;
+ }
+
/* ensure allocators are ready */
error = netmap_get_memory(priv);
ND("get_memory returned %d", error);
@@ -1161,71 +1749,26 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
NMA_LOCK();
if (priv->np_ifp != NULL) { /* thread already registered */
error = netmap_set_ringid(priv, nmr->nr_ringid);
+unlock_out:
NMA_UNLOCK();
break;
}
/* find the interface and a reference */
- error = get_ifp(nmr->nr_name, &ifp); /* keep reference */
- if (error) {
- NMA_UNLOCK();
- break;
- }
- na = NA(ifp); /* retrieve netmap adapter */
-
- for (i = 10; i > 0; i--) {
- na->nm_lock(ifp, NETMAP_REG_LOCK, 0);
- if (!NETMAP_DELETING(na))
- break;
- na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
- tsleep(na, 0, "NIOCREGIF", hz/10);
- }
- if (i == 0) {
- D("too many NIOCREGIF attempts, give up");
- error = EINVAL;
- nm_if_rele(ifp); /* return the refcount */
- NMA_UNLOCK();
- break;
- }
-
- /* ring configuration may have changed, fetch from the card */
- netmap_update_config(na);
- priv->np_ifp = ifp; /* store the reference */
- error = netmap_set_ringid(priv, nmr->nr_ringid);
+ error = get_ifp(nmr, &ifp); /* keep reference */
if (error)
- goto error;
- nifp = netmap_if_new(nmr->nr_name, na);
- if (nifp == NULL) { /* allocation failed */
- error = ENOMEM;
- } else if (ifp->if_capenable & IFCAP_NETMAP) {
- /* was already set */
- } else {
- /* Otherwise set the card in netmap mode
- * and make it use the shared buffers.
- */
- for (i = 0 ; i < na->num_tx_rings + 1; i++)
- mtx_init(&na->tx_rings[i].q_lock, "nm_txq_lock", MTX_NETWORK_LOCK, MTX_DEF);
- for (i = 0 ; i < na->num_rx_rings + 1; i++) {
- mtx_init(&na->rx_rings[i].q_lock, "nm_rxq_lock", MTX_NETWORK_LOCK, MTX_DEF);
- }
- error = na->nm_register(ifp, 1); /* mode on */
- if (error) {
- netmap_dtor_locked(priv);
- netmap_if_free(nifp);
- }
+ goto unlock_out;
+ else if (NETMAP_OWNED_BY_KERN(ifp)) {
+ nm_if_rele(ifp);
+ goto unlock_out;
}
-
- if (error) { /* reg. failed, release priv and ref */
-error:
- na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
- nm_if_rele(ifp); /* return the refcount */
+ nifp = netmap_do_regif(priv, ifp, nmr->nr_ringid, &error);
+ if (!nifp) { /* reg. failed, release priv and ref */
+ nm_if_rele(ifp); /* return the refcount */
priv->np_ifp = NULL;
priv->np_nifp = NULL;
- NMA_UNLOCK();
- break;
+ goto unlock_out;
}
- na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
-
/* the following assignment is a commitment.
* Readers (i.e., poll and *SYNC) check for
* np_nifp != NULL without locking
@@ -1235,6 +1778,7 @@ error:
NMA_UNLOCK();
/* return the offset of the netmap_if object */
+ na = NA(ifp); /* retrieve netmap adapter */
nmr->nr_rx_rings = na->num_rx_rings;
nmr->nr_tx_rings = na->num_tx_rings;
nmr->nr_rx_slots = na->num_rx_desc;
@@ -1314,7 +1858,7 @@ error:
{
struct socket so;
bzero(&so, sizeof(so));
- error = get_ifp(nmr->nr_name, &ifp); /* keep reference */
+ error = get_ifp(nmr, &ifp); /* keep reference */
if (error)
break;
so.so_vnet = ifp->if_vnet;
@@ -1391,7 +1935,6 @@ netmap_poll(struct cdev *dev, int events, struct thread *td)
if (priv->np_qfirst == NETMAP_SW_RING) {
if (priv->np_txpoll || want_tx) {
/* push any packets up, then we are always ready */
- kring = &na->tx_rings[lim_tx];
netmap_sync_to_host(na);
revents |= want_tx;
}
@@ -1600,6 +2143,7 @@ flush_tx:
/*------- driver support routines ------*/
+
/*
* default lock wrapper.
*/
@@ -1661,10 +2205,12 @@ netmap_attach(struct netmap_adapter *arg, int num_queues)
{
struct netmap_adapter *na = NULL;
struct ifnet *ifp = arg ? arg->ifp : NULL;
+ int len;
if (arg == NULL || ifp == NULL)
goto fail;
- na = malloc(sizeof(*na), M_DEVBUF, M_NOWAIT | M_ZERO);
+ len = nma_is_vp(arg) ? sizeof(*na) : sizeof(*na) * 2;
+ na = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO);
if (na == NULL)
goto fail;
WNA(ifp) = na;
@@ -1680,19 +2226,20 @@ netmap_attach(struct netmap_adapter *arg, int num_queues)
ND("using default locks for %s", ifp->if_xname);
na->nm_lock = netmap_lock_wrapper;
}
-
#ifdef linux
- if (!ifp->netdev_ops) {
- D("ouch, we cannot override netdev_ops");
- goto fail;
+ if (ifp->netdev_ops) {
+ ND("netdev_ops %p", ifp->netdev_ops);
+ /* prepare a clone of the netdev ops */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
+ na->nm_ndo.ndo_start_xmit = ifp->netdev_ops;
+#else
+ na->nm_ndo = *ifp->netdev_ops;
+#endif
}
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28)
- /* if needed, prepare a clone of the entire netdev ops */
- na->nm_ndo = *ifp->netdev_ops;
-#endif /* 2.6.28 and above */
na->nm_ndo.ndo_start_xmit = linux_netmap_start;
-#endif /* linux */
-
+#endif
+ if (!nma_is_vp(arg))
+ netmap_attach_sw(ifp);
D("success for %s", ifp->if_xname);
return 0;
@@ -1727,6 +2274,35 @@ netmap_detach(struct ifnet *ifp)
}
+int
+nm_bdg_flush(struct nm_bdg_fwd *ft, int n, struct netmap_adapter *na, u_int ring_nr);
+
+/* we don't need to lock myself */
+static int
+bdg_netmap_start(struct ifnet *ifp, struct mbuf *m)
+{
+ struct netmap_adapter *na = SWNA(ifp);
+ struct nm_bdg_fwd *ft = na->rx_rings[0].nkr_ft;
+ char *buf = NMB(&na->rx_rings[0].ring->slot[0]);
+ u_int len = MBUF_LEN(m);
+
+ if (!na->na_bdg) /* SWNA is not configured to be attached */
+ return EBUSY;
+ m_copydata(m, 0, len, buf);
+ ft->ft_len = len;
+ ft->buf = buf;
+ nm_bdg_flush(ft, 1, na, 0);
+
+ /* release the mbuf in either cases of success or failure. As an
+ * alternative, put the mbuf in a free list and free the list
+ * only when really necessary.
+ */
+ m_freem(m);
+
+ return (0);
+}
+
+
/*
* Intercept packets from the network stack and pass them
* to netmap as incoming packets on the 'software' ring.
@@ -1750,6 +2326,9 @@ netmap_start(struct ifnet *ifp, struct mbuf *m)
m_freem(m);
return EINVAL;
}
+ if (na->na_bdg)
+ return bdg_netmap_start(ifp, m);
+
na->nm_lock(ifp, NETMAP_CORE_LOCK, 0);
if (kring->nr_hwavail >= lim) {
if (netmap_verbose)
@@ -1844,6 +2423,73 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, int n,
}
+/* returns the next position in the ring */
+static int
+nm_bdg_preflush(struct netmap_adapter *na, u_int ring_nr,
+ struct netmap_kring *kring, u_int end)
+{
+ struct netmap_ring *ring = kring->ring;
+ struct nm_bdg_fwd *ft = kring->nkr_ft;
+ u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
+ u_int ft_i = 0; /* start from 0 */
+
+ for (; likely(j != end); j = unlikely(j == lim) ? 0 : j+1) {
+ struct netmap_slot *slot = &ring->slot[j];
+ int len = ft[ft_i].ft_len = slot->len;
+ char *buf = ft[ft_i].buf = NMB(slot);
+
+ prefetch(buf);
+ if (unlikely(len < 14))
+ continue;
+ if (unlikely(++ft_i == netmap_bridge))
+ ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
+ }
+ if (ft_i)
+ ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
+ return j;
+}
+
+
+/*
+ * Pass packets from nic to the bridge. Must be called with
+ * proper locks on the source interface.
+ * Note, no user process can access this NIC so we can ignore
+ * the info in the 'ring'.
+ */
+static void
+netmap_nic_to_bdg(struct ifnet *ifp, u_int ring_nr)
+{
+ struct netmap_adapter *na = NA(ifp);
+ struct netmap_kring *kring = &na->rx_rings[ring_nr];
+ struct netmap_ring *ring = kring->ring;
+ int j, k, lim = kring->nkr_num_slots - 1;
+
+ /* fetch packets that have arrived */
+ na->nm_rxsync(ifp, ring_nr, 0);
+ /* XXX we don't count reserved, but it should be 0 */
+ j = kring->nr_hwcur;
+ k = j + kring->nr_hwavail;
+ if (k > lim)
+ k -= lim + 1;
+ if (k == j && netmap_verbose) {
+ D("how strange, interrupt with no packets on %s",
+ ifp->if_xname);
+ return;
+ }
+
+ j = nm_bdg_preflush(na, ring_nr, kring, k);
+
+ /* we consume everything, but we cannot update kring directly
+ * because the nic may have destroyed the info in the NIC ring.
+ * So we need to call rxsync again to restore it.
+ */
+ ring->cur = j;
+ ring->avail = 0;
+ na->nm_rxsync(ifp, ring_nr, 0);
+ return;
+}
+
+
/*
* Default functions to handle rx/tx interrupts
* we have 4 cases:
@@ -1867,7 +2513,7 @@ netmap_rx_irq(struct ifnet *ifp, int q, int *work_done)
struct netmap_adapter *na;
struct netmap_kring *r;
NM_SELINFO_T *main_wq;
- int locktype, unlocktype, lock;
+ int locktype, unlocktype, nic_to_bridge, lock;
if (!(ifp->if_capenable & IFCAP_NETMAP))
return 0;
@@ -1888,6 +2534,8 @@ netmap_rx_irq(struct ifnet *ifp, int q, int *work_done)
r = na->rx_rings + q;
r->nr_kflags |= NKR_PENDINTR;
main_wq = (na->num_rx_rings > 1) ? &na->rx_si : NULL;
+ /* set a flag if the NIC is attached to a VALE switch */
+ nic_to_bridge = (na->na_bdg != NULL);
locktype = NETMAP_RX_LOCK;
unlocktype = NETMAP_RX_UNLOCK;
} else { /* TX path */
@@ -1896,15 +2544,23 @@ netmap_rx_irq(struct ifnet *ifp, int q, int *work_done)
r = na->tx_rings + q;
main_wq = (na->num_tx_rings > 1) ? &na->tx_si : NULL;
work_done = &q; /* dummy */
+ nic_to_bridge = 0;
locktype = NETMAP_TX_LOCK;
unlocktype = NETMAP_TX_UNLOCK;
}
if (na->separate_locks) {
if (!(lock & NETMAP_LOCKED_ENTER))
na->nm_lock(ifp, locktype, q);
- selwakeuppri(&r->si, PI_NET);
+ /* If a NIC is attached to a bridge, flush packets
+ * (and no need to wakeup anyone). Otherwise, wakeup
+ * possible processes waiting for packets.
+ */
+ if (nic_to_bridge)
+ netmap_nic_to_bdg(ifp, q);
+ else
+ selwakeuppri(&r->si, PI_NET);
na->nm_lock(ifp, unlocktype, q);
- if (main_wq) {
+ if (main_wq && !nic_to_bridge) {
na->nm_lock(ifp, NETMAP_CORE_LOCK, 0);
selwakeuppri(main_wq, PI_NET);
na->nm_lock(ifp, NETMAP_CORE_UNLOCK, 0);
@@ -1915,9 +2571,13 @@ netmap_rx_irq(struct ifnet *ifp, int q, int *work_done)
} else {
if (!(lock & NETMAP_LOCKED_ENTER))
na->nm_lock(ifp, NETMAP_CORE_LOCK, 0);
- selwakeuppri(&r->si, PI_NET);
- if (main_wq)
- selwakeuppri(main_wq, PI_NET);
+ if (nic_to_bridge)
+ netmap_nic_to_bdg(ifp, q);
+ else {
+ selwakeuppri(&r->si, PI_NET);
+ if (main_wq)
+ selwakeuppri(main_wq, PI_NET);
+ }
if (!(lock & NETMAP_LOCKED_EXIT))
na->nm_lock(ifp, NETMAP_CORE_UNLOCK, 0);
}
@@ -1928,6 +2588,7 @@ netmap_rx_irq(struct ifnet *ifp, int q, int *work_done)
#ifdef linux /* linux-specific routines */
+
/*
* Remap linux arguments into the FreeBSD call.
* - pwait is the poll table, passed as 'dev';
@@ -1950,6 +2611,7 @@ linux_netmap_poll(struct file * file, struct poll_table_struct *pwait)
return netmap_poll((void *)pwait, events, (void *)file);
}
+
static int
linux_netmap_mmap(struct file *f, struct vm_area_struct *vma)
{
@@ -2019,6 +2681,7 @@ done:
return 0;
}
+
static netdev_tx_t
linux_netmap_start(struct sk_buff *skb, struct net_device *dev)
{
@@ -2059,6 +2722,7 @@ netmap_release(struct inode *inode, struct file *file)
return (0);
}
+
static int
linux_netmap_open(struct inode *inode, struct file *file)
{
@@ -2075,7 +2739,9 @@ linux_netmap_open(struct inode *inode, struct file *file)
return (0);
}
+
static struct file_operations netmap_fops = {
+ .owner = THIS_MODULE,
.open = linux_netmap_open,
.mmap = linux_netmap_mmap,
LIN_IOCTL_NAME = linux_netmap_ioctl,
@@ -2083,6 +2749,7 @@ static struct file_operations netmap_fops = {
.release = netmap_release,
};
+
static struct miscdevice netmap_cdevsw = { /* same name as FreeBSD */
MISC_DYNAMIC_MINOR,
"netmap",
@@ -2092,6 +2759,7 @@ static struct miscdevice netmap_cdevsw = { /* same name as FreeBSD */
static int netmap_init(void);
static void netmap_fini(void);
+
/* Errors have negative values on linux */
static int linux_netmap_init(void)
{
@@ -2111,6 +2779,8 @@ EXPORT_SYMBOL(netmap_reset); // ring init routines
EXPORT_SYMBOL(netmap_buf_size);
EXPORT_SYMBOL(netmap_rx_irq); // default irq handler
EXPORT_SYMBOL(netmap_no_pendintr); // XXX mitigation - should go away
+EXPORT_SYMBOL(netmap_bdg_ctl); // bridge configuration routine
+EXPORT_SYMBOL(netmap_bdg_learning); // the default lookup function
MODULE_AUTHOR("http://info.iet.unipi.it/~luigi/netmap/");
@@ -2119,6 +2789,7 @@ MODULE_LICENSE("Dual BSD/GPL"); /* the code here is all BSD. */
#else /* __FreeBSD__ */
+
static struct cdevsw netmap_cdevsw = {
.d_version = D_VERSION,
.d_name = "netmap",
@@ -2180,159 +2851,241 @@ nm_bridge_rthash(const uint8_t *addr)
static int
bdg_netmap_reg(struct ifnet *ifp, int onoff)
{
- int i, err = 0;
- struct nm_bridge *b = ifp->if_bridge;
+ // struct nm_bridge *b = NA(ifp)->na_bdg;
- BDG_LOCK(b);
+ /* the interface is already attached to the bridge,
+ * so we only need to toggle IFCAP_NETMAP.
+ * Locking is not necessary (we are already under
+ * NMA_LOCK, and the port is not in use during this call).
+ */
+ /* BDG_WLOCK(b); */
if (onoff) {
- /* the interface must be already in the list.
- * only need to mark the port as active
- */
- ND("should attach %s to the bridge", ifp->if_xname);
- for (i=0; i < NM_BDG_MAXPORTS; i++)
- if (b->bdg_ports[i] == ifp)
- break;
- if (i == NM_BDG_MAXPORTS) {
- D("no more ports available");
- err = EINVAL;
- goto done;
- }
- ND("setting %s in netmap mode", ifp->if_xname);
ifp->if_capenable |= IFCAP_NETMAP;
- NA(ifp)->bdg_port = i;
- b->act_ports |= (1<<i);
- b->bdg_ports[i] = ifp;
} else {
- /* should be in the list, too -- remove from the mask */
- ND("removing %s from netmap mode", ifp->if_xname);
ifp->if_capenable &= ~IFCAP_NETMAP;
- i = NA(ifp)->bdg_port;
- b->act_ports &= ~(1<<i);
}
-done:
- BDG_UNLOCK(b);
- return err;
+ /* BDG_WUNLOCK(b); */
+ return 0;
}
-static int
-nm_bdg_flush(struct nm_bdg_fwd *ft, int n, struct ifnet *ifp)
+/*
+ * Lookup function for a learning bridge.
+ * Update the hash table with the source address,
+ * and then returns the destination port index, and the
+ * ring in *dst_ring (at the moment, always use ring 0)
+ */
+u_int
+netmap_bdg_learning(char *buf, u_int len, uint8_t *dst_ring,
+ struct netmap_adapter *na)
{
- int i, ifn;
- uint64_t all_dst, dst;
+ struct nm_hash_ent *ht = na->na_bdg->ht;
uint32_t sh, dh;
- uint64_t mysrc = 1 << NA(ifp)->bdg_port;
+ u_int dst, mysrc = na->bdg_port;
uint64_t smac, dmac;
- struct netmap_slot *slot;
- struct nm_bridge *b = ifp->if_bridge;
- ND("prepare to send %d packets, act_ports 0x%x", n, b->act_ports);
- /* only consider valid destinations */
- all_dst = (b->act_ports & ~mysrc);
- /* first pass: hash and find destinations */
+ dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
+ smac = le64toh(*(uint64_t *)(buf + 4));
+ smac >>= 16;
+
+ /*
+ * The hash is somewhat expensive, there might be some
+ * worthwhile optimizations here.
+ */
+ if ((buf[6] & 1) == 0) { /* valid src */
+ uint8_t *s = buf+6;
+ sh = nm_bridge_rthash(buf+6); // XXX hash of source
+ /* update source port forwarding entry */
+ ht[sh].mac = smac; /* XXX expire ? */
+ ht[sh].ports = mysrc;
+ if (netmap_verbose)
+ D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
+ s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
+ }
+ dst = NM_BDG_BROADCAST;
+ if ((buf[0] & 1) == 0) { /* unicast */
+ dh = nm_bridge_rthash(buf); // XXX hash of dst
+ if (ht[dh].mac == dmac) { /* found dst */
+ dst = ht[dh].ports;
+ }
+ /* XXX otherwise return NM_BDG_UNKNOWN ? */
+ }
+ *dst_ring = 0;
+ return dst;
+}
+
+
+/*
+ * This flush routine supports only unicast and broadcast but a large
+ * number of ports, and lets us replace the learn and dispatch functions.
+ */
+int
+nm_bdg_flush(struct nm_bdg_fwd *ft, int n, struct netmap_adapter *na,
+ u_int ring_nr)
+{
+ struct nm_bdg_q *dst_ents, *brddst;
+ uint16_t num_dsts = 0, *dsts;
+ struct nm_bridge *b = na->na_bdg;
+ u_int i, me = na->bdg_port;
+
+ dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH);
+ dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
+
+ BDG_RLOCK(b);
+
+ /* first pass: find a destination */
for (i = 0; likely(i < n); i++) {
uint8_t *buf = ft[i].buf;
- dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
- smac = le64toh(*(uint64_t *)(buf + 4));
- smac >>= 16;
- if (unlikely(netmap_verbose)) {
- uint8_t *s = buf+6, *d = buf;
- D("%d len %4d %02x:%02x:%02x:%02x:%02x:%02x -> %02x:%02x:%02x:%02x:%02x:%02x",
- i,
- ft[i].len,
- s[0], s[1], s[2], s[3], s[4], s[5],
- d[0], d[1], d[2], d[3], d[4], d[5]);
+ uint8_t dst_ring = ring_nr;
+ uint16_t dst_port, d_i;
+ struct nm_bdg_q *d;
+
+ dst_port = b->nm_bdg_lookup(buf, ft[i].ft_len, &dst_ring, na);
+ if (dst_port == NM_BDG_NOPORT) {
+ continue; /* this packet is identified to be dropped */
+ } else if (unlikely(dst_port > NM_BDG_MAXPORTS)) {
+ continue;
+ } else if (dst_port == NM_BDG_BROADCAST) {
+ dst_ring = 0; /* broadcasts always go to ring 0 */
+ } else if (unlikely(dst_port == me ||
+ !BDG_GET_VAR(b->bdg_ports[dst_port]))) {
+ continue;
}
- /*
- * The hash is somewhat expensive, there might be some
- * worthwhile optimizations here.
- */
- if ((buf[6] & 1) == 0) { /* valid src */
- uint8_t *s = buf+6;
- sh = nm_bridge_rthash(buf+6); // XXX hash of source
- /* update source port forwarding entry */
- b->ht[sh].mac = smac; /* XXX expire ? */
- b->ht[sh].ports = mysrc;
- if (netmap_verbose)
- D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
- s[0], s[1], s[2], s[3], s[4], s[5], NA(ifp)->bdg_port);
+
+ /* get a position in the scratch pad */
+ d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
+ d = dst_ents + d_i;
+ if (d->bq_head == NM_BDG_BATCH) { /* new destination */
+ d->bq_head = d->bq_tail = i;
+ /* remember this position to be scanned later */
+ if (dst_port != NM_BDG_BROADCAST)
+ dsts[num_dsts++] = d_i;
}
- dst = 0;
- if ( (buf[0] & 1) == 0) { /* unicast */
- uint8_t *d = buf;
- dh = nm_bridge_rthash(buf); // XXX hash of dst
- if (b->ht[dh].mac == dmac) { /* found dst */
- dst = b->ht[dh].ports;
- if (netmap_verbose)
- D("dst %02x:%02x:%02x:%02x:%02x:%02x to port %x",
- d[0], d[1], d[2], d[3], d[4], d[5], (uint32_t)(dst >> 16));
- }
+ ft[d->bq_tail].ft_next = i;
+ d->bq_tail = i;
+ }
+
+ /* if there is a broadcast, set ring 0 of all ports to be scanned
+ * XXX This would be optimized by recording the highest index of active
+ * ports.
+ */
+ brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
+ if (brddst->bq_head != NM_BDG_BATCH) {
+ for (i = 0; likely(i < NM_BDG_MAXPORTS); i++) {
+ uint16_t d_i = i * NM_BDG_MAXRINGS;
+ if (unlikely(i == me) || !BDG_GET_VAR(b->bdg_ports[i]))
+ continue;
+ else if (dst_ents[d_i].bq_head == NM_BDG_BATCH)
+ dsts[num_dsts++] = d_i;
}
- if (dst == 0)
- dst = all_dst;
- dst &= all_dst; /* only consider valid ports */
- if (unlikely(netmap_verbose))
- D("pkt goes to ports 0x%x", (uint32_t)dst);
- ft[i].dst = dst;
- }
-
- /* second pass, scan interfaces and forward */
- all_dst = (b->act_ports & ~mysrc);
- for (ifn = 0; all_dst; ifn++) {
- struct ifnet *dst_ifp = b->bdg_ports[ifn];
- struct netmap_adapter *na;
+ }
+
+ /* second pass: scan destinations (XXX will be modular somehow) */
+ for (i = 0; i < num_dsts; i++) {
+ struct ifnet *dst_ifp;
+ struct netmap_adapter *dst_na;
struct netmap_kring *kring;
struct netmap_ring *ring;
- int j, lim, sent, locked;
-
- if (!dst_ifp)
+ u_int dst_nr, is_vp, lim, j, sent = 0, d_i, next, brd_next;
+ int howmany, retry = netmap_txsync_retry;
+ struct nm_bdg_q *d;
+
+ d_i = dsts[i];
+ d = dst_ents + d_i;
+ dst_na = BDG_GET_VAR(b->bdg_ports[d_i/NM_BDG_MAXRINGS]);
+ /* protect from the lookup function returning an inactive
+ * destination port
+ */
+ if (unlikely(dst_na == NULL))
continue;
- ND("scan port %d %s", ifn, dst_ifp->if_xname);
- dst = 1 << ifn;
- if ((dst & all_dst) == 0) /* skip if not set */
+ else if (dst_na->na_flags & NAF_SW_ONLY)
continue;
- all_dst &= ~dst; /* clear current node */
- na = NA(dst_ifp);
-
- ring = NULL;
- kring = NULL;
- lim = sent = locked = 0;
- /* inside, scan slots */
- for (i = 0; likely(i < n); i++) {
- if ((ft[i].dst & dst) == 0)
- continue; /* not here */
- if (!locked) {
- kring = &na->rx_rings[0];
- ring = kring->ring;
- lim = kring->nkr_num_slots - 1;
- na->nm_lock(dst_ifp, NETMAP_RX_LOCK, 0);
- locked = 1;
- }
- if (unlikely(kring->nr_hwavail >= lim)) {
- if (netmap_verbose)
- D("rx ring full on %s", ifp->if_xname);
- break;
- }
+ dst_ifp = dst_na->ifp;
+ /*
+ * The interface may be in !netmap mode in two cases:
+ * - when na is attached but not activated yet;
+ * - when na is being deactivated but is still attached.
+ */
+ if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP)))
+ continue;
+
+ /* there is at least one either unicast or broadcast packet */
+ brd_next = brddst->bq_head;
+ next = d->bq_head;
+
+ is_vp = nma_is_vp(dst_na);
+ dst_nr = d_i & (NM_BDG_MAXRINGS-1);
+ if (is_vp) { /* virtual port */
+ if (dst_nr >= dst_na->num_rx_rings)
+ dst_nr = dst_nr % dst_na->num_rx_rings;
+ kring = &dst_na->rx_rings[dst_nr];
+ ring = kring->ring;
+ lim = kring->nkr_num_slots - 1;
+ dst_na->nm_lock(dst_ifp, NETMAP_RX_LOCK, dst_nr);
j = kring->nr_hwcur + kring->nr_hwavail;
if (j > lim)
j -= kring->nkr_num_slots;
+ howmany = lim - kring->nr_hwavail;
+ } else { /* hw or sw adapter */
+ if (dst_nr >= dst_na->num_tx_rings)
+ dst_nr = dst_nr % dst_na->num_tx_rings;
+ kring = &dst_na->tx_rings[dst_nr];
+ ring = kring->ring;
+ lim = kring->nkr_num_slots - 1;
+ dst_na->nm_lock(dst_ifp, NETMAP_TX_LOCK, dst_nr);
+retry:
+ dst_na->nm_txsync(dst_ifp, dst_nr, 0);
+ /* see nm_bdg_flush() */
+ j = kring->nr_hwcur;
+ howmany = kring->nr_hwavail;
+ }
+ while (howmany-- > 0) {
+ struct netmap_slot *slot;
+ struct nm_bdg_fwd *ft_p;
+
+ if (next < brd_next) {
+ ft_p = ft + next;
+ next = ft_p->ft_next;
+ } else { /* insert broadcast */
+ ft_p = ft + brd_next;
+ brd_next = ft_p->ft_next;
+ }
slot = &ring->slot[j];
- ND("send %d %d bytes at %s:%d", i, ft[i].len, dst_ifp->if_xname, j);
- pkt_copy(ft[i].buf, NMB(slot), ft[i].len);
- slot->len = ft[i].len;
- kring->nr_hwavail++;
+ ND("send %d %d bytes at %s:%d", i, ft_p->ft_len, dst_ifp->if_xname, j);
+ pkt_copy(ft_p->buf, NMB(slot), ft_p->ft_len);
+ slot->len = ft_p->ft_len;
+ j = (j == lim) ? 0: j + 1; /* XXX to be macro-ed */
sent++;
+ if (next == d->bq_tail && brd_next == brddst->bq_tail)
+ break;
}
- if (locked) {
- ND("sent %d on %s", sent, dst_ifp->if_xname);
- if (sent)
+ if (netmap_verbose && (howmany < 0))
+ D("rx ring full on %s", dst_ifp->if_xname);
+ if (is_vp) {
+ if (sent) {
+ kring->nr_hwavail += sent;
selwakeuppri(&kring->si, PI_NET);
- na->nm_lock(dst_ifp, NETMAP_RX_UNLOCK, 0);
+ }
+ dst_na->nm_lock(dst_ifp, NETMAP_RX_UNLOCK, dst_nr);
+ } else {
+ if (sent) {
+ ring->avail -= sent;
+ ring->cur = j;
+ dst_na->nm_txsync(dst_ifp, dst_nr, 0);
+ }
+ /* retry to send more packets */
+ if (nma_is_hw(dst_na) && howmany < 0 && retry--)
+ goto retry;
+ dst_na->nm_lock(dst_ifp, NETMAP_TX_UNLOCK, dst_nr);
}
+ d->bq_head = d->bq_tail = NM_BDG_BATCH; /* cleanup */
}
+ brddst->bq_head = brddst->bq_tail = NM_BDG_BATCH; /* cleanup */
+ BDG_RUNLOCK(b);
return 0;
}
+
/*
* main dispatch routine
*/
@@ -2343,8 +3096,6 @@ bdg_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock)
struct netmap_kring *kring = &na->tx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
int i, j, k, lim = kring->nkr_num_slots - 1;
- struct nm_bdg_fwd *ft = (struct nm_bdg_fwd *)(ifp + 1);
- int ft_i; /* position in the forwarding table */
k = ring->cur;
if (k > lim)
@@ -2359,21 +3110,7 @@ bdg_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock)
if (netmap_bridge > NM_BDG_BATCH)
netmap_bridge = NM_BDG_BATCH;
- ft_i = 0; /* start from 0 */
- for (j = kring->nr_hwcur; likely(j != k); j = unlikely(j == lim) ? 0 : j+1) {
- struct netmap_slot *slot = &ring->slot[j];
- int len = ft[ft_i].len = slot->len;
- char *buf = ft[ft_i].buf = NMB(slot);
-
- prefetch(buf);
- if (unlikely(len < 14))
- continue;
- if (unlikely(++ft_i == netmap_bridge))
- ft_i = nm_bdg_flush(ft, ft_i, ifp);
- }
- if (ft_i)
- ft_i = nm_bdg_flush(ft, ft_i, ifp);
- /* count how many packets we sent */
+ j = nm_bdg_preflush(na, ring_nr, kring, k);
i = k - j;
if (i < 0)
i += kring->nkr_num_slots;
@@ -2392,14 +3129,16 @@ done:
return 0;
}
+
static int
bdg_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock)
{
struct netmap_adapter *na = NA(ifp);
struct netmap_kring *kring = &na->rx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
- u_int j, n, lim = kring->nkr_num_slots - 1;
+ u_int j, lim = kring->nkr_num_slots - 1;
u_int k = ring->cur, resvd = ring->reserved;
+ int n;
ND("%s ring %d lock %d avail %d",
ifp->if_xname, ring_nr, do_lock, kring->nr_hwavail);
@@ -2449,22 +3188,25 @@ bdg_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock)
return 0;
}
+
static void
-bdg_netmap_attach(struct ifnet *ifp)
+bdg_netmap_attach(struct netmap_adapter *arg)
{
struct netmap_adapter na;
ND("attaching virtual bridge");
bzero(&na, sizeof(na));
- na.ifp = ifp;
+ na.ifp = arg->ifp;
na.separate_locks = 1;
+ na.num_tx_rings = arg->num_tx_rings;
+ na.num_rx_rings = arg->num_rx_rings;
na.num_tx_desc = NM_BRIDGE_RINGSIZE;
na.num_rx_desc = NM_BRIDGE_RINGSIZE;
na.nm_txsync = bdg_netmap_txsync;
na.nm_rxsync = bdg_netmap_rxsync;
na.nm_register = bdg_netmap_reg;
- netmap_attach(&na, 1);
+ netmap_attach(&na, na.num_tx_rings);
}
#endif /* NM_BRIDGE */
@@ -2497,8 +3239,11 @@ netmap_init(void)
#ifdef NM_BRIDGE
{
int i;
+ mtx_init(&netmap_bridge_mutex, "netmap_bridge_mutex",
+ MTX_NETWORK_LOCK, MTX_DEF);
+ bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */
for (i = 0; i < NM_BRIDGES; i++)
- mtx_init(&nm_bridges[i].bdg_lock, "bdg lock", "bdg_lock", MTX_DEF);
+ rw_init(&nm_bridges[i].bdg_lock, "bdg lock");
}
#endif
return (error);
diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h
index 7ab617e..e246e14 100644
--- a/sys/dev/netmap/netmap_kern.h
+++ b/sys/dev/netmap/netmap_kern.h
@@ -39,6 +39,7 @@
#define unlikely(x) __builtin_expect(!!(x), 0)
#define NM_LOCK_T struct mtx
+#define NM_RWLOCK_T struct rwlock
#define NM_SELINFO_T struct selinfo
#define MBUF_LEN(m) ((m)->m_pkthdr.len)
#define NM_SEND_UP(ifp, m) ((ifp)->if_input)(ifp, m)
@@ -46,6 +47,7 @@
#elif defined (linux)
#define NM_LOCK_T safe_spinlock_t // see bsd_glue.h
+#define NM_RWLOCK_T safe_spinlock_t // see bsd_glue.h
#define NM_SELINFO_T wait_queue_head_t
#define MBUF_LEN(m) ((m)->len)
#define NM_SEND_UP(ifp, m) netif_rx(m)
@@ -63,7 +65,7 @@
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
#define IFCAP_NETMAP 0x8000
#else
-#define IFCAP_NETMAP 0x100000
+#define IFCAP_NETMAP 0x200000
#endif
#elif defined (__APPLE__)
@@ -105,6 +107,9 @@
} while (0)
struct netmap_adapter;
+struct nm_bdg_fwd;
+struct nm_bridge;
+struct netmap_priv_d;
/*
* private, kernel view of a ring. Keeps track of the status of
@@ -138,6 +143,7 @@ struct netmap_kring {
uint16_t nkr_slot_flags; /* initial value for flags */
int nkr_hwofs; /* offset between NIC and netmap ring */
struct netmap_adapter *na;
+ struct nm_bdg_fwd *nkr_ft;
NM_SELINFO_T si; /* poll/select wait queue */
NM_LOCK_T q_lock; /* used if no device lock available */
} __attribute__((__aligned__(64)));
@@ -160,6 +166,7 @@ struct netmap_adapter {
#define NAF_SKIP_INTR 1 /* use the regular interrupt handler.
* useful during initialization
*/
+#define NAF_SW_ONLY 2 /* forward packets only to sw adapter */
int refcount; /* number of user-space descriptors using this
interface, which is equal to the number of
struct netmap_if objs in the mapped region. */
@@ -218,10 +225,17 @@ struct netmap_adapter {
* when it goes to 0 we can detach+free this port
* (a bridge port is always attached if it exists;
* it is not always registered)
+ * na_bdg points to the bridge this NA is attached to.
*/
int bdg_port;
int na_bdg_refcount;
-
+ struct nm_bridge *na_bdg;
+ /* When we attach a physical interface to the bridge, we
+ * allow the controlling process to terminate, so we need
+ * a place to store the netmap_priv_d data structure.
+ * This is only done when physical interfaces are attached to a bridge.
+ */
+ struct netmap_priv_d *na_kpriv;
#ifdef linux
struct net_device_ops nm_ndo;
#endif /* linux */
@@ -288,6 +302,22 @@ struct netmap_slot *netmap_reset(struct netmap_adapter *na,
enum txrx tx, int n, u_int new_cur);
int netmap_ring_reinit(struct netmap_kring *);
+/*
+ * The following bridge-related interfaces are used by other kernel modules
+ * In the version that only supports unicast or broadcast, the lookup
+ * function can return 0 .. NM_BDG_MAXPORTS-1 for regular ports,
+ * NM_BDG_MAXPORTS for broadcast, NM_BDG_MAXPORTS+1 for unknown.
+ * XXX in practice "unknown" might be handled same as broadcast.
+ */
+typedef u_int (*bdg_lookup_fn_t)(char *buf, u_int len, uint8_t *ring_nr,
+ struct netmap_adapter *);
+int netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func);
+u_int netmap_bdg_learning(char *, u_int, uint8_t *, struct netmap_adapter *);
+#define NM_NAME "vale" /* prefix for the bridge port name */
+#define NM_BDG_MAXPORTS 254 /* up to 32 for bitmap, 254 ok otherwise */
+#define NM_BDG_BROADCAST NM_BDG_MAXPORTS
+#define NM_BDG_NOPORT (NM_BDG_MAXPORTS+1)
+
extern u_int netmap_buf_size;
#define NETMAP_BUF_SIZE netmap_buf_size // XXX remove
extern int netmap_mitigate;
@@ -309,11 +339,15 @@ enum { /* verbose flags */
/*
* NA returns a pointer to the struct netmap adapter from the ifp,
* WNA is used to write it.
+ * SWNA() is used for the "host stack" endpoint associated
+ * to an interface. It is allocated together with the main NA(),
+ * as an array of two objects.
*/
#ifndef WNA
#define WNA(_ifp) (_ifp)->if_pspare[0]
#endif
#define NA(_ifp) ((struct netmap_adapter *)WNA(_ifp))
+#define SWNA(_ifp) (NA(_ifp) + 1)
/*
* Macros to determine if an interface is netmap capable or netmap enabled.
diff --git a/sys/net/netmap.h b/sys/net/netmap.h
index cdeb10e..b5ab6d5 100644
--- a/sys/net/netmap.h
+++ b/sys/net/netmap.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
+ * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
@@ -127,8 +127,15 @@
* transparent mode, buffers released with the flag set
* will be forwarded to the 'other' side (host stack
* or NIC, respectively) on the next select() or ioctl()
+ *
+ * The following will be supported from NETMAP_API = 5
* NS_NO_LEARN on a VALE switch, do not 'learn' the source port for
* this packet.
+ * NS_INDIRECT the netmap buffer contains a 64-bit pointer to
+ * the actual userspace buffer. This may be useful
+ * to reduce copies in a VM environment.
+ * NS_MOREFRAG Part of a multi-segment frame. The last (or only)
+ * segment must not have this flag.
* NS_PORT_MASK the high 8 bits of the flag, if not zero, indicate the
* destination port for the VALE switch, overriding
* the lookup table.
@@ -146,6 +153,8 @@ struct netmap_slot {
* (host stack or device)
*/
#define NS_NO_LEARN 0x0008
+#define NS_INDIRECT 0x0010
+#define NS_MOREFRAG 0x0020
#define NS_PORT_SHIFT 8
#define NS_PORT_MASK (0xff << NS_PORT_SHIFT)
};
@@ -277,10 +286,24 @@ struct netmap_if {
* NIOCREGIF takes an interface name within a struct ifreq,
* and activates netmap mode on the interface (if possible).
*
+ * For vale ports, starting with NETMAP_API = 5,
+ * nr_tx_rings and nr_rx_rings specify how many software rings
+ * are created (0 means 1).
+ *
+ * NIOCREGIF is also used to attach a NIC to a VALE switch.
+ * In this case the name is vale*:ifname, and "nr_cmd"
+ * is set to 'NETMAP_BDG_ATTACH' or 'NETMAP_BDG_DETACH'.
+ * nr_ringid specifies which rings should be attached, 0 means all,
+ * NETMAP_HW_RING + n means only the n-th ring.
+ * The process can terminate after the interface has been attached.
+ *
* NIOCUNREGIF unregisters the interface associated to the fd.
+ * this is deprecated and will go away.
*
* NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues,
* whose identity is set in NIOCREGIF through nr_ringid
+ *
+ * NETMAP_API is the API version.
*/
/*
@@ -289,7 +312,7 @@ struct netmap_if {
struct nmreq {
char nr_name[IFNAMSIZ];
uint32_t nr_version; /* API version */
-#define NETMAP_API 3 /* current version */
+#define NETMAP_API 4 /* current version */
uint32_t nr_offset; /* nifp offset in the shared region */
uint32_t nr_memsize; /* size of the shared region */
uint32_t nr_tx_slots; /* slots in tx rings */
@@ -301,8 +324,15 @@ struct nmreq {
#define NETMAP_SW_RING 0x2000 /* process the sw ring */
#define NETMAP_NO_TX_POLL 0x1000 /* no automatic txsync on poll */
#define NETMAP_RING_MASK 0xfff /* the ring number */
- uint16_t spare1;
- uint32_t spare2[4];
+ uint16_t nr_cmd;
+#define NETMAP_BDG_ATTACH 1 /* attach the NIC */
+#define NETMAP_BDG_DETACH 2 /* detach the NIC */
+#define NETMAP_BDG_LOOKUP_REG 3 /* register lookup function */
+#define NETMAP_BDG_LIST 4 /* get bridge's info */
+ uint16_t nr_arg1;
+#define NETMAP_BDG_HOST 1 /* attach the host stack on ATTACH */
+ uint16_t nr_arg2;
+ uint32_t spare2[3];
};
/*
OpenPOWER on IntegriCloud