summaryrefslogtreecommitdiffstats
path: root/sys/net
diff options
context:
space:
mode:
authorluigi <luigi@FreeBSD.org>2014-02-18 05:01:04 +0000
committerluigi <luigi@FreeBSD.org>2014-02-18 05:01:04 +0000
commit5bacc3bb87b954978543b0d82a4d5705e33f5c06 (patch)
treea79f129924ca9cf087c1e108d2d184a16ac1e42b /sys/net
parentdd5bb071cd203986ef23e5ceecdcef3cea848542 (diff)
downloadFreeBSD-src-5bacc3bb87b954978543b0d82a4d5705e33f5c06.zip
FreeBSD-src-5bacc3bb87b954978543b0d82a4d5705e33f5c06.tar.gz
MFH: sync the netmap code with the one in HEAD
(enhanced VALE switch, netmap pipes, emulated netmap mode). See details in the log for svn 261909.
Diffstat (limited to 'sys/net')
-rw-r--r--sys/net/netmap.h651
-rw-r--r--sys/net/netmap_user.h658
2 files changed, 1045 insertions, 264 deletions
diff --git a/sys/net/netmap.h b/sys/net/netmap.h
index b5ab6d5..f0b4c56 100644
--- a/sys/net/netmap.h
+++ b/sys/net/netmap.h
@@ -1,33 +1,27 @@
/*
- * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved.
- *
+ * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
+ *
* Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
- *
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the
- * distribution.
- *
- * 3. Neither the name of the authors nor the names of their contributors
- * may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY MATTEO LANDI AND CONTRIBUTORS "AS IS" AND
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``S IS''AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL MATTEO LANDI OR CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
- * THE POSSIBILITY OF SUCH DAMAGE.
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
*/
/*
@@ -36,215 +30,249 @@
* Definitions of constants and the structures used by the netmap
* framework, for the part visible to both kernel and userspace.
* Detailed info on netmap is available with "man netmap" or at
- *
+ *
* http://info.iet.unipi.it/~luigi/netmap/
+ *
+ * This API is also used to communicate with the VALE software switch
*/
#ifndef _NET_NETMAP_H_
#define _NET_NETMAP_H_
+#define NETMAP_API 11 /* current API version */
+
+#define NETMAP_MIN_API 11 /* min and max versions accepted */
+#define NETMAP_MAX_API 15
+/*
+ * Some fields should be cache-aligned to reduce contention.
+ * The alignment is architecture and OS dependent, but rather than
+ * digging into OS headers to find the exact value we use an estimate
+ * that should cover most architectures.
+ */
+#define NM_CACHE_ALIGN 128
+
/*
* --- Netmap data structures ---
*
- * The data structures used by netmap are shown below. Those in
- * capital letters are in an mmapp()ed area shared with userspace,
- * while others are private to the kernel.
- * Shared structures do not contain pointers but only memory
- * offsets, so that addressing is portable between kernel and userspace.
-
-
- softc
-+----------------+
-| standard fields|
-| if_pspare[0] ----------+
-+----------------+ |
- |
-+----------------+<------+
-|(netmap_adapter)|
-| | netmap_kring
-| tx_rings *--------------------------------->+---------------+
-| | netmap_kring | ring *---------.
-| rx_rings *--------->+---------------+ | nr_hwcur | |
-+----------------+ | ring *--------. | nr_hwavail | V
- | nr_hwcur | | | selinfo | |
- | nr_hwavail | | +---------------+ .
- | selinfo | | | ... | .
- +---------------+ | |(ntx+1 entries)|
- | .... | | | |
- |(nrx+1 entries)| | +---------------+
- | | |
- KERNEL +---------------+ |
- |
+ * The userspace data structures used by netmap are shown below.
+ * They are allocated by the kernel and mmap()ed by userspace threads.
+ * Pointers are implemented as memory offsets or indexes,
+ * so that they can be easily dereferenced in kernel and userspace.
+
+ KERNEL (opaque, obviously)
+
====================================================================
|
- USERSPACE | NETMAP_RING
- +---->+-------------+
- / | cur |
- NETMAP_IF (nifp, one per file desc.) / | avail |
- +---------------+ / | buf_ofs |
- | ni_tx_rings | / +=============+
- | ni_rx_rings | / | buf_idx | slot[0]
- | | / | len, flags |
- | | / +-------------+
- +===============+ / | buf_idx | slot[1]
- | txring_ofs[0] | (rel.to nifp)--' | len, flags |
- | txring_ofs[1] | +-------------+
- (num_rings+1 entries) (nr_num_slots entries)
- | txring_ofs[n] | | buf_idx | slot[n-1]
- +---------------+ | len, flags |
- | rxring_ofs[0] | +-------------+
+ USERSPACE | struct netmap_ring
+ +---->+---------------+
+ / | head,cur,tail |
+ struct netmap_if (nifp, 1 per fd) / | buf_ofs |
+ +---------------+ / | other fields |
+ | ni_tx_rings | / +===============+
+ | ni_rx_rings | / | buf_idx, len | slot[0]
+ | | / | flags, ptr |
+ | | / +---------------+
+ +===============+ / | buf_idx, len | slot[1]
+ | txring_ofs[0] | (rel.to nifp)--' | flags, ptr |
+ | txring_ofs[1] | +---------------+
+ (tx+1 entries) (num_slots entries)
+ | txring_ofs[t] | | buf_idx, len | slot[n-1]
+ +---------------+ | flags, ptr |
+ | rxring_ofs[0] | +---------------+
| rxring_ofs[1] |
- (num_rings+1 entries)
- | txring_ofs[n] |
+ (rx+1 entries)
+ | rxring_ofs[r] |
+---------------+
- * The private descriptor ('softc' or 'adapter') of each interface
- * is extended with a "struct netmap_adapter" containing netmap-related
- * info (see description in dev/netmap/netmap_kernel.h.
- * Among other things, tx_rings and rx_rings point to the arrays of
- * "struct netmap_kring" which in turn reache the various
- * "struct netmap_ring", shared with userspace.
-
- * The NETMAP_RING is the userspace-visible replica of the NIC ring.
- * Each slot has the index of a buffer, its length and some flags.
+ * For each "interface" (NIC, host stack, PIPE, VALE switch port) bound to
+ * a file descriptor, the mmap()ed region contains a (logically readonly)
+ * struct netmap_if pointing to struct netmap_ring's.
+ *
+ * There is one netmap_ring per physical NIC ring, plus one tx/rx ring
+ * pair attached to the host stack (this pair is unused for non-NIC ports).
+ *
+ * All physical/host stack ports share the same memory region,
+ * so that zero-copy can be implemented between them.
+ * VALE switch ports instead have separate memory regions.
+ *
+ * The netmap_ring is the userspace-visible replica of the NIC ring.
+ * Each slot has the index of a buffer (MTU-sized and residing in the
+ * mmapped region), its length and some flags. An extra 64-bit pointer
+ * is provided for user-supplied buffers in the tx path.
+ *
* In user space, the buffer address is computed as
- * (char *)ring + buf_ofs + index*NETMAP_BUF_SIZE
- * In the kernel, buffers do not necessarily need to be contiguous,
- * and the virtual and physical addresses are derived through
- * a lookup table.
- *
- * struct netmap_slot:
- *
- * buf_idx is the index of the buffer associated to the slot.
- * len is the length of the payload
- * NS_BUF_CHANGED must be set whenever userspace wants
- * to change buf_idx (it might be necessary to
- * reprogram the NIC slot)
- * NS_REPORT must be set if we want the NIC to generate an interrupt
- * when this slot is used. Leaving it to 0 improves
- * performance.
- * NS_FORWARD if set on a receive ring, and the device is in
- * transparent mode, buffers released with the flag set
- * will be forwarded to the 'other' side (host stack
- * or NIC, respectively) on the next select() or ioctl()
- *
- * The following will be supported from NETMAP_API = 5
- * NS_NO_LEARN on a VALE switch, do not 'learn' the source port for
- * this packet.
- * NS_INDIRECT the netmap buffer contains a 64-bit pointer to
- * the actual userspace buffer. This may be useful
- * to reduce copies in a VM environment.
- * NS_MOREFRAG Part of a multi-segment frame. The last (or only)
- * segment must not have this flag.
- * NS_PORT_MASK the high 8 bits of the flag, if not zero, indicate the
- * destination port for the VALE switch, overriding
- * the lookup table.
+ * (char *)ring + buf_ofs + index * NETMAP_BUF_SIZE
+ *
+ * Added in NETMAP_API 11:
+ *
+ * + NIOCREGIF can request the allocation of extra spare buffers from
+ * the same memory pool. The desired number of buffers must be in
+ * nr_arg3. The ioctl may return fewer buffers, depending on memory
+ * availability. nr_arg3 will return the actual value, and, once
+ * mapped, nifp->ni_bufs_head will be the index of the first buffer.
+ *
+ * The buffers are linked to each other using the first uint32_t
+ * as the index. On close, ni_bufs_head must point to the list of
+ * buffers to be released.
+ *
+ * + NIOCREGIF can request space for extra rings (and buffers)
+ * allocated in the same memory space. The number of extra rings
+ * is in nr_arg1, and is advisory. This is a no-op on NICs where
+ * the size of the memory space is fixed.
+ *
+ * + NIOCREGIF can attach to PIPE rings sharing the same memory
+ * space with a parent device. The ifname indicates the parent device,
+ * which must already exist. Flags in nr_flags indicate if we want to
+ * bind the master or slave side, the index (from nr_ringid)
+ * is just a cookie and does need to be sequential.
+ *
+ * + NIOCREGIF can also attach to 'monitor' rings that replicate
+ * the content of specific rings, also from the same memory space.
+ *
+ * Extra flags in nr_flags support the above functions.
+ * Application libraries may use the following naming scheme:
+ * netmap:foo all NIC ring pairs
+ * netmap:foo^ only host ring pair
+ * netmap:foo+ all NIC ring + host ring pairs
+ * netmap:foo-k the k-th NIC ring pair
+ * netmap:foo{k PIPE ring pair k, master side
+ * netmap:foo}k PIPE ring pair k, slave side
*/
+/*
+ * struct netmap_slot is a buffer descriptor
+ */
struct netmap_slot {
- uint32_t buf_idx; /* buffer index */
- uint16_t len; /* packet length, to be copied to/from the hw ring */
- uint16_t flags; /* buf changed, etc. */
-#define NS_BUF_CHANGED 0x0001 /* must resync the map, buffer changed */
-#define NS_REPORT 0x0002 /* ask the hardware to report results
- * e.g. by generating an interrupt
- */
-#define NS_FORWARD 0x0004 /* pass packet to the other endpoint
- * (host stack or device)
- */
-#define NS_NO_LEARN 0x0008
-#define NS_INDIRECT 0x0010
-#define NS_MOREFRAG 0x0020
+ uint32_t buf_idx; /* buffer index */
+ uint16_t len; /* length for this slot */
+ uint16_t flags; /* buf changed, etc. */
+ uint64_t ptr; /* pointer for indirect buffers */
+};
+
+/*
+ * The following flags control how the slot is used
+ */
+
+#define NS_BUF_CHANGED 0x0001 /* buf_idx changed */
+ /*
+ * must be set whenever buf_idx is changed (as it might be
+ * necessary to recompute the physical address and mapping)
+ */
+
+#define NS_REPORT 0x0002 /* ask the hardware to report results */
+ /*
+ * Request notification when slot is used by the hardware.
+ * Normally transmit completions are handled lazily and
+ * may be unreported. This flag lets us know when a slot
+ * has been sent (e.g. to terminate the sender).
+ */
+
+#define NS_FORWARD 0x0004 /* pass packet 'forward' */
+ /*
+ * (Only for physical ports, rx rings with NR_FORWARD set).
+ * Slot released to the kernel (i.e. before ring->head) with
+ * this flag set are passed to the peer ring (host/NIC),
+ * thus restoring the host-NIC connection for these slots.
+ * This supports efficient traffic monitoring or firewalling.
+ */
+
+#define NS_NO_LEARN 0x0008 /* disable bridge learning */
+ /*
+ * On a VALE switch, do not 'learn' the source port for
+ * this buffer.
+ */
+
+#define NS_INDIRECT 0x0010 /* userspace buffer */
+ /*
+ * (VALE tx rings only) data is in a userspace buffer,
+ * whose address is in the 'ptr' field in the slot.
+ */
+
+#define NS_MOREFRAG 0x0020 /* packet has more fragments */
+ /*
+ * (VALE ports only)
+ * Set on all but the last slot of a multi-segment packet.
+ * The 'len' field refers to the individual fragment.
+ */
+
#define NS_PORT_SHIFT 8
#define NS_PORT_MASK (0xff << NS_PORT_SHIFT)
-};
+ /*
+ * The high 8 bits of the flag, if not zero, indicate the
+ * destination port for the VALE switch, overriding
+ * the lookup table.
+ */
+
+#define NS_RFRAGS(_slot) ( ((_slot)->flags >> 8) & 0xff)
+ /*
+ * (VALE rx rings only) the high 8 bits
+ * are the number of fragments.
+ */
+
/*
+ * struct netmap_ring
+ *
* Netmap representation of a TX or RX ring (also known as "queue").
* This is a queue implemented as a fixed-size circular array.
- * At the software level, two fields are important: avail and cur.
+ * At the software level the important fields are: head, cur, tail.
*
* In TX rings:
- * avail indicates the number of slots available for transmission.
- * It is updated by the kernel after every netmap system call.
- * It MUST BE decremented by the application when it appends a
- * packet.
- * cur indicates the slot to use for the next packet
- * to send (i.e. the "tail" of the queue).
- * It MUST BE incremented by the application before
- * netmap system calls to reflect the number of newly
- * sent packets.
- * It is checked by the kernel on netmap system calls
- * (normally unmodified by the kernel unless invalid).
- *
- * The kernel side of netmap uses two additional fields in its own
- * private ring structure, netmap_kring:
- * nr_hwcur is a copy of nr_cur on an NIOCTXSYNC.
- * nr_hwavail is the number of slots known as available by the
- * hardware. It is updated on an INTR (inc by the
- * number of packets sent) and on a NIOCTXSYNC
- * (decrease by nr_cur - nr_hwcur)
- * A special case, nr_hwavail is -1 if the transmit
- * side is idle (no pending transmits).
+ *
+ * head first slot available for transmission.
+ * cur wakeup point. select() and poll() will unblock
+ * when 'tail' moves past 'cur'
+ * tail (readonly) first slot reserved to the kernel
+ *
+ * [head .. tail-1] can be used for new packets to send;
+ * 'head' and 'cur' must be incremented as slots are filled
+ * with new packets to be sent;
+ * 'cur' can be moved further ahead if we need more space
+ * for new transmissions.
*
* In RX rings:
- * avail is the number of packets available (possibly 0).
- * It MUST BE decremented by the application when it consumes
- * a packet, and it is updated to nr_hwavail on a NIOCRXSYNC
- * cur indicates the first slot that contains a packet not
- * processed yet (the "head" of the queue).
- * It MUST BE incremented by the software when it consumes
- * a packet.
- * reserved indicates the number of buffers before 'cur'
- * that the application has still in use. Normally 0,
- * it MUST BE incremented by the application when it
- * does not return the buffer immediately, and decremented
- * when the buffer is finally freed.
- *
- * The kernel side of netmap uses two additional fields in the kring:
- * nr_hwcur is a copy of nr_cur on an NIOCRXSYNC
- * nr_hwavail is the number of packets available. It is updated
- * on INTR (inc by the number of new packets arrived)
- * and on NIOCRXSYNC (decreased by nr_cur - nr_hwcur).
+ *
+ * head first valid received packet
+ * cur wakeup point. select() and poll() will unblock
+ * when 'tail' moves past 'cur'
+ * tail (readonly) first slot reserved to the kernel
+ *
+ * [head .. tail-1] contain received packets;
+ * 'head' and 'cur' must be incremented as slots are consumed
+ * and can be returned to the kernel;
+ * 'cur' can be moved further ahead if we want to wait for
+ * new packets without returning the previous ones.
*
* DATA OWNERSHIP/LOCKING:
- * The netmap_ring is owned by the user program and it is only
- * accessed or modified in the upper half of the kernel during
- * a system call.
- *
- * The netmap_kring is only modified by the upper half of the kernel.
- *
- * FLAGS
- * NR_TIMESTAMP updates the 'ts' field on each syscall. This is
- * a global timestamp for all packets.
- * NR_RX_TSTMP if set, the last 64 byte in each buffer will
- * contain a timestamp for the frame supplied by
- * the hardware (if supported)
- * NR_FORWARD if set, the NS_FORWARD flag in each slot of the
- * RX ring is checked, and if set the packet is
- * passed to the other side (host stack or device,
- * respectively). This permits bpf-like behaviour
- * or transparency for selected packets.
+ * The netmap_ring, and all slots and buffers in the range
+ * [head .. tail-1] are owned by the user program;
+ * the kernel only accesses them during a netmap system call
+ * and in the user thread context.
+ *
+ * Other slots and buffers are reserved for use by the kernel
*/
struct netmap_ring {
/*
- * nr_buf_base_ofs is meant to be used through macros.
+ * buf_ofs is meant to be used through macros.
* It contains the offset of the buffer region from this
* descriptor.
*/
- const ssize_t buf_ofs;
+ const int64_t buf_ofs;
const uint32_t num_slots; /* number of slots in the ring. */
- uint32_t avail; /* number of usable slots */
- uint32_t cur; /* 'current' r/w position */
- uint32_t reserved; /* not refilled before current */
+ const uint32_t nr_buf_size;
+ const uint16_t ringid;
+ const uint16_t dir; /* 0: tx, 1: rx */
- const uint16_t nr_buf_size;
- uint16_t flags;
-#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */
-#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */
-#define NR_RX_TSTMP 0x0008 /* set rx timestamp in slots */
+ uint32_t head; /* (u) first user slot */
+ uint32_t cur; /* (u) wakeup point */
+ uint32_t tail; /* (k) first kernel slot */
+
+ uint32_t flags;
- struct timeval ts; /* time of last *sync() */
+ struct timeval ts; /* (k) time of last *sync() */
+
+ /* opaque room for a mutex or similar object */
+ uint8_t sem[128] __attribute__((__aligned__(NM_CACHE_ALIGN)));
/* the slots follow. This struct has variable size */
struct netmap_slot slot[0]; /* array of slots. */
@@ -252,88 +280,246 @@ struct netmap_ring {
/*
+ * RING FLAGS
+ */
+#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */
+ /*
+ * updates the 'ts' field on each netmap syscall. This saves
+ * saves a separate gettimeofday(), and is not much worse than
+ * software timestamps generated in the interrupt handler.
+ */
+
+#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */
+ /*
+ * Enables the NS_FORWARD slot flag for the ring.
+ */
+
+
+/*
* Netmap representation of an interface and its queue(s).
+ * This is initialized by the kernel when binding a file
+ * descriptor to a port, and should be considered as readonly
+ * by user programs. The kernel never uses it.
+ *
* There is one netmap_if for each file descriptor on which we want
- * to select/poll. We assume that on each interface has the same number
- * of receive and transmit queues.
+ * to select/poll.
* select/poll operates on one or all pairs depending on the value of
* nmr_queueid passed on the ioctl.
*/
struct netmap_if {
char ni_name[IFNAMSIZ]; /* name of the interface. */
- const u_int ni_version; /* API version, currently unused */
- const u_int ni_rx_rings; /* number of rx rings */
- const u_int ni_tx_rings; /* if zero, same as ni_rx_rings */
+ const uint32_t ni_version; /* API version, currently unused */
+ const uint32_t ni_flags; /* properties */
+#define NI_PRIV_MEM 0x1 /* private memory region */
+
+ /*
+ * The number of packet rings available in netmap mode.
+ * Physical NICs can have different numbers of tx and rx rings.
+ * Physical NICs also have a 'host' ring pair.
+ * Additionally, clients can request additional ring pairs to
+ * be used for internal communication.
+ */
+ const uint32_t ni_tx_rings; /* number of HW tx rings */
+ const uint32_t ni_rx_rings; /* number of HW rx rings */
+
+ uint32_t ni_bufs_head; /* head index for extra bufs */
+ uint32_t ni_spare1[5];
/*
* The following array contains the offset of each netmap ring
- * from this structure. The first ni_tx_queues+1 entries refer
- * to the tx rings, the next ni_rx_queues+1 refer to the rx rings
- * (the last entry in each block refers to the host stack rings).
- * The area is filled up by the kernel on NIOCREG,
+ * from this structure, in the following order:
+ * NIC tx rings (ni_tx_rings); host tx ring (1); extra tx rings;
+ * NIC rx rings (ni_rx_rings); host tx ring (1); extra rx rings.
+ *
+ * The area is filled up by the kernel on NIOCREGIF,
* and then only read by userspace code.
*/
const ssize_t ring_ofs[0];
};
-#ifndef NIOCREGIF
+
+#ifndef NIOCREGIF
/*
* ioctl names and related fields
*
+ * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues,
+ * whose identity is set in NIOCREGIF through nr_ringid.
+ * These are non blocking and take no argument.
+ *
* NIOCGINFO takes a struct ifreq, the interface name is the input,
* the outputs are number of queues and number of descriptor
* for each queue (useful to set number of threads etc.).
+ * The info returned is only advisory and may change before
+ * the interface is bound to a file descriptor.
*
- * NIOCREGIF takes an interface name within a struct ifreq,
+ * NIOCREGIF takes an interface name within a struct nmre,
* and activates netmap mode on the interface (if possible).
*
- * For vale ports, starting with NETMAP_API = 5,
- * nr_tx_rings and nr_rx_rings specify how many software rings
- * are created (0 means 1).
+ * The argument to NIOCGINFO/NIOCREGIF overlays struct ifreq so we
+ * can pass it down to other NIC-related ioctls.
*
- * NIOCREGIF is also used to attach a NIC to a VALE switch.
- * In this case the name is vale*:ifname, and "nr_cmd"
- * is set to 'NETMAP_BDG_ATTACH' or 'NETMAP_BDG_DETACH'.
- * nr_ringid specifies which rings should be attached, 0 means all,
- * NETMAP_HW_RING + n means only the n-th ring.
- * The process can terminate after the interface has been attached.
+ * The actual argument (struct nmreq) has a number of options to request
+ * different functions.
+ * The following are used in NIOCREGIF when nr_cmd == 0:
*
- * NIOCUNREGIF unregisters the interface associated to the fd.
- * this is deprecated and will go away.
+ * nr_name (in)
+ * The name of the port (em0, valeXXX:YYY, etc.)
+ * limited to IFNAMSIZ for backward compatibility.
*
- * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues,
- * whose identity is set in NIOCREGIF through nr_ringid
+ * nr_version (in/out)
+ * Must match NETMAP_API as used in the kernel, error otherwise.
+ * Always returns the desired value on output.
+ *
+ * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings (in/out)
+ * On input, non-zero values may be used to reconfigure the port
+ * according to the requested values, but this is not guaranteed.
+ * On output the actual values in use are reported.
+ *
+ * nr_ringid (in)
+ * Indicates how rings should be bound to the file descriptors.
+ * If nr_flags != 0, then the low bits (in NETMAP_RING_MASK)
+ * are used to indicate the ring number, and nr_flags specifies
+ * the actual rings to bind. NETMAP_NO_TX_POLL is unaffected.
+ *
+ * NOTE: THE FOLLOWING (nr_flags == 0) IS DEPRECATED:
+ * If nr_flags == 0, NETMAP_HW_RING and NETMAP_SW_RING control
+ * the binding as follows:
+ * 0 (default) binds all physical rings
+ * NETMAP_HW_RING | ring number binds a single ring pair
+ * NETMAP_SW_RING binds only the host tx/rx rings
+ *
+ * NETMAP_NO_TX_POLL can be OR-ed to make select()/poll() push
+ * packets on tx rings only if POLLOUT is set.
+ * The default is to push any pending packet.
+ *
+ * NETMAP_DO_RX_POLL can be OR-ed to make select()/poll() release
+ * packets on rx rings also when POLLIN is NOT set.
+ * The default is to touch the rx ring only with POLLIN.
+ * Note that this is the opposite of TX because it
+ * reflects the common usage.
+ *
+ * NOTE: NETMAP_PRIV_MEM IS DEPRECATED, use nr_arg2 instead.
+ * NETMAP_PRIV_MEM is set on return for ports that do not use
+ * the global memory allocator.
+ * This information is not significant and applications
+ * should look at the region id in nr_arg2
+ *
+ * nr_flags is the recommended mode to indicate which rings should
+ * be bound to a file descriptor. Values are NR_REG_*
+ *
+ * nr_arg1 (in) The number of extra rings to be reserved.
+ * Especially when allocating a VALE port the system only
+ * allocates the amount of memory needed for the port.
+ * If more shared memory rings are desired (e.g. for pipes),
+ * the first invocation for the same basename/allocator
+ * should specify a suitable number. Memory cannot be
+ * extended after the first allocation without closing
+ * all ports on the same region.
+ *
+ * nr_arg2 (in/out) The identity of the memory region used.
+ * On input, 0 means the system decides autonomously,
+ * other values may try to select a specific region.
+ * On return the actual value is reported.
+ * Region '1' is the global allocator, normally shared
+ * by all interfaces. Other values are private regions.
+ * If two ports the same region zero-copy is possible.
+ *
+ * nr_arg3 (in/out) number of extra buffers to be allocated.
+ *
+ *
+ *
+ * nr_cmd (in) if non-zero indicates a special command:
+ * NETMAP_BDG_ATTACH and nr_name = vale*:ifname
+ * attaches the NIC to the switch; nr_ringid specifies
+ * which rings to use. Used by vale-ctl -a ...
+ * nr_arg1 = NETMAP_BDG_HOST also attaches the host port
+ * as in vale-ctl -h ...
+ *
+ * NETMAP_BDG_DETACH and nr_name = vale*:ifname
+ * disconnects a previously attached NIC.
+ * Used by vale-ctl -d ...
+ *
+ * NETMAP_BDG_LIST
+ * list the configuration of VALE switches.
+ *
+ * NETMAP_BDG_VNET_HDR
+ * Set the virtio-net header length used by the client
+ * of a VALE switch port.
+ *
+ * nr_arg1, nr_arg2, nr_arg3 (in/out) command specific
+ *
+ *
*
- * NETMAP_API is the API version.
*/
+
/*
- * struct nmreq overlays a struct ifreq
+ * struct nmreq overlays a struct ifreq (just the name)
+ *
+ * On input, nr_ringid indicates which rings we are requesting,
+ * with the low flags for the specific ring number.
+ * selection FLAGS RING INDEX
+ *
+ * all the NIC rings 0x0000 -
+ * only HOST ring 0x2000 ring index
+ * single NIC ring 0x4000 -
+ * all the NIC+HOST rings 0x6000 -
+ * one pipe ring, master 0x8000 ring index
+ * *** INVALID 0xA000
+ * one pipe ring, slave 0xC000 ring index
+ * *** INVALID 0xE000
+ *
*/
struct nmreq {
char nr_name[IFNAMSIZ];
uint32_t nr_version; /* API version */
-#define NETMAP_API 4 /* current version */
uint32_t nr_offset; /* nifp offset in the shared region */
uint32_t nr_memsize; /* size of the shared region */
uint32_t nr_tx_slots; /* slots in tx rings */
uint32_t nr_rx_slots; /* slots in rx rings */
uint16_t nr_tx_rings; /* number of tx rings */
uint16_t nr_rx_rings; /* number of rx rings */
+
uint16_t nr_ringid; /* ring(s) we care about */
-#define NETMAP_HW_RING 0x4000 /* low bits indicate one hw ring */
-#define NETMAP_SW_RING 0x2000 /* process the sw ring */
+#define NETMAP_HW_RING 0x4000 /* single NIC ring pair */
+#define NETMAP_SW_RING 0x2000 /* only host ring pair */
+
+#define NETMAP_RING_MASK 0x0fff /* the ring number */
+
#define NETMAP_NO_TX_POLL 0x1000 /* no automatic txsync on poll */
-#define NETMAP_RING_MASK 0xfff /* the ring number */
+
+#define NETMAP_DO_RX_POLL 0x8000 /* DO automatic rxsync on poll */
+
uint16_t nr_cmd;
#define NETMAP_BDG_ATTACH 1 /* attach the NIC */
#define NETMAP_BDG_DETACH 2 /* detach the NIC */
#define NETMAP_BDG_LOOKUP_REG 3 /* register lookup function */
#define NETMAP_BDG_LIST 4 /* get bridge's info */
- uint16_t nr_arg1;
+#define NETMAP_BDG_VNET_HDR 5 /* set the port virtio-net-hdr length */
+#define NETMAP_BDG_OFFSET NETMAP_BDG_VNET_HDR /* deprecated alias */
+
+ uint16_t nr_arg1; /* reserve extra rings in NIOCREGIF */
#define NETMAP_BDG_HOST 1 /* attach the host stack on ATTACH */
+
uint16_t nr_arg2;
- uint32_t spare2[3];
+ uint32_t nr_arg3; /* req. extra buffers in NIOCREGIF */
+ uint32_t nr_flags;
+ /* various modes, extends nr_ringid */
+ uint32_t spare2[1];
+};
+
+#define NR_REG_MASK 0xf /* values for nr_flags */
+enum { NR_REG_DEFAULT = 0, /* backward compat, should not be used. */
+ NR_REG_ALL_NIC = 1,
+ NR_REG_SW = 2,
+ NR_REG_NIC_SW = 3,
+ NR_REG_ONE_NIC = 4,
+ NR_REG_PIPE_MASTER = 5,
+ NR_REG_PIPE_SLAVE = 6,
};
+/* monitor uses the NR_REG to select the rings to monitor */
+#define NR_MONITOR_TX 0x100
+#define NR_MONITOR_RX 0x200
+
/*
* FreeBSD uses the size value embedded in the _IOWR to determine
@@ -343,9 +529,22 @@ struct nmreq {
*/
#define NIOCGINFO _IOWR('i', 145, struct nmreq) /* return IF info */
#define NIOCREGIF _IOWR('i', 146, struct nmreq) /* interface register */
-#define NIOCUNREGIF _IO('i', 147) /* interface unregister */
#define NIOCTXSYNC _IO('i', 148) /* sync tx queues */
#define NIOCRXSYNC _IO('i', 149) /* sync rx queues */
#endif /* !NIOCREGIF */
+
+/*
+ * Helper functions for kernel and userspace
+ */
+
+/*
+ * check if space is available in the ring.
+ */
+static inline int
+nm_ring_empty(struct netmap_ring *ring)
+{
+ return (ring->cur == ring->tail);
+}
+
#endif /* _NET_NETMAP_H_ */
diff --git a/sys/net/netmap_user.h b/sys/net/netmap_user.h
index fcb5cb3..9c3a4c1 100644
--- a/sys/net/netmap_user.h
+++ b/sys/net/netmap_user.h
@@ -1,40 +1,34 @@
/*
- * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
- *
+ * Copyright (C) 2011-2014 Universita` di Pisa. All rights reserved.
+ *
* Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
- *
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the
- * distribution.
- *
- * 3. Neither the name of the authors nor the names of their contributors
- * may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY MATTEO LANDI AND CONTRIBUTORS "AS IS" AND
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL MATTEO LANDI OR CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
- * THE POSSIBILITY OF SUCH DAMAGE.
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
*/
/*
* $FreeBSD$
*
- * This header contains the macros used to manipulate netmap structures
- * and packets in userspace. See netmap(4) for more information.
+ * Functions and macros to manipulate netmap structures and packets
+ * in userspace. See netmap(4) for more information.
*
* The address of the struct netmap_if, say nifp, is computed from the
* value returned from ioctl(.., NIOCREG, ...) and the mmap region:
@@ -49,22 +43,44 @@
* we can access ring->nr_cur, ring->nr_avail, ring->nr_flags
*
* ring->slot[i] gives us the i-th slot (we can access
- * directly plen, flags, bufindex)
+ * directly len, flags, buf_idx)
*
* char *buf = NETMAP_BUF(ring, x) returns a pointer to
* the buffer numbered x
*
- * Since rings are circular, we have macros to compute the next index
- * i = NETMAP_RING_NEXT(ring, i);
+ * All ring indexes (head, cur, tail) should always move forward.
+ * To compute the next index in a circular ring you can use
+ * i = nm_ring_next(ring, i);
+ *
+ * To ease porting apps from pcap to netmap we supply a few fuctions
+ * that can be called to open, close, read and write on netmap in a way
+ * similar to libpcap. Note that the read/write function depend on
+ * an ioctl()/select()/poll() being issued to refill rings or push
+ * packets out.
+ *
+ * In order to use these, include #define NETMAP_WITH_LIBS
+ * in the source file that invokes these functions.
*/
#ifndef _NET_NETMAP_USER_H_
#define _NET_NETMAP_USER_H_
+#include <stdint.h>
+#include <sys/socket.h> /* apple needs sockaddr */
+#include <net/if.h> /* IFNAMSIZ */
+
+#ifndef likely
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#endif /* likely and unlikely */
+
+#include <net/netmap.h>
+
+/* helper macro */
#define _NETMAP_OFFSET(type, ptr, offset) \
((type)(void *)((char *)(ptr) + (offset)))
-#define NETMAP_IF(b, o) _NETMAP_OFFSET(struct netmap_if *, b, o)
+#define NETMAP_IF(_base, _ofs) _NETMAP_OFFSET(struct netmap_if *, _base, _ofs)
#define NETMAP_TXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, \
nifp, (nifp)->ring_ofs[index] )
@@ -77,19 +93,585 @@
#define NETMAP_BUF_IDX(ring, buf) \
( ((char *)(buf) - ((char *)(ring) + (ring)->buf_ofs) ) / \
- (ring)->nr_buf_size )
+ (ring)->nr_buf_size )
+
+
+static inline uint32_t
+nm_ring_next(struct netmap_ring *r, uint32_t i)
+{
+ return ( unlikely(i + 1 == r->num_slots) ? 0 : i + 1);
+}
+
+
+/*
+ * Return 1 if we have pending transmissions in the tx ring.
+ * When everything is complete ring->head = ring->tail + 1 (modulo ring size)
+ */
+static inline int
+nm_tx_pending(struct netmap_ring *r)
+{
+ return nm_ring_next(r, r->tail) != r->head;
+}
+
-#define NETMAP_RING_NEXT(r, i) \
- ((i)+1 == (r)->num_slots ? 0 : (i) + 1 )
+static inline uint32_t
+nm_ring_space(struct netmap_ring *ring)
+{
+ int ret = ring->tail - ring->cur;
+ if (ret < 0)
+ ret += ring->num_slots;
+ return ret;
+}
-#define NETMAP_RING_FIRST_RESERVED(r) \
- ( (r)->cur < (r)->reserved ? \
- (r)->cur + (r)->num_slots - (r)->reserved : \
- (r)->cur - (r)->reserved )
+#ifdef NETMAP_WITH_LIBS
/*
- * Return 1 if the given tx ring is empty.
+ * Support for simple I/O libraries.
+ * Include other system headers required for compiling this.
*/
-#define NETMAP_TX_RING_EMPTY(r) ((r)->avail >= (r)->num_slots - 1)
+
+#ifndef HAVE_NETMAP_WITH_LIBS
+#define HAVE_NETMAP_WITH_LIBS
+
+#include <sys/time.h>
+#include <sys/mman.h>
+#include <string.h> /* memset */
+#include <sys/ioctl.h>
+#include <sys/errno.h> /* EINVAL */
+#include <fcntl.h> /* O_RDWR */
+#include <unistd.h> /* close() */
+#include <signal.h>
+#include <stdlib.h>
+
+#ifndef ND /* debug macros */
+/* debug support */
+#define ND(_fmt, ...) do {} while(0)
+#define D(_fmt, ...) \
+ do { \
+ struct timeval t0; \
+ gettimeofday(&t0, NULL); \
+ fprintf(stderr, "%03d.%06d %s [%d] " _fmt "\n", \
+ (int)(t0.tv_sec % 1000), (int)t0.tv_usec, \
+ __FUNCTION__, __LINE__, ##__VA_ARGS__); \
+ } while (0)
+
+/* Rate limited version of "D", lps indicates how many per second */
+#define RD(lps, format, ...) \
+ do { \
+ static int t0, __cnt; \
+ struct timeval __xxts; \
+ gettimeofday(&__xxts, NULL); \
+ if (t0 != __xxts.tv_sec) { \
+ t0 = __xxts.tv_sec; \
+ __cnt = 0; \
+ } \
+ if (__cnt++ < lps) { \
+ D(format, ##__VA_ARGS__); \
+ } \
+ } while (0)
+#endif
+
+struct nm_pkthdr { /* same as pcap_pkthdr */
+ struct timeval ts;
+ uint32_t caplen;
+ uint32_t len;
+};
+
+struct nm_stat { /* same as pcap_stat */
+ u_int ps_recv;
+ u_int ps_drop;
+ u_int ps_ifdrop;
+#ifdef WIN32
+ u_int bs_capt;
+#endif /* WIN32 */
+};
+
+#define NM_ERRBUF_SIZE 512
+
+struct nm_desc {
+ struct nm_desc *self; /* point to self if netmap. */
+ int fd;
+ void *mem;
+ int memsize;
+ int done_mmap; /* set if mem is the result of mmap */
+ struct netmap_if * const nifp;
+ uint16_t first_tx_ring, last_tx_ring, cur_tx_ring;
+ uint16_t first_rx_ring, last_rx_ring, cur_rx_ring;
+ struct nmreq req; /* also contains the nr_name = ifname */
+ struct nm_pkthdr hdr;
+
+ /*
+ * The memory contains netmap_if, rings and then buffers.
+ * Given a pointer (e.g. to nm_inject) we can compare with
+ * mem/buf_start/buf_end to tell if it is a buffer or
+ * some other descriptor in our region.
+ * We also store a pointer to some ring as it helps in the
+ * translation from buffer indexes to addresses.
+ */
+ struct netmap_ring * const some_ring;
+ void * const buf_start;
+ void * const buf_end;
+ /* parameters from pcap_open_live */
+ int snaplen;
+ int promisc;
+ int to_ms;
+ char *errbuf;
+
+ /* save flags so we can restore them on close */
+ uint32_t if_flags;
+ uint32_t if_reqcap;
+ uint32_t if_curcap;
+
+ struct nm_stat st;
+ char msg[NM_ERRBUF_SIZE];
+};
+
+/*
+ * when the descriptor is open correctly, d->self == d
+ * Eventually we should also use some magic number.
+ */
+#define P2NMD(p) ((struct nm_desc *)(p))
+#define IS_NETMAP_DESC(d) ((d) && P2NMD(d)->self == P2NMD(d))
+#define NETMAP_FD(d) (P2NMD(d)->fd)
+
+
+/*
+ * this is a slightly optimized copy routine which rounds
+ * to multiple of 64 bytes and is often faster than dealing
+ * with other odd sizes. We assume there is enough room
+ * in the source and destination buffers.
+ *
+ * XXX only for multiples of 64 bytes, non overlapped.
+ */
+static inline void
+nm_pkt_copy(const void *_src, void *_dst, int l)
+{
+ const uint64_t *src = (const uint64_t *)_src;
+ uint64_t *dst = (uint64_t *)_dst;
+
+ if (unlikely(l >= 1024)) {
+ memcpy(dst, src, l);
+ return;
+ }
+ for (; likely(l > 0); l-=64) {
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ }
+}
+
+
+/*
+ * The callback, invoked on each received packet. Same as libpcap
+ */
+typedef void (*nm_cb_t)(u_char *, const struct nm_pkthdr *, const u_char *d);
+
+/*
+ *--- the pcap-like API ---
+ *
+ * nm_open() opens a file descriptor, binds to a port and maps memory.
+ *
+ * ifname (netmap:foo or vale:foo) is the port name
+ * a suffix can indicate the follwing:
+ * ^ bind the host (sw) ring pair
+ * * bind host and NIC ring pairs (transparent)
+ * -NN bind individual NIC ring pair
+ * {NN bind master side of pipe NN
+ * }NN bind slave side of pipe NN
+ *
+ * req provides the initial values of nmreq before parsing ifname.
+ * Remember that the ifname parsing will override the ring
+ * number in nm_ringid, and part of nm_flags;
+ * flags special functions, normally 0
+ * indicates which fields of *arg are significant
+ * arg special functions, normally NULL
+ * if passed a netmap_desc with mem != NULL,
+ * use that memory instead of mmap.
+ */
+
+static struct nm_desc *nm_open(const char *ifname, const struct nmreq *req,
+ uint64_t flags, const struct nm_desc *arg);
+
+/*
+ * nm_open can import some fields from the parent descriptor.
+ * These flags control which ones.
+ * Also in flags you can specify NETMAP_NO_TX_POLL and NETMAP_DO_RX_POLL,
+ * which set the initial value for these flags.
+ * Note that the 16 low bits of the flags are reserved for data
+ * that may go into the nmreq.
+ */
+enum {
+ NM_OPEN_NO_MMAP = 0x040000, /* reuse mmap from parent */
+ NM_OPEN_IFNAME = 0x080000, /* nr_name, nr_ringid, nr_flags */
+ NM_OPEN_ARG1 = 0x100000,
+ NM_OPEN_ARG2 = 0x200000,
+ NM_OPEN_ARG3 = 0x400000,
+ NM_OPEN_RING_CFG = 0x800000, /* tx|rx rings|slots */
+};
+
+
+/*
+ * nm_close() closes and restores the port to its previous state
+ */
+
+static int nm_close(struct nm_desc *);
+
+/*
+ * nm_inject() is the same as pcap_inject()
+ * nm_dispatch() is the same as pcap_dispatch()
+ * nm_nextpkt() is the same as pcap_next()
+ */
+
+static int nm_inject(struct nm_desc *, const void *, size_t);
+static int nm_dispatch(struct nm_desc *, int, nm_cb_t, u_char *);
+static u_char *nm_nextpkt(struct nm_desc *, struct nm_pkthdr *);
+
+
+/*
+ * Try to open, return descriptor if successful, NULL otherwise.
+ * An invalid netmap name will return errno = 0;
+ * You can pass a pointer to a pre-filled nm_desc to add special
+ * parameters. Flags is used as follows
+ * NM_OPEN_NO_MMAP use the memory from arg, only
+ * if the nr_arg2 (memory block) matches.
+ * NM_OPEN_ARG1 use req.nr_arg1 from arg
+ * NM_OPEN_ARG2 use req.nr_arg2 from arg
+ * NM_OPEN_RING_CFG user ring config from arg
+ */
+static struct nm_desc *
+nm_open(const char *ifname, const struct nmreq *req,
+ uint64_t new_flags, const struct nm_desc *arg)
+{
+ struct nm_desc *d = NULL;
+ const struct nm_desc *parent = arg;
+ u_int namelen;
+ uint32_t nr_ringid = 0, nr_flags;
+ const char *port = NULL;
+ const char *errmsg = NULL;
+
+ if (strncmp(ifname, "netmap:", 7) && strncmp(ifname, "vale", 4)) {
+ errno = 0; /* name not recognised, not an error */
+ return NULL;
+ }
+ if (ifname[0] == 'n')
+ ifname += 7;
+ /* scan for a separator */
+ for (port = ifname; *port && !index("-*^{}", *port); port++)
+ ;
+ namelen = port - ifname;
+ if (namelen >= sizeof(d->req.nr_name)) {
+ errmsg = "name too long";
+ goto fail;
+ }
+ switch (*port) {
+ default: /* '\0', no suffix */
+ nr_flags = NR_REG_ALL_NIC;
+ break;
+ case '-': /* one NIC */
+ nr_flags = NR_REG_ONE_NIC;
+ nr_ringid = atoi(port + 1);
+ break;
+ case '*': /* NIC and SW, ignore port */
+ nr_flags = NR_REG_NIC_SW;
+ if (port[1]) {
+ errmsg = "invalid port for nic+sw";
+ goto fail;
+ }
+ break;
+ case '^': /* only sw ring */
+ nr_flags = NR_REG_SW;
+ if (port[1]) {
+ errmsg = "invalid port for sw ring";
+ goto fail;
+ }
+ break;
+ case '{':
+ nr_flags = NR_REG_PIPE_MASTER;
+ nr_ringid = atoi(port + 1);
+ break;
+ case '}':
+ nr_flags = NR_REG_PIPE_SLAVE;
+ nr_ringid = atoi(port + 1);
+ break;
+ }
+
+ if (nr_ringid >= NETMAP_RING_MASK) {
+ errmsg = "invalid ringid";
+ goto fail;
+ }
+ /* add the *XPOLL flags */
+ nr_ringid |= new_flags & (NETMAP_NO_TX_POLL | NETMAP_DO_RX_POLL);
+
+ d = (struct nm_desc *)calloc(1, sizeof(*d));
+ if (d == NULL) {
+ errmsg = "nm_desc alloc failure";
+ errno = ENOMEM;
+ return NULL;
+ }
+ d->self = d; /* set this early so nm_close() works */
+ d->fd = open("/dev/netmap", O_RDWR);
+ if (d->fd < 0) {
+ errmsg = "cannot open /dev/netmap";
+ goto fail;
+ }
+
+ if (req)
+ d->req = *req;
+ d->req.nr_version = NETMAP_API;
+ d->req.nr_ringid &= ~NETMAP_RING_MASK;
+
+ /* these fields are overridden by ifname and flags processing */
+ d->req.nr_ringid |= nr_ringid;
+ d->req.nr_flags = nr_flags;
+ memcpy(d->req.nr_name, ifname, namelen);
+ d->req.nr_name[namelen] = '\0';
+ /* optionally import info from parent */
+ if (IS_NETMAP_DESC(parent) && new_flags) {
+ if (new_flags & NM_OPEN_ARG1)
+ D("overriding ARG1 %d", parent->req.nr_arg1);
+ d->req.nr_arg1 = new_flags & NM_OPEN_ARG1 ?
+ parent->req.nr_arg1 : 4;
+ if (new_flags & NM_OPEN_ARG2)
+ D("overriding ARG2 %d", parent->req.nr_arg2);
+ d->req.nr_arg2 = new_flags & NM_OPEN_ARG2 ?
+ parent->req.nr_arg2 : 0;
+ if (new_flags & NM_OPEN_ARG3)
+ D("overriding ARG3 %d", parent->req.nr_arg3);
+ d->req.nr_arg3 = new_flags & NM_OPEN_ARG3 ?
+ parent->req.nr_arg3 : 0;
+ if (new_flags & NM_OPEN_RING_CFG) {
+ D("overriding RING_CFG");
+ d->req.nr_tx_slots = parent->req.nr_tx_slots;
+ d->req.nr_rx_slots = parent->req.nr_rx_slots;
+ d->req.nr_tx_rings = parent->req.nr_tx_rings;
+ d->req.nr_rx_rings = parent->req.nr_rx_rings;
+ }
+ if (new_flags & NM_OPEN_IFNAME) {
+ D("overriding ifname %s ringid 0x%x flags 0x%x",
+ parent->req.nr_name, parent->req.nr_ringid,
+ parent->req.nr_flags);
+ memcpy(d->req.nr_name, parent->req.nr_name,
+ sizeof(d->req.nr_name));
+ d->req.nr_ringid = parent->req.nr_ringid;
+ d->req.nr_flags = parent->req.nr_flags;
+ }
+ }
+ if (ioctl(d->fd, NIOCREGIF, &d->req)) {
+ errmsg = "NIOCREGIF failed";
+ goto fail;
+ }
+
+ if (IS_NETMAP_DESC(parent) && parent->mem &&
+ parent->req.nr_arg2 == d->req.nr_arg2) {
+ /* do not mmap, inherit from parent */
+ d->memsize = parent->memsize;
+ d->mem = parent->mem;
+ } else {
+ d->memsize = d->req.nr_memsize;
+ d->mem = mmap(0, d->memsize, PROT_WRITE | PROT_READ, MAP_SHARED,
+ d->fd, 0);
+ if (d->mem == NULL) {
+ errmsg = "mmap failed";
+ goto fail;
+ }
+ d->done_mmap = 1;
+ }
+ {
+ struct netmap_if *nifp = NETMAP_IF(d->mem, d->req.nr_offset);
+ struct netmap_ring *r = NETMAP_RXRING(nifp, );
+
+ *(struct netmap_if **)(uintptr_t)&(d->nifp) = nifp;
+ *(struct netmap_ring **)(uintptr_t)&d->some_ring = r;
+ *(void **)(uintptr_t)&d->buf_start = NETMAP_BUF(r, 0);
+ *(void **)(uintptr_t)&d->buf_end =
+ (char *)d->mem + d->memsize;
+ }
+
+ if (nr_flags == NR_REG_SW) { /* host stack */
+ d->first_tx_ring = d->last_tx_ring = d->req.nr_tx_rings;
+ d->first_rx_ring = d->last_rx_ring = d->req.nr_rx_rings;
+ } else if (nr_flags == NR_REG_ALL_NIC) { /* only nic */
+ d->first_tx_ring = 0;
+ d->first_rx_ring = 0;
+ d->last_tx_ring = d->req.nr_tx_rings - 1;
+ d->last_rx_ring = d->req.nr_rx_rings - 1;
+ } else if (nr_flags == NR_REG_NIC_SW) {
+ d->first_tx_ring = 0;
+ d->first_rx_ring = 0;
+ d->last_tx_ring = d->req.nr_tx_rings;
+ d->last_rx_ring = d->req.nr_rx_rings;
+ } else if (nr_flags == NR_REG_ONE_NIC) {
+ /* XXX check validity */
+ d->first_tx_ring = d->last_tx_ring =
+ d->first_rx_ring = d->last_rx_ring = nr_ringid;
+ } else { /* pipes */
+ d->first_tx_ring = d->last_tx_ring = 0;
+ d->first_rx_ring = d->last_rx_ring = 0;
+ }
+
+#ifdef DEBUG_NETMAP_USER
+ { /* debugging code */
+ int i;
+
+ D("%s tx %d .. %d %d rx %d .. %d %d", ifname,
+ d->first_tx_ring, d->last_tx_ring, d->req.nr_tx_rings,
+ d->first_rx_ring, d->last_rx_ring, d->req.nr_rx_rings);
+ for (i = 0; i <= d->req.nr_tx_rings; i++) {
+ struct netmap_ring *r = NETMAP_TXRING(d->nifp, i);
+ D("TX%d %p h %d c %d t %d", i, r, r->head, r->cur, r->tail);
+ }
+ for (i = 0; i <= d->req.nr_rx_rings; i++) {
+ struct netmap_ring *r = NETMAP_RXRING(d->nifp, i);
+ D("RX%d %p h %d c %d t %d", i, r, r->head, r->cur, r->tail);
+ }
+ }
+#endif /* debugging */
+
+ d->cur_tx_ring = d->first_tx_ring;
+ d->cur_rx_ring = d->first_rx_ring;
+ return d;
+
+fail:
+ nm_close(d);
+ if (errmsg)
+ D("%s %s", errmsg, ifname);
+ errno = EINVAL;
+ return NULL;
+}
+
+
+static int
+nm_close(struct nm_desc *d)
+{
+ /*
+ * ugly trick to avoid unused warnings
+ */
+ static void *__xxzt[] __attribute__ ((unused)) =
+ { (void *)nm_open, (void *)nm_inject,
+ (void *)nm_dispatch, (void *)nm_nextpkt } ;
+
+ if (d == NULL || d->self != d)
+ return EINVAL;
+ if (d->done_mmap && d->mem)
+ munmap(d->mem, d->memsize);
+ if (d->fd != -1)
+ close(d->fd);
+ bzero(d, sizeof(*d));
+ free(d);
+ return 0;
+}
+
+
+/*
+ * Same prototype as pcap_inject(), only need to cast.
+ */
+static int
+nm_inject(struct nm_desc *d, const void *buf, size_t size)
+{
+ u_int c, n = d->last_tx_ring - d->first_tx_ring + 1;
+
+ for (c = 0; c < n ; c++) {
+ /* compute current ring to use */
+ struct netmap_ring *ring;
+ uint32_t i, idx;
+ uint32_t ri = d->cur_tx_ring + c;
+
+ if (ri > d->last_tx_ring)
+ ri = d->first_tx_ring;
+ ring = NETMAP_TXRING(d->nifp, ri);
+ if (nm_ring_empty(ring)) {
+ continue;
+ }
+ i = ring->cur;
+ idx = ring->slot[i].buf_idx;
+ ring->slot[i].len = size;
+ nm_pkt_copy(buf, NETMAP_BUF(ring, idx), size);
+ d->cur_tx_ring = ri;
+ ring->head = ring->cur = nm_ring_next(ring, i);
+ return size;
+ }
+ return 0; /* fail */
+}
+
+
+/*
+ * Same prototype as pcap_dispatch(), only need to cast.
+ */
+static int
+nm_dispatch(struct nm_desc *d, int cnt, nm_cb_t cb, u_char *arg)
+{
+ int n = d->last_rx_ring - d->first_rx_ring + 1;
+ int c, got = 0, ri = d->cur_rx_ring;
+
+ if (cnt == 0)
+ cnt = -1;
+ /* cnt == -1 means infinite, but rings have a finite amount
+ * of buffers and the int is large enough that we never wrap,
+ * so we can omit checking for -1
+ */
+ for (c=0; c < n && cnt != got; c++) {
+ /* compute current ring to use */
+ struct netmap_ring *ring;
+
+ ri = d->cur_rx_ring + c;
+ if (ri > d->last_rx_ring)
+ ri = d->first_rx_ring;
+ ring = NETMAP_RXRING(d->nifp, ri);
+ for ( ; !nm_ring_empty(ring) && cnt != got; got++) {
+ u_int i = ring->cur;
+ u_int idx = ring->slot[i].buf_idx;
+ u_char *buf = (u_char *)NETMAP_BUF(ring, idx);
+
+ // __builtin_prefetch(buf);
+ d->hdr.len = d->hdr.caplen = ring->slot[i].len;
+ d->hdr.ts = ring->ts;
+ cb(arg, &d->hdr, buf);
+ ring->head = ring->cur = nm_ring_next(ring, i);
+ }
+ }
+ d->cur_rx_ring = ri;
+ return got;
+}
+
+static u_char *
+nm_nextpkt(struct nm_desc *d, struct nm_pkthdr *hdr)
+{
+ int ri = d->cur_rx_ring;
+
+ do {
+ /* compute current ring to use */
+ struct netmap_ring *ring = NETMAP_RXRING(d->nifp, ri);
+ if (!nm_ring_empty(ring)) {
+ u_int i = ring->cur;
+ u_int idx = ring->slot[i].buf_idx;
+ u_char *buf = (u_char *)NETMAP_BUF(ring, idx);
+
+ // __builtin_prefetch(buf);
+ hdr->ts = ring->ts;
+ hdr->len = hdr->caplen = ring->slot[i].len;
+ ring->cur = nm_ring_next(ring, i);
+ /* we could postpone advancing head if we want
+ * to hold the buffer. This can be supported in
+ * the future.
+ */
+ ring->head = ring->cur;
+ d->cur_rx_ring = ri;
+ return buf;
+ }
+ ri++;
+ if (ri > d->last_rx_ring)
+ ri = d->first_rx_ring;
+ } while (ri != d->cur_rx_ring);
+ return NULL; /* nothing found */
+}
+
+#endif /* !HAVE_NETMAP_WITH_LIBS */
+
+#endif /* NETMAP_WITH_LIBS */
#endif /* _NET_NETMAP_USER_H_ */
OpenPOWER on IntegriCloud