summaryrefslogtreecommitdiffstats
path: root/sys/net/netmap_user.h
diff options
context:
space:
mode:
authorluigi <luigi@FreeBSD.org>2014-01-06 12:53:15 +0000
committerluigi <luigi@FreeBSD.org>2014-01-06 12:53:15 +0000
commit41068e3dad4b13c5f9eb42f4dc310e9abf19de43 (patch)
tree529a5d218d5f4d073c5ad30a4b484d1b412ea226 /sys/net/netmap_user.h
parentb421f931ee549ea10e588f82e7cc1df4c24e2c2a (diff)
downloadFreeBSD-src-41068e3dad4b13c5f9eb42f4dc310e9abf19de43.zip
FreeBSD-src-41068e3dad4b13c5f9eb42f4dc310e9abf19de43.tar.gz
It is 2014 and we have a new version of netmap.
Most relevant features: - netmap emulation on any NIC, even those without native netmap support. On the ixgbe we have measured about 4Mpps/core/queue in this mode, which is still a lot more than with sockets/bpf. - seamless interconnection of VALE switch, NICs and host stack. If you disable accelerations on your NIC (say em0) ifconfig em0 -txcsum -txcsum you can use the VALE switch to connect the NIC and the host stack: vale-ctl -h valeXX:em0 allowing sharing the NIC with other netmap clients. - THE USER API HAS SLIGHTLY CHANGED (head/cur/tail pointers instead of pointers/count as before). This was unavoidable to support, in the future, multiple threads operating on the same rings. Netmap clients require very small source code changes to compile again. On the plus side, the new API should be easier to understand and the internals are a lot simpler. The manual page has been updated extensively to reflect the current features and give some examples. This is the result of work of several people including Giuseppe Lettieri, Vincenzo Maffione, Michio Honda and myself, and has been financially supported by EU projects CHANGE and OPENLAB, from NetApp University Research Fund, NEC, and of course the Universita` di Pisa.
Diffstat (limited to 'sys/net/netmap_user.h')
-rw-r--r--sys/net/netmap_user.h189
1 files changed, 153 insertions, 36 deletions
diff --git a/sys/net/netmap_user.h b/sys/net/netmap_user.h
index 3f28583..bd6fe0d 100644
--- a/sys/net/netmap_user.h
+++ b/sys/net/netmap_user.h
@@ -1,6 +1,5 @@
/*
- * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
- * Copyright (C) 2013 Universita` di Pisa
+ * Copyright (C) 2011-2014 Universita` di Pisa. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -28,8 +27,8 @@
/*
* $FreeBSD$
*
- * This header contains the macros used to manipulate netmap structures
- * and packets in userspace. See netmap(4) for more information.
+ * Functions and macros to manipulate netmap structures and packets
+ * in userspace. See netmap(4) for more information.
*
* The address of the struct netmap_if, say nifp, is computed from the
* value returned from ioctl(.., NIOCREG, ...) and the mmap region:
@@ -44,17 +43,20 @@
* we can access ring->nr_cur, ring->nr_avail, ring->nr_flags
*
* ring->slot[i] gives us the i-th slot (we can access
- * directly plen, flags, bufindex)
+ * directly len, flags, buf_idx)
*
* char *buf = NETMAP_BUF(ring, x) returns a pointer to
* the buffer numbered x
*
- * Since rings are circular, we have macros to compute the next index
- * i = NETMAP_RING_NEXT(ring, i);
+ * All ring indexes (head, cur, tail) should always move forward.
+ * To compute the next index in a circular ring you can use
+ * i = nm_ring_next(ring, i);
*
* To ease porting apps from pcap to netmap we supply a few fuctions
- * that can be called to open, close and read from netmap in a way
- * similar to libpcap.
+ * that can be called to open, close, read and write on netmap in a way
+ * similar to libpcap. Note that the read/write function depend on
+ * an ioctl()/select()/poll() being issued to refill rings or push
+ * packets out.
*
* In order to use these, include #define NETMAP_WITH_LIBS
* in the source file that invokes these functions.
@@ -65,12 +67,19 @@
#include <stdint.h>
#include <net/if.h> /* IFNAMSIZ */
+
+#ifndef likely
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#endif /* likely and unlikely */
+
#include <net/netmap.h>
+/* helper macro */
#define _NETMAP_OFFSET(type, ptr, offset) \
((type)(void *)((char *)(ptr) + (offset)))
-#define NETMAP_IF(b, o) _NETMAP_OFFSET(struct netmap_if *, b, o)
+#define NETMAP_IF(_base, _ofs) _NETMAP_OFFSET(struct netmap_if *, _base, _ofs)
#define NETMAP_TXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, \
nifp, (nifp)->ring_ofs[index] )
@@ -85,18 +94,34 @@
( ((char *)(buf) - ((char *)(ring) + (ring)->buf_ofs) ) / \
(ring)->nr_buf_size )
-#define NETMAP_RING_NEXT(r, i) \
- ((i)+1 == (r)->num_slots ? 0 : (i) + 1 )
-#define NETMAP_RING_FIRST_RESERVED(r) \
- ( (r)->cur < (r)->reserved ? \
- (r)->cur + (r)->num_slots - (r)->reserved : \
- (r)->cur - (r)->reserved )
+static inline uint32_t
+nm_ring_next(struct netmap_ring *r, uint32_t i)
+{
+ return ( unlikely(i + 1 == r->num_slots) ? 0 : i + 1);
+}
+
/*
- * Return 1 if the given tx ring is empty.
+ * Return 1 if we have pending transmissions in the tx ring.
+ * When everything is complete ring->cur = ring->tail + 1 (modulo ring size)
*/
-#define NETMAP_TX_RING_EMPTY(r) ((r)->avail >= (r)->num_slots - 1)
+static inline int
+nm_tx_pending(struct netmap_ring *r)
+{
+ return nm_ring_next(r, r->tail) != r->cur;
+}
+
+
+static inline uint32_t
+nm_ring_space(struct netmap_ring *ring)
+{
+ int ret = ring->tail - ring->cur;
+ if (ret < 0)
+ ret += ring->num_slots;
+ return ret;
+}
+
#ifdef NETMAP_WITH_LIBS
/*
@@ -113,7 +138,12 @@
#include <sys/ioctl.h>
#include <sys/errno.h> /* EINVAL */
#include <fcntl.h> /* O_RDWR */
-#include <malloc.h>
+#include <unistd.h> /* close() */
+#ifdef __FreeBSD__
+#include <stdlib.h>
+#else
+#include <malloc.h> /* on FreeBSD it is stdlib.h */
+#endif
struct nm_hdr_t { /* same as pcap_pkthdr */
struct timeval ts;
@@ -139,30 +169,73 @@ struct nm_desc_t {
#define IS_NETMAP_DESC(d) (P2NMD(d)->self == P2NMD(d))
#define NETMAP_FD(d) (P2NMD(d)->fd)
+
+/*
+ * this is a slightly optimized copy routine which rounds
+ * to multiple of 64 bytes and is often faster than dealing
+ * with other odd sizes. We assume there is enough room
+ * in the source and destination buffers.
+ *
+ * XXX only for multiples of 64 bytes, non overlapped.
+ */
+static inline void
+pkt_copy(const void *_src, void *_dst, int l)
+{
+ const uint64_t *src = _src;
+ uint64_t *dst = _dst;
+ if (unlikely(l >= 1024)) {
+ memcpy(dst, src, l);
+ return;
+ }
+ for (; likely(l > 0); l-=64) {
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ }
+}
+
+
/*
* The callback, invoked on each received packet. Same as libpcap
*/
typedef void (*nm_cb_t)(u_char *, const struct nm_hdr_t *, const u_char *d);
/*
- * The open routine accepts an ifname (netmap:foo or vale:foo) and
- * optionally a second (string) argument indicating the ring number
+ *--- the pcap-like API ---
+ *
+ * nm_open() opens a file descriptor, binds to a port and maps memory.
+ *
+ * ifname (netmap:foo or vale:foo) is the port name
+ * flags can be NETMAP_SW_RING or NETMAP_HW_RING etc.
+ * ring_no only used if NETMAP_HW_RING is specified, is interpreted
+ * as a string or integer indicating the ring number
+ * ring_flags is stored in all ring flags (e.g. for transparent mode)
* to open. If successful, t opens the fd and maps the memory.
*/
+
static struct nm_desc_t *nm_open(const char *ifname,
const char *ring_no, int flags, int ring_flags);
/*
- * nm_dispatch() is the same as pcap_dispatch()
- * nm_next() is the same as pcap_next()
+ * nm_close() closes and restores the port to its previous state
*/
-static int nm_dispatch(struct nm_desc_t *, int, nm_cb_t, u_char *);
-static u_char *nm_next(struct nm_desc_t *, struct nm_hdr_t *);
+
+static int nm_close(struct nm_desc_t *);
/*
- * unmap memory, close file descriptor and free the descriptor.
+ * nm_inject() is the same as pcap_inject()
+ * nm_dispatch() is the same as pcap_dispatch()
+ * nm_nextpkt() is the same as pcap_next()
*/
-static int nm_close(struct nm_desc_t *);
+
+static int nm_inject(struct nm_desc_t *, const void *, size_t);
+static int nm_dispatch(struct nm_desc_t *, int, nm_cb_t, u_char *);
+static u_char *nm_nextpkt(struct nm_desc_t *, struct nm_hdr_t *);
/*
@@ -240,6 +313,12 @@ fail:
static int
nm_close(struct nm_desc_t *d)
{
+ /*
+ * ugly trick to avoid unused warnings
+ */
+ static void *__xxzt[] __attribute__ ((unused)) =
+ { nm_open, nm_inject, nm_dispatch, nm_nextpkt } ;
+
if (d == NULL || d->self != d)
return EINVAL;
if (d->mem)
@@ -253,9 +332,45 @@ nm_close(struct nm_desc_t *d)
/*
+ * Same prototype as pcap_inject(), only need to cast.
+ */
+static int
+nm_inject(struct nm_desc_t *d, const void *buf, size_t size)
+{
+ u_int c, n = d->last_ring - d->first_ring + 1;
+
+ if (0) fprintf(stderr, "%s rings %d %d %d\n", __FUNCTION__,
+ d->first_ring, d->cur_ring, d->last_ring);
+ for (c = 0; c < n ; c++) {
+ /* compute current ring to use */
+ struct netmap_ring *ring;
+ uint32_t i, idx;
+ uint32_t ri = d->cur_ring + c;
+
+ if (ri > d->last_ring)
+ ri = d->first_ring;
+ ring = NETMAP_TXRING(d->nifp, ri);
+ if (nm_ring_empty(ring)) {
+ if (0) fprintf(stderr, "%s ring %d cur %d tail %d\n",
+ __FUNCTION__,
+ ri, ring->cur, ring->tail);
+ continue;
+ }
+ i = ring->cur;
+ idx = ring->slot[i].buf_idx;
+ ring->slot[i].len = size;
+ pkt_copy(buf, NETMAP_BUF(ring, idx), size);
+ d->cur_ring = ri;
+ ring->head = ring->cur = nm_ring_next(ring, i);
+ return size;
+ }
+ return 0; /* fail */
+}
+
+
+/*
* Same prototype as pcap_dispatch(), only need to cast.
*/
-inline /* not really, but disable unused warnings */
static int
nm_dispatch(struct nm_desc_t *d, int cnt, nm_cb_t cb, u_char *arg)
{
@@ -276,7 +391,7 @@ nm_dispatch(struct nm_desc_t *d, int cnt, nm_cb_t cb, u_char *arg)
if (ri > d->last_ring)
ri = d->first_ring;
ring = NETMAP_RXRING(d->nifp, ri);
- for ( ; ring->avail > 0 && cnt != got; got++) {
+ for ( ; !nm_ring_empty(ring) && cnt != got; got++) {
u_int i = ring->cur;
u_int idx = ring->slot[i].buf_idx;
u_char *buf = (u_char *)NETMAP_BUF(ring, idx);
@@ -285,24 +400,22 @@ nm_dispatch(struct nm_desc_t *d, int cnt, nm_cb_t cb, u_char *arg)
d->hdr.len = d->hdr.caplen = ring->slot[i].len;
d->hdr.ts = ring->ts;
cb(arg, &d->hdr, buf);
- ring->cur = NETMAP_RING_NEXT(ring, i);
- ring->avail--;
+ ring->head = ring->cur = nm_ring_next(ring, i);
}
}
d->cur_ring = ri;
return got;
}
-inline /* not really, but disable unused warnings */
static u_char *
-nm_next(struct nm_desc_t *d, struct nm_hdr_t *hdr)
+nm_nextpkt(struct nm_desc_t *d, struct nm_hdr_t *hdr)
{
int ri = d->cur_ring;
do {
/* compute current ring to use */
struct netmap_ring *ring = NETMAP_RXRING(d->nifp, ri);
- if (ring->avail > 0) {
+ if (!nm_ring_empty(ring)) {
u_int i = ring->cur;
u_int idx = ring->slot[i].buf_idx;
u_char *buf = (u_char *)NETMAP_BUF(ring, idx);
@@ -310,8 +423,12 @@ nm_next(struct nm_desc_t *d, struct nm_hdr_t *hdr)
// prefetch(buf);
hdr->ts = ring->ts;
hdr->len = hdr->caplen = ring->slot[i].len;
- ring->cur = NETMAP_RING_NEXT(ring, i);
- ring->avail--;
+ ring->cur = nm_ring_next(ring, i);
+ /* we could postpone advancing head if we want
+ * to hold the buffer. This can be supported in
+ * the future.
+ */
+ ring->head = ring->cur;
d->cur_ring = ri;
return buf;
}
OpenPOWER on IntegriCloud