summaryrefslogtreecommitdiffstats
path: root/sys/net/netmap.h
blob: a5ee9b55edc96a4d0a31f85efb0c320523b029f6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
/*
 * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *   1. Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in the
 *      documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``S IS''AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * $FreeBSD$
 *
 * Definitions of constants and the structures used by the netmap
 * framework, for the part visible to both kernel and userspace.
 * Detailed info on netmap is available with "man netmap" or at
 *
 *	http://info.iet.unipi.it/~luigi/netmap/
 *
 * This API is also used to communicate with the VALE software switch
 */

#ifndef _NET_NETMAP_H_
#define _NET_NETMAP_H_

#define	NETMAP_API	10		/* current API version */

/*
 * Some fields should be cache-aligned to reduce contention.
 * The alignment is architecture and OS dependent, but rather than
 * digging into OS headers to find the exact value we use an estimate
 * that should cover most architectures.
 */
#define NM_CACHE_ALIGN	128

/*
 * --- Netmap data structures ---
 *
 * The userspace data structures used by netmap are shown below.
 * They are allocated by the kernel and mmap()ed by userspace threads.
 * Pointers are implemented as memory offsets or indexes,
 * so that they can be easily dereferenced in kernel and userspace.

   KERNEL (opaque, obviously)

  ====================================================================
                                         |
   USERSPACE                             |      struct netmap_ring
                                         +---->+---------------+
                                             / | head,cur,tail |
   struct netmap_if (nifp, 1 per fd)        /  | buf_ofs       |
    +---------------+                      /   | other fields  |
    | ni_tx_rings   |                     /    +===============+
    | ni_rx_rings   |                    /     | buf_idx, len  | slot[0]
    |               |                   /      | flags, ptr    |
    |               |                  /       +---------------+
    +===============+                 /        | buf_idx, len  | slot[1]
    | txring_ofs[0] | (rel.to nifp)--'         | flags, ptr    |
    | txring_ofs[1] |                          +---------------+
  (tx+1+extra_tx entries)                     (num_slots entries)
    | txring_ofs[t] |                          | buf_idx, len  | slot[n-1]
    +---------------+                          | flags, ptr    |
    | rxring_ofs[0] |                          +---------------+
    | rxring_ofs[1] |
  (rx+1+extra_rx entries)
    | rxring_ofs[r] |
    +---------------+

 * For each "interface" (NIC, host stack, VALE switch port) attached to a
 * file descriptor, the mmap()ed region contains a (logically readonly)
 * struct netmap_if pointing to struct netmap_ring's.
 * There is one netmap_ring per physical NIC ring, plus one tx/rx ring
 * pair attached to the host stack (this pair is unused for VALE ports).
 *
 * All physical/host stack ports share the same memory region,
 * so that zero-copy can be implemented between them.
 * VALE switch ports instead have separate memory regions.
 *
 * The netmap_ring is the userspace-visible replica of the NIC ring.
 * Each slot has the index of a buffer (MTU-sized and residing in the
 * mmapped region), its length and some flags. An extra 64-bit pointer
 * is provided for user-supplied buffers in the tx path.
 *
 * In user space, the buffer address is computed as
 *	(char *)ring + buf_ofs + index*NETMAP_BUF_SIZE
 */

/*
 * struct netmap_slot is a buffer descriptor
 */
struct netmap_slot {
	uint32_t buf_idx;	/* buffer index */
	uint16_t len;		/* length for this slot */
	uint16_t flags;		/* buf changed, etc. */
	uint64_t ptr;		/* pointer for indirect buffers */
};

/*
 * The following flags control how the slot is used
 */

#define	NS_BUF_CHANGED	0x0001	/* buf_idx changed */
	/*
	 * must be set whenever buf_idx is changed (as it might be
	 * necessary to recompute the physical address and mapping)
	 */

#define	NS_REPORT	0x0002	/* ask the hardware to report results */
	/*
	 * Request notification when slot is used by the hardware.
	 * Normally transmit completions are handled lazily and
	 * may be unreported. This flag lets us know when a slot
	 * has been sent (e.g. to terminate the sender).
	 */

#define	NS_FORWARD	0x0004	/* pass packet 'forward' */
	/*
	 * (Only for physical ports, rx rings with NR_FORWARD set).
	 * Slot released to the kernel (i.e. before ring->head) with
	 * this flag set are passed to the peer ring (host/NIC),
	 * thus restoring the host-NIC connection for these slots.
	 * This supports efficient traffic monitoring or firewalling.
	 */

#define	NS_NO_LEARN	0x0008	/* disable bridge learning */
 	/*
	 * On a VALE switch, do not 'learn' the source port for
 	 * this buffer.
	 */

#define	NS_INDIRECT	0x0010	/* userspace buffer */
 	/*
	 * (VALE tx rings only) data is in a userspace buffer,
	 * whose address is in the 'ptr' field in the slot.
	 */

#define	NS_MOREFRAG	0x0020	/* packet has more fragments */
 	/*
	 * (VALE ports only)
	 * Set on all but the last slot of a multi-segment packet.
	 * The 'len' field refers to the individual fragment.
	 */

#define	NS_PORT_SHIFT	8
#define	NS_PORT_MASK	(0xff << NS_PORT_SHIFT)
	/*
 	 * The high 8 bits of the flag, if not zero, indicate the
	 * destination port for the VALE switch, overriding
 	 * the lookup table.
 	 */

#define	NS_RFRAGS(_slot)	( ((_slot)->flags >> 8) & 0xff)
	/*
	 * (VALE rx rings only) the high 8 bits
	 *  are the number of fragments.
	 */


/*
 * struct netmap_ring
 *
 * Netmap representation of a TX or RX ring (also known as "queue").
 * This is a queue implemented as a fixed-size circular array.
 * At the software level the important fields are: head, cur, tail.
 *
 * In TX rings:
 *
 *	head	first slot available for transmission.
 *	cur	wakeup point. select() and poll() will unblock
 *		when 'tail' moves past 'cur'
 *	tail	(readonly) first slot reserved to the kernel
 *
 *	[head .. tail-1] can be used for new packets to send;
 *	'head' and 'cur' must be incremented as slots are filled
 *	    with new packets to be sent;
 *	'cur' can be moved further ahead if we need more space
 *	for new transmissions.
 *
 * In RX rings:
 *
 *	head	first valid received packet
 *	cur	wakeup point. select() and poll() will unblock
 *		when 'tail' moves past 'cur'
 *	tail	(readonly) first slot reserved to the kernel
 *
 *	[head .. tail-1] contain received packets;
 *	'head' and 'cur' must be incremented as slots are consumed
 *		and can be returned to the kernel;
 *	'cur' can be moved further ahead if we want to wait for
 *		new packets without returning the previous ones.
 *
 * DATA OWNERSHIP/LOCKING:
 *	The netmap_ring, and all slots and buffers in the range
 *	[head .. tail-1] are owned by the user program;
 *	the kernel only accesses them during a netmap system call
 *	and in the user thread context.
 *
 *	Other slots and buffers are reserved for use by the kernel
 */
struct netmap_ring {
	/*
	 * buf_ofs is meant to be used through macros.
	 * It contains the offset of the buffer region from this
	 * descriptor.
	 */
	const int64_t	buf_ofs;
	const uint32_t	num_slots;	/* number of slots in the ring. */
	const uint32_t	nr_buf_size;
	const uint16_t	ringid;
	const uint16_t	dir;		/* 0: tx, 1: rx */

	uint32_t        head;		/* (u) first user slot */
	uint32_t        cur;		/* (u) wakeup point */
	uint32_t	tail;		/* (k) first kernel slot */

	uint32_t	flags;

	struct timeval	ts;		/* (k) time of last *sync() */

	/* opaque room for a mutex or similar object */
	uint8_t		sem[128] __attribute__((__aligned__(NM_CACHE_ALIGN)));

	/* the slots follow. This struct has variable size */
	struct netmap_slot slot[0];	/* array of slots. */
};


/*
 * RING FLAGS
 */
#define	NR_TIMESTAMP	0x0002		/* set timestamp on *sync() */
	/*
	 * updates the 'ts' field on each netmap syscall. This saves
	 * saves a separate gettimeofday(), and is not much worse than
	 * software timestamps generated in the interrupt handler.
	 */

#define	NR_FORWARD	0x0004		/* enable NS_FORWARD for ring */
 	/*
	 * Enables the NS_FORWARD slot flag for the ring.
	 */


/*
 * Netmap representation of an interface and its queue(s).
 * This is initialized by the kernel when binding a file
 * descriptor to a port, and should be considered as readonly
 * by user programs. The kernel never uses it.
 *
 * There is one netmap_if for each file descriptor on which we want
 * to select/poll.
 * select/poll operates on one or all pairs depending on the value of
 * nmr_queueid passed on the ioctl.
 */
struct netmap_if {
	char		ni_name[IFNAMSIZ]; /* name of the interface. */
	const uint32_t	ni_version;	/* API version, currently unused */
	const uint32_t	ni_flags;	/* properties */
#define	NI_PRIV_MEM	0x1		/* private memory region */

	/*
	 * The number of packet rings available in netmap mode.
	 * Physical NICs can have different numbers of tx and rx rings.
	 * Physical NICs also have a 'host' ring pair.
	 * Additionally, clients can request additional ring pairs to
	 * be used for internal communication.
	 */
	const uint32_t	ni_tx_rings;	/* number of HW tx rings */
	const uint32_t	ni_rx_rings;	/* number of HW rx rings */

	const uint32_t	ni_extra_tx_rings;
	const uint32_t	ni_extra_rx_rings;
	/*
	 * The following array contains the offset of each netmap ring
	 * from this structure, in the following order:
	 * NIC tx rings (ni_tx_rings); host tx ring (1); extra tx rings;
	 * NIC rx rings (ni_rx_rings); host tx ring (1); extra rx rings.
	 *
	 * The area is filled up by the kernel on NIOCREGIF,
	 * and then only read by userspace code.
	 */
	const ssize_t	ring_ofs[0];
};


#ifndef NIOCREGIF
/*
 * ioctl names and related fields
 *
 * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues,
 *	whose identity is set in NIOCREGIF through nr_ringid.
 *	These are non blocking and take no argument.
 *
 * NIOCGINFO takes a struct ifreq, the interface name is the input,
 *	the outputs are number of queues and number of descriptor
 *	for each queue (useful to set number of threads etc.).
 *	The info returned is only advisory and may change before
 *	the interface is bound to a file descriptor.
 *
 * NIOCREGIF takes an interface name within a struct nmre,
 *	and activates netmap mode on the interface (if possible).
 *
 * The argument to NIOCGINFO/NIOCREGIF overlays struct ifreq so we
 * can pass it down to other NIC-related ioctls.
 *
 * The actual argument (struct nmreq) has a number of options to request
 * different functions.
 *
 * nr_name	(in)
 *	The name of the port (em0, valeXXX:YYY, etc.)
 *	limited to IFNAMSIZ for backward compatibility.
 *
 * nr_version	(in/out)
 *	Must match NETMAP_API as used in the kernel, error otherwise.
 *	Always returns the desired value on output.
 *
 * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings (in/out)
 *	On input, non-zero values may be used to reconfigure the port
 *	according to the requested values, but this is not guaranteed.
 *	On output the actual values in use are reported.
 *
 * nr_ringid (in)
 *	Indicates how rings should be bound to the file descriptors.
 *	0 (default)			binds all physical rings
 *	NETMAP_HW_RING | ring number	binds a single ring pair
 *	NETMAP_SW_RING			binds only the host tx/rx rings
 *
 *	NETMAP_NO_TX_POLL can be OR-ed to make select()/poll() push
 *		packets on tx rings only if POLLOUT is set.
 *		The default is to push any pending packet.
 *
 *	NETMAP_PRIV_MEM is set on return for ports that use private
 *		memory regions and cannot use buffer swapping.
 *
 * nr_cmd (in)	if non-zero indicates a special command:
 *	NETMAP_BDG_ATTACH	 and nr_name = vale*:ifname
 *		attaches the NIC to the switch; nr_ringid specifies
 *		which rings to use. Used by vale-ctl -a ...
 *	    nr_arg1 = NETMAP_BDG_HOST also attaches the host port
 *		as in vale-ctl -h ...
 *
 *	NETMAP_BDG_DETACH	and nr_name = vale*:ifname
 *		disconnects a previously attached NIC.
 *		Used by vale-ctl -d ...
 *
 *	NETMAP_BDG_LIST
 *		list the configuration of VALE switches.
 *
 *	NETMAP_BDG_OFFSET	XXX ?
 *		Set the offset of data in packets. Used with VALE
 *		switches where the clients use the vhost header.
 *
 * nr_arg1, nr_arg2 (in/out)		command specific
 *
 */


/*
 * struct nmreq overlays a struct ifreq
 */
struct nmreq {
	char		nr_name[IFNAMSIZ];
	uint32_t	nr_version;	/* API version */
	uint32_t	nr_offset;	/* nifp offset in the shared region */
	uint32_t	nr_memsize;	/* size of the shared region */
	uint32_t	nr_tx_slots;	/* slots in tx rings */
	uint32_t	nr_rx_slots;	/* slots in rx rings */
	uint16_t	nr_tx_rings;	/* number of tx rings */
	uint16_t	nr_rx_rings;	/* number of rx rings */
	uint16_t	nr_ringid;	/* ring(s) we care about */
#define NETMAP_PRIV_MEM	0x8000		/* rings use private memory */
#define NETMAP_HW_RING	0x4000		/* low bits indicate one hw ring */
#define NETMAP_SW_RING	0x2000		/* process the sw ring */
#define NETMAP_NO_TX_POLL	0x1000	/* no automatic txsync on poll */
#define NETMAP_RING_MASK 0xfff		/* the ring number */

	uint16_t	nr_cmd;
#define NETMAP_BDG_ATTACH	1	/* attach the NIC */
#define NETMAP_BDG_DETACH	2	/* detach the NIC */
#define NETMAP_BDG_LOOKUP_REG	3	/* register lookup function */
#define NETMAP_BDG_LIST		4	/* get bridge's info */
#define NETMAP_BDG_OFFSET       5       /* set the port offset */

	uint16_t	nr_arg1;
#define NETMAP_BDG_HOST		1	/* attach the host stack on ATTACH */
#define NETMAP_BDG_MAX_OFFSET	12

	uint16_t	nr_arg2;
	uint32_t	spare2[3];
};


/*
 * FreeBSD uses the size value embedded in the _IOWR to determine
 * how much to copy in/out. So we need it to match the actual
 * data structure we pass. We put some spares in the structure
 * to ease compatibility with other versions
 */
#define NIOCGINFO	_IOWR('i', 145, struct nmreq) /* return IF info */
#define NIOCREGIF	_IOWR('i', 146, struct nmreq) /* interface register */
#define NIOCTXSYNC	_IO('i', 148) /* sync tx queues */
#define NIOCRXSYNC	_IO('i', 149) /* sync rx queues */
#endif /* !NIOCREGIF */


/*
 * Helper functions for kernel and userspace
 */

/*
 * check if space is available in the ring.
 */
static inline int
nm_ring_empty(struct netmap_ring *ring)
{
	return (ring->cur == ring->tail);
}

#endif /* _NET_NETMAP_H_ */
OpenPOWER on IntegriCloud