summaryrefslogtreecommitdiffstats
path: root/sys/netinet/ip_dummynet.h
diff options
context:
space:
mode:
Diffstat (limited to 'sys/netinet/ip_dummynet.h')
-rw-r--r--sys/netinet/ip_dummynet.h467
1 files changed, 178 insertions, 289 deletions
diff --git a/sys/netinet/ip_dummynet.h b/sys/netinet/ip_dummynet.h
index 3a193e9..0bbc326 100644
--- a/sys/netinet/ip_dummynet.h
+++ b/sys/netinet/ip_dummynet.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
+ * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa
* Portions Copyright (c) 2000 Akamba Corp.
* All rights reserved
*
@@ -31,257 +31,124 @@
#define _IP_DUMMYNET_H
/*
- * Definition of dummynet data structures. In the structures, I decided
- * not to use the macros in <sys/queue.h> in the hope of making the code
- * easier to port to other architectures. The type of lists and queue we
- * use here is pretty simple anyways.
- */
-
-/*
- * We start with a heap, which is used in the scheduler to decide when
- * to transmit packets etc.
- *
- * The key for the heap is used for two different values:
+ * Definition of the kernel-userland API for dummynet.
*
- * 1. timer ticks- max 10K/second, so 32 bits are enough;
+ * Setsockopt() and getsockopt() pass a batch of objects, each
+ * of them starting with a "struct dn_id" which should fully identify
+ * the object and its relation with others in the sequence.
+ * The first object in each request should have
+ * type= DN_CMD_*, id = DN_API_VERSION.
+ * For other objects, type and subtype specify the object, len indicates
+ * the total length including the header, and 'id' identifies the specific
+ * object.
*
- * 2. virtual times. These increase in steps of len/x, where len is the
- * packet length, and x is either the weight of the flow, or the
- * sum of all weights.
- * If we limit to max 1000 flows and a max weight of 100, then
- * x needs 17 bits. The packet size is 16 bits, so we can easily
- * overflow if we do not allow errors.
- * So we use a key "dn_key" which is 64 bits. Some macros are used to
- * compare key values and handle wraparounds.
- * MAX64 returns the largest of two key values.
- * MY_M is used as a shift count when doing fixed point arithmetic
- * (a better name would be useful...).
+ * Most objects are numbered with an identifier in the range 1..65535.
+ * DN_MAX_ID indicates the first value outside the range.
*/
-typedef u_int64_t dn_key ; /* sorting key */
-#define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0)
-#define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0)
-#define DN_KEY_GT(a,b) ((int64_t)((a)-(b)) > 0)
-#define DN_KEY_GEQ(a,b) ((int64_t)((a)-(b)) >= 0)
-#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x)
-#define MY_M 16 /* number of left shift to obtain a larger precision */
-/*
- * XXX With this scaling, max 1000 flows, max weight 100, 1Gbit/s, the
- * virtual time wraps every 15 days.
- */
+#define DN_API_VERSION 12500000
+#define DN_MAX_ID 0x10000
+struct dn_id {
+ uint16_t len; /* total obj len including this header */
+ uint8_t type;
+ uint8_t subtype;
+ uint32_t id; /* generic id */
+};
/*
- * The maximum hash table size for queues. This value must be a power
- * of 2.
- */
-#define DN_MAX_HASH_SIZE 65536
-
-/*
- * A heap entry is made of a key and a pointer to the actual
- * object stored in the heap.
- * The heap is an array of dn_heap_entry entries, dynamically allocated.
- * Current size is "size", with "elements" actually in use.
- * The heap normally supports only ordered insert and extract from the top.
- * If we want to extract an object from the middle of the heap, we
- * have to know where the object itself is located in the heap (or we
- * need to scan the whole array). To this purpose, an object has a
- * field (int) which contains the index of the object itself into the
- * heap. When the object is moved, the field must also be updated.
- * The offset of the index in the object is stored in the 'offset'
- * field in the heap descriptor. The assumption is that this offset
- * is non-zero if we want to support extract from the middle.
+ * These values are in the type field of struct dn_id.
+ * To preserve the ABI, never rearrange the list or delete
+ * entries with the exception of DN_LAST
*/
-struct dn_heap_entry {
- dn_key key ; /* sorting key. Topmost element is smallest one */
- void *object ; /* object pointer */
+enum {
+ DN_NONE = 0,
+ DN_LINK = 1,
+ DN_FS,
+ DN_SCH,
+ DN_SCH_I,
+ DN_QUEUE,
+ DN_DELAY_LINE,
+ DN_PROFILE,
+ DN_FLOW, /* struct dn_flow */
+ DN_TEXT, /* opaque text is the object */
+
+ DN_CMD_CONFIG = 0x80, /* objects follow */
+ DN_CMD_DELETE, /* subtype + list of entries */
+ DN_CMD_GET, /* subtype + list of entries */
+ DN_CMD_FLUSH,
+ /* for compatibility with FreeBSD 7.2/8 */
+ DN_COMPAT_PIPE,
+ DN_COMPAT_QUEUE,
+ DN_GET_COMPAT,
+
+ /* special commands for emulation of sysctl variables */
+ DN_SYSCTL_GET,
+ DN_SYSCTL_SET,
+
+ DN_LAST,
} ;
-struct dn_heap {
- int size ;
- int elements ;
- int offset ; /* XXX if > 0 this is the offset of direct ptr to obj */
- struct dn_heap_entry *p ; /* really an array of "size" entries */
+enum { /* subtype for schedulers, flowset and the like */
+ DN_SCHED_UNKNOWN = 0,
+ DN_SCHED_FIFO = 1,
+ DN_SCHED_WF2QP = 2,
+ /* others are in individual modules */
} ;
-#ifdef _KERNEL
-/*
- * Packets processed by dummynet have an mbuf tag associated with
- * them that carries their dummynet state. This is used within
- * the dummynet code as well as outside when checking for special
- * processing requirements.
- * Note that the first part is the reinject info and is common to
- * other forms of packet reinjection.
- */
-struct dn_pkt_tag {
- struct ipfw_rule_ref rule; /* matching rule */
-
- /* second part, dummynet specific */
- int dn_dir; /* action when packet comes out. */
- /* see ip_fw_private.h */
-
- dn_key output_time; /* when the pkt is due for delivery */
- struct ifnet *ifp; /* interface, for ip_output */
- struct _ip6dn_args ip6opt; /* XXX ipv6 options */
+enum { /* user flags */
+ DN_HAVE_MASK = 0x0001, /* fs or sched has a mask */
+ DN_NOERROR = 0x0002, /* do not report errors */
+ DN_QHT_HASH = 0x0004, /* qht is a hash table */
+ DN_QSIZE_BYTES = 0x0008, /* queue size is in bytes */
+ DN_HAS_PROFILE = 0x0010, /* a link has a profile */
+ DN_IS_RED = 0x0020,
+ DN_IS_GENTLE_RED= 0x0040,
+ DN_PIPE_CMD = 0x1000, /* pipe config... */
};
-#endif /* _KERNEL */
/*
- * Overall structure of dummynet (with WF2Q+):
-
-In dummynet, packets are selected with the firewall rules, and passed
-to two different objects: PIPE or QUEUE.
-
-A QUEUE is just a queue with configurable size and queue management
-policy. It is also associated with a mask (to discriminate among
-different flows), a weight (used to give different shares of the
-bandwidth to different flows) and a "pipe", which essentially
-supplies the transmit clock for all queues associated with that
-pipe.
-
-A PIPE emulates a fixed-bandwidth link, whose bandwidth is
-configurable. The "clock" for a pipe can come from either an
-internal timer, or from the transmit interrupt of an interface.
-A pipe is also associated with one (or more, if masks are used)
-queue, where all packets for that pipe are stored.
-
-The bandwidth available on the pipe is shared by the queues
-associated with that pipe (only one in case the packet is sent
-to a PIPE) according to the WF2Q+ scheduling algorithm and the
-configured weights.
-
-In general, incoming packets are stored in the appropriate queue,
-which is then placed into one of a few heaps managed by a scheduler
-to decide when the packet should be extracted.
-The scheduler (a function called dummynet()) is run at every timer
-tick, and grabs queues from the head of the heaps when they are
-ready for processing.
-
-There are three data structures definining a pipe and associated queues:
-
- + dn_pipe, which contains the main configuration parameters related
- to delay and bandwidth;
- + dn_flow_set, which contains WF2Q+ configuration, flow
- masks, plr and RED configuration;
- + dn_flow_queue, which is the per-flow queue (containing the packets)
-
-Multiple dn_flow_set can be linked to the same pipe, and multiple
-dn_flow_queue can be linked to the same dn_flow_set.
-All data structures are linked in a linear list which is used for
-housekeeping purposes.
-
-During configuration, we create and initialize the dn_flow_set
-and dn_pipe structures (a dn_pipe also contains a dn_flow_set).
-
-At runtime: packets are sent to the appropriate dn_flow_set (either
-WFQ ones, or the one embedded in the dn_pipe for fixed-rate flows),
-which in turn dispatches them to the appropriate dn_flow_queue
-(created dynamically according to the masks).
-
-The transmit clock for fixed rate flows (ready_event()) selects the
-dn_flow_queue to be used to transmit the next packet. For WF2Q,
-wfq_ready_event() extract a pipe which in turn selects the right
-flow using a number of heaps defined into the pipe itself.
-
- *
+ * link template.
*/
+struct dn_link {
+ struct dn_id oid;
-/*
- * per flow queue. This contains the flow identifier, the queue
- * of packets, counters, and parameters used to support both RED and
- * WF2Q+.
- *
- * A dn_flow_queue is created and initialized whenever a packet for
- * a new flow arrives.
- */
-struct dn_flow_queue {
- struct dn_flow_queue *next ;
- struct ipfw_flow_id id ;
-
- struct mbuf *head, *tail ; /* queue of packets */
- u_int len ;
- u_int len_bytes ;
-
- /*
- * When we emulate MAC overheads, or channel unavailability due
- * to other traffic on a shared medium, we augment the packet at
- * the head of the queue with an 'extra_bits' field representsing
- * the additional delay the packet will be subject to:
- * extra_bits = bw*unavailable_time.
- * With large bandwidth and large delays, extra_bits (and also numbytes)
- * can become very large, so better play safe and use 64 bit
- */
- uint64_t numbytes ; /* credit for transmission (dynamic queues) */
- int64_t extra_bits; /* extra bits simulating unavailable channel */
-
- u_int64_t tot_pkts ; /* statistics counters */
- u_int64_t tot_bytes ;
- u_int32_t drops ;
-
- int hash_slot ; /* debugging/diagnostic */
-
- /* RED parameters */
- int avg ; /* average queue length est. (scaled) */
- int count ; /* arrivals since last RED drop */
- int random ; /* random value (scaled) */
- dn_key idle_time; /* start of queue idle time */
-
- /* WF2Q+ support */
- struct dn_flow_set *fs ; /* parent flow set */
- int heap_pos ; /* position (index) of struct in heap */
- dn_key sched_time ; /* current time when queue enters ready_heap */
-
- dn_key S,F ; /* start time, finish time */
/*
- * Setting F < S means the timestamp is invalid. We only need
- * to test this when the queue is empty.
+ * Userland sets bw and delay in bits/s and milliseconds.
+ * The kernel converts this back and forth to bits/tick and ticks.
+ * XXX what about burst ?
*/
+ int32_t link_nr;
+ int bandwidth; /* bit/s or bits/tick. */
+ int delay; /* ms and ticks */
+ uint64_t burst; /* scaled. bits*Hz XXX */
} ;
/*
- * flow_set descriptor. Contains the "template" parameters for the
- * queue configuration, and pointers to the hash table of dn_flow_queue's.
- *
- * The hash table is an array of lists -- we identify the slot by
- * hashing the flow-id, then scan the list looking for a match.
- * The size of the hash table (buckets) is configurable on a per-queue
- * basis.
- *
- * A dn_flow_set is created whenever a new queue or pipe is created (in the
- * latter case, the structure is located inside the struct dn_pipe).
+ * A flowset, which is a template for flows. Contains parameters
+ * from the command line: id, target scheduler, queue sizes, plr,
+ * flow masks, buckets for the flow hash, and possibly scheduler-
+ * specific parameters (weight, quantum and so on).
*/
-struct dn_flow_set {
- SLIST_ENTRY(dn_flow_set) next; /* linked list in a hash slot */
-
- u_short fs_nr ; /* flow_set number */
- u_short flags_fs;
-#define DN_HAVE_FLOW_MASK 0x0001
-#define DN_IS_RED 0x0002
-#define DN_IS_GENTLE_RED 0x0004
-#define DN_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */
-#define DN_NOERROR 0x0010 /* do not report ENOBUFS on drops */
-#define DN_HAS_PROFILE 0x0020 /* the pipe has a delay profile. */
-#define DN_IS_PIPE 0x4000
-#define DN_IS_QUEUE 0x8000
-
- struct dn_pipe *pipe ; /* pointer to parent pipe */
- u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */
-
- int weight ; /* WFQ queue weight */
+struct dn_fs {
+ struct dn_id oid;
+ uint32_t fs_nr; /* the flowset number */
+ uint32_t flags; /* userland flags */
int qsize ; /* queue size in slots or bytes */
- int plr ; /* pkt loss rate (2^31-1 means 100%) */
+ int32_t plr; /* PLR, pkt loss rate (2^31-1 means 100%) */
+ uint32_t buckets; /* buckets used for the queue hash table */
struct ipfw_flow_id flow_mask ;
-
- /* hash table of queues onto this flow_set */
- int rq_size ; /* number of slots */
- int rq_elements ; /* active elements */
- struct dn_flow_queue **rq; /* array of rq_size entries */
-
- u_int32_t last_expired ; /* do not expire too frequently */
- int backlogged ; /* #active queues for this flowset */
-
- /* RED parameters */
+ uint32_t sched_nr; /* the scheduler we attach to */
+ /* generic scheduler parameters. Leave them at -1 if unset.
+ * Now we use 0: weight, 1: lmax, 2: priority
+ */
+ int par[4];
+
+ /* RED/GRED parameters.
+ * weight and probabilities are in the range 0..1 represented
+ * in fixed point arithmetic with SCALE_RED decimal bits.
+ */
#define SCALE_RED 16
#define SCALE(x) ( (x) << SCALE_RED )
#define SCALE_VAL(x) ( (x) >> SCALE_RED )
@@ -290,85 +157,107 @@ struct dn_flow_set {
int max_th ; /* maximum threshold for queue (scaled) */
int min_th ; /* minimum threshold for queue (scaled) */
int max_p ; /* maximum value for p_b (scaled) */
- u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */
- u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */
- u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */
- u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */
- u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */
- u_int lookup_depth ; /* depth of lookup table */
- int lookup_step ; /* granularity inside the lookup table */
- int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
- int avg_pkt_size ; /* medium packet size */
- int max_pkt_size ; /* max packet size */
+
};
-SLIST_HEAD(dn_flow_set_head, dn_flow_set);
/*
- * Pipe descriptor. Contains global parameters, delay-line queue,
- * and the flow_set used for fixed-rate queues.
- *
- * For WF2Q+ support it also has 3 heaps holding dn_flow_queue:
- * not_eligible_heap, for queues whose start time is higher
- * than the virtual time. Sorted by start time.
- * scheduler_heap, for queues eligible for scheduling. Sorted by
- * finish time.
- * idle_heap, all flows that are idle and can be removed. We
- * do that on each tick so we do not slow down too much
- * operations during forwarding.
- *
+ * dn_flow collects flow_id and stats for queues and scheduler
+ * instances, and is used to pass these info to userland.
+ * oid.type/oid.subtype describe the object, oid.id is number
+ * of the parent object.
*/
-struct dn_pipe { /* a pipe */
- SLIST_ENTRY(dn_pipe) next; /* linked list in a hash slot */
-
- int pipe_nr ; /* number */
- int bandwidth; /* really, bytes/tick. */
- int delay ; /* really, ticks */
-
- struct mbuf *head, *tail ; /* packets in delay line */
-
- /* WF2Q+ */
- struct dn_heap scheduler_heap ; /* top extract - key Finish time*/
- struct dn_heap not_eligible_heap; /* top extract- key Start time */
- struct dn_heap idle_heap ; /* random extract - key Start=Finish time */
-
- dn_key V ; /* virtual time */
- int sum; /* sum of weights of all active sessions */
-
- /* Same as in dn_flow_queue, numbytes can become large */
- int64_t numbytes; /* bits I can transmit (more or less). */
- uint64_t burst; /* burst size, scaled: bits * hz */
+struct dn_flow {
+ struct dn_id oid;
+ struct ipfw_flow_id fid;
+ uint64_t tot_pkts; /* statistics counters */
+ uint64_t tot_bytes;
+ uint32_t length; /* Queue lenght, in packets */
+ uint32_t len_bytes; /* Queue lenght, in bytes */
+ uint32_t drops;
+};
- dn_key sched_time ; /* time pipe was scheduled in ready_heap */
- dn_key idle_time; /* start of pipe idle time */
/*
- * When the tx clock come from an interface (if_name[0] != '\0'), its name
- * is stored below, whereas the ifp is filled when the rule is configured.
+ * Scheduler template, mostly indicating the name, number,
+ * sched_mask and buckets.
*/
- char if_name[IFNAMSIZ];
- struct ifnet *ifp ;
- int ready ; /* set if ifp != NULL and we got a signal from it */
+struct dn_sch {
+ struct dn_id oid;
+ uint32_t sched_nr; /* N, scheduler number */
+ uint32_t buckets; /* number of buckets for the instances */
+ uint32_t flags; /* have_mask, ... */
+
+ char name[16]; /* null terminated */
+ /* mask to select the appropriate scheduler instance */
+ struct ipfw_flow_id sched_mask; /* M */
+};
- struct dn_flow_set fs ; /* used with fixed-rate flows */
+/* A delay profile is attached to a link.
+ * Note that a profile, as any other object, cannot be longer than 2^16
+ */
+#define ED_MAX_SAMPLES_NO 1024
+struct dn_profile {
+ struct dn_id oid;
/* fields to simulate a delay profile */
-
#define ED_MAX_NAME_LEN 32
char name[ED_MAX_NAME_LEN];
+ int link_nr;
int loss_level;
- int samples_no;
- int *samples;
+ int bandwidth; // XXX use link bandwidth?
+ int samples_no; /* actual length of samples[] */
+ int samples[ED_MAX_SAMPLES_NO]; /* may be shorter */
};
-/* dn_pipe_max is used to pass pipe configuration from userland onto
- * kernel space and back
- */
-#define ED_MAX_SAMPLES_NO 1024
-struct dn_pipe_max {
- struct dn_pipe pipe;
- int samples[ED_MAX_SAMPLES_NO];
-};
-SLIST_HEAD(dn_pipe_head, dn_pipe);
+
+/*
+ * Overall structure of dummynet
+
+In dummynet, packets are selected with the firewall rules, and passed
+to two different objects: PIPE or QUEUE (bad name).
+
+A QUEUE defines a classifier, which groups packets into flows
+according to a 'mask', puts them into independent queues (one
+per flow) with configurable size and queue management policy,
+and passes flows to a scheduler:
+
+ (flow_mask|sched_mask) sched_mask
+ +---------+ weight Wx +-------------+
+ | |->-[flow]-->--| |-+
+ -->--| QUEUE x | ... | | |
+ | |->-[flow]-->--| SCHEDuler N | |
+ +---------+ | | |
+ ... | +--[LINK N]-->--
+ +---------+ weight Wy | | +--[LINK N]-->--
+ | |->-[flow]-->--| | |
+ -->--| QUEUE y | ... | | |
+ | |->-[flow]-->--| | |
+ +---------+ +-------------+ |
+ +-------------+
+
+Many QUEUE objects can connect to the same scheduler, each
+QUEUE object can have its own set of parameters.
+
+In turn, the SCHEDuler 'forks' multiple instances according
+to a 'sched_mask', each instance manages its own set of queues
+and transmits on a private instance of a configurable LINK.
+
+A PIPE is a simplified version of the above, where there
+is no flow_mask, and each scheduler instance handles a single queue.
+
+The following data structures (visible from userland) describe
+the objects used by dummynet:
+
+ + dn_link, contains the main configuration parameters related
+ to delay and bandwidth;
+ + dn_profile describes a delay profile;
+ + dn_flow describes the flow status (flow id, statistics)
+
+ + dn_sch describes a scheduler
+ + dn_fs describes a flowset (msk, weight, queue parameters)
+
+ *
+ */
#endif /* _IP_DUMMYNET_H */
OpenPOWER on IntegriCloud